pax_global_header00006660000000000000000000000064125473010130014506gustar00rootroot0000000000000052 comment=29239bcf329218ec0bb764978ce5448d4609f464 libStatGen-1.0.14/000077500000000000000000000000001254730101300136255ustar00rootroot00000000000000libStatGen-1.0.14/.gitignore000066400000000000000000000000501254730101300156100ustar00rootroot00000000000000*~ *.o *.a *.bak dox/ dox_errors.txt *# libStatGen-1.0.14/Doxyfile000066400000000000000000001742711254730101300153470ustar00rootroot00000000000000# Doxyfile 1.6.1 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project # # All text after a hash (#) is considered a comment and will be ignored # The format is: # TAG = value [value, ...] # For lists items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (" ") #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # http://www.gnu.org/software/libiconv for the list of possible encodings. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded # by quotes) that should identify the project. PROJECT_NAME = "libStatGen Software" # The PROJECT_NUMBER tag can be used to enter a project or revision number. # This could be handy for archiving the generated documentation or # if some version control system is used. PROJECT_NUMBER = 1 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location # where doxygen was started. If left blank the current directory will be used. OUTPUT_DIRECTORY = dox # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create # 4096 sub-directories (in 2 levels) under the output directory of each output # format and will distribute the generated files over these directories. # Enabling this option can be useful when feeding doxygen a huge amount of # source files, where putting all generated files in the same directory would # otherwise cause performance problems for the file system. CREATE_SUBDIRS = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # The default language is English, other supported languages are: # Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, # Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, # Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English # messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, # Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, # Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will # include brief member descriptions after the members that are listed in # the file and class documentation (similar to JavaDoc). # Set to NO to disable this. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend # the brief description of a member or function before the detailed description. # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator # that is used to form the text in various listings. Each string # in this list, if found as the leading text of the brief description, will be # stripped from the text and the result after processing the whole list, is # used as the annotated text. Otherwise, the brief description is used as-is. # If left blank, the following values are used ("$name" is automatically # replaced with the name of the entity): "The $name class" "The $name widget" # "The $name file" "is" "provides" "specifies" "contains" # "represents" "a" "an" "the" ABBREVIATE_BRIEF = # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # Doxygen will generate a detailed section even if there is only a brief # description. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full # path before files name in the file list and in the header files. If set # to NO the shortest path that makes the file name unique will be used. FULL_PATH_NAMES = YES # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag # can be used to strip a user-defined part of the path. Stripping is # only done if one of the specified strings matches the left-hand part of # the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the # path to strip. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of # the path mentioned in the documentation of a class, which tells # the reader which header file to include in order to use a class. # If left blank only the name of the header file containing the class # definition is used. Otherwise one should specify the include paths that # are normally passed to the compiler using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter # (but less readable) file names. This can be useful is your file systems # doesn't support long names like on DOS, Mac, or CD-ROM. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen # will interpret the first line (until the first dot) of a JavaDoc-style # comment as the brief description. If set to NO, the JavaDoc # comments will behave just like regular Qt-style comments # (thus requiring an explicit @brief command for a brief description.) JAVADOC_AUTOBRIEF = YES # If the QT_AUTOBRIEF tag is set to YES then Doxygen will # interpret the first line (until the first dot) of a Qt-style # comment as the brief description. If set to NO, the comments # will behave just like regular Qt-style comments (thus requiring # an explicit \brief command for a brief description.) QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen # treat a multi-line C++ special comment block (i.e. a block of //! or /// # comments) as a brief description. This used to be the default behaviour. # The new default is to treat a multi-line C++ comment block as a detailed # description. Set this tag to YES if you prefer the old behaviour instead. MULTILINE_CPP_IS_BRIEF = NO # If the DETAILS_AT_TOP tag is set to YES then Doxygen # # will output the detailed description near the top, like JavaDoc. # # If set to NO, the detailed description appears after the member # # documentation. # DETAILS_AT_TOP = YES # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented # member inherits the documentation from any documented member that it # re-implements. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce # a new page for each member. If set to NO, the documentation of a member will # be part of the file/class/namespace that contains it. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. # Doxygen uses this value to replace tabs by spaces in code fragments. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that acts # as commands in the documentation. An alias has the form "name=value". # For example adding "sideeffect=\par Side Effects:\n" will allow you to # put the command \sideeffect (or @sideeffect) in the documentation, which # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. ALIASES = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. # For instance, some of the names that are used will be different. The list # of all members will be omitted, etc. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java # sources only. Doxygen will then generate output that is more tailored for # Java. For instance, namespaces will be presented as packages, qualified # scopes will look different, etc. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources only. Doxygen will then generate output that is more tailored for # Fortran. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for # VHDL. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it parses. # With this tag you can assign which parser to use for a given extension. # Doxygen has a built-in mapping, but you can override or extend it using this tag. # The format is ext=language, where ext is a file extension, and language is one of # the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, # Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat # .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), # use: inc=Fortran f=C. Note that for custom extensions you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. EXTENSION_MAPPING = h=C++ # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should # set this tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. # func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. BUILTIN_STL_SUPPORT = YES # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. # Doxygen will parse them like normal C++ but will assume all classes use public # instead of private inheritance when no explicit protection keyword is present. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate getter # and setter methods for a property. Setting this option to YES (the default) # will make doxygen to replace the get and set methods by a property in the # documentation. This will only work if the methods are indeed getting or # setting a simple type. If this is not the case, or you want to show the # methods anyway, you should set this option to NO. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES, then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. DISTRIBUTE_GROUP_DOC = NO # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a # subgroup of that type (e.g. under the Public Functions section). Set it to # NO to prevent subgrouping. Alternatively, this can be done per class using # the \nosubgrouping command. SUBGROUPING = YES # When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum # is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically # be useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. TYPEDEF_HIDES_STRUCT = NO # The SYMBOL_CACHE_SIZE determines the size of the internal cache use to # determine which symbols to keep in memory and which to flush to disk. # When the cache is full, less often used symbols will be written to disk. # For small to medium size projects (<1000 input files) the default value is # probably good enough. For larger projects a too small cache size can cause # doxygen to be busy swapping symbols to and from disk most of the time # causing a significant performance penality. # If the system has enough physical memory increasing the cache will improve the # performance by keeping more symbols in memory. Note that the value works on # a logarithmic scale so increasing the size by one will rougly double the # memory usage. The cache size is given by this formula: # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, # corresponding to a cache size of 2^16 = 65536 symbols SYMBOL_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in # documentation are documented, even if no documentation was available. # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. EXTRACT_PRIVATE = NO # If the EXTRACT_STATIC tag is set to YES all static members of a file # will be included in the documentation. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) # defined locally in source files will be included in the documentation. # If set to NO only classes defined in header files are included. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. When set to YES local # methods, which are defined in the implementation section but not in # the interface are included in the documentation. # If set to NO (the default) only methods in the interface are included. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base # name of the file that contains the anonymous namespace. By default # anonymous namespace are hidden. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members of documented classes, files or namespaces. # If set to NO (the default) these members will be included in the # various overviews, but no documentation section is generated. # This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. # If set to NO (the default) these classes will be included in the various # overviews. This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all # friend (class|struct|union) declarations. # If set to NO (the default) these declarations will be included in the # documentation. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. # If set to NO (the default) these blocks will be appended to the # function's detailed documentation block. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation # that is typed after a \internal command is included. If the tag is set # to NO (the default) then the documentation will be excluded. # Set it to YES to include the internal documentation. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate # file names in lower-case letters. If set to YES upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen # will show members with their full class and namespace scopes in the # documentation. If set to YES the scope will be hidden. HIDE_SCOPE_NAMES = NO # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen # will put a list of the files that are included by a file in the documentation # of that file. SHOW_INCLUDE_FILES = YES # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] # is inserted in the documentation for inline members. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen # will sort the (detailed) documentation of file and class members # alphabetically by member name. If set to NO the members will appear in # declaration order. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the # brief documentation of file, namespace and class members alphabetically # by member name. If set to NO (the default) the members will appear in # declaration order. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the (brief and detailed) documentation of class members so that constructors and destructors are listed first. If set to NO (the default) the constructors will appear in the respective orders defined by SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. SORT_MEMBERS_CTORS_1ST = YES # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the # hierarchy of group names into alphabetical order. If set to NO (the default) # the group names will appear in their defined order. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be # sorted by fully-qualified names, including namespaces. If set to # NO (the default), the class list will be sorted only by class name, # not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the # alphabetical list. SORT_BY_SCOPE_NAME = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or # disable (NO) the todo list. This list is created by putting \todo # commands in the documentation. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or # disable (NO) the test list. This list is created by putting \test # commands in the documentation. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or # disable (NO) the bug list. This list is created by putting \bug # commands in the documentation. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or # disable (NO) the deprecated list. This list is created by putting # \deprecated commands in the documentation. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional # documentation sections, marked by \if sectionname ... \endif. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines # the initial value of a variable or define consists of for it to appear in # the documentation. If the initializer consists of more lines than specified # here it will be hidden. Use a value of 0 to hide initializers completely. # The appearance of the initializer of individual variables and defines in the # documentation can be controlled using \showinitializer or \hideinitializer # command in the documentation regardless of this setting. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated # at the bottom of the documentation of classes and structs. If set to YES the # list will mention the files that were used to generate the documentation. SHOW_USED_FILES = YES # If the sources in your project are distributed over multiple directories # then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy # in the documentation. The default is NO. SHOW_DIRECTORIES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. # This will remove the Files entry from the Quick Index and from the # Folder Tree View (if specified). The default is YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the # Namespaces page. # This will remove the Namespaces entry from the Quick Index # and from the Folder Tree View (if specified). The default is YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command , where is the value of # the FILE_VERSION_FILTER tag, and is the name of an input file # provided by doxygen. Whatever the program writes to standard output # is used as the file version. See the manual for examples. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by # doxygen. The layout file controls the global structure of the generated output files # in an output format independent way. The create the layout file that represents # doxygen's defaults, run doxygen with the -l option. You can optionally specify a # file name after the option, if omitted DoxygenLayout.xml will be used as the name # of the layout file. LAYOUT_FILE = #--------------------------------------------------------------------------- # configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated # by doxygen. Possible values are YES and NO. If left blank NO is used. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated by doxygen. Possible values are YES and NO. If left blank # NO is used. WARNINGS = YES # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings # for undocumented members. If EXTRACT_ALL is set to YES then this flag will # automatically be disabled. WARN_IF_UNDOCUMENTED = YES # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some # parameters in a documented function, or documenting parameters that # don't exist or using markup commands wrongly. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be abled to get warnings for # functions that are documented, but have no documentation for their parameters # or return value. If set to NO (the default) doxygen will only warn about # wrong or incomplete parameter documentation, but not about the absence of # documentation. WARN_NO_PARAMDOC = YES # The WARN_FORMAT tag determines the format of the warning messages that # doxygen can produce. The string should contain the $file, $line, and $text # tags, which will be replaced by the file and line number from which the # warning originated and the warning text. Optionally the format may contain # $version, which will be replaced by the version of the file (if it could # be obtained via FILE_VERSION_FILTER) WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning # and error messages should be written. If left blank the output is written # to stderr. WARN_LOGFILE = dox_errors.txt #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag can be used to specify the files and/or directories that contain # documented source files. You may enter file names like "myfile.cpp" or # directories like "/usr/src/myproject". Separate the files or directories # with spaces. INPUT = bam fastq general glf samtools Makefiles # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is # also the default input encoding. Doxygen uses libiconv (or the iconv built # into libc) for the transcoding. See http://www.gnu.org/software/libiconv for # the list of possible encodings. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank the following patterns are tested: # *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx # *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 FILE_PATTERNS = # The RECURSIVE tag can be used to turn specify whether or not subdirectories # should be searched for input files as well. Possible values are YES and NO. # If left blank NO is used. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. EXCLUDE = .git include copyrights dox # The EXCLUDE_SYMLINKS tag can be used select whether or not files or # directories that are symbolic links (a Unix filesystem feature) are excluded # from the input. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. Note that the wildcards are matched # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see # the \include command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank all files are included. EXAMPLE_PATTERNS = # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude # commands irrespective of the value of the RECURSIVE tag. # Possible values are YES and NO. If left blank NO is used. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or # directories that contain image that are included in the documentation (see # the \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command , where # is the value of the INPUT_FILTER tag, and is the name of an # input file. Doxygen will then use the output that the filter program writes # to standard output. # If FILTER_PATTERNS is specified, this tag will be # ignored. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. # Doxygen will compare the file name with each pattern and apply the # filter if there is a match. # The filters are a list of the form: # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further # info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER # is applied to all files. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will be used to filter the input files when producing source # files to browse (i.e. when SOURCE_BROWSER is set to YES). FILTER_SOURCE_FILES = NO #--------------------------------------------------------------------------- # configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will # be generated. Documented entities will be cross-referenced with these sources. # Note: To get rid of all source code in the generated output, make sure also # VERBATIM_HEADERS is set to NO. SOURCE_BROWSER = YES # Setting the INLINE_SOURCES tag to YES will include the body # of functions and classes directly in the documentation. INLINE_SOURCES = YES # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct # doxygen to hide any special comment blocks from generated source code # fragments. Normal C and C++ comments will always remain visible. STRIP_CODE_COMMENTS = NO # If the REFERENCED_BY_RELATION tag is set to YES # then for each documented function all documented # functions referencing it will be listed. REFERENCED_BY_RELATION = YES # If the REFERENCES_RELATION tag is set to YES # then for each documented function all documented entities # called/used by that function will be listed. REFERENCES_RELATION = YES # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will # link to the source code. # Otherwise they will link to the documentation. REFERENCES_LINK_SOURCE = YES # If the USE_HTAGS tag is set to YES then the references to source code # will point to the HTML generated by the htags(1) tool instead of doxygen # built-in source browser. The htags tool is part of GNU's global source # tagging system (see http://www.gnu.org/software/global/global.html). You # will need version 4.8.6 or higher. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen # will generate a verbatim copy of the header file for each class for # which an include is specified. Set to NO to disable this. VERBATIM_HEADERS = YES #--------------------------------------------------------------------------- # configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index # of all compounds will be generated. Enable this if the project # contains a lot of classes, structs, unions or interfaces. ALPHABETICAL_INDEX = NO # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns # in which this list will be split (can be a number in the range [1..20]) COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that # should be ignored while generating the index headers. IGNORE_PREFIX = #--------------------------------------------------------------------------- # configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `html' will be used as the default path. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for # each generated HTML page (for example: .htm,.php,.asp). If it is left blank # doxygen will generate files with .html extension. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a personal HTML header for # each generated HTML page. If it is left blank doxygen will generate a # standard header. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a personal HTML footer for # each generated HTML page. If it is left blank doxygen will generate a # standard footer. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to # fine-tune the look of the HTML output. If the tag is left blank doxygen # will generate a default style sheet. Note that doxygen will try to copy # the style sheet file to the HTML output directory, so don't put your own # stylesheet in the HTML output directory as well, or it will be erased! HTML_STYLESHEET = # If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, # files or namespaces will be aligned in HTML using tables. If set to # NO a bullet list will be used. HTML_ALIGN_MEMBERS = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. For this to work a browser that supports # JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox # Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). HTML_DYNAMIC_SECTIONS = NO # If the GENERATE_DOCSET tag is set to YES, additional index files # will be generated that can be used as input for Apple's Xcode 3 # integrated development environment, introduced with OSX 10.5 (Leopard). # To create a documentation set, doxygen will generate a Makefile in the # HTML output directory. Running make will produce the docset in that # directory and running "make install" will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find # it at startup. # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information. GENERATE_DOCSET = NO # When GENERATE_DOCSET tag is set to YES, this tag determines the name of the # feed. A documentation feed provides an umbrella under which multiple # documentation sets from a single provider (such as a company or product suite) # can be grouped. DOCSET_FEEDNAME = "Doxygen generated docs" # When GENERATE_DOCSET tag is set to YES, this tag specifies a string that # should uniquely identify the documentation set bundle. This should be a # reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen # will append .docset to the name. DOCSET_BUNDLE_ID = edu.umich.sph.csg.libStatGen # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compiled HTML help file (.chm) # of the generated HTML documentation. GENERATE_HTMLHELP = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can # be used to specify the file name of the resulting .chm file. You # can add a path in front of the file if the result should not be # written to the html output directory. CHM_FILE = # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can # be used to specify the location (absolute path including file name) of # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run # the HTML help compiler on the generated index.hhp. HHC_LOCATION = # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag # controls if a separate .chi index file is generated (YES) or that # it should be included in the master .chm file (NO). GENERATE_CHI = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING # is used to encode HtmlHelp index (hhk), content (hhc) and project file # content. CHM_INDEX_ENCODING = # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag # controls whether a binary table of contents is generated (YES) or a # normal table of contents (NO) in the .chm file. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members # to the contents of the HTML help documentation and to the tree view. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER # are set, an additional index file will be generated that can be used as input for # Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated # HTML documentation. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can # be used to specify the file name of the resulting .qch file. # The path specified is relative to the HTML output folder. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#namespace QHP_NAMESPACE = # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#virtual-folders QHP_VIRTUAL_FOLDER = doc # If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. # For more information please see # http://doc.trolltech.com/qthelpproject.html#custom-filters QHP_CUST_FILTER_NAME = # The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see # Qt Help Project / Custom Filters. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's # filter section matches. # Qt Help Project / Filter Attributes. QHP_SECT_FILTER_ATTRS = # If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can # be used to specify the location of Qt's qhelpgenerator. # If non-empty doxygen will try to run qhelpgenerator on the generated # .qhp file. QHG_LOCATION = # The DISABLE_INDEX tag can be used to turn on/off the condensed index at # top of each HTML page. The value NO (the default) enables the index and # the value YES disables it. DISABLE_INDEX = NO # This tag can be used to set the number of enum values (range [1..20]) # that doxygen will group on one line in the generated HTML documentation. ENUM_VALUES_PER_LINE = 4 # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. # If the tag value is set to YES, a side panel will be generated # containing a tree-like index structure (just like the one that # is generated for HTML Help). For this to work a browser that supports # JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). # Windows users are probably better off using the HTML help feature. GENERATE_TREEVIEW = YES # By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, # and Class Hierarchy pages using a tree view instead of an ordered list. USE_INLINE_TREES = YES # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be # used to set the initial width (in pixels) of the frame in which the tree # is shown. TREEVIEW_WIDTH = 250 # Use this tag to change the font size of Latex formulas included # as images in the HTML documentation. The default is 10. Note that # when you change the font size after a successful doxygen run you need # to manually remove any form_*.png images from the HTML output directory # to force them to be regenerated. FORMULA_FONTSIZE = 10 # When the SEARCHENGINE tag is enable doxygen will generate a search box for the HTML output. The underlying search engine uses javascript # and DHTML and should work on any modern browser. Note that when using HTML help (GENERATE_HTMLHELP) or Qt help (GENERATE_QHP) # there is already a search function so this one should typically # be disabled. SEARCHENGINE = YES #--------------------------------------------------------------------------- # configuration options related to the LaTeX output #--------------------------------------------------------------------------- # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `latex' will be used as the default path. LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the # default command name. MAKEINDEX_CMD_NAME = makeindex # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact # LaTeX documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_LATEX = NO # The PAPER_TYPE tag can be used to set the paper type that is used # by the printer. Possible values are: a4, a4wide, letter, legal and # executive. If left blank a4wide will be used. PAPER_TYPE = letter # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX # packages that should be included in the LaTeX output. EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for # the generated latex document. The header should contain everything until # the first chapter. If it is left blank doxygen will generate a # standard header. Notice: only use this tag if you know what you are doing! LATEX_HEADER = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated # is prepared for conversion to pdf (using ps2pdf). The pdf file will # contain links (just like the HTML output) instead of page references # This makes the output suitable for online browsing using a pdf viewer. PDF_HYPERLINKS = YES # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of # plain latex in the generated Makefile. Set this option to YES to get a # higher quality PDF documentation. USE_PDFLATEX = YES # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. # command to the generated LaTeX files. This will instruct LaTeX to keep # running if errors occur, instead of asking the user for help. # This option is also used when generating formulas in HTML. LATEX_BATCHMODE = NO # If LATEX_HIDE_INDICES is set to YES then doxygen will not # include the index chapters (such as File Index, Compound Index, etc.) # in the output. LATEX_HIDE_INDICES = NO # If LATEX_SOURCE_CODE is set to YES then doxygen will include source code with syntax highlighting in the LaTeX output. Note that which sources are shown also depends on other settings such as SOURCE_BROWSER. LATEX_SOURCE_CODE = NO #--------------------------------------------------------------------------- # configuration options related to the RTF output #--------------------------------------------------------------------------- # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output # The RTF output is optimized for Word 97 and may not look very pretty with # other RTF readers or editors. GENERATE_RTF = NO # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `rtf' will be used as the default path. RTF_OUTPUT = rtf # If the COMPACT_RTF tag is set to YES Doxygen generates more compact # RTF documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_RTF = NO # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated # will contain hyperlink fields. The RTF file will # contain links (just like the HTML output) instead of page references. # This makes the output suitable for online browsing using WORD or other # programs which support those fields. # Note: wordpad (write) and others do not support links. RTF_HYPERLINKS = NO # Load stylesheet definitions from file. Syntax is similar to doxygen's # config file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an rtf document. # Syntax is similar to doxygen's config file. RTF_EXTENSIONS_FILE = #--------------------------------------------------------------------------- # configuration options related to the man page output #--------------------------------------------------------------------------- # If the GENERATE_MAN tag is set to YES (the default) Doxygen will # generate man pages GENERATE_MAN = YES # The MAN_OUTPUT tag is used to specify where the man pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `man' will be used as the default path. MAN_OUTPUT = man # The MAN_EXTENSION tag determines the extension that is added to # the generated man pages (default is the subroutine's section .3) MAN_EXTENSION = .3 # If the MAN_LINKS tag is set to YES and Doxygen generates man output, # then it will generate one additional man file for each entity # documented in the real man page(s). These additional files # only source the real man page, but without them the man command # would be unable to find the correct page. The default is NO. MAN_LINKS = NO #--------------------------------------------------------------------------- # configuration options related to the XML output #--------------------------------------------------------------------------- # If the GENERATE_XML tag is set to YES Doxygen will # generate an XML file that captures the structure of # the code including all documentation. GENERATE_XML = NO # The XML_OUTPUT tag is used to specify where the XML pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `xml' will be used as the default path. XML_OUTPUT = xml # The XML_SCHEMA tag can be used to specify an XML schema, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_SCHEMA = # The XML_DTD tag can be used to specify an XML DTD, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_DTD = # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that # enabling this will significantly increase the size of the XML output. XML_PROGRAMLISTING = YES #--------------------------------------------------------------------------- # configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will # generate an AutoGen Definitions (see autogen.sf.net) file # that captures the structure of the code including all # documentation. Note that this feature is still experimental # and incomplete at the moment. GENERATE_AUTOGEN_DEF = NO #--------------------------------------------------------------------------- # configuration options related to the Perl module output #--------------------------------------------------------------------------- # If the GENERATE_PERLMOD tag is set to YES Doxygen will # generate a Perl module file that captures the structure of # the code including all documentation. Note that this # feature is still experimental and incomplete at the # moment. GENERATE_PERLMOD = NO # If the PERLMOD_LATEX tag is set to YES Doxygen will generate # the necessary Makefile rules, Perl scripts and LaTeX code to be able # to generate PDF and DVI output from the Perl module output. PERLMOD_LATEX = NO # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be # nicely formatted so it can be parsed by a human reader. # This is useful # if you want to understand what is going on. # On the other hand, if this # tag is set to NO the size of the Perl module output will be much smaller # and Perl will parse it just the same. PERLMOD_PRETTY = YES # The names of the make variables in the generated doxyrules.make file # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. # This is useful so different doxyrules.make files included by the same # Makefile don't overwrite each other's variables. PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor #--------------------------------------------------------------------------- # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will # evaluate all C-preprocessor directives found in the sources and include # files. ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro # names in the source code. If set to NO (the default) only conditional # compilation will be performed. Macro expansion can be done in a controlled # way by setting EXPAND_ONLY_PREDEF to YES. MACRO_EXPANSION = NO # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES # then the macro expansion is limited to the macros specified with the # PREDEFINED and EXPAND_AS_DEFINED tags. EXPAND_ONLY_PREDEF = NO # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files # in the INCLUDE_PATH (see below) will be search if a #include is found. SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by # the preprocessor. INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the # directories. If left blank, the patterns specified with FILE_PATTERNS will # be used. INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that # are defined before the preprocessor is started (similar to the -D option of # gcc). The argument of the tag is a list of macros of the form: name # or name=definition (no spaces). If the definition and the = are # omitted =1 is assumed. To prevent a macro definition from being # undefined via #undef or recursively expanded use the := operator # instead of the = operator. PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. # The macro definition that is found in the sources will be used. # Use the PREDEFINED tag if you want to use a different macro definition. EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all function-like macros that are alone # on a line, have an all uppercase name, and do not end with a semicolon. Such # function macros are typically used for boiler-plate code, and will confuse # the parser if not removed. SKIP_FUNCTION_MACROS = YES #--------------------------------------------------------------------------- # Configuration::additions related to external references #--------------------------------------------------------------------------- # The TAGFILES option can be used to specify one or more tagfiles. # Optionally an initial location of the external documentation # can be added for each tagfile. The format of a tag file without # this location is as follows: # # TAGFILES = file1 file2 ... # Adding location for the tag files is done as follows: # # TAGFILES = file1=loc1 "file2 = loc2" ... # where "loc1" and "loc2" can be relative or absolute paths or # URLs. If a location is present for each tag, the installdox tool # does not have to be run to correct the links. # Note that each tag file must have a unique name # (where the name does NOT include the path) # If a tag file is not located in the directory in which doxygen # is run, you must also specify the path to the tagfile here. TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create # a tag file that is based on the input files it reads. GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES all external classes will be listed # in the class index. If set to NO only the inherited external classes # will be listed. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed # in the modules index. If set to NO, only the current project's groups will # be listed. EXTERNAL_GROUPS = YES # The PERL_PATH should be the absolute path and name of the perl script # interpreter (i.e. the result of `which perl'). PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base # or super classes. Setting the tag to NO turns the diagrams off. Note that # this option is superseded by the HAVE_DOT option below. This is only a # fallback. It is recommended to install and use dot, since it yields more # powerful graphs. CLASS_DIAGRAMS = YES # You can define message sequence charts within doxygen comments using the \msc # command. Doxygen will then run the mscgen tool (see # http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the # documentation. The MSCGEN_PATH tag allows you to specify the directory where # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. MSCGEN_PATH = # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz, a graph visualization # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) HAVE_DOT = YES # By default doxygen will write a font called FreeSans.ttf to the output # directory and reference it in all dot files that doxygen generates. This # font does not include all possible unicode characters however, so when you need # these (or just want a differently looking font) you can specify the font name # using DOT_FONTNAME. You need need to make sure dot is able to find the font, # which can be done by putting it in a standard location or by setting the # DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory # containing the font. DOT_FONTNAME = FreeSans # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. # The default size is 10pt. DOT_FONTSIZE = 10 # By default doxygen will tell dot to use the output directory to look for the # FreeSans.ttf font (which doxygen will put there itself). If you specify a # different font using DOT_FONTNAME you can set the path where dot # can find it using this tag. DOT_FONTPATH = # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect inheritance relations. Setting this tag to YES will force the # the CLASS_DIAGRAMS tag to NO. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect implementation dependencies (inheritance, containment, and # class references variables) of the class with other documented classes. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen # will generate a graph for groups, showing the direct groups dependencies GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. UML_LOOK = NO # If set to YES, the inheritance and collaboration graphs will show the # relations between templates and their instances. TEMPLATE_RELATIONS = NO # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT # tags are set to YES then doxygen will generate a graph for each documented # file showing the direct and indirect include dependencies of the file with # other documented files. INCLUDE_GRAPH = YES # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and # HAVE_DOT tags are set to YES then doxygen will generate a graph for each # documented header file showing the documented files that directly or # indirectly include this file. INCLUDED_BY_GRAPH = YES # If the CALL_GRAPH and HAVE_DOT options are set to YES then # doxygen will generate a call dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable call graphs # for selected functions only using the \callgraph command. CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then # doxygen will generate a caller dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will graphical hierarchy of all classes instead of a textual one. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES # then doxygen will show the dependencies a directory has on other directories # in a graphical way. The dependency relations are determined by the #include # relations between the files in the directories. DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. Possible values are png, jpg, or gif # If left blank png will be used. DOT_IMAGE_FORMAT = png # The tag DOT_PATH can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the # \dotfile command). DOTFILE_DIRS = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of # nodes that will be shown in the graph. If the number of nodes in a graph # becomes larger than this value, doxygen will truncate the graph, which is # visualized by representing a node as a red box. Note that doxygen if the # number of direct children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. DOT_GRAPH_MAX_NODES = 50 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the # graphs generated by dot. A depth value of 3 means that only nodes reachable # from the root by following a path via at most 3 edges will be shown. Nodes # that lay further from the root node will be omitted. Note that setting this # option to 1 or 2 may greatly reduce the computation time needed for large # code bases. Also note that the size of a graph can be further restricted by # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. MAX_DOT_GRAPH_DEPTH = 0 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent # background. This is disabled by default, because dot on Windows does not # seem to support this out of the box. Warning: Depending on the platform used, # enabling this option may lead to badly anti-aliased labels on the edges of # a graph (i.e. they become hard to read). DOT_TRANSPARENT = NO # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) # support this, this feature is disabled by default. DOT_MULTI_TARGETS = YES # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will # generate a legend page explaining the meaning of the various boxes and # arrows in the dot generated graphs. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will # remove the intermediate dot files that are used to generate # the various graphs. DOT_CLEANUP = YES libStatGen-1.0.14/Makefile000066400000000000000000000013311254730101300152630ustar00rootroot00000000000000VERSION ?= 1.0.14 .PHONY: package SUBDIRS=general bam fastq glf samtools vcf include Makefiles/Makefile.base clean:$(SUBDIRS) rm -f $(STAT_GEN_LIB_OPT) rm -f $(STAT_GEN_LIB_DEBUG) rm -f $(STAT_GEN_LIB_PROFILE) # general depends on samtools general: samtools # other subdirectories depend on general bam fastq glf vcf: general RELEASE_FILE?=libStatGen.$(VERSION).tgz # Package the library. package : # the touch gets rid of a tar warning touch $(RELEASE_FILE) tar chvz --exclude="*~" --exclude=$(RELEASE_FILE) --exclude='obj/*' --exclude='*.a' --exclude='include/*' --exclude='bin/*' --exclude='test/results/*' --exclude-vcs -f $(RELEASE_FILE) --transform 's,^,libStatGen_$(VERSION)/,' * --show-transformed-names libStatGen-1.0.14/Makefiles/000077500000000000000000000000001254730101300155255ustar00rootroot00000000000000libStatGen-1.0.14/Makefiles/Makefile.base000066400000000000000000000005461254730101300201030ustar00rootroot00000000000000# SUBDIRS should be set by the Makefile that includes this one. # SUBDIRS = MAKEFILES_PATH := $(dir $(lastword $(MAKEFILE_LIST))) include $(MAKEFILES_PATH)Makefile.include # Build in all subdirectories. .PHONY: $(SUBDIRS) include $(MAKEFILES_PATH)Makefile.help $(SUBDIRS): @$(MAKE) -C $@ $(MAKECMDGOALS) %: $(SUBDIRS) ; Makefile.%: ; Makefile: ; libStatGen-1.0.14/Makefiles/Makefile.common000066400000000000000000000104551254730101300204610ustar00rootroot00000000000000# This makefile is common for both Makefile.lib and Makefile.src TOOLBASE ?= COMPILE_ANY_CHANGE ?= SRCONLY ?= HDRONLY ?= USER_REMOVES ?= # USER_REMOVES are additional things that should be removed by clean. MAKEFILES_PATH := $(dir $(lastword $(MAKEFILE_LIST))) include $(MAKEFILES_PATH)Makefile.include MAKE_DEPENDS=Makefile.depends # Source File Set TOOLHDR = $(TOOLBASE:=.h) $(HDRONLY) TOOLSRC = $(TOOLBASE:=.cpp) $(SRCONLY) TOOLOBJ = $(patsubst %.f,%.o,$(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(TOOLSRC)))) # Define the directory structure for opt/debug/profile for objs & bins SUBDIR_OPT ?= SUBDIR_DEBUG ?= /debug SUBDIR_PROFILE ?= /profile #Objs default to the obj directory. OBJDIR ?= ../obj OBJDIR_OPT ?= $(OBJDIR)$(SUBDIR_OPT) OBJDIR_DEBUG ?= $(OBJDIR)$(SUBDIR_DEBUG) OBJDIR_PROFILE ?= $(OBJDIR)$(SUBDIR_PROFILE) #Needed for cleaning up the library - need all three. OBJECTS_OPT := $(patsubst %,$(OBJDIR_OPT)/%,$(TOOLOBJ)) OBJECTS_DEBUG := $(patsubst %,$(OBJDIR_DEBUG)/%,$(TOOLOBJ)) OBJECTS_PROFILE := $(patsubst %,$(OBJDIR_PROFILE)/%,$(TOOLOBJ)) # Tests go in the test directory. TESTDIR ?= test ALWAYS_COMP_OBJ_OPT = $(patsubst %,$(OBJDIR_OPT)/%,$(COMPILE_ANY_CHANGE:=.o)) ALWAYS_COMP_OBJ_DEBUG = $(patsubst %,$(OBJDIR_DEBUG)/%,$(COMPILE_ANY_CHANGE:=.o)) ALWAYS_COMP_OBJ_PROFILE = $(patsubst %,$(OBJDIR_PROFILE)/%,$(COMPILE_ANY_CHANGE:=.o)) CCOMPILE=$(CC) $(COMPFLAGS) -o $@ -c $*.c CXXCOMPILE=$(CXX) $(COMPFLAGS) -o $@ -c $*.cpp -DVERSION="\"$(VERSION)\"" .PHONY: all test clean opt debug profile specific_clean # all, build as opt, debug, and profile. all: opt debug profile opt: OPTFLAG=$(OPTFLAG_OPT) debug: OPTFLAG=$(OPTFLAG_DEBUG) profile: OPTFLAG=$(OPTFLAG_PROFILE) ######### # Help include $(MAKEFILES_PATH)Makefile.help ####### # Objects # The objects are dependent on the object directory. $(OBJECTS_OPT): | $(OBJDIR_OPT) $(OBJECTS_DEBUG): | $(OBJDIR_DEBUG) $(OBJECTS_PROFILE): | $(OBJDIR_PROFILE) # Create the object directory $(OBJDIR_OPT) $(OBJDIR_DEBUG) $(OBJDIR_PROFILE): mkdir -p $@ # Compile the source # Must keep the 3 types (opt, debug, profile) separate, # otherwise on a "make all" makefile will think it has already compiled the # target. $(OBJDIR_OPT)/%.o: %.c $(CCOMPILE) $(OBJDIR_DEBUG)/%.o: %.c $(CCOMPILE) $(OBJDIR_PROFILE)/%.o: %.c $(CCOMPILE) $(OBJDIR_OPT)/%.o: %.cpp $(CXXCOMPILE) $(OBJDIR_DEBUG)/%.o: %.cpp $(CXXCOMPILE) $(OBJDIR_PROFILE)/%.o: %.cpp $(CXXCOMPILE) ifneq ($(COMPILE_ANY_CHANGE),) $(ALWAYS_COMP_OBJ_OPT): $(COMPILE_ANY_CHANGE:=.cpp) $(filter-out $(ALWAYS_COMP_OBJ_OPT),$(OBJECTS_OPT)) $(CXX) $(COMPFLAGS) -o $@ -c $(*F).cpp -DVERSION="\"$(VERSION)\"" $(ALWAYS_COMP_OBJ_DEBUG): $(COMPILE_ANY_CHANGE:=.cpp) $(filter-out $(ALWAYS_COMP_OBJ_DEBUG),$(OBJECTS_DEBUG)) $(CXX) $(COMPFLAGS) -o $@ -c $(*F).cpp -DVERSION="\"$(VERSION)\"" $(ALWAYS_COMP_OBJ_PROFILE): $(COMPILE_ANY_CHANGE:=.cpp) $(filter-out $(ALWAYS_COMP_OBJ_PROFILE),$(OBJECTS_PROFILE)) $(CXX) $(COMPFLAGS) -o $@ -c $(*F).cpp -DVERSION="\"$(VERSION)\"" endif ######### # Test test: all @if test -f $(TESTDIR)/Makefile; \ then \ $(MAKE) -C $(TESTDIR) --no-print-directory $@; \ fi ######### # clean clean : specific_clean $(USER_REMOVES) -rm -f $(OBJDIR_OPT)/*.o $(OBJDIR_DEBUG)/*.o $(OBJDIR_PROFILE)/*.o *~ @if test -f $(TESTDIR)/Makefile; \ then \ $(MAKE) -C $(TESTDIR) --no-print-directory $@; \ fi .SUFFIXES : .cpp .c .o .X.o $(SUFFIXES) # # The newest version of astyle will remove parens in a statement # header (the character following if/while/do). The current # version in Ubuntu does not. # # See http://astyle.sourceforge.net/astyle.html#_Padding_Options # style: (command -v astyle || echo "astyle not installed.") (command -v astyle && astyle --style=ansi --preserve-date --unpad-paren --indent-switches *.cpp *.h) ############### # Dependencies DFLAGS=-Y $(COMPFLAGS) cleandepend: makedepend -- $(DFLAGS) -- depend: touch $(MAKE_DEPENDS) makedepend -f$(MAKE_DEPENDS) -p$$\(OBJDIR_OPT\)/ -- $(DFLAGS) -- $(TOOLSRC) >/dev/null 2>&1 makedepend -f$(MAKE_DEPENDS) -a -p$$\(OBJDIR_DEBUG\)/ -- $(DFLAGS) -- $(TOOLSRC) >/dev/null 2>&1 makedepend -f$(MAKE_DEPENDS) -a -p$$\(OBJDIR_PROFILE\)/ -- $(DFLAGS) -- $(TOOLSRC) >/dev/null 2>&1 ifeq ($(wildcard $(MAKE_DEPENDS)),) $(OBJECTS_OPT) $(OBJECTS_DEBUG) $(OBJECTS_PROFILE) : $(TOOLHDR) else include $(MAKE_DEPENDS) endif libStatGen-1.0.14/Makefiles/Makefile.ext000066400000000000000000000044731254730101300177740ustar00rootroot00000000000000# Your Makefile should include this Makefile after defining: # EXE - the executable name # TOOLBASE - the base filename for files with .h & .cpp versions # SRCONLY - any cpp files without headers. # HDRONLY - any header files without cpp # COMPILE_ANY_CHANGE - any files that should be compiled if any of the # files change. These files MUST also be # included in TOOLBASE or SRCONLY. Here they are # just the base name without the extension. # VERSION - if not version in Makefile.include # BINDIR if it is not ../bin # USER_INCLUDES if any additional directories need to be included to pick up # header files (example: USER_INCLUDES=-ImyDir1 -ImyDir2) # INSTALLDIR if not /usr/local/bin EXE ?= BINDIR ?= ../bin TESTDIR ?= ../test MAKEFILES_PATH := $(dir $(lastword $(MAKEFILE_LIST))) include $(MAKEFILES_PATH)Makefile.common # Set the bins for each build type BINDIR_OPT ?= $(BINDIR)$(SUBDIR_OPT) BINDIR_DEBUG ?= $(BINDIR)$(SUBDIR_DEBUG) BINDIR_PROFILE ?= $(BINDIR)$(SUBDIR_PROFILE) PROG_OPT ?= $(BINDIR_OPT)/$(EXE) PROG_DEBUG ?= $(BINDIR_DEBUG)/$(EXE) PROG_PROFILE ?= $(BINDIR_PROFILE)/$(EXE) .PHONY: $(STAT_GEN_LIB_OPT) $(STAT_GEN_LIB_DEBUG) $(STAT_GEN_LIB_PROFILE) # Build the appropriate program opt: $(PROG_OPT) debug: $(PROG_DEBUG) profile: $(PROG_PROFILE) # Build with the appropriate dependencies $(PROG_OPT): $(USER_LIBS) $(REQ_LIBS_OPT) $(OBJECTS_OPT) | $(BINDIR_OPT) $(OPT_BUILD) $(PROG_DEBUG): $(USER_LIBS) $(REQ_LIBS_DEBUG) $(OBJECTS_DEBUG) | $(BINDIR_DEBUG) $(DEBUG_BUILD) $(PROG_PROFILE): $(USER_LIBS) $(REQ_LIBS_PROFILE) $(OBJECTS_PROFILE) | $(BINDIR_PROFILE) $(PROFILE_BUILD) # Build the statgen lib if necessary $(STAT_GEN_LIB_OPT): $(MAKE) -C $(@D) --no-print-directory opt $(STAT_GEN_LIB_DEBUG): $(MAKE) -C $(@D) --no-print-directory debug $(STAT_GEN_LIB_PROFILE): $(MAKE) -C $(@D) --no-print-directory profile .SECONDEXPANSION: $(OBJECTS_OPT): TYPE = OPT $(OBJECTS_DEBUG): TYPE=DEBUG $(OBJECTS_PROFILE): TYPE:=PROFILE $(OBJECTS_OPT): $(INCLUDE_PATH)/*h | $(STAT_GEN_LIB_OPT) $(OBJECTS_DEBUG): $(INCLUDE_PATH)/*h | $(STAT_GEN_LIB_DEBUG) $(OBJECTS_PROFILE): $(INCLUDE_PATH)/*h | $(STAT_GEN_LIB_PROFILE) $(INCLUDE_PATH)/*h: $$(STAT_GEN_LIB_$$(TYPE)) $(BINDIR_OPT) $(BINDIR_DEBUG) $(BINDIR_PROFILE): mkdir -p $@ libStatGen-1.0.14/Makefiles/Makefile.extlib000066400000000000000000000025061254730101300204560ustar00rootroot00000000000000# Your Makefile should include this Makefile after defining: # EXE - the library name # TOOLBASE - the base filename for files with .h & .cpp versions # SRCONLY - any cpp files without headers. # HDRONLY - any header files without cpp # COMPILE_ANY_CHANGE - any files that should be compiled if any of the # files change. These files MUST also be # included in TOOLBASE or SRCONLY. Here they are # just the base name without the extension. # VERSION - if not version in Makefile.include # BINDIR if it is not ../lib # TESTDIR if it is not ../test # USER_INCLUDES if any additional directories need to be included to pick up # header files (example: USER_INCLUDES=-ImyDir1 -ImyDir2) EXE ?= BINDIR ?= ../lib TESTDIR ?= ../test MAKEFILES_PATH := $(dir $(lastword $(MAKEFILE_LIST))) include $(MAKEFILES_PATH)Makefile.ext # Set the build commands for library OPT_BUILD = ar -cr $(PROG_OPT) $(OBJECTS_OPT) DEBUG_BUILD = ar -cr $(PROG_DEBUG) $(OBJECTS_DEBUG) PROFILE_BUILD = ar -cr $(PROG_PROFILE) $(OBJECTS_PROFILE) specific_clean : -$(AR) d $(PROG_OPT) $(OBJECTS_OPT) -$(RANLIB) $(PROG_OPT) -$(AR) d $(PROG_DEBUG) $(OBJECTS_DEBUG) -$(RANLIB) $(PROG_DEBUG) -$(AR) d $(PROG_PROFILE) $(OBJECTS_PROFILE) -$(RANLIB) $(PROG_PROFILE) libStatGen-1.0.14/Makefiles/Makefile.footer000066400000000000000000000010621254730101300204610ustar00rootroot00000000000000 # # The newest version of astyle will remove parens in a statement # header (the character following if/while/do). The current # version in Ubuntu does not. # # See http://astyle.sourceforge.net/astyle.html#_Padding_Options # style: (command -v astyle || echo "astyle not installed.") (command -v astyle && astyle --style=ansi --preserve-date --unpad-paren --indent-switches *.cpp *.h) # DFLAGS=-I$(HEADER_DIR) -I/usr/include/g++ DFLAGS=-Y $(CFLAGS) cleandepend: makedepend -- $(DFLAGS) -- depend: makedepend -- $(DFLAGS) -- $(SOURCES) >/dev/null 2>&1 libStatGen-1.0.14/Makefiles/Makefile.help000066400000000000000000000010041254730101300201070ustar00rootroot00000000000000help : @echo "Makefile help" @echo "-------------" @echo "Type... To..." @echo "make Compile opt " @echo "make help Display this help screen" @echo "make all Compile everything (opt, debug, & profile)" @echo "make opt Compile optimized" @echo "make debug Compile for debug" @echo "make profile Compile for profile" @echo "make clean Delete temporary files" @echo "make test Execute tests (if there are any)" $(ADDITIONAL_HELP) libStatGen-1.0.14/Makefiles/Makefile.include000066400000000000000000000034341254730101300206130ustar00rootroot00000000000000.DEFAULT_GOAL := opt # Determine the path to this Makefile MAKEFILES_PATH := $(dir $(lastword $(MAKEFILE_LIST))) MAKEFILES_PATH_TMP := $(patsubst %Makefiles/, %, $(MAKEFILES_PATH)) # The base lib path is one above this Makefile. BASE_LIB_PATH := $(dir $(MAKEFILES_PATH_TMP)) # Path names that may be needed. DIR_PARTS := $(subst /, , $(CURDIR)) DIR_NAME := $(word $(words $(DIR_PARTS)), $(DIR_PARTS)) VERSION ?= 1.0.0 # Define the paths/names of the Stagen Libraries STAT_GEN_LIB = $(STAT_GEN_LIB_OPT) STAT_GEN_LIB_OPT = $(BASE_LIB_PATH)libStatGen.a STAT_GEN_LIB_DEBUG = $(BASE_LIB_PATH)libStatGen_debug.a STAT_GEN_LIB_PROFILE = $(BASE_LIB_PATH)libStatGen_profile.a # Currently only the statgen library is required. REQ_LIBS_OPT ?= $(STAT_GEN_LIB) REQ_LIBS_DEBUG ?= $(STAT_GEN_LIB_DEBUG) REQ_LIBS_PROFILE ?= $(STAT_GEN_LIB_PROFILE) INCLUDE_PATH = $(BASE_LIB_PATH)include include $(MAKEFILES_PATH)Makefile.toolchain # -fno-ftti is sometimes useful to increase speed # 2-5%, but makes linking consistently more difficult OPTFLAG_OPT?=-O4 OPTFLAG_DEBUG?=-ggdb -O0 OPTFLAG_PROFILE?=-pg # User specific variables listed here. # USER_INCLUDES = # USER_COMPILE_VARS = # USER_WARNINGS = # If you don't wnat to include the current directory, set in your Makefile: # CURRENT_DIR_INCLUDE= CURRENT_DIR_INCLUDE?=-I. ZLIB_AVAIL ?= 1 USE_ZLIB = -D__ZLIB_AVAILABLE__ ZLIB_LIB = -lz ifeq ($(ZLIB_AVAIL), 0) USE_ZLIB = ZLIB_LIB = endif KNET_ON ?= 0 USE_KNET ?= ifeq ($(KNET_ON), 1) USE_KNET = -D_USE_KNETFILE endif REQ_SETTINGS = CFLAGS ?= $(OPTFLAG) -pipe -Wall COMPFLAGS = $(CFLAGS) $(USER_WARNINGS) -I$(INCLUDE_PATH) $(CURRENT_DIR_INCLUDE) $(USER_INCLUDES) $(USE_KNET) $(USE_ZLIB) -D_FILE_OFFSET_BITS=64 -D__STDC_LIMIT_MACROS $(USER_COMPILE_VARS) # default installation directory INSTALLDIR?=/usr/local/bin libStatGen-1.0.14/Makefiles/Makefile.lib000066400000000000000000000034611254730101300177360ustar00rootroot00000000000000# Your Makefile should include this Makefile after defining: # TOOLBASE - the base filename for files with .h & .cpp versions # SRCONLY - any cpp files without headers. # HDRONLY - any header files without cpp # VERSION - if not version in Makefile.include MAKEFILES_PATH := $(dir $(lastword $(MAKEFILE_LIST))) OBJDIR = obj include $(MAKEFILES_PATH)Makefile.common HEADERS=$(TOOLHDR) .PHONY: all test clean debug profile param install specific_clean $(STAT_GEN_LIB) $(STAT_GEN_LIB_DEBUG) $(STAT_GEN_LIB_PROFILE) # make everything, ensure headers are in the include direcotry. opt debug profile : $(addprefix $(INCLUDE_PATH)/, $(HEADERS)) install: opt # Link into the include directory $(INCLUDE_PATH)/%.h: %.h -ln -s ../$(DIR_NAME)/$< $@ ######### # Build the library opt: $(STAT_GEN_LIB_OPT) debug: $(STAT_GEN_LIB_DEBUG) profile: $(STAT_GEN_LIB_PROFILE) # To build the library, build the objects # Then add them to the library $(STAT_GEN_LIB_OPT): $(OBJECTS_OPT) ar -cru $@ $(OBJECTS_OPT) $(STAT_GEN_LIB_DEBUG): $(OBJECTS_DEBUG) ar -cru $@ $(OBJECTS_DEBUG) $(STAT_GEN_LIB_PROFILE): $(OBJECTS_PROFILE) ar -cru $@ $(OBJECTS_PROFILE) UNAME=$(shell uname) ifeq ($(UNAME), Darwin) specific_clean: -rm -f $(addprefix $(INCLUDE_PATH)/, $(HEADERS)) -$(AR) d $(STAT_GEN_LIB_OPT) $(OBJECTS_OPT) -$(RANLIB) $(STAT_GEN_LIB_OPT) -$(AR) d $(STAT_GEN_LIB_DEBUG) $(OBJECTS_DEBUG) -$(RANLIB) $(STAT_GEN_LIB_DEBUG) -$(AR) d $(STAT_GEN_LIB_PROFILE) $(OBJECTS_PROFILE) -$(RANLIB) $(STAT_GEN_LIB_PROFILE) else specific_clean: -rm -f $(addprefix $(INCLUDE_PATH)/, $(HEADERS)) $(AR) d $(STAT_GEN_LIB_OPT) $(OBJECTS_OPT) $(RANLIB) $(STAT_GEN_LIB_OPT) $(AR) d $(STAT_GEN_LIB_DEBUG) $(OBJECTS_DEBUG) $(RANLIB) $(STAT_GEN_LIB_DEBUG) $(AR) d $(STAT_GEN_LIB_PROFILE) $(OBJECTS_PROFILE) $(RANLIB) $(STAT_GEN_LIB_PROFILE) endif libStatGen-1.0.14/Makefiles/Makefile.src000066400000000000000000000041411254730101300177530ustar00rootroot00000000000000# Your Makefile should include this Makefile after defining: # EXE - the executable name # TOOLBASE - the base filename for files with .h & .cpp versions # SRCONLY - any cpp files without headers. # HDRONLY - any header files without cpp # COMPILE_ANY_CHANGE - any files that should be compiled if any of the # files change. These files MUST also be # included in TOOLBASE or SRCONLY. Here they are # just the base name without the extension. # VERSION - if not version in Makefile.include # BINDIR if it is not ../bin # USER_INCLUDES if any additional directories need to be included to pick up # header files (example: USER_INCLUDES=-ImyDir1 -ImyDir2) # INSTALLDIR if not /usr/local/bin EXE ?= BINDIR ?= ../bin TESTDIR ?= ../test MAKEFILES_PATH := $(dir $(lastword $(MAKEFILE_LIST))) include $(MAKEFILES_PATH)Makefile.ext # Set the build commands for executable OPT_BUILD = $(CXX) $(COMPFLAGS) $(USER_LINK_OPTIONS) -o $(PROG_OPT) $(OBJECTS_OPT) $(USER_LIBS) $(REQ_LIBS_OPT) -lm $(ZLIB_LIB) $(UNAME_LIBS) $(OTHER_LIBS) DEBUG_BUILD = $(CXX) $(COMPFLAGS) $(USER_LINK_OPTIONS) -o $(PROG_DEBUG) $(OBJECTS_DEBUG) $(USER_LIBS) $(REQ_LIBS_DEBUG) -lm $(ZLIB_LIB) $(UNAME_LIBS) $(OTHER_LIBS) PROFILE_BUILD = $(CXX) $(COMPFLAGS) $(USER_LINK_OPTIONS) -o $(PROG_PROFILE) $(OBJECTS_PROFILE) $(USER_LIBS) $(REQ_LIBS_PROFILE) -lm $(ZLIB_LIB) $(UNAME_LIBS) $(OTHER_LIBS) ADDITIONAL_HELP= @echo "make install Install binaries in $(INSTALLDIR)";\ echo "make install INSTALLDIR=directory_for_binaries";\ echo " Install binaries in directory_for_binaries" .PHONY: install install : all $(INSTALLDIR) @echo " " @echo Installing to directory $(INSTALLDIR) @echo To select a different directory, run @echo " " @echo make install INSTALLDIR=your_preferred_dir @echo " " cp $(PROG_OPT) $(INSTALLDIR) $(INSTALLDIR) : @echo " " @echo Creating directory $(INSTALLDIR) @echo " " @mkdir -p $(INSTALLDIR) specific_clean : -rm -f $(BINDIR_OPT)/$(EXE) -rm -rf $(BINDIR_DEBUG) -rm -rf $(BINDIR_PROFILE) libStatGen-1.0.14/Makefiles/Makefile.test000066400000000000000000000032241254730101300201440ustar00rootroot00000000000000# Your Makefile should include this Makefile after defining: # TEST_COMMAND - the commands to run under make test # EXE - executable built for this test. # TOOLBASE - the base filename for files with .h & .cpp versions # SRCONLY - any cpp files without headers. # HDRONLY - any header files without cpp # VERSION - if not 0.0.1 TEST_COMMAND ?= EXE ?= TOOLBASE ?= SRCONLY ?= HDRONLY ?= VERSION ?= 0.0.1 MAKEFILES_PATH := $(dir $(lastword $(MAKEFILE_LIST))) include $(MAKEFILES_PATH)Makefile.include # Use debug opt flag. OPTFLAG?=$(OPTFLAG_DEBUG) OBJDIR?=obj # # Goncalo's Generic Makefile -- Compiles and installs a Generic Goncalo Tool # (c) 2000-2007 Goncalo Abecasis # # Source File Set # For best results, consider editing this manually ... TOOLHDR = $(TOOLBASE:=.h) $(HDRONLY) TOOLSRC = $(TOOLBASE:=.cpp) $(SRCONLY) TOOLOBJ = $(TOOLSRC:.cpp=.o) LIBRARY = $(REQ_LIBS_DEBUG) OBJECTS=$(patsubst %,$(OBJDIR)/%,$(TOOLOBJ)) .DEFAULT_GOAL := all # make everything all debug: $(EXE) # dependencies for executables $(EXE) : $(LIBRARY) $(OBJECTS) $(CXX) $(COMPFLAGS) -o $@ $(OBJECTS) $(LIBRARY) -lm $(ZLIB_LIB) $(UNAME_LIBS) $(OBJECTS): $(TOOLHDR) $(LIBHDR) | $(OBJDIR) $(OBJDIR): mkdir $(OBJDIR) clean : -rm -rf $(OBJDIR) $(EXE) *~ results/* $(TEST_CLEAN) test : all $(TEST_COMMAND) $(OBJDIR)/%.o: %.c $(CXX) $(COMPFLAGS) -o $@ -c $*.c $(OBJDIR)/%.o: %.cpp $(CXX) $(COMPFLAGS) -o $@ -c $*.cpp -DVERSION="\"$(VERSION)\"" .SUFFIXES : .cpp .c .o .X.o $(SUFFIXES) DFLAGS=-Y cleandepend: makedepend -- $(DFLAGS) -- depend: makedepend -- $(DFLAGS) -- $(TOOLSRC) >/dev/null 2>&1 # DO NOT DELETE THIS LINE -- make depend depends on it libStatGen-1.0.14/Makefiles/Makefile.tool000066400000000000000000000026771254730101300201550ustar00rootroot00000000000000 MAKEFILES_PATH := $(dir $(lastword $(MAKEFILE_LIST))) include $(MAKEFILES_PATH)Makefile.base RELEASE_FILE?=$(DIR_NAME).$(VERSION).tgz ADDITIONAL_HELP= @echo "make install Install binaries in $(INSTALLDIR)";\ echo "make install INSTALLDIR=directory_for_binaries";\ echo " Install binaries in directory_for_binaries" .PHONY: package wholepackage # Does not include the library. package : # the touch gets rid of a tar warning touch $(RELEASE_FILE) tar chvz --exclude="*~" --exclude=$(RELEASE_FILE) --exclude='obj/*' --exclude='*.a' --exclude='include/*' --exclude='bin/*' --exclude='test/results/*' --exclude-vcs -f $(RELEASE_FILE) --transform 's,^,$(DIR_NAME)_$(VERSION)/,' * --show-transformed-names BASE_LIB_PARTS := $(subst /, , $(BASE_LIB_PATH)) BASE_LIB_DIRNAME := $(word $(words $(BASE_LIB_PARTS)), $(BASE_LIB_PARTS)) WHOLEPACKAGE_MAKE := $(BASE_LIB_DIRNAME)/Makefiles/Makefile.wholepackage DIR_ABOVE_LIB := $(patsubst %$(BASE_LIB_DIRNAME)/, %, $(BASE_LIB_PATH)) # also includes the library wholepackage: # the touch gets rid of a tar warning touch $(RELEASE_FILE) tar chvz --exclude="*~" --exclude=$(RELEASE_FILE) --exclude='obj/*' --exclude='*.a' --exclude='include/*' --exclude='bin/*' --exclude='test/results/*' --exclude-vcs -f $(RELEASE_FILE) --transform 's,^,$(DIR_NAME)_$(VERSION)/,;s,$(WHOLEPACKAGE_MAKE),Makefile,' -C .. $(DIR_NAME) -C $(DIR_NAME) -C $(DIR_ABOVE_LIB) $(BASE_LIB_DIRNAME) --show-transformed-names libStatGen-1.0.14/Makefiles/Makefile.toolchain000066400000000000000000000036361254730101300211540ustar00rootroot00000000000000# # This file allows us to create windows # 32 and 64 bit binaries instead of binaries # for the compiling host. It assumes that the # GNU compiler suite is the compiler for all # three targets. # # TARGET is a bit of a misnomer. The goal is # really to allow different toolchains to be used # to build the system. # # Supported TARGET values include: # # mingw32 # mingw64 (except c++, so mostly not) # llvm # clang # # Typical use is to say at the toplevel: # make # or # make TARGET=mingw32 # or # make TARGET=mingw64 # # But you can also just set it here: # # TARGET=mingw32 ifeq ($(TARGET),mingw) TARGET=mingw32 endif TOOLCHAIN_DIR= ifeq ($(TARGET), mingw32) TOOLCHAIN_PREFIX=i586-mingw32msvc- else ifeq ($(TARGET), mingw64) TOOLCHAIN_PREFIX=amd64-mingw32msvc- else ifeq ($(TARGET),llvm) TOOLCHAIN_PREFIX=llvm- else TOOLCHAIN_PREFIX= endif UNAME:=$(shell uname) UNAME_LIBS:= ifeq ($(UNAME:MINGW32%=MINGW32),MINGW32) UNAME_LIBS=-lwsock32 endif # CPP0X=-std=c++0x CXX = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)g++ $(CPP0X) CC = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)gcc LD = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)ld AR = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)ar RANLIB = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)ranlib OBJCOPY = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)objcopy OBJDUMP = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)objdump STRIP = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)strip NM = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)nm SIZE = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)size CPP = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)cpp AS = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)as F77 = $(TOOLCHAIN_DIR)$(TOOLCHAIN_PREFIX)gfortran # # clang is a drop in replacement for c++ and cc: # ifeq ($(TARGET), clang) CXX=clang CC=clang endif CCVERSION = $(shell $(CC) -dumpversion ) EXPORT_TOOLCHAIN="export RANLIB=$(RANLIB); export AR=$(AR); export CC='$(CC)'; export CXX='$(CXX)'; export LD=$(LD); export CCVERSION=$(CCVERSION)" libStatGen-1.0.14/Makefiles/Makefile.wholepackage000066400000000000000000000006721254730101300216230ustar00rootroot00000000000000SUBDIRS = $(sort $(dir $(wildcard */))) SUBDIRS_NO_STATGEN = $(filter-out libStatGen/,$(SUBDIRS)) MAKEFILES_PATH := libStatGen/Makefiles/ include $(MAKEFILES_PATH)Makefile.include # Build in all subdirectories. .PHONY: $(SUBDIRS) include $(MAKEFILES_PATH)Makefile.help $(SUBDIRS): @$(MAKE) -C $@ $(MAKECMDGOALS) $(SUBDIRS_NO_STATGEN): libStatGen/ %: $(SUBDIRS) ; Makefile.%: ; Makefile: ; SUBDIRS = $(sort $(dir $(wildcard */))) libStatGen-1.0.14/README.txt000066400000000000000000000052601254730101300153260ustar00rootroot00000000000000Dependencies ------------ On debian type systems (including Ubuntu), add the following packages if they are not already installed (or have your admin add them if you do not have permission): sudo apt-get install g++ libssl-dev zlib1g-dev Building -------- To compile, from the top level directory, type: "make" To compile with debug symbols, type: "make debug" To test (after compiling), from the top level directory, type: "make test" Under the main statgen repository, there are: * bam - library code for operating on bam files. * copyrights - copyrights for the library and any code included with it. * fastq - library code for operating on fastq files. * general - library code for general operations * glf - library code for operating on glf files. * include - after compiling, the library headers are linked here * Makefiles - directory containing Makefiles that are used in the library and can be used for developing programs using the library * samtools - library code used from samtools After Compiling: libStatGen.a, libStatGen_debug.a, libStatGen_profile.a are created at the top level. Makefiles --------- Makefiles/Makefile.include should contain the definitions that you need for creating software using this library. Makefiles/Makefile.lib and Makefiles/Makefile.src can be used as templates for creating Makefiles for new software. If possible, just include them within your Makefile. Just set the proper variables for your program in your Makefile first. (both Makefiles automatically include Makefile.include) A similar setup should be used for test code, by including Makefiles/Makefile.test and defining your test specific variables first. Other Notes ----------- * Typically the .o files are compiled into their own directory called obj. Compile Issues/Troubleshooting ------------------------------ See http://genome.sph.umich.edu/wiki/LibStatGen_Troubleshooting for the latest troubleshooting information. If you are compiling on OSX and you encounter errors like the following: GenomeSequence.cpp: In instantiation of 'std::basic_ostream<_CharT, _Traits>& std::operator<<(std::basic_ostream<_CharT, _Traits>&, const std::basic_string<_CharT, _Traits, _Alloc>&) [with _CharT = char, _Traits = std::char_traits, _Alloc = std::allocator]': GenomeSequence.cpp:161: instantiated from here GenomeSequence.cpp:161: error: explicit instantiation of 'std::basic_ostream<_CharT, _Traits>& std::operator<<(std::basic_ostream<_CharT, _Traits>&, const std::basic_string<_CharT, _Traits, _Alloc>&) [with _CharT = char, _Traits = std::char_traits, _Alloc = std::allocator]' but no definition available Try compiling with: make USER_COMPILE_VARS=-mmacosx-version-min=10.8 libStatGen-1.0.14/bam/000077500000000000000000000000001254730101300143645ustar00rootroot00000000000000libStatGen-1.0.14/bam/BamIndex.cpp000066400000000000000000000364371254730101300165740ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "BamIndex.h" #include BamIndex::BamIndex() : IndexBase(), maxOverallOffset(0), myUnMappedNumReads(-1) { } BamIndex::~BamIndex() { } // Reset the member data for a new index file. void BamIndex::resetIndex() { IndexBase::resetIndex(); maxOverallOffset = 0; myUnMappedNumReads = -1; } // Read & parse the specified index file. SamStatus::Status BamIndex::readIndex(const char* filename) { // Reset the index from anything that may previously be set. resetIndex(); IFILE indexFile = ifopen(filename, "rb"); // Failed to open the index file. if(indexFile == NULL) { return(SamStatus::FAIL_IO); } // generate the bam index structure. // Read the magic string. char magic[4]; if(ifread(indexFile, magic, 4) != 4) { // Failed to read the magic ifclose(indexFile); return(SamStatus::FAIL_IO); } // If this is not an index file, set num references to 0. if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1) { // Not a BAM Index file. ifclose(indexFile); return(SamStatus::FAIL_PARSE); } // It is a bam index file. // Read the number of reference sequences. if(ifread(indexFile, &n_ref, 4) != 4) { // Failed to read. ifclose(indexFile); return(SamStatus::FAIL_IO); } // Size the references. myRefs.resize(n_ref); for(int refIndex = 0; refIndex < n_ref; refIndex++) { // Read each reference. Reference* ref = &(myRefs[refIndex]); // Read the number of bins. if(ifread(indexFile, &(ref->n_bin), 4) != 4) { // Failed to read the number of bins. // Return failure. ifclose(indexFile); return(SamStatus::FAIL_PARSE); } // If there are no bins, then there are no // mapped/unmapped reads. if(ref->n_bin == 0) { ref->n_mapped = 0; ref->n_unmapped = 0; } // Resize the bins so they can be indexed by bin number. ref->bins.resize(ref->n_bin + 1); // Read each bin. for(int binIndex = 0; binIndex < ref->n_bin; binIndex++) { uint32_t binNumber; // Read in the bin number. if(ifread(indexFile, &(binNumber), 4) != 4) { // Failed to read the bin number. // Return failure. ifclose(indexFile); return(SamStatus::FAIL_IO); } // Add the bin to the reference and get the // pointer back so the values can be set in it. Bin* binPtr = &(ref->bins[binIndex]); binPtr->bin = binNumber; // Read in the number of chunks. if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4) { // Failed to read number of chunks. // Return failure. ifclose(indexFile); return(SamStatus::FAIL_IO); } // Read in the chunks. // Allocate space for the chunks. uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk); binPtr->chunks = (Chunk*)malloc(sizeOfChunkList); if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList) { // Failed to read the chunks. // Return failure. ifclose(indexFile); return(SamStatus::FAIL_IO); } // Determine the min/max for this bin if it is not the max bin. if(binPtr->bin != MAX_NUM_BINS) { for(int i = 0; i < binPtr->n_chunk; i++) { if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset) { ref->minChunkOffset = binPtr->chunks[i].chunk_beg; } if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset) { ref->maxChunkOffset = binPtr->chunks[i].chunk_end; } if(binPtr->chunks[i].chunk_end > maxOverallOffset) { maxOverallOffset = binPtr->chunks[i].chunk_end; } } } else { // Mapped/unmapped are the last chunk of the // MAX BIN ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg; ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end; } } // Read the number of intervals. if(ifread(indexFile, &(ref->n_intv), 4) != 4) { // Failed to read, set to 0. ref->n_intv = 0; // Return failure. ifclose(indexFile); return(SamStatus::FAIL_IO); } // Allocate space for the intervals and read them. uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t); ref->ioffsets = (uint64_t*)malloc(linearIndexSize); if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize) { // Failed to read the linear index. // Return failure. ifclose(indexFile); return(SamStatus::FAIL_IO); } } int32_t numUnmapped = 0; if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t)) { myUnMappedNumReads = numUnmapped; } // Successfully read the bam index file. ifclose(indexFile); return(SamStatus::SUCCESS); } // Get the chunks for the specified reference id and start/end 0-based // coordinates. bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end, SortedChunkList& chunkList) { chunkList.clear(); // If start is >= to end, there will be no sections, return no // regions. if((start >= end) && (end != -1)) { std::cerr << "Warning, requesting region where start <= end, so " << "no values will be returned.\n"; return(false); } // Handle REF_ID_UNMAPPED. This uses a default chunk which covers // from the max offset to the end of the file. if(refID == REF_ID_UNMAPPED) { Chunk refChunk; // The start of the unmapped region is the max offset found // in the index file. refChunk.chunk_beg = getMaxOffset(); // The end of the unmapped region is the end of the file, so // set chunk end to the max value. refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE; return(chunkList.insert(refChunk)); } if((refID < 0) || (refID >= n_ref)) { // The specified refID is out of range, return false. std::cerr << "Warning, requesting refID is out of range, so " << "no values will be returned.\n"; return(false); } const Reference* ref = &(myRefs[refID]); // Handle where start/end are defaults. if(start == -1) { if(end == -1) { // This is whole chromosome, so take a shortcut. if(ref->maxChunkOffset == 0) { // No chunks for this region, but this is not an error. return(true); } Chunk refChunk; refChunk.chunk_beg = ref->minChunkOffset; refChunk.chunk_end = ref->maxChunkOffset; return(chunkList.insert(refChunk)); } else { start = 0; } } if(end == -1) { // MAX_POSITION is inclusive, but end is exclusive, so add 1. end = MAX_POSITION + 1; } // Determine the minimum offset for the given start position. This // is done by using the linear index for the specified start position. uint64_t minOffset = 0; getMinOffsetFromLinearIndex(refID, start, minOffset); bool binInRangeMap[MAX_NUM_BINS+1]; getBinsForRegion(start, end, binInRangeMap); // Loop through the bins in the ref and if they are in the region, get the chunks. for(int i = 0; i < ref->n_bin; ++i) { const Bin* bin = &(ref->bins[i]); if(binInRangeMap[bin->bin] == false) { // This bin is not in the region, so check the next one. continue; } // Add each chunk in the bin to the map. for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++) { // If the end of the chunk is less than the minimum offset // for the 16K block that starts our region, then no // records in this chunk will cross our region, so do // not add it to the chunks we need to use. if(bin->chunks[chunkIndex].chunk_end < minOffset) { continue; } // Add the chunk to the map. if(!chunkList.insert(bin->chunks[chunkIndex])) { // Failed to add to the map, return false. std::cerr << "Warning, Failed to add a chunk, so " << "no values will be returned.\n"; return(false); } } } // Now that all chunks have been added to the list, // handle overlapping chunks. return(chunkList.mergeOverlapping()); } // Get the max offset. uint64_t BamIndex::getMaxOffset() const { return(maxOverallOffset); } // Get the min & max file offsets for the reference ID. bool BamIndex::getReferenceMinMax(int32_t refID, uint64_t& minOffset, uint64_t& maxOffset) const { if((refID < 0) || (refID >= (int32_t)myRefs.size())) { // Reference ID is out of range for this index file. return(false); } // Get this reference. minOffset = myRefs[refID].minChunkOffset; maxOffset = myRefs[refID].maxChunkOffset; return(true); } // Get the number of mapped reads for this reference id. int32_t BamIndex::getNumMappedReads(int32_t refID) { // If it is the reference id of unmapped reads, return // that there are no mapped reads. if(refID == REF_ID_UNMAPPED) { // These are by definition all unmapped reads. return(0); } if((refID < 0) || (refID >= (int32_t)myRefs.size())) { // Reference ID is out of range for this index file. return(-1); } // Get this reference. return(myRefs[refID].n_mapped); } // Get the number of unmapped reads for this reference id. int32_t BamIndex::getNumUnMappedReads(int32_t refID) { // If it is the reference id of unmapped reads, return // that value. if(refID == REF_ID_UNMAPPED) { return(myUnMappedNumReads); } if((refID < 0) || (refID >= (int32_t)myRefs.size())) { // Reference ID is out of range for this index file. return(-1); } // Get this reference. return(myRefs[refID].n_unmapped); } // Print the bam index. void BamIndex::printIndex(int32_t refID, bool summary) { std::cout << "BAM Index: " << std::endl; std::cout << "# Reference Sequences: " << n_ref << std::endl; unsigned int startRef = 0; unsigned int endRef = myRefs.size() - 1; std::vector refsToProcess; if(refID != -1) { // Set start and end ref to the specified reference id. startRef = refID; endRef = refID; } // Print out the information for each bin. for(unsigned int i = startRef; i <= endRef; ++i) { std::cout << std::dec << "\tReference ID: " << std::setw(4) << i << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin << "; #Linear Index Entries: " << std::setw(6) << myRefs[i].n_intv << "; Min Chunk Offset: " << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset << "; Max Chunk Offset: " << std::setw(18) << myRefs[i].maxChunkOffset << std::dec; // Print the mapped/unmapped if set. if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) { std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads"; } if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) { std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads"; } std::cout << std::endl; // Only print more details if not summary. if(!summary) { std::vector::iterator binIter; for(binIter = myRefs[i].bins.begin(); binIter != myRefs[i].bins.end(); ++binIter) { Bin* binPtr = &(*binIter); if(binPtr->bin == Bin::NOT_USED_BIN) { // This bin is not used, continue. continue; } // Print the bin info. std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl; std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl; std::cout << std::hex << std::showbase; for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk; ++chunkIndex) { // If this is the last chunk of the MAX_NUM_BINS - it // contains a mapped/unmapped count rather than the regular // chunk addresses. if((binPtr->bin != MAX_NUM_BINS) || (chunkIndex != (binPtr->n_chunk - 1))) { std::cout << "\t\t\t\tchunk_beg: " << binPtr->chunks[chunkIndex].chunk_beg << std::endl; std::cout << "\t\t\t\tchunk_end: " << binPtr->chunks[chunkIndex].chunk_end << std::endl; } } } std::cout << std::dec; // Print the linear index. for(int linearIndex = 0; linearIndex < myRefs[i].n_intv; ++linearIndex) { if(myRefs[i].ioffsets[linearIndex] != 0) { std::cout << "\t\t\tLinearIndex[" << std::dec << linearIndex << "] Offset: " << std::hex << myRefs[i].ioffsets[linearIndex] << std::endl; } } } } } libStatGen-1.0.14/bam/BamIndex.h000066400000000000000000000071411254730101300162270ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BAM_INDEX_H__ #define __BAM_INDEX_H__ #include #include #include #include #include "IndexBase.h" #include "InputFile.h" #include "SamStatus.h" class BamIndex : public IndexBase { public: BamIndex(); virtual ~BamIndex(); /// Reset the member data for a new index file. virtual void resetIndex(); // Read & parse the specified index file. /// \param filename the bam index file to be read. /// \return the status of the read. SamStatus::Status readIndex(const char* filename); /// Get the list of chunks associated with this region. /// For an entire reference ID, set start and end to -1. /// To start at the beginning of the region, set start to 0/-1. /// To go to the end of the region, set end to -1. bool getChunksForRegion(int32_t refID, int32_t start, int32_t end, SortedChunkList& chunkList); uint64_t getMaxOffset() const; /// Get the minimum and maximum file offsets for the specfied reference ID. /// \param refID the reference ID to locate in the file. /// \param minOffset returns the min file offset for the specified reference /// \param maxOffset returns the max file offset for the specified reference /// \return whether or not the reference was found in the file bool getReferenceMinMax(int32_t refID, uint64_t& minOffset, uint64_t& maxOffset) const; /// Get the number of mapped reads for this reference id. Returns -1 for /// out of range refIDs. /// \param refID reference ID for which to extract the number of mapped reads. /// \return number of mapped reads for the specified reference id. int32_t getNumMappedReads(int32_t refID); /// Get the number of unmapped reads for this reference id. Returns -1 for /// out of range refIDs. /// \param refID reference ID for which to extract the number of unmapped reads. /// \return number of unmapped reads for the specified reference id int32_t getNumUnMappedReads(int32_t refID); /// Print the index information. /// \param refID reference ID for which to print info for. -1 means print for all references. /// \param summary whether or not to just print a summary (defaults to false). The summary just contains summary info for each reference and not every bin/chunk. void printIndex(int32_t refID, bool summary = false); // Number of reference sequences. /// The number used for an unknown number of reads. static const int32_t UNKNOWN_NUM_READS = -1; /// The number used for the reference id of unmapped reads. static const int32_t REF_ID_UNMAPPED = -1; /// The number used to indicate that all reference ids should be used. static const int32_t REF_ID_ALL = -2; private: uint64_t maxOverallOffset; int32_t myUnMappedNumReads; }; #endif libStatGen-1.0.14/bam/BamInterface.cpp000066400000000000000000000160351254730101300174150ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "BamInterface.h" #include "CharBuffer.h" BamInterface::BamInterface() { } BamInterface::~BamInterface() { } // Read a BAM file's header. bool BamInterface::readHeader(IFILE filePtr, SamFileHeader& header, SamStatus& status) { if(filePtr == NULL) { // File is not open, return false. status.setStatus(SamStatus::FAIL_ORDER, "Cannot read header since the file pointer is null"); return(false); } if(filePtr->isOpen() == false) { status.setStatus(SamStatus::FAIL_ORDER, "Cannot read header since the file is not open"); return(false); } // Clear the passed in header. header.resetHeader(); int32_t headerLength; int readSize = ifread(filePtr, &headerLength, sizeof(headerLength)); if(readSize != sizeof(headerLength)) { String errMsg = "Failed to read the BAM header length, read "; errMsg += readSize; errMsg += " bytes instead of "; errMsg += (unsigned int)sizeof(headerLength); status.setStatus(SamStatus::FAIL_IO, errMsg.c_str()); return(false); } String headerStr; if(headerLength > 0) { // Read the header. readSize = ifread(filePtr, headerStr.LockBuffer(headerLength + 1), headerLength); headerStr[headerLength] = 0; headerStr.UnlockBuffer(); if(readSize != headerLength) { // Failed to read the header. status.setStatus(SamStatus::FAIL_IO, "Failed to read the BAM header."); return(false); } } // Parse the header that was read. if(!header.addHeader(headerStr)) { // Status is set in the method on failure. status.setStatus(SamStatus::FAIL_PARSE, header.getErrorMessage()); return(false); } int referenceCount; // Read the number of references sequences. ifread(filePtr, &referenceCount, sizeof(int)); // Get and clear the reference info so it can be set // from the bam reference table. SamReferenceInfo& refInfo = header.getReferenceInfoForBamInterface(); refInfo.clear(); CharBuffer refName; // Read each reference sequence for (int i = 0; i < referenceCount; i++) { int nameLength; int rc; // Read the length of the reference name. rc = ifread(filePtr, &nameLength, sizeof(int)); if(rc != sizeof(int)) { status.setStatus(SamStatus::FAIL_IO, "Failed to read the BAM reference dictionary."); return(false); } // Read the name. refName.readFromFile(filePtr, nameLength); // Read the length of the reference sequence. int32_t refLen; rc = ifread(filePtr, &refLen, sizeof(int)); if(rc != sizeof(int)) { status.setStatus(SamStatus::FAIL_IO, "Failed to read the BAM reference dictionary."); return(false); } refInfo.add(refName.c_str(), refLen); } // Successfully read the file. return(true); } bool BamInterface::writeHeader(IFILE filePtr, SamFileHeader& header, SamStatus& status) { if((filePtr == NULL) || (filePtr->isOpen() == false)) { // File is not open, return false. status.setStatus(SamStatus::FAIL_ORDER, "Cannot write header since the file pointer is null"); return(false); } char magic[4]; magic[0] = 'B'; magic[1] = 'A'; magic[2] = 'M'; magic[3] = 1; // Write magic to the file. ifwrite(filePtr, magic, 4); //////////////////////////////// // Write the header to the file. //////////////////////////////// // Construct a string containing the entire header. std::string headerString = ""; header.getHeaderString(headerString); int32_t headerLen = headerString.length(); int numWrite = 0; // Write the header length. numWrite = ifwrite(filePtr, &headerLen, sizeof(int32_t)); if(numWrite != sizeof(int32_t)) { status.setStatus(SamStatus::FAIL_IO, "Failed to write the BAM header length."); return(false); } // Write the header to the file. numWrite = ifwrite(filePtr, headerString.c_str(), headerLen); if(numWrite != headerLen) { status.setStatus(SamStatus::FAIL_IO, "Failed to write the BAM header."); return(false); } //////////////////////////////////////////////////////// // Write the Reference Information. const SamReferenceInfo& refInfo = header.getReferenceInfo(); // Get the number of sequences. int32_t numSeq = refInfo.getNumEntries(); ifwrite(filePtr, &numSeq, sizeof(int32_t)); // Write each reference sequence for (int i = 0; i < numSeq; i++) { const char* refName = refInfo.getReferenceName(i); // Add one for the null value. int32_t nameLength = strlen(refName) + 1; // Write the length of the reference name. ifwrite(filePtr, &nameLength, sizeof(int32_t)); // Write the name. ifwrite(filePtr, refName, nameLength); // Write the length of the reference sequence. int32_t refLen = refInfo.getReferenceLength(i); ifwrite(filePtr, &refLen, sizeof(int32_t)); } return(true); } void BamInterface::readRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamStatus& samStatus) { // TODO - need to validate there are @SQ lines in both sam/bam - MAYBE! // SetBufferFromFile will reset the record prior to reading a new one. if(record.setBufferFromFile(filePtr, header) != SamStatus::SUCCESS) { // Failed, so add the error message. samStatus.addError(record.getStatus()); } } SamStatus::Status BamInterface::writeRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamRecord::SequenceTranslation translation) { // Write the file, returning the status. return(record.writeRecordBuffer(filePtr, translation)); } libStatGen-1.0.14/bam/BamInterface.h000066400000000000000000000041061254730101300170560ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BAM_INTERFACE_H__ #define __BAM_INTERFACE_H__ #include "GenericSamInterface.h" class BamInterface : public GenericSamInterface { public: BamInterface(); ~BamInterface(); // Reads the header section from the specified BAM file and stores it in // the passed in header. // Returns false and updates the status on failure. virtual bool readHeader(IFILE filePtr, SamFileHeader& header, SamStatus& samStatus); // Writes the specified header into the specified BAM file. // Returns false and updates the status on failure. virtual bool writeHeader(IFILE filePtr, SamFileHeader& header, SamStatus& samStatus); // Reads the next record from the specified BAM file and stores it in // the passed in record. virtual void readRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamStatus& samStatus); // Writes the specified record into the specified BAM file. virtual SamStatus::Status writeRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamRecord::SequenceTranslation translation); private: }; #endif libStatGen-1.0.14/bam/COPYING000066400000000000000000001045141254730101300154240ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . libStatGen-1.0.14/bam/CigarHelper.cpp000066400000000000000000000270361254730101300172650ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ ////////////////////////////////////////////////////////////////////////// #include "CigarHelper.h" // Soft Clip from the beginning of the read to the specified reference position. int32_t CigarHelper::softClipBeginByRefPos(SamRecord& record, int32_t refPosition0Based, CigarRoller& newCigar, int32_t &new0BasedPosition) { newCigar.clear(); Cigar* cigar = record.getCigarInfo(); if(cigar == NULL) { // Failed to get the cigar. ErrorHandler::handleError("Soft clipping, but failed to read the cigar"); return(NO_CLIP); } // No cigar or position in the record, so return no clip. if((cigar->size() == 0) || (record.get0BasedPosition() == -1)) { return(NO_CLIP); } // Check to see if the reference position occurs before the record starts, // if it does, do no clipping. if(refPosition0Based < record.get0BasedPosition()) { // Not within this read, so nothing to clip. newCigar.Set(record.getCigar()); return(NO_CLIP); } // The position falls after the read starts, so loop through until the // position or the end of the read is found. int32_t readClipPosition = 0; bool clipWritten = false; new0BasedPosition = record.get0BasedPosition(); for(int i = 0; i < cigar->size(); i++) { const Cigar::CigarOperator* op = &(cigar->getOperator(i)); if(clipWritten) { // Clip point has been found, so just add everything. newCigar += *op; // Go to the next operation. continue; } // The clip point has not yet been found, so check to see if we found // it now. // Not a clip, check to see if the operation is found in the // reference. if(Cigar::foundInReference(*op)) { // match, mismatch, deletion, skip // increment the current reference position to just past this // operation. new0BasedPosition += op->count; // Check to see if this is also in the query, because otherwise // the operation is still being consumed. if(Cigar::foundInQuery(*op)) { // Also in the query, determine if the entire thing should // be clipped or just part of it. uint32_t numKeep = 0; // Check to see if we have hit our clip position. if(refPosition0Based < new0BasedPosition) { // The specified clip position is in this cigar operation. numKeep = new0BasedPosition - refPosition0Based - 1; if(numKeep > op->count) { // Keep the entire read. This happens because // we keep reading until the first match/mismatch // after the clip. numKeep = op->count; } } // Add the part of this operation that is being clipped // to the clip count. readClipPosition += (op->count - numKeep); // Only write the clip if we found a match/mismatch // to write. Otherwise we will keep accumulating clips // for the case of insertions. if(numKeep > 0) { new0BasedPosition -= numKeep; newCigar.Add(Cigar::softClip, readClipPosition); // Add the clipped part of this cigar to the clip // position. newCigar.Add(op->operation, numKeep); // Found a match after the clip point, so stop // consuming cigar operations. clipWritten = true; continue; } } } else { // Only add hard clips. The softclips will be added in // when the total number is found. if(op->operation == Cigar::hardClip) { // Check if this is the first operation, if so, just write it. if(i == 0) { newCigar += *op; } // Check if it is the last operation (otherwise skip it). else if(i == (cigar->size() - 1)) { // Check whether or not the clip was ever written, and if // not, write it. if(clipWritten == false) { newCigar.Add(Cigar::softClip, readClipPosition); // Since no match/mismatch was ever found, set // the new ref position to the original one. new0BasedPosition = record.get0BasedPosition(); clipWritten = true; } // Add the hard clip. newCigar += *op; } } // Not yet to the clip position, so do not add this operation. if(Cigar::foundInQuery(*op)) { // Found in the query, so update the read clip position. readClipPosition += op->count; } } } // End loop through cigar. // Check whether or not the clip was ever written, and if // not, write it. if(clipWritten == false) { newCigar.Add(Cigar::softClip, readClipPosition); // Since no match/mismatch was ever found, set // the new ref position to the original one. new0BasedPosition = record.get0BasedPosition(); } // Subtract 1 since readClipPosition atually contains the first 0based // position that is not clipped. return(readClipPosition - 1); } // Soft Clip from the end of the read at the specified reference position. int32_t CigarHelper::softClipEndByRefPos(SamRecord& record, int32_t refPosition0Based, CigarRoller& newCigar) { newCigar.clear(); Cigar* cigar = record.getCigarInfo(); if(cigar == NULL) { // Failed to get the cigar. ErrorHandler::handleError("Soft clipping, but failed to read the cigar"); return(NO_CLIP); } // No cigar or position in the record, so return no clip. if((cigar->size() == 0) || (record.get0BasedPosition() == -1)) { return(NO_CLIP); } // Check to see if the reference position occurs after the record ends, // if so, do no clipping. if(refPosition0Based > record.get0BasedAlignmentEnd()) { // Not within this read, so nothing to clip. newCigar.Set(record.getCigar()); return(NO_CLIP); } // The position falls before the read ends, so loop through until the // position is found. int32_t currentRefPosition = record.get0BasedPosition(); int32_t readClipPosition = 0; for(int i = 0; i < cigar->size(); i++) { const Cigar::CigarOperator* op = &(cigar->getOperator(i)); // If the operation is found in the reference, increase the // reference position. if(Cigar::foundInReference(*op)) { // match, mismatch, deletion, skip // increment the current reference position to just past // this operation. currentRefPosition += op->count; } // Check to see if we have hit our clip position. if(refPosition0Based < currentRefPosition) { // If this read is also in the query (match/mismatch), // write the partial op to the new cigar. int32_t numKeep = 0; if(Cigar::foundInQuery(*op)) { numKeep = op->count - (currentRefPosition - refPosition0Based); if(numKeep > 0) { newCigar.Add(op->operation, numKeep); readClipPosition += numKeep; } } else if(Cigar::isClip(*op)) { // This is a hard clip, so write it. newCigar.Add(op->operation, op->count); } else { // Not found in the query (skip/deletion), // so don't write any of the operation. } // Found the clip point, so break. break; } else if(refPosition0Based == currentRefPosition) { newCigar += *op; if(Cigar::foundInQuery(*op)) { readClipPosition += op->count; } } else { // Not yet to the clip position, so add this operation/size to // the new cigar. newCigar += *op; if(Cigar::foundInQuery(*op)) { // Found in the query, so update the read clip position. readClipPosition += op->count; } } } // End loop through cigar. // Before adding the softclip, read from the end of the cigar checking to // see if the operations are in the query, removing operations that are // not (pad/delete/skip) until a hardclip or an operation in the query is // found. We do not want a pad/delete/skip right before a softclip. for(int j = newCigar.size() - 1; j >= 0; j--) { const Cigar::CigarOperator* op = &(newCigar.getOperator(j)); if(!Cigar::foundInQuery(*op) && !Cigar::isClip(*op)) { // pad/delete/skip newCigar.Remove(j); } else if(Cigar::foundInQuery(*op) & Cigar::isClip(*op)) { // Soft clip, so increment the clip position for the return value. // Remove the softclip since the readClipPosition is used to // calculate teh size of the soft clip added. readClipPosition -= op->count; newCigar.Remove(j); } else { // Found a cigar operation that should not be deleted, so stop deleting. break; } } // Determine the number of soft clips. int32_t numSoftClips = record.getReadLength() - readClipPosition; // NOTE that if the previous operation is a softclip, the CigarRoller logic // will merge this with that one. newCigar.Add(Cigar::softClip, numSoftClips); // Check if an ending hard clip needs to be added. if(cigar->size() != 0) { const Cigar::CigarOperator* lastOp = &(cigar->getOperator(cigar->size() - 1)); if(lastOp->operation == Cigar::hardClip) { newCigar += *lastOp; } } return(readClipPosition); } libStatGen-1.0.14/bam/CigarHelper.h000066400000000000000000000060101254730101300167170ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __CIGAR_HELPER_H__ #define __CIGAR_HELPER_H__ #include "SamRecord.h" /// Class for helping to filter a SAM/BAM record. class CigarHelper { public: static const int32_t NO_CLIP = -1; /// Soft clip the cigar from the beginning of the read at the specified /// reference position. If the clip position is deleted/skipped /// or is immediately followed by a deletion/skip/pad/insert, that entire /// CIGAR operation is also removed. /// Nothing is clipped if the reference position is before the read starts, /// everything is clipped if the reference position is after the read ends. /// \param record record to calculate the clip for. /// \param refPosition0Based 0-based reference position to end the clip at /// \param newCigar cigar object to set with the updated cigar. /// \param new0BasedPosition new 0-based reference position of the read. /// \param read position where the clip ends (last clipped position) or // NO_CLIP if nothing is clipped. static int32_t softClipBeginByRefPos(SamRecord& record, int32_t refPosition0Based, CigarRoller& newCigar, int32_t &new0BasedPosition); /// Soft clip the cigar from the back of the read at the specified /// reference position. If the clip position is deleted/skipped /// or is immediately preceded by a deletion/skip/pad, that entire CIGAR /// operation is also removed. If the clip position is immediately /// preceded by an insertion, the insertion is left in the CIGAR. /// Nothing is clipped if the reference position is after the read ends, /// everything is clipped if the reference position is before the read /// starts (including insertions). /// \param record record to calculate the clip for. /// \param refPosition0Based 0-based reference position to start clip at /// \param newCigar cigar object to set with the updated cigar. /// \param read position where the clip starts or // NO_CLIP if nothing is clipped. static int32_t softClipEndByRefPos(SamRecord& record, int32_t refPosition0Based, CigarRoller& newCigar); }; #endif libStatGen-1.0.14/bam/GenericSamInterface.cpp000066400000000000000000000015471254730101300207350ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "GenericSamInterface.h" GenericSamInterface::GenericSamInterface() { } GenericSamInterface::~GenericSamInterface() { } libStatGen-1.0.14/bam/GenericSamInterface.h000066400000000000000000000047211254730101300203770ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GENERIC_SAM_INTERFACE_H__ #define __GENERIC_SAM_INTERFACE_H__ #include "SamStatus.h" #include "InputFile.h" #include "SamFileHeader.h" #include "SamRecord.h" class GenericSamInterface { public: GenericSamInterface(); virtual ~GenericSamInterface(); // Pure virtual method that reads the header section from the specified file // and stores it in the passed in header, returns false and sets the status // on failure. // Will be implemented specifically for sam/bam files. virtual bool readHeader(IFILE filePtr, SamFileHeader& header, SamStatus& status) = 0; // Pure virtual method that writes the specified header into the specified // file, returns false and sets the status on failure. // Will be implemented specifically for sam/bam files. virtual bool writeHeader(IFILE filePtr, SamFileHeader& header, SamStatus& status) = 0; // Pure virtual method that reads the next record from the specified file // and stores it in the passed in record. // Will be implemented specifically for sam/bam files. // TODO On error, a more detailed message is appended to statusMsg. virtual void readRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamStatus& samStatus) = 0; // Pure virtual method that writes the specified record into the specified // file. // Will be implemented specifically for sam/bam files. virtual SamStatus::Status writeRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamRecord::SequenceTranslation translation) = 0; }; #endif libStatGen-1.0.14/bam/Makefile000066400000000000000000000007031254730101300160240ustar00rootroot00000000000000TOOLBASE = SamFileHeader SamFile GenericSamInterface SamInterface BamInterface SamRecord BamIndex SamHeaderHD SamHeaderPG SamHeaderRecord SamHeaderSQ SamHeaderRG SamHeaderTag SamValidation SamStatistics SamQuerySeqWithRefHelper SamFilter PileupElement PileupElementBaseQual SamReferenceInfo SamTags PosList CigarHelper SamRecordPool SamCoordOutput SamRecordHelper HDRONLY = Pileup.h SamHelper.h SamFlag.h SamStatus.h include ../Makefiles/Makefile.liblibStatGen-1.0.14/bam/Makefile.depends000066400000000000000000002350411254730101300174520ustar00rootroot00000000000000# DO NOT DELETE $(OBJDIR_OPT)/SamFileHeader.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/SamFileHeader.o: ../include/StringArray.h $(OBJDIR_OPT)/SamFileHeader.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamFileHeader.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/SamFileHeader.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_OPT)/SamFileHeader.o: ../include/Hash.h ../include/IntArray.h $(OBJDIR_OPT)/SamFileHeader.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/SamFileHeader.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/SamFile.o: SamFile.h SamStatus.h ../include/StatGenStatus.h $(OBJDIR_OPT)/SamFile.o: ../include/ErrorHandler.h ../include/InputFile.h $(OBJDIR_OPT)/SamFile.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_OPT)/SamFile.o: SamReferenceInfo.h ../include/StringArray.h $(OBJDIR_OPT)/SamFile.o: ../include/StringBasics.h ../include/StringHash.h $(OBJDIR_OPT)/SamFile.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamFile.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_OPT)/SamFile.o: SamHeaderRecord.h SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_OPT)/SamFile.o: SamHeaderRG.h SamHeaderPG.h SamRecord.h $(OBJDIR_OPT)/SamFile.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/SamFile.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_OPT)/SamFile.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamFile.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_OPT)/SamFile.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/SamFile.o: ../include/MathVector.h ../include/CigarRoller.h $(OBJDIR_OPT)/SamFile.o: ../include/Cigar.h GenericSamInterface.h BamIndex.h $(OBJDIR_OPT)/SamFile.o: ../include/IndexBase.h SamStatistics.h $(OBJDIR_OPT)/SamFile.o: BamInterface.h SamInterface.h $(OBJDIR_OPT)/SamFile.o: ../include/BgzfFileType.h ../include/bgzf.h $(OBJDIR_OPT)/GenericSamInterface.o: GenericSamInterface.h SamStatus.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/InputFile.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_OPT)/GenericSamInterface.o: SamReferenceInfo.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/StringArray.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/StringBasics.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/StringHash.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_OPT)/GenericSamInterface.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/GenericSamInterface.o: SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_OPT)/GenericSamInterface.o: SamHeaderPG.h SamRecord.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/Generic.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/MemoryMap.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/MathVector.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/CigarRoller.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/Cigar.h $(OBJDIR_OPT)/SamInterface.o: SamInterface.h GenericSamInterface.h $(OBJDIR_OPT)/SamInterface.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_OPT)/SamInterface.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/SamInterface.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/SamInterface.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/SamInterface.o: ../include/StringArray.h $(OBJDIR_OPT)/SamInterface.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamInterface.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_OPT)/SamInterface.o: ../include/Hash.h ../include/IntArray.h $(OBJDIR_OPT)/SamInterface.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/SamInterface.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/SamInterface.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_OPT)/SamInterface.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/SamInterface.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_OPT)/SamInterface.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamInterface.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_OPT)/SamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/SamInterface.o: ../include/MathVector.h $(OBJDIR_OPT)/SamInterface.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/SamInterface.o: SamRecordHelper.h $(OBJDIR_OPT)/BamInterface.o: BamInterface.h GenericSamInterface.h $(OBJDIR_OPT)/BamInterface.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_OPT)/BamInterface.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/BamInterface.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/BamInterface.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/BamInterface.o: ../include/StringArray.h $(OBJDIR_OPT)/BamInterface.o: ../include/StringBasics.h $(OBJDIR_OPT)/BamInterface.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_OPT)/BamInterface.o: ../include/Hash.h ../include/IntArray.h $(OBJDIR_OPT)/BamInterface.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/BamInterface.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/BamInterface.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_OPT)/BamInterface.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/BamInterface.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_OPT)/BamInterface.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/BamInterface.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_OPT)/BamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/BamInterface.o: ../include/MathVector.h $(OBJDIR_OPT)/BamInterface.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/BamInterface.o: ../include/CharBuffer.h $(OBJDIR_OPT)/SamRecord.o: ../include/bam.h SamRecord.h $(OBJDIR_OPT)/SamRecord.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/SamRecord.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_OPT)/SamRecord.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamRecord.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_OPT)/SamRecord.o: ../include/FileType.h ../include/StringArray.h $(OBJDIR_OPT)/SamRecord.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_OPT)/SamRecord.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/SamRecord.o: ../include/ErrorHandler.h ../include/LongHash.h $(OBJDIR_OPT)/SamRecord.o: ../include/Error.h ../include/MathVector.h $(OBJDIR_OPT)/SamRecord.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_OPT)/SamRecord.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_OPT)/SamRecord.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamRecord.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/SamRecord.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/SamRecord.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/SamRecord.o: SamValidation.h SamFile.h GenericSamInterface.h $(OBJDIR_OPT)/SamRecord.o: BamIndex.h ../include/IndexBase.h SamStatistics.h $(OBJDIR_OPT)/SamRecord.o: ../include/BaseUtilities.h $(OBJDIR_OPT)/SamRecord.o: SamQuerySeqWithRefHelper.h $(OBJDIR_OPT)/BamIndex.o: BamIndex.h ../include/IndexBase.h $(OBJDIR_OPT)/BamIndex.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/BamIndex.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/BamIndex.o: ../include/ErrorHandler.h SamStatus.h $(OBJDIR_OPT)/SamHeaderHD.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_OPT)/SamHeaderHD.o: ../include/StringArray.h $(OBJDIR_OPT)/SamHeaderHD.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_OPT)/SamHeaderHD.o: ../include/FileType.h ../include/StringHash.h $(OBJDIR_OPT)/SamHeaderHD.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamHeaderHD.o: SamHeaderTag.h $(OBJDIR_OPT)/SamHeaderPG.o: SamHeaderPG.h SamHeaderRecord.h $(OBJDIR_OPT)/SamHeaderPG.o: ../include/StringArray.h $(OBJDIR_OPT)/SamHeaderPG.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_OPT)/SamHeaderPG.o: ../include/FileType.h ../include/StringHash.h $(OBJDIR_OPT)/SamHeaderPG.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamHeaderPG.o: SamHeaderTag.h $(OBJDIR_OPT)/SamHeaderRecord.o: SamHeaderRecord.h ../include/StringArray.h $(OBJDIR_OPT)/SamHeaderRecord.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamHeaderRecord.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/SamHeaderRecord.o: ../include/StringHash.h $(OBJDIR_OPT)/SamHeaderRecord.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamHeaderRecord.o: SamHeaderTag.h $(OBJDIR_OPT)/SamHeaderSQ.o: SamHeaderSQ.h SamHeaderRecord.h $(OBJDIR_OPT)/SamHeaderSQ.o: ../include/StringArray.h $(OBJDIR_OPT)/SamHeaderSQ.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_OPT)/SamHeaderSQ.o: ../include/FileType.h ../include/StringHash.h $(OBJDIR_OPT)/SamHeaderSQ.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamHeaderSQ.o: SamHeaderTag.h $(OBJDIR_OPT)/SamHeaderRG.o: SamHeaderRG.h SamHeaderRecord.h $(OBJDIR_OPT)/SamHeaderRG.o: ../include/StringArray.h $(OBJDIR_OPT)/SamHeaderRG.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_OPT)/SamHeaderRG.o: ../include/FileType.h ../include/StringHash.h $(OBJDIR_OPT)/SamHeaderRG.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamHeaderRG.o: SamHeaderTag.h $(OBJDIR_OPT)/SamHeaderTag.o: SamHeaderTag.h $(OBJDIR_OPT)/SamValidation.o: SamValidation.h SamFile.h SamStatus.h $(OBJDIR_OPT)/SamValidation.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/SamValidation.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/SamValidation.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/SamValidation.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/SamValidation.o: ../include/StringArray.h $(OBJDIR_OPT)/SamValidation.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamValidation.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_OPT)/SamValidation.o: ../include/Hash.h ../include/IntArray.h $(OBJDIR_OPT)/SamValidation.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/SamValidation.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/SamValidation.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_OPT)/SamValidation.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/SamValidation.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_OPT)/SamValidation.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamValidation.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_OPT)/SamValidation.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/SamValidation.o: ../include/MathVector.h $(OBJDIR_OPT)/SamValidation.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/SamValidation.o: GenericSamInterface.h BamIndex.h $(OBJDIR_OPT)/SamValidation.o: ../include/IndexBase.h SamStatistics.h $(OBJDIR_OPT)/SamValidation.o: SamTags.h $(OBJDIR_OPT)/SamStatistics.o: SamStatistics.h SamRecord.h $(OBJDIR_OPT)/SamStatistics.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/SamStatistics.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/SamStatistics.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_OPT)/SamStatistics.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamStatistics.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamStatistics.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/SamStatistics.o: ../include/StringArray.h $(OBJDIR_OPT)/SamStatistics.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_OPT)/SamStatistics.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/SamStatistics.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/SamStatistics.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/SamStatistics.o: ../include/MathVector.h ../include/IntArray.h $(OBJDIR_OPT)/SamStatistics.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/SamStatistics.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_OPT)/SamStatistics.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_OPT)/SamStatistics.o: SamHeaderRecord.h SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_OPT)/SamStatistics.o: SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/SamStatistics.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/SamStatistics.o: SamFlag.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: SamQuerySeqWithRefHelper.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: SamRecord.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/Generic.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/MemoryMap.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/InputFile.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/FileType.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/StringArray.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: SamStatus.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/LongHash.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/Error.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/MathVector.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/IntArray.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/StringHash.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/Constant.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: SamHeaderPG.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/CigarRoller.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/Cigar.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: ../include/BaseUtilities.h $(OBJDIR_OPT)/SamQuerySeqWithRefHelper.o: SamFlag.h $(OBJDIR_OPT)/SamFilter.o: SamFilter.h SamRecord.h $(OBJDIR_OPT)/SamFilter.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/SamFilter.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_OPT)/SamFilter.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamFilter.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_OPT)/SamFilter.o: ../include/FileType.h ../include/StringArray.h $(OBJDIR_OPT)/SamFilter.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_OPT)/SamFilter.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/SamFilter.o: ../include/ErrorHandler.h ../include/LongHash.h $(OBJDIR_OPT)/SamFilter.o: ../include/Error.h ../include/MathVector.h $(OBJDIR_OPT)/SamFilter.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_OPT)/SamFilter.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_OPT)/SamFilter.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamFilter.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/SamFilter.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/SamFilter.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/SamFilter.o: SamQuerySeqWithRefHelper.h $(OBJDIR_OPT)/SamFilter.o: ../include/BaseUtilities.h SamFlag.h $(OBJDIR_OPT)/PileupElement.o: PileupElement.h SamRecord.h $(OBJDIR_OPT)/PileupElement.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/PileupElement.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/PileupElement.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_OPT)/PileupElement.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/PileupElement.o: ../include/StringBasics.h $(OBJDIR_OPT)/PileupElement.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/PileupElement.o: ../include/StringArray.h $(OBJDIR_OPT)/PileupElement.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_OPT)/PileupElement.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/PileupElement.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/PileupElement.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/PileupElement.o: ../include/MathVector.h ../include/IntArray.h $(OBJDIR_OPT)/PileupElement.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/PileupElement.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_OPT)/PileupElement.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_OPT)/PileupElement.o: SamHeaderRecord.h SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_OPT)/PileupElement.o: SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/PileupElement.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/PileupElementBaseQual.o: PileupElementBaseQual.h $(OBJDIR_OPT)/PileupElementBaseQual.o: PileupElement.h SamRecord.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/Generic.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/MemoryMap.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/StringBasics.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/InputFile.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/FileType.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/StringArray.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_OPT)/PileupElementBaseQual.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/LongHash.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/Error.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/MathVector.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_OPT)/PileupElementBaseQual.o: SamReferenceInfo.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/StringHash.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/Constant.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_OPT)/PileupElementBaseQual.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/PileupElementBaseQual.o: SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_OPT)/PileupElementBaseQual.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_OPT)/PileupElementBaseQual.o: ../include/Cigar.h $(OBJDIR_OPT)/SamReferenceInfo.o: SamReferenceInfo.h ../include/StringArray.h $(OBJDIR_OPT)/SamReferenceInfo.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamReferenceInfo.o: ../include/InputFile.h $(OBJDIR_OPT)/SamReferenceInfo.o: ../include/FileType.h $(OBJDIR_OPT)/SamReferenceInfo.o: ../include/StringHash.h $(OBJDIR_OPT)/SamReferenceInfo.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamReferenceInfo.o: ../include/IntArray.h $(OBJDIR_OPT)/SamTags.o: SamTags.h SamRecord.h ../include/GenomeSequence.h $(OBJDIR_OPT)/SamTags.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_OPT)/SamTags.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamTags.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_OPT)/SamTags.o: ../include/FileType.h ../include/StringArray.h $(OBJDIR_OPT)/SamTags.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_OPT)/SamTags.o: ../include/StatGenStatus.h ../include/ErrorHandler.h $(OBJDIR_OPT)/SamTags.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/SamTags.o: ../include/MathVector.h ../include/IntArray.h $(OBJDIR_OPT)/SamTags.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/SamTags.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_OPT)/SamTags.o: ../include/Hash.h SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_OPT)/SamTags.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_OPT)/SamTags.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_OPT)/SamTags.o: ../include/Cigar.h ../include/BaseUtilities.h $(OBJDIR_OPT)/PosList.o: PosList.h $(OBJDIR_OPT)/CigarHelper.o: CigarHelper.h SamRecord.h $(OBJDIR_OPT)/CigarHelper.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/CigarHelper.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_OPT)/CigarHelper.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_OPT)/CigarHelper.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_OPT)/CigarHelper.o: ../include/FileType.h ../include/StringArray.h $(OBJDIR_OPT)/CigarHelper.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_OPT)/CigarHelper.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/CigarHelper.o: ../include/ErrorHandler.h ../include/LongHash.h $(OBJDIR_OPT)/CigarHelper.o: ../include/Error.h ../include/MathVector.h $(OBJDIR_OPT)/CigarHelper.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_OPT)/CigarHelper.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_OPT)/CigarHelper.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/CigarHelper.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_OPT)/CigarHelper.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/CigarHelper.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/SamRecordPool.o: SamRecordPool.h SamRecord.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/StringArray.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/MathVector.h ../include/IntArray.h $(OBJDIR_OPT)/SamRecordPool.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_OPT)/SamRecordPool.o: SamHeaderRecord.h SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_OPT)/SamRecordPool.o: SamHeaderRG.h SamHeaderPG.h $(OBJDIR_OPT)/SamRecordPool.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/SamCoordOutput.o: SamCoordOutput.h SamFile.h SamStatus.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/SamCoordOutput.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/StringArray.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/Hash.h ../include/IntArray.h $(OBJDIR_OPT)/SamCoordOutput.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_OPT)/SamCoordOutput.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_OPT)/SamCoordOutput.o: SamHeaderPG.h SamRecord.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/MathVector.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/SamCoordOutput.o: GenericSamInterface.h BamIndex.h $(OBJDIR_OPT)/SamCoordOutput.o: ../include/IndexBase.h SamStatistics.h $(OBJDIR_OPT)/SamCoordOutput.o: SamRecordPool.h SamHelper.h $(OBJDIR_OPT)/SamRecordHelper.o: SamRecordHelper.h SamRecord.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/MemoryMapArray.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/BaseAsciiMap.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/StringBasics.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/StringArray.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_OPT)/SamRecordHelper.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/ErrorHandler.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/MathVector.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_OPT)/SamRecordHelper.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_OPT)/SamRecordHelper.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_OPT)/SamRecordHelper.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_OPT)/SamRecordHelper.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_OPT)/SamRecordHelper.o: ../include/Cigar.h $(OBJDIR_DEBUG)/SamFileHeader.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_DEBUG)/SamFileHeader.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamFileHeader.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamFileHeader.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/SamFileHeader.o: ../include/StringHash.h $(OBJDIR_DEBUG)/SamFileHeader.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamFileHeader.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_DEBUG)/SamFileHeader.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamFileHeader.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_DEBUG)/SamFile.o: SamFile.h SamStatus.h ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamFile.o: ../include/ErrorHandler.h ../include/InputFile.h $(OBJDIR_DEBUG)/SamFile.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_DEBUG)/SamFile.o: SamReferenceInfo.h ../include/StringArray.h $(OBJDIR_DEBUG)/SamFile.o: ../include/StringBasics.h ../include/StringHash.h $(OBJDIR_DEBUG)/SamFile.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamFile.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_DEBUG)/SamFile.o: SamHeaderRecord.h SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_DEBUG)/SamFile.o: SamHeaderRG.h SamHeaderPG.h SamRecord.h $(OBJDIR_DEBUG)/SamFile.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamFile.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_DEBUG)/SamFile.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamFile.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/SamFile.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/SamFile.o: ../include/MathVector.h ../include/CigarRoller.h $(OBJDIR_DEBUG)/SamFile.o: ../include/Cigar.h GenericSamInterface.h $(OBJDIR_DEBUG)/SamFile.o: BamIndex.h ../include/IndexBase.h SamStatistics.h $(OBJDIR_DEBUG)/SamFile.o: BamInterface.h SamInterface.h $(OBJDIR_DEBUG)/SamFile.o: ../include/BgzfFileType.h ../include/bgzf.h $(OBJDIR_DEBUG)/GenericSamInterface.o: GenericSamInterface.h SamStatus.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/InputFile.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_DEBUG)/GenericSamInterface.o: SamReferenceInfo.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/StringArray.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/StringHash.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/Constant.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/Hash.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_DEBUG)/GenericSamInterface.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_DEBUG)/GenericSamInterface.o: SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_DEBUG)/GenericSamInterface.o: SamHeaderPG.h SamRecord.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/Generic.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/MemoryMap.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/LongHash.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/Error.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/MathVector.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/CigarRoller.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/Cigar.h $(OBJDIR_DEBUG)/SamInterface.o: SamInterface.h GenericSamInterface.h $(OBJDIR_DEBUG)/SamInterface.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/SamInterface.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/Hash.h ../include/IntArray.h $(OBJDIR_DEBUG)/SamInterface.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_DEBUG)/SamInterface.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_DEBUG)/SamInterface.o: SamHeaderPG.h SamRecord.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/MathVector.h $(OBJDIR_DEBUG)/SamInterface.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_DEBUG)/SamInterface.o: SamRecordHelper.h $(OBJDIR_DEBUG)/BamInterface.o: BamInterface.h GenericSamInterface.h $(OBJDIR_DEBUG)/BamInterface.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/BamInterface.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/StringArray.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/Hash.h ../include/IntArray.h $(OBJDIR_DEBUG)/BamInterface.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_DEBUG)/BamInterface.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_DEBUG)/BamInterface.o: SamHeaderPG.h SamRecord.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/MathVector.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/CharBuffer.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/bam.h SamRecord.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/FileType.h ../include/StringArray.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/ErrorHandler.h ../include/LongHash.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/Error.h ../include/MathVector.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_DEBUG)/SamRecord.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamRecord.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamRecord.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_DEBUG)/SamRecord.o: SamValidation.h SamFile.h GenericSamInterface.h $(OBJDIR_DEBUG)/SamRecord.o: BamIndex.h ../include/IndexBase.h $(OBJDIR_DEBUG)/SamRecord.o: SamStatistics.h ../include/BaseUtilities.h $(OBJDIR_DEBUG)/SamRecord.o: SamQuerySeqWithRefHelper.h $(OBJDIR_DEBUG)/BamIndex.o: BamIndex.h ../include/IndexBase.h $(OBJDIR_DEBUG)/BamIndex.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/BamIndex.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/BamIndex.o: ../include/ErrorHandler.h SamStatus.h $(OBJDIR_DEBUG)/SamHeaderHD.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_DEBUG)/SamHeaderHD.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamHeaderHD.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamHeaderHD.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/SamHeaderHD.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_DEBUG)/SamHeaderHD.o: ../include/Hash.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamHeaderPG.o: SamHeaderPG.h SamHeaderRecord.h $(OBJDIR_DEBUG)/SamHeaderPG.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamHeaderPG.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamHeaderPG.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/SamHeaderPG.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_DEBUG)/SamHeaderPG.o: ../include/Hash.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamHeaderRecord.o: SamHeaderRecord.h ../include/StringArray.h $(OBJDIR_DEBUG)/SamHeaderRecord.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamHeaderRecord.o: ../include/InputFile.h $(OBJDIR_DEBUG)/SamHeaderRecord.o: ../include/FileType.h $(OBJDIR_DEBUG)/SamHeaderRecord.o: ../include/StringHash.h $(OBJDIR_DEBUG)/SamHeaderRecord.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamHeaderRecord.o: SamHeaderTag.h $(OBJDIR_DEBUG)/SamHeaderSQ.o: SamHeaderSQ.h SamHeaderRecord.h $(OBJDIR_DEBUG)/SamHeaderSQ.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamHeaderSQ.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamHeaderSQ.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/SamHeaderSQ.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_DEBUG)/SamHeaderSQ.o: ../include/Hash.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamHeaderRG.o: SamHeaderRG.h SamHeaderRecord.h $(OBJDIR_DEBUG)/SamHeaderRG.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamHeaderRG.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamHeaderRG.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/SamHeaderRG.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_DEBUG)/SamHeaderRG.o: ../include/Hash.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamHeaderTag.o: SamHeaderTag.h $(OBJDIR_DEBUG)/SamValidation.o: SamValidation.h SamFile.h SamStatus.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/SamValidation.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/StringHash.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_DEBUG)/SamValidation.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamValidation.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_DEBUG)/SamValidation.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/MathVector.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_DEBUG)/SamValidation.o: GenericSamInterface.h BamIndex.h $(OBJDIR_DEBUG)/SamValidation.o: ../include/IndexBase.h SamStatistics.h $(OBJDIR_DEBUG)/SamValidation.o: SamTags.h $(OBJDIR_DEBUG)/SamStatistics.o: SamStatistics.h SamRecord.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/SamStatistics.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/MathVector.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_DEBUG)/SamStatistics.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamStatistics.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_DEBUG)/SamStatistics.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_DEBUG)/SamStatistics.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_DEBUG)/SamStatistics.o: ../include/Cigar.h SamFlag.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: SamQuerySeqWithRefHelper.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: SamRecord.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/Generic.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/MemoryMap.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/InputFile.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/FileType.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: SamStatus.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/LongHash.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/Error.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/MathVector.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/IntArray.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: SamFileHeader.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: SamReferenceInfo.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/StringHash.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/Constant.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: SamHeaderPG.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/CigarRoller.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/Cigar.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: ../include/BaseUtilities.h $(OBJDIR_DEBUG)/SamQuerySeqWithRefHelper.o: SamFlag.h $(OBJDIR_DEBUG)/SamFilter.o: SamFilter.h SamRecord.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/FileType.h ../include/StringArray.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/ErrorHandler.h ../include/LongHash.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/Error.h ../include/MathVector.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_DEBUG)/SamFilter.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamFilter.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamFilter.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_DEBUG)/SamFilter.o: SamQuerySeqWithRefHelper.h $(OBJDIR_DEBUG)/SamFilter.o: ../include/BaseUtilities.h SamFlag.h $(OBJDIR_DEBUG)/PileupElement.o: PileupElement.h SamRecord.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/StringArray.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/PileupElement.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/MathVector.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_DEBUG)/PileupElement.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/PileupElement.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_DEBUG)/PileupElement.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_DEBUG)/PileupElement.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_DEBUG)/PileupElement.o: ../include/Cigar.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: PileupElementBaseQual.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: PileupElement.h SamRecord.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/Generic.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/MemoryMap.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/InputFile.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/FileType.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/StringArray.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: SamStatus.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/LongHash.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/Error.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/MathVector.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/IntArray.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/StringHash.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/Constant.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: SamHeaderPG.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/CigarRoller.h $(OBJDIR_DEBUG)/PileupElementBaseQual.o: ../include/Cigar.h $(OBJDIR_DEBUG)/SamReferenceInfo.o: SamReferenceInfo.h $(OBJDIR_DEBUG)/SamReferenceInfo.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamReferenceInfo.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamReferenceInfo.o: ../include/InputFile.h $(OBJDIR_DEBUG)/SamReferenceInfo.o: ../include/FileType.h $(OBJDIR_DEBUG)/SamReferenceInfo.o: ../include/StringHash.h $(OBJDIR_DEBUG)/SamReferenceInfo.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamReferenceInfo.o: ../include/IntArray.h $(OBJDIR_DEBUG)/SamTags.o: SamTags.h SamRecord.h ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamTags.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_DEBUG)/SamTags.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamTags.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_DEBUG)/SamTags.o: ../include/FileType.h ../include/StringArray.h $(OBJDIR_DEBUG)/SamTags.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_DEBUG)/SamTags.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamTags.o: ../include/ErrorHandler.h ../include/LongHash.h $(OBJDIR_DEBUG)/SamTags.o: ../include/Error.h ../include/MathVector.h $(OBJDIR_DEBUG)/SamTags.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_DEBUG)/SamTags.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_DEBUG)/SamTags.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamTags.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamTags.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_DEBUG)/SamTags.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_DEBUG)/SamTags.o: ../include/BaseUtilities.h $(OBJDIR_DEBUG)/PosList.o: PosList.h $(OBJDIR_DEBUG)/CigarHelper.o: CigarHelper.h SamRecord.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/StringArray.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/MathVector.h ../include/IntArray.h $(OBJDIR_DEBUG)/CigarHelper.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_DEBUG)/CigarHelper.o: SamHeaderRecord.h SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_DEBUG)/CigarHelper.o: SamHeaderRG.h SamHeaderPG.h $(OBJDIR_DEBUG)/CigarHelper.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_DEBUG)/SamRecordPool.o: SamRecordPool.h SamRecord.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/SamRecordPool.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/MathVector.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_DEBUG)/SamRecordPool.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamRecordPool.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_DEBUG)/SamRecordPool.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_DEBUG)/SamRecordPool.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_DEBUG)/SamRecordPool.o: ../include/Cigar.h $(OBJDIR_DEBUG)/SamCoordOutput.o: SamCoordOutput.h SamFile.h SamStatus.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/InputFile.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_DEBUG)/SamCoordOutput.o: SamReferenceInfo.h ../include/StringArray.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/StringHash.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_DEBUG)/SamCoordOutput.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_DEBUG)/SamCoordOutput.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_DEBUG)/SamCoordOutput.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/MathVector.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_DEBUG)/SamCoordOutput.o: GenericSamInterface.h BamIndex.h $(OBJDIR_DEBUG)/SamCoordOutput.o: ../include/IndexBase.h SamStatistics.h $(OBJDIR_DEBUG)/SamCoordOutput.o: SamRecordPool.h SamHelper.h $(OBJDIR_DEBUG)/SamRecordHelper.o: SamRecordHelper.h SamRecord.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/MemoryMapArray.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/Generic.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/MemoryMap.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/InputFile.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/FileType.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/StringArray.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/SamRecordHelper.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/ErrorHandler.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/MathVector.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_DEBUG)/SamRecordHelper.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_DEBUG)/SamRecordHelper.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_DEBUG)/SamRecordHelper.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_DEBUG)/SamRecordHelper.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_DEBUG)/SamRecordHelper.o: ../include/Cigar.h $(OBJDIR_PROFILE)/SamFileHeader.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_PROFILE)/SamFileHeader.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamFileHeader.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamFileHeader.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamFileHeader.o: ../include/FileType.h $(OBJDIR_PROFILE)/SamFileHeader.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamFileHeader.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamFileHeader.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_PROFILE)/SamFileHeader.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_PROFILE)/SamFileHeader.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamFile.o: SamFile.h SamStatus.h ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamFile.o: ../include/ErrorHandler.h ../include/InputFile.h $(OBJDIR_PROFILE)/SamFile.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_PROFILE)/SamFile.o: SamReferenceInfo.h ../include/StringArray.h $(OBJDIR_PROFILE)/SamFile.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamFile.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_PROFILE)/SamFile.o: ../include/Hash.h ../include/IntArray.h $(OBJDIR_PROFILE)/SamFile.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_PROFILE)/SamFile.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamFile.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamFile.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_PROFILE)/SamFile.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamFile.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/SamFile.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/SamFile.o: ../include/MathVector.h ../include/CigarRoller.h $(OBJDIR_PROFILE)/SamFile.o: ../include/Cigar.h GenericSamInterface.h $(OBJDIR_PROFILE)/SamFile.o: BamIndex.h ../include/IndexBase.h $(OBJDIR_PROFILE)/SamFile.o: SamStatistics.h BamInterface.h SamInterface.h $(OBJDIR_PROFILE)/SamFile.o: ../include/BgzfFileType.h ../include/bgzf.h $(OBJDIR_PROFILE)/GenericSamInterface.o: GenericSamInterface.h SamStatus.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/InputFile.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/FileType.h $(OBJDIR_PROFILE)/GenericSamInterface.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/StringArray.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/StringHash.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/Constant.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/Hash.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_PROFILE)/GenericSamInterface.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_PROFILE)/GenericSamInterface.o: SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_PROFILE)/GenericSamInterface.o: SamHeaderPG.h SamRecord.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/Generic.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/MemoryMap.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/LongHash.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/Error.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/MathVector.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/CigarRoller.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/Cigar.h $(OBJDIR_PROFILE)/SamInterface.o: SamInterface.h GenericSamInterface.h $(OBJDIR_PROFILE)/SamInterface.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_PROFILE)/SamInterface.o: SamReferenceInfo.h ../include/StringArray.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_PROFILE)/SamInterface.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_PROFILE)/SamInterface.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamInterface.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/MathVector.h $(OBJDIR_PROFILE)/SamInterface.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_PROFILE)/SamInterface.o: SamRecordHelper.h $(OBJDIR_PROFILE)/BamInterface.o: BamInterface.h GenericSamInterface.h $(OBJDIR_PROFILE)/BamInterface.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/InputFile.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_PROFILE)/BamInterface.o: SamReferenceInfo.h ../include/StringArray.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/StringHash.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_PROFILE)/BamInterface.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_PROFILE)/BamInterface.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/BamInterface.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/MathVector.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/CharBuffer.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/bam.h SamRecord.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/MathVector.h ../include/IntArray.h $(OBJDIR_PROFILE)/SamRecord.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_PROFILE)/SamRecord.o: SamHeaderRecord.h SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_PROFILE)/SamRecord.o: SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_PROFILE)/SamRecord.o: SamValidation.h SamFile.h $(OBJDIR_PROFILE)/SamRecord.o: GenericSamInterface.h BamIndex.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/IndexBase.h SamStatistics.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/BaseUtilities.h $(OBJDIR_PROFILE)/SamRecord.o: SamQuerySeqWithRefHelper.h $(OBJDIR_PROFILE)/BamIndex.o: BamIndex.h ../include/IndexBase.h $(OBJDIR_PROFILE)/BamIndex.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_PROFILE)/BamIndex.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/BamIndex.o: ../include/ErrorHandler.h SamStatus.h $(OBJDIR_PROFILE)/SamHeaderHD.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_PROFILE)/SamHeaderHD.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamHeaderHD.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamHeaderHD.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_PROFILE)/SamHeaderHD.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamHeaderHD.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamHeaderHD.o: SamHeaderTag.h $(OBJDIR_PROFILE)/SamHeaderPG.o: SamHeaderPG.h SamHeaderRecord.h $(OBJDIR_PROFILE)/SamHeaderPG.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamHeaderPG.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamHeaderPG.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_PROFILE)/SamHeaderPG.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamHeaderPG.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamHeaderPG.o: SamHeaderTag.h $(OBJDIR_PROFILE)/SamHeaderRecord.o: SamHeaderRecord.h $(OBJDIR_PROFILE)/SamHeaderRecord.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamHeaderRecord.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamHeaderRecord.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamHeaderRecord.o: ../include/FileType.h $(OBJDIR_PROFILE)/SamHeaderRecord.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamHeaderRecord.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamHeaderRecord.o: SamHeaderTag.h $(OBJDIR_PROFILE)/SamHeaderSQ.o: SamHeaderSQ.h SamHeaderRecord.h $(OBJDIR_PROFILE)/SamHeaderSQ.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamHeaderSQ.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamHeaderSQ.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_PROFILE)/SamHeaderSQ.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamHeaderSQ.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamHeaderSQ.o: SamHeaderTag.h $(OBJDIR_PROFILE)/SamHeaderRG.o: SamHeaderRG.h SamHeaderRecord.h $(OBJDIR_PROFILE)/SamHeaderRG.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamHeaderRG.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamHeaderRG.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_PROFILE)/SamHeaderRG.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamHeaderRG.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamHeaderRG.o: SamHeaderTag.h $(OBJDIR_PROFILE)/SamHeaderTag.o: SamHeaderTag.h $(OBJDIR_PROFILE)/SamValidation.o: SamValidation.h SamFile.h SamStatus.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_PROFILE)/SamValidation.o: SamReferenceInfo.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_PROFILE)/SamValidation.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_PROFILE)/SamValidation.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamValidation.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/Generic.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/MemoryMap.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/MathVector.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/CigarRoller.h $(OBJDIR_PROFILE)/SamValidation.o: ../include/Cigar.h GenericSamInterface.h $(OBJDIR_PROFILE)/SamValidation.o: BamIndex.h ../include/IndexBase.h $(OBJDIR_PROFILE)/SamValidation.o: SamStatistics.h SamTags.h $(OBJDIR_PROFILE)/SamStatistics.o: SamStatistics.h SamRecord.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/Generic.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/MemoryMap.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/FileType.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/SamStatistics.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/MathVector.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_PROFILE)/SamStatistics.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamStatistics.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_PROFILE)/SamStatistics.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_PROFILE)/SamStatistics.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_PROFILE)/SamStatistics.o: ../include/Cigar.h SamFlag.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: SamQuerySeqWithRefHelper.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: SamRecord.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/Generic.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/MemoryMap.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/FileType.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: SamStatus.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/LongHash.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/Error.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/MathVector.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/IntArray.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: SamFileHeader.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: SamReferenceInfo.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/Constant.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: SamHeaderRecord.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/CigarRoller.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/Cigar.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: ../include/BaseUtilities.h $(OBJDIR_PROFILE)/SamQuerySeqWithRefHelper.o: SamFlag.h $(OBJDIR_PROFILE)/SamFilter.o: SamFilter.h SamRecord.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/MathVector.h ../include/IntArray.h $(OBJDIR_PROFILE)/SamFilter.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/StringHash.h ../include/Constant.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_PROFILE)/SamFilter.o: SamHeaderRecord.h SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_PROFILE)/SamFilter.o: SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_PROFILE)/SamFilter.o: SamQuerySeqWithRefHelper.h $(OBJDIR_PROFILE)/SamFilter.o: ../include/BaseUtilities.h SamFlag.h $(OBJDIR_PROFILE)/PileupElement.o: PileupElement.h SamRecord.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/Generic.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/MemoryMap.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/InputFile.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/FileType.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/StringArray.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/PileupElement.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/MathVector.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_PROFILE)/PileupElement.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/PileupElement.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_PROFILE)/PileupElement.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_PROFILE)/PileupElement.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_PROFILE)/PileupElement.o: ../include/Cigar.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: PileupElementBaseQual.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: PileupElement.h SamRecord.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/Generic.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/MemoryMap.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/InputFile.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/FileType.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/StringArray.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: SamStatus.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/LongHash.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/Error.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/MathVector.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/IntArray.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: SamFileHeader.h SamReferenceInfo.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/StringHash.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/Constant.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/Hash.h SamHeaderHD.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: SamHeaderPG.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/CigarRoller.h $(OBJDIR_PROFILE)/PileupElementBaseQual.o: ../include/Cigar.h $(OBJDIR_PROFILE)/SamReferenceInfo.o: SamReferenceInfo.h $(OBJDIR_PROFILE)/SamReferenceInfo.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamReferenceInfo.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamReferenceInfo.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamReferenceInfo.o: ../include/FileType.h $(OBJDIR_PROFILE)/SamReferenceInfo.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamReferenceInfo.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamReferenceInfo.o: ../include/IntArray.h $(OBJDIR_PROFILE)/SamTags.o: SamTags.h SamRecord.h $(OBJDIR_PROFILE)/SamTags.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamTags.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_PROFILE)/SamTags.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamTags.o: ../include/StringBasics.h ../include/InputFile.h $(OBJDIR_PROFILE)/SamTags.o: ../include/FileType.h ../include/StringArray.h $(OBJDIR_PROFILE)/SamTags.o: ../include/GenomeSequenceHelpers.h SamStatus.h $(OBJDIR_PROFILE)/SamTags.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamTags.o: ../include/ErrorHandler.h ../include/LongHash.h $(OBJDIR_PROFILE)/SamTags.o: ../include/Error.h ../include/MathVector.h $(OBJDIR_PROFILE)/SamTags.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_PROFILE)/SamTags.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_PROFILE)/SamTags.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamTags.o: SamHeaderHD.h SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_PROFILE)/SamTags.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamTags.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_PROFILE)/SamTags.o: ../include/BaseUtilities.h $(OBJDIR_PROFILE)/PosList.o: PosList.h $(OBJDIR_PROFILE)/CigarHelper.o: CigarHelper.h SamRecord.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/Generic.h ../include/MemoryMap.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/StringArray.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/CigarHelper.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/MathVector.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_PROFILE)/CigarHelper.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/CigarHelper.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_PROFILE)/CigarHelper.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_PROFILE)/CigarHelper.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_PROFILE)/CigarHelper.o: ../include/Cigar.h $(OBJDIR_PROFILE)/SamRecordPool.o: SamRecordPool.h SamRecord.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/Generic.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/MemoryMap.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/FileType.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/SamRecordPool.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/MathVector.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_PROFILE)/SamRecordPool.o: SamReferenceInfo.h ../include/StringHash.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamRecordPool.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_PROFILE)/SamRecordPool.o: SamHeaderTag.h SamHeaderSQ.h SamHeaderRG.h $(OBJDIR_PROFILE)/SamRecordPool.o: SamHeaderPG.h ../include/CigarRoller.h $(OBJDIR_PROFILE)/SamRecordPool.o: ../include/Cigar.h $(OBJDIR_PROFILE)/SamCoordOutput.o: SamCoordOutput.h SamFile.h SamStatus.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/FileType.h SamFileHeader.h $(OBJDIR_PROFILE)/SamCoordOutput.o: SamReferenceInfo.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/IntArray.h SamHeaderHD.h $(OBJDIR_PROFILE)/SamCoordOutput.o: SamHeaderRecord.h SamHeaderTag.h $(OBJDIR_PROFILE)/SamCoordOutput.o: SamHeaderSQ.h SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamCoordOutput.o: SamRecord.h ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/Generic.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/MemoryMap.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/MathVector.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/CigarRoller.h $(OBJDIR_PROFILE)/SamCoordOutput.o: ../include/Cigar.h GenericSamInterface.h $(OBJDIR_PROFILE)/SamCoordOutput.o: BamIndex.h ../include/IndexBase.h $(OBJDIR_PROFILE)/SamCoordOutput.o: SamStatistics.h SamRecordPool.h $(OBJDIR_PROFILE)/SamCoordOutput.o: SamHelper.h $(OBJDIR_PROFILE)/SamRecordHelper.o: SamRecordHelper.h SamRecord.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/Generic.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/MemoryMap.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/InputFile.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/FileType.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/StringArray.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/SamRecordHelper.o: SamStatus.h ../include/StatGenStatus.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/ErrorHandler.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/MathVector.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/IntArray.h SamFileHeader.h $(OBJDIR_PROFILE)/SamRecordHelper.o: SamReferenceInfo.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/StringHash.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/Constant.h ../include/Hash.h $(OBJDIR_PROFILE)/SamRecordHelper.o: SamHeaderHD.h SamHeaderRecord.h $(OBJDIR_PROFILE)/SamRecordHelper.o: SamHeaderTag.h SamHeaderSQ.h $(OBJDIR_PROFILE)/SamRecordHelper.o: SamHeaderRG.h SamHeaderPG.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/CigarRoller.h $(OBJDIR_PROFILE)/SamRecordHelper.o: ../include/Cigar.h libStatGen-1.0.14/bam/Pileup.h000066400000000000000000000413061254730101300157770ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PILEUP_H__ #define __PILEUP_H__ #include #include "SamFile.h" #include "PosList.h" class PileupHelper { public: static const int DEFAULT_WINDOW_SIZE = 1024; }; template class defaultPileup { public: bool operator() (PILEUP_TYPE& element) { element.analyze(); return(true); } void analyze(PILEUP_TYPE element) { element.analyze(); } }; template void defaultPileupAnalyze(PILEUP_TYPE& element) { element.analyze(); } /// Class to perform a pileup of all reads by position, assuming /// the reads are coordinate sorted. template > class Pileup { public: /// Constructor using the default maximum number of bases a read spans. Pileup(const FUNC_CLASS& fp = FUNC_CLASS()); /// Constructor that sets the maximum number of bases a read spans. /// This is the "window" the length of the buffer that holds /// the pileups for each position until the read start has moved /// past the position. Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS()); /// Perform pileup with a reference. Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp = FUNC_CLASS()); /// Perform pileup with a reference and a specified window size. Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp = FUNC_CLASS()); /// Destructor virtual ~Pileup(); /// Performs a pileup on the specified file. /// \param excludeFlag if specified, if any bit set in the exclude flag /// is set in the record's flag, it will be dropped. /// Defaulted to exclude: /// * unmapped, /// * not primary alignment /// * failed platform/vendor quality check /// * PCR or optical duplicate /// \param includeFlag if specified, every bit must be set in the /// record's flag for it to be included - /// defaulted to 0, no bits are required to be set. /// \return 0 for success and non-zero for failure. virtual int processFile(const std::string& fileName, uint16_t excludeFlag = 0x0704, uint16_t includeFlag = 0); /// Add an alignment to the pileup. virtual void processAlignment(SamRecord& record); /// Add only positions that fall within the specified region of the /// alignment to the pileup and outside of the specified excluded positions. /// \param record alignment to be added to the pileup. /// \param startPos 0-based start position of the bases that should be /// added to the pileup. /// \param endPos 0-based end position of the bases that should be added /// to the pileup (this position is not added). /// Set to -1 if there is no end position to the region. /// \param excludeList list of refID/positions to exclude from processing. virtual void processAlignmentRegion(SamRecord& record, int startPos, int endPos, PosList* excludeList = NULL); /// Done processing, flush every position that is currently being stored /// in the pileup. void flushPileup(); protected: FUNC_CLASS myAnalyzeFuncPtr; // Always need the reference position. void addAlignmentPosition(int refPosition, SamRecord& record); virtual void flushPileup(int refID, int refPosition); void flushPileup(int refPosition); // Get the position in the myElements container that is associated // with the specified position. If the specified position cannot // fit within the myElements container, -1 is returned. int pileupPosition(int refPosition); virtual void resetElement(PILEUP_TYPE& element, int position); virtual void addElement(PILEUP_TYPE& element, SamRecord& record); virtual void analyzeElement(PILEUP_TYPE& element); virtual void analyzeHead(); std::vector myElements; int pileupStart; int pileupHead; int pileupTail; // last used position int pileupWindow; int myCurrentRefID; GenomeSequence* myRefPtr; }; template Pileup::Pileup(const FUNC_CLASS& fp) : myAnalyzeFuncPtr(fp), myElements(), pileupStart(0), pileupHead(0), pileupTail(-1), pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE), myCurrentRefID(-2), myRefPtr(NULL) { // Not using pointers since this is templated. myElements.resize(pileupWindow); } template Pileup::Pileup(int window, const FUNC_CLASS& fp) : myAnalyzeFuncPtr(fp), myElements(), pileupStart(0), pileupHead(0), pileupTail(-1), pileupWindow(window), myCurrentRefID(-2), myRefPtr(NULL) { // Not using pointers since this is templated. myElements.resize(window); } template Pileup::Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp) : myAnalyzeFuncPtr(fp), myElements(), pileupStart(0), pileupHead(0), pileupTail(-1), pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE), myCurrentRefID(-2), myRefPtr(NULL) { myRefPtr = new GenomeSequence(refSeqFileName.c_str()); // Not using pointers since this is templated. myElements.resize(pileupWindow); PILEUP_TYPE::setReference(myRefPtr); } template Pileup::Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp) : myAnalyzeFuncPtr(fp), myElements(), pileupStart(0), pileupHead(0), pileupTail(-1), pileupWindow(window), myCurrentRefID(-2), myRefPtr(NULL) { myRefPtr = new GenomeSequence(refSeqFileName.c_str()); // Not using pointers since this is templated. myElements.resize(window); PILEUP_TYPE::setReference(myRefPtr); } template Pileup::~Pileup() { flushPileup(); if(myRefPtr != NULL) { delete myRefPtr; myRefPtr = NULL; } } template int Pileup::processFile(const std::string& fileName, uint16_t excludeFlag, uint16_t includeFlag) { SamFile samIn; SamFileHeader header; SamRecord record; if(myRefPtr != NULL) { samIn.SetReference(myRefPtr); } if(!samIn.OpenForRead(fileName.c_str())) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } if(!samIn.ReadHeader(header)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // The file needs to be sorted by coordinate. samIn.setSortedValidation(SamFile::COORDINATE); // Iterate over all records while (samIn.ReadRecord(header, record)) { uint16_t flag = record.getFlag(); if(flag & excludeFlag) { // This record has an excluded flag set, // so continue to the next one. continue; } if((flag & includeFlag) != includeFlag) { // This record does not have all required flags set, // so continue to the next one. continue; } processAlignment(record); } flushPileup(); int returnValue = 0; if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed to read a record. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnValue = samIn.GetStatus(); } return(returnValue); } template void Pileup::processAlignment(SamRecord& record) { int refPosition = record.get0BasedPosition(); int refID = record.getReferenceID(); // Flush any elements from the pileup that are prior to this record // since the file is sorted, we are done with those positions. flushPileup(refID, refPosition); // Loop through for each reference position covered by the record. // It is up to the PILEUP_TYPE to handle insertions/deletions, etc // that are related with the given reference position. for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition) { addAlignmentPosition(refPosition, record); } } template void Pileup::processAlignmentRegion(SamRecord& record, int startPos, int endPos, PosList* excludeList) { int refPosition = record.get0BasedPosition(); int refID = record.getReferenceID(); // Flush any elements from the pileup that are prior to this record // since the file is sorted, we are done with those positions. flushPileup(refID, refPosition); // Check if the region starts after this reference starts. If so, // we only want to start adding at the region start position. if(startPos > refPosition) { refPosition = startPos; } // Loop through for each reference position covered by the record. // It is up to the PILEUP_TYPE to handle insertions/deletions, etc // that are related with the given reference position. for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition) { // Check to see if we have gone past the end of the region, in which // case we can stop processing this record. Check >= since the // end position is not in the region. if((endPos != -1) && (refPosition >= endPos)) { break; } // Check to see if this position is in the exclude list. bool addPos = true; if(excludeList != NULL) { // There is an exclude list, so lookup the position. if(excludeList->hasPosition(refID, refPosition)) { // This position is in the exclude list, so don't add it. addPos = false; } } if(addPos) { addAlignmentPosition(refPosition, record); } } } template void Pileup::flushPileup() { // while there are still entries between the head and tail, flush, // but no need to flush if pileupTail == -1 because in that case // no entries have been added while ((pileupHead <= pileupTail) && (pileupTail != -1)) { flushPileup(pileupHead+1); } pileupStart = pileupHead = 0; pileupTail = -1; } // Always need the reference position. template void Pileup::addAlignmentPosition(int refPosition, SamRecord& record) { int offset = 0; try{ offset = pileupPosition(refPosition); } catch(std::runtime_error& err) { const char* overflowErr = "Overflow on the pileup buffer:"; String errorMessage = err.what(); if(strncmp(err.what(), overflowErr, strlen(overflowErr)) == 0) { errorMessage += "\n\tPileup Buffer Overflow: recordName = "; errorMessage += record.getReadName(); errorMessage += "; Cigar = "; errorMessage += record.getCigar(); } throw std::runtime_error(errorMessage.c_str()); } if((offset < 0) || (offset >= pileupWindow)) { std::cerr << "Pileup Buffer Overflow: position = " << refPosition << "; refID = " << record.getReferenceID() << "; recStartPos = " << record.get1BasedPosition() << "; pileupStart = " << pileupStart << "; pileupHead = " << pileupHead << "; pileupTail = " << pileupTail; } addElement(myElements[offset], record); } template void Pileup::flushPileup(int refID, int position) { // if new chromosome, flush the entire pileup. if(refID != myCurrentRefID) { // New chromosome, flush everything. flushPileup(); myCurrentRefID = refID; } else { // on the same chromosome, so flush just up to this new position. flushPileup(position); } } template void Pileup::flushPileup(int position) { // Flush up to this new position, but no reason to flush if // pileupHead has not been set. while((pileupHead < position) && (pileupHead <= pileupTail)) { analyzeHead(); pileupHead++; if(pileupHead - pileupStart >= pileupWindow) pileupStart += pileupWindow; } if(pileupHead > pileupTail) { // All positions have been flushed, so reset pileup info pileupHead = pileupStart = 0; pileupTail = -1; } } // Get the position in the myElements container that is associated // with the specified position. If the specified position cannot // fit within the myElements container, -1 is returned. template int Pileup::pileupPosition(int position) { // Check to see if this is the first position (pileupTail == -1) if(pileupTail == -1) { pileupStart = pileupHead = position; // This is the first time this position is being used, so // reset the element. resetElement(myElements[0], position); pileupTail = position; return(0); } if((position < pileupHead) || (position > (pileupHead + pileupWindow))) { String errorMessage = "Overflow on the pileup buffer: specifiedPosition = "; errorMessage += position; errorMessage += ", pileup buffer start position: "; errorMessage += pileupHead; errorMessage += ", pileup buffer end position: "; errorMessage += pileupHead + pileupWindow; throw std::runtime_error(errorMessage.c_str()); } // int offset = position - pileupStart; int offset = position - pileupStart; if(offset >= pileupWindow) { offset -= pileupWindow; } // Check to see if position is past the end of the currently // setup pileup positions. while(position > pileupTail) { // Increment pileupTail to the next position since the current // pileupTail is already in use. ++pileupTail; // Figure out the offset for this next position. offset = pileupTail - pileupStart; if(offset >= pileupWindow) { offset -= pileupWindow; } // This is the first time this position is being used, so // reset the element. resetElement(myElements[offset], pileupTail); } return(offset); } template void Pileup::resetElement(PILEUP_TYPE& element, int position) { element.reset(position); } template void Pileup::addElement(PILEUP_TYPE& element, SamRecord& record) { element.addEntry(record); } template void Pileup::analyzeElement(PILEUP_TYPE& element) { myAnalyzeFuncPtr(element); } template void Pileup::analyzeHead() { myAnalyzeFuncPtr(myElements[pileupHead - pileupStart]); } #endif libStatGen-1.0.14/bam/PileupElement.cpp000066400000000000000000000044431254730101300176450ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "PileupElement.h" GenomeSequence* PileupElement::myRefPtr = NULL; PileupElement::PileupElement() : myRefPosition(UNSET_POSITION), myChromosome("") { } // NOTE that this method does not actually copy, it just resets. PileupElement::PileupElement(const PileupElement& q) : myRefPosition(UNSET_POSITION), myChromosome("") { } PileupElement::~PileupElement() { } // Add an entry to this pileup element. void PileupElement::addEntry(SamRecord& record) { if(myChromosome.empty()) { // First entry, save chromosme name. myChromosome = record.getReferenceName(); } } // Perform the alalysis associated with this class. May be a simple print, // a calculation, or something else. Typically performed when this element // has been fully populated by all records that cover the reference position. void PileupElement::analyze() { if(myRefPosition != UNSET_POSITION) { std::cout << myChromosome << "\t" << myRefPosition << "\n"; } } // Resets the entry, setting the new position associated with this element. void PileupElement::reset(int32_t refPosition) { myChromosome.clear(); myRefPosition = refPosition; } char PileupElement::getRefBase() { if(myRefPtr != NULL) { // Add 1 to pos because getBase expects 1-based index. return(myRefPtr->getBase(myChromosome.c_str(), myRefPosition+1)); } return('N'); } // Resets the entry, setting the new position associated with this element. void PileupElement::setReference(GenomeSequence* reference) { myRefPtr = reference; } libStatGen-1.0.14/bam/PileupElement.h000066400000000000000000000046041254730101300173110ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PILEUP_ELEMENT_H__ #define __PILEUP_ELEMENT_H__ #include "SamRecord.h" /// This is a base class pileup component, representing the information /// for one reference position. Child classes will be defined to detail more /// information that needs to be saved and how it should be analyzed. class PileupElement { public: static const int32_t UNSET_POSITION = -1; /// Pileup element constructor. PileupElement(); /// Constructor that resets the pileup element, does not copy, just resets. PileupElement(const PileupElement& q); /// Pileup element destructor. virtual ~PileupElement(); /// Add an entry to this pileup element. virtual void addEntry(SamRecord& record); /// Perform the analysis associated with this class. virtual void analyze(); /// Resets the entry, setting the new position associated with this element. virtual void reset(int32_t refPosition); /// Get the chromosome name stored in this element. const char* getChromosome() const { return(myChromosome.c_str()); } /// Get the reference position stored in this element. int32_t getRefPosition() const { return(myRefPosition); } /// Returns the reference base for this pileup element. /// Only works if a reference has been set, otherwise, 'N' is returned. char getRefBase(); /// Set the reference to use for all pilepElements. static void setReference(GenomeSequence* reference); protected: /// Get a pointer to the reference. static GenomeSequence* getReference() { return(myRefPtr); } private: int32_t myRefPosition; std::string myChromosome; static GenomeSequence* myRefPtr; }; #endif libStatGen-1.0.14/bam/PileupElementBaseQual.cpp000066400000000000000000000105721254730101300212630ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "PileupElementBaseQual.h" PileupElementBaseQual::PileupElementBaseQual() : PileupElement(), myBases(NULL), myQualities(NULL), myAllocatedSize(0), myIndex(-1), myAddDelAsBase(false) { myAllocatedSize = 1024; myBases = (char*)malloc(myAllocatedSize + 1); myQualities = (char*)malloc(myAllocatedSize + 1); if((myBases == NULL ) || (myQualities == NULL)) { // TODO, check for malloc failures. std::cerr << "Failed Memory Allocation\n"; } } // NOTE that this method does not actually copy, it just resets. PileupElementBaseQual::PileupElementBaseQual(const PileupElementBaseQual& q) : PileupElement(), myBases(NULL), myQualities(NULL), myAllocatedSize(0), myIndex(-1) { myAllocatedSize = 1024; myBases = (char*)malloc(myAllocatedSize + 1); myQualities = (char*)malloc(myAllocatedSize + 1); myAddDelAsBase = q.myAddDelAsBase; if((myBases == NULL ) || (myQualities == NULL)) { // TODO, check for malloc failures. std::cerr << "Failed Memory Allocation\n"; } } PileupElementBaseQual::~PileupElementBaseQual() { if(myBases != NULL) { free(myBases); myBases = NULL; } if(myQualities != NULL) { free(myQualities); myQualities = NULL; } } // Add an entry to this pileup element. void PileupElementBaseQual::addEntry(SamRecord& record) { // Call the base class: PileupElement::addEntry(record); // Increment the index ++myIndex; // if the index has gone beyond the allocated space, double the size. if(myIndex >= myAllocatedSize) { char* tempBuffer = (char*)realloc(myBases, myAllocatedSize * 2); if(tempBuffer == NULL) { std::cerr << "Memory Allocation Failure\n"; // TODO return; } myBases = tempBuffer; tempBuffer = (char*)realloc(myQualities, myAllocatedSize * 2); if(tempBuffer == NULL) { std::cerr << "Memory Allocation Failure\n"; // TODO return; } myQualities = tempBuffer; myAllocatedSize = myAllocatedSize * 2; } Cigar* cigar = record.getCigarInfo(); if(cigar == NULL) { throw std::runtime_error("Failed to retrieve cigar info from the record."); } int32_t readIndex = cigar->getQueryIndex(getRefPosition(), record.get0BasedPosition()); // If the readPosition is N/A, this is a deletion. if(readIndex != CigarRoller::INDEX_NA) { char base = record.getSequence(readIndex); char qual = record.getQuality(readIndex); if(qual == UNSET_QUAL) { qual = ' '; } myBases[myIndex] = base; myQualities[myIndex] = qual; } else if(myAddDelAsBase) { // This version adds deletions as bases. myBases[myIndex] = '-'; myQualities[myIndex] = '0'; } else { // Do not add a deletion. // Did not add any entries, so decrement the index counter since the // index was not used. --myIndex; } } void PileupElementBaseQual::analyze() { if(getRefPosition() != UNSET_POSITION) { myBases[myIndex+1] = '\0'; myQualities[myIndex+1] = '\0'; std::cout << getChromosome() << "\t" << getRefPosition() << "\tN\t" << myIndex+1 << "\t"; std::cout << myBases << "\t"; std::cout << myQualities; std::cout << "\n"; } } void PileupElementBaseQual::reset(int refPosition) { // Call the base class. PileupElement::reset(refPosition); myIndex = -1; } libStatGen-1.0.14/bam/PileupElementBaseQual.h000066400000000000000000000033561254730101300207320ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PILEUP_ELEMENT_BASE_QUAL_H__ #define __PILEUP_ELEMENT_BASE_QUAL_H__ #include #include "PileupElement.h" /// This class inherits from the base class and stores base and qualities. class PileupElementBaseQual : public PileupElement { public: PileupElementBaseQual(); // NOTE that this method does not actually copy, it just resets. PileupElementBaseQual(const PileupElementBaseQual& q); virtual ~PileupElementBaseQual(); // Add an entry to this pileup element. virtual void addEntry(SamRecord& record); // Perform the alalysis associated with this class. In this case, it is // a print of the base & quality information associated with this position. virtual void analyze(); // Resets the entry, setting the new position associated with this element. virtual void reset(int32_t refPosition); private: static const char UNSET_QUAL = 0xFF; char* myBases; char* myQualities; int myAllocatedSize; int myIndex; bool myAddDelAsBase; }; #endif libStatGen-1.0.14/bam/PosList.cpp000066400000000000000000000057441254730101300164770ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "PosList.h" #include PosList::PosList() : myNumRefs(24), myNumPos(100) { initVars(); } PosList::PosList(int numRefs, int numPositions) : myNumRefs(numRefs), myNumPos(numPositions) { initVars(); } PosList::~PosList() { myPosList.clear(); } void PosList::addPosition(int refID, int refPosition) { // Check for negative numbers, if so, just return. if((refID < 0) || (refPosition < 0)) { return; } // If the position list is smaller or equal to refID, it cannot handle an index, // so increase the size. if(myPosList.size() <= (unsigned int)refID) { // The position list does not currently have space for this reference id, // so add it. myPosList.resize(refID+1, std::vector(myNumPos, false)); myNumRefs = refID + 1; } // The matrix is now sized for this reference id. // Check to see if this id holds this position. if((myPosList[refID]).size() <= (unsigned int)refPosition) { // The index for this position has not yet been created, // so increase the size for it. if(myNumPos <= refPosition) { // Our number of positions is smaller than // the current reference id, so reset // myNumPos for future use to be this position +1. myNumPos = refPosition + 1; } // Increase the size for this reference id to hold at least myNumPos. (myPosList[refID]).resize(myNumPos, false); } // It now holds this position, so set it to true. myPosList[refID][refPosition] = true; } bool PosList::hasPosition(int refID, int refPosition) { // Check for negative numbers, if so, just return false, not found. if((refID < 0) || (refPosition < 0)) { return(false); } bool found = false; try { if((myPosList.at(refID)).at(refPosition)) { found = true; } } catch (std::out_of_range& oor) { // Nothing to do here, if it was out of range, then // the position was not found (already set to false). } return(found); } void PosList::initVars() { myPosList.clear(); myPosList.resize(myNumRefs, std::vector(myNumPos, false)); } libStatGen-1.0.14/bam/PosList.h000066400000000000000000000031711254730101300161340ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __POSLIST_H__ #define __POSLIST_H__ #include /// Store refID/position, but does not store values < 0. class PosList { public: /// Constructor PosList(); /// Reserves space for numRefs reference ids and numPositions for each id. PosList(int numRefs, int numPositions); /// Destructor virtual ~PosList(); /// Add the specified reference id/position (negative values will not be /// added). void addPosition(int refID, int refPosition); /// Return whether or not this list contains the specified reference ID /// and position (negative values will automatically return false). bool hasPosition(int refID, int refPosition); protected: PosList(const PosList& p); void initVars(); // 2-D vector. // indexed by [referenceID][position]. std::vector < std::vector > myPosList; int myNumRefs; int myNumPos; }; #endif libStatGen-1.0.14/bam/SamCoordOutput.cpp000066400000000000000000000053201254730101300200200ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamCoordOutput.h" #include "SamHelper.h" SamCoordOutput::SamCoordOutput(SamRecordPool& pool) : myOutputFile(NULL), myHeader(NULL), myPool(&pool) { } SamCoordOutput::~SamCoordOutput() { // Flush the rest of the records. flush(-1, -1); myOutputFile = NULL; myHeader = NULL; } void SamCoordOutput::setOutputFile(SamFile* outFile, SamFileHeader* header) { myOutputFile = outFile; myHeader = header; } bool SamCoordOutput::add(SamRecord* record) { if(record != NULL) { int32_t chrom = record->getReferenceID(); uint64_t chromPos = SamHelper::combineChromPos(chrom, record->get0BasedPosition()); myReadBuffer.insert(std::pair(chromPos, record)); return(true); } return(false); } bool SamCoordOutput::flushAll() { return(flush(-1,-1)); } bool SamCoordOutput::flush(int32_t chromID, int32_t pos0Based) { static std::multimap::iterator iter; uint64_t chromPos = SamHelper::combineChromPos(chromID, pos0Based); bool returnVal = true; iter = myReadBuffer.begin(); if((myOutputFile == NULL) || (myHeader == NULL)) { std::cerr << "SamCoordOutput::flush, no output file/header is set, so records removed without being written\n"; returnVal = false; } while((iter != myReadBuffer.end()) && (((*iter).first <= chromPos) || (chromID == -1))) { if((myOutputFile != NULL) && (myHeader != NULL)) { returnVal &= myOutputFile->WriteRecord(*myHeader, *((*iter).second)); } if(myPool != NULL) { myPool->releaseRecord((*iter).second); } else { delete((*iter).second); } ++iter; } // Remove the elements from the begining up to, // but not including the current iterator position. myReadBuffer.erase(myReadBuffer.begin(), iter); return(returnVal); } libStatGen-1.0.14/bam/SamCoordOutput.h000066400000000000000000000055121254730101300174700ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_COORD_OUTPUT_H__ #define __SAM_COORD_OUTPUT_H__ #include "SamFile.h" #include "SamRecordPool.h" /// Class for buffering up output reads to ensure that it is sorted. /// They are added in almost sorted order. /// Flush writes any records that start at/before the specified position. class SamCoordOutput { public: /// Create an output buffer returning any written records to the specified pool. /// \param pool pool that any written records should be returned to, a pointer /// to this pool is stored, so it should not go out of scope until the output buffer /// has written all the records. SamCoordOutput(SamRecordPool& pool); ~SamCoordOutput(); /// Set the already opened output file to write to when flushed. /// The user should not close/delete the SamFile until this class is done /// with it. This class does NOT close/delete the SamFile. /// \param outFile pointer to an already opened (and header written) /// SAM/BAM output file. /// \param header pointer to an already written header that should be /// used for writing the records. void setOutputFile(SamFile* outFile, SamFileHeader* header); /// Add the specified record to this read buffer. bool add(SamRecord* record); /// Flush the entire buffer, writing all records. /// If no output buffer is set, the files cannot be written, but the /// flushed records are removed/freed. bool flushAll(); /// Flush the buffer based on the specified chromosome id/position, writing /// any records that start at/before the specified chromosome id/position. /// If no output buffer is set, the files cannot be written, but the /// flushed records are removed/freed. /// A chromID of -1 will flush everything regardless of pos0Based. bool flush(int32_t chromID, int32_t pos0Based); protected: private: // Require a sam record pool, so make the constructor with // no parameters private. SamCoordOutput(); SamFile* myOutputFile; SamFileHeader* myHeader; std::multimap myReadBuffer; SamRecordPool* myPool; }; #endif libStatGen-1.0.14/bam/SamFile.cpp000066400000000000000000001260301254730101300164120ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "SamFile.h" #include "SamFileHeader.h" #include "SamRecord.h" #include "BamInterface.h" #include "SamInterface.h" #include "BgzfFileType.h" // Constructor, init variables. SamFile::SamFile() : myStatus() { init(); resetFile(); } // Constructor, init variables. SamFile::SamFile(ErrorHandler::HandlingType errorHandlingType) : myStatus(errorHandlingType) { init(); resetFile(); } // Constructor, init variables and open the specified file based on the // specified mode (READ/WRITE). SamFile::SamFile(const char* filename, OpenType mode) : myStatus() { init(filename, mode, NULL); } // Constructor, init variables and open the specified file based on the // specified mode (READ/WRITE). Default is READ.. SamFile::SamFile(const char* filename, OpenType mode, ErrorHandler::HandlingType errorHandlingType) : myStatus(errorHandlingType) { init(filename, mode, NULL); } // Constructor, init variables and open the specified file based on the // specified mode (READ/WRITE). SamFile::SamFile(const char* filename, OpenType mode, SamFileHeader* header) : myStatus() { init(filename, mode, header); } // Constructor, init variables and open the specified file based on the // specified mode (READ/WRITE). Default is READ.. SamFile::SamFile(const char* filename, OpenType mode, ErrorHandler::HandlingType errorHandlingType, SamFileHeader* header) : myStatus(errorHandlingType) { init(filename, mode, header); } SamFile::~SamFile() { resetFile(); if(myStatistics != NULL) { delete myStatistics; } } // Open a sam/bam file for reading with the specified filename. bool SamFile::OpenForRead(const char * filename, SamFileHeader* header) { // Reset for any previously operated on files. resetFile(); int lastchar = 0; while (filename[lastchar] != 0) lastchar++; // If at least one character, check for '-'. if((lastchar >= 1) && (filename[0] == '-')) { // Read from stdin - determine type of file to read. // Determine if compressed bam. if(strcmp(filename, "-.bam") == 0) { // Compressed bam - open as bgzf. // -.bam is the filename, read compressed bam from stdin filename = "-"; myFilePtr = new InputFile; // support recover mode - this switches in a reader // capable of recovering from bad BGZF compression blocks. myFilePtr->setAttemptRecovery(myAttemptRecovery); myFilePtr->openFile(filename, "rb", InputFile::BGZF); myInterfacePtr = new BamInterface; // Read the magic string. char magic[4]; ifread(myFilePtr, magic, 4); } else if(strcmp(filename, "-.ubam") == 0) { // uncompressed BAM File. // -.ubam is the filename, read uncompressed bam from stdin. // uncompressed BAM is still compressed with BGZF, but using // compression level 0, so still open as BGZF since it has a // BGZF header. filename = "-"; // Uncompressed, so do not require the eof block. #ifdef __ZLIB_AVAILABLE__ BgzfFileType::setRequireEofBlock(false); #endif myFilePtr = ifopen(filename, "rb", InputFile::BGZF); myInterfacePtr = new BamInterface; // Read the magic string. char magic[4]; ifread(myFilePtr, magic, 4); } else if((strcmp(filename, "-") == 0) || (strcmp(filename, "-.sam") == 0)) { // SAM File. // read sam from stdin filename = "-"; myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED); myInterfacePtr = new SamInterface; } else { std::string errorMessage = "Invalid SAM/BAM filename, "; errorMessage += filename; errorMessage += ". From stdin, can only be '-', '-.sam', '-.bam', or '-.ubam'"; myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str()); delete myFilePtr; myFilePtr = NULL; return(false); } } else { // Not from stdin. Read the file to determine the type. myFilePtr = new InputFile; // support recovery mode - this conditionally enables a reader // capable of recovering from bad BGZF compression blocks. myFilePtr->setAttemptRecovery(myAttemptRecovery); bool rc = myFilePtr->openFile(filename, "rb", InputFile::DEFAULT); if (rc == false) { std::string errorMessage = "Failed to Open "; errorMessage += filename; errorMessage += " for reading"; myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str()); delete myFilePtr; myFilePtr = NULL; return(false); } char magic[4]; ifread(myFilePtr, magic, 4); if (magic[0] == 'B' && magic[1] == 'A' && magic[2] == 'M' && magic[3] == 1) { myInterfacePtr = new BamInterface; // Set that it is a bam file open for reading. This is needed to // determine if an index file can be used. myIsBamOpenForRead = true; } else { // Not a bam, so rewind to the beginning of the file so it // can be read. ifrewind(myFilePtr); myInterfacePtr = new SamInterface; } } // File is open for reading. myIsOpenForRead = true; // Read the header if one was passed in. if(header != NULL) { return(ReadHeader(*header)); } // Successfully opened the file. myStatus = SamStatus::SUCCESS; return(true); } // Open a sam/bam file for writing with the specified filename. bool SamFile::OpenForWrite(const char * filename, SamFileHeader* header) { // Reset for any previously operated on files. resetFile(); int lastchar = 0; while (filename[lastchar] != 0) lastchar++; if (lastchar >= 4 && filename[lastchar - 4] == 'u' && filename[lastchar - 3] == 'b' && filename[lastchar - 2] == 'a' && filename[lastchar - 1] == 'm') { // BAM File. // if -.ubam is the filename, write uncompressed bam to stdout if((lastchar == 6) && (filename[0] == '-') && (filename[1] == '.')) { filename = "-"; } myFilePtr = ifopen(filename, "wb0", InputFile::BGZF); myInterfacePtr = new BamInterface; } else if (lastchar >= 3 && filename[lastchar - 3] == 'b' && filename[lastchar - 2] == 'a' && filename[lastchar - 1] == 'm') { // BAM File. // if -.bam is the filename, write compressed bam to stdout if((lastchar == 5) && (filename[0] == '-') && (filename[1] == '.')) { filename = "-"; } myFilePtr = ifopen(filename, "wb", InputFile::BGZF); myInterfacePtr = new BamInterface; } else { // SAM File // if - (followed by anything is the filename, // write uncompressed sam to stdout if((lastchar >= 1) && (filename[0] == '-')) { filename = "-"; } myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED); myInterfacePtr = new SamInterface; } if (myFilePtr == NULL) { std::string errorMessage = "Failed to Open "; errorMessage += filename; errorMessage += " for writing"; myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str()); return(false); } myIsOpenForWrite = true; // Write the header if one was passed in. if(header != NULL) { return(WriteHeader(*header)); } // Successfully opened the file. myStatus = SamStatus::SUCCESS; return(true); } // Read BAM Index file. bool SamFile::ReadBamIndex(const char* bamIndexFilename) { // Cleanup a previously setup index. if(myBamIndex != NULL) { delete myBamIndex; myBamIndex = NULL; } // Create a new bam index. myBamIndex = new BamIndex(); SamStatus::Status indexStat = myBamIndex->readIndex(bamIndexFilename); if(indexStat != SamStatus::SUCCESS) { std::string errorMessage = "Failed to read the bam Index file: "; errorMessage += bamIndexFilename; myStatus.setStatus(indexStat, errorMessage.c_str()); delete myBamIndex; myBamIndex = NULL; return(false); } myStatus = SamStatus::SUCCESS; return(true); } // Read BAM Index file. bool SamFile::ReadBamIndex() { if(myFilePtr == NULL) { // Can't read the bam index file because the BAM file has not yet been // opened, so we don't know the base filename for the index file. std::string errorMessage = "Failed to read the bam Index file -" " the BAM file needs to be read first in order to determine" " the index filename."; myStatus.setStatus(SamStatus::FAIL_ORDER, errorMessage.c_str()); return(false); } const char* bamBaseName = myFilePtr->getFileName(); std::string indexName = bamBaseName; indexName += ".bai"; bool foundFile = true; try { if(ReadBamIndex(indexName.c_str()) == false) { foundFile = false; } } catch (std::exception&) { foundFile = false; } // Check to see if the index file was found. if(!foundFile) { // Not found - try without the bam extension. // Locate the start of the bam extension size_t startExt = indexName.find(".bam"); if(startExt == std::string::npos) { // Could not find the .bam extension, so just return false since the // call to ReadBamIndex set the status. return(false); } // Remove ".bam" and try reading the index again. indexName.erase(startExt, 4); return(ReadBamIndex(indexName.c_str())); } return(true); } // Sets the reference to the specified genome sequence object. void SamFile::SetReference(GenomeSequence* reference) { myRefPtr = reference; } // Set the type of sequence translation to use when reading the sequence. void SamFile::SetReadSequenceTranslation(SamRecord::SequenceTranslation translation) { myReadTranslation = translation; } // Set the type of sequence translation to use when writing the sequence. void SamFile::SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation) { myWriteTranslation = translation; } // Close the file if there is one open. void SamFile::Close() { // Resetting the file will close it if it is open, and // will reset all other variables. resetFile(); } // Returns whether or not the file has been opened. // return: int - true = open; false = not open. bool SamFile::IsOpen() { if (myFilePtr != NULL) { // File Pointer is set, so return if it is open. return(myFilePtr->isOpen()); } // File pointer is not set, so return false, not open. return false; } // Returns whether or not the end of the file has been reached. // return: int - true = EOF; false = not eof. bool SamFile::IsEOF() { if (myFilePtr != NULL) { // File Pointer is set, so return if eof. return(ifeof(myFilePtr)); } // File pointer is not set, so return true, eof. return true; } // Read the header from the currently opened file. bool SamFile::ReadHeader(SamFileHeader& header) { myStatus = SamStatus::SUCCESS; if(myIsOpenForRead == false) { // File is not open for read myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot read header since the file is not open for reading"); return(false); } if(myHasHeader == true) { // The header has already been read. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot read header since it has already been read."); return(false); } if(myInterfacePtr->readHeader(myFilePtr, header, myStatus)) { // The header has now been successfully read. myHasHeader = true; return(true); } return(false); } // Write the header to the currently opened file. bool SamFile::WriteHeader(SamFileHeader& header) { myStatus = SamStatus::SUCCESS; if(myIsOpenForWrite == false) { // File is not open for write // -OR- // The header has already been written. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot write header since the file is not open for writing"); return(false); } if(myHasHeader == true) { // The header has already been written. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot write header since it has already been written"); return(false); } if(myInterfacePtr->writeHeader(myFilePtr, header, myStatus)) { // The header has now been successfully written. myHasHeader = true; return(true); } // return the status. return(false); } // Read a record from the currently opened file. bool SamFile::ReadRecord(SamFileHeader& header, SamRecord& record) { myStatus = SamStatus::SUCCESS; if(myIsOpenForRead == false) { // File is not open for read myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot read record since the file is not open for reading"); throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file.")); return(false); } if(myHasHeader == false) { // The header has not yet been read. // TODO - maybe just read the header. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot read record since the header has not been read."); throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header.")); return(false); } // Check to see if a new region has been set. If so, determine the // chunks for that region. if(myNewSection) { if(!processNewSection(header)) { // Failed processing a new section. Could be an // order issue like the file not being open or the // indexed file not having been read. // processNewSection sets myStatus with the failure reason. return(false); } } // Read until a record is not successfully read or there are no more // requested records. while(myStatus == SamStatus::SUCCESS) { record.setReference(myRefPtr); record.setSequenceTranslation(myReadTranslation); // If reading by index, this method will setup to ensure it is in // the correct position for the next record (if not already there). // Sets myStatus if it could not move to a good section. // Just returns true if it is not setup to read by index. if(!ensureIndexedReadPosition()) { // Either there are no more records in the section // or it failed to move to the right section, so there // is nothing more to read, stop looping. break; } // File is open for reading and the header has been read, so read the // next record. myInterfacePtr->readRecord(myFilePtr, header, record, myStatus); if(myStatus != SamStatus::SUCCESS) { // Failed to read the record, so break out of the loop. break; } // Successfully read a record, so check if we should filter it. // First check if it is out of the section. Returns true // if not reading by sections, returns false if the record // is outside of the section. Sets status to NO_MORE_RECS if // there is nothing left ot read in the section. if(!checkRecordInSection(record)) { // The record is not in the section. // The while loop will detect if NO_MORE_RECS was set. continue; } // Check the flag for required/excluded flags. uint16_t flag = record.getFlag(); if((flag & myRequiredFlags) != myRequiredFlags) { // The record does not conatain all required flags, so // continue looking. continue; } if((flag & myExcludedFlags) != 0) { // The record contains an excluded flag, so continue looking. continue; } //increment the record count. myRecordCount++; if(myStatistics != NULL) { // Statistics should be updated. myStatistics->updateStatistics(record); } // Successfully read the record, so check the sort order. if(!validateSortOrder(record, header)) { // ValidateSortOrder sets the status on a failure. return(false); } return(true); } // End while loop that checks if a desired record is found or failure. // Return true if a record was found. return(myStatus == SamStatus::SUCCESS); } // Write a record to the currently opened file. bool SamFile::WriteRecord(SamFileHeader& header, SamRecord& record) { if(myIsOpenForWrite == false) { // File is not open for writing myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot write record since the file is not open for writing"); return(false); } if(myHasHeader == false) { // The header has not yet been written. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot write record since the header has not been written"); return(false); } // Before trying to write the record, validate the sort order. if(!validateSortOrder(record, header)) { // Not sorted like it is supposed to be, do not write the record myStatus.setStatus(SamStatus::INVALID_SORT, "Cannot write the record since the file is not properly sorted."); return(false); } if(myRefPtr != NULL) { record.setReference(myRefPtr); } // File is open for writing and the header has been written, so write the // record. myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record, myWriteTranslation); if(myStatus == SamStatus::SUCCESS) { // A record was successfully written, so increment the record count. myRecordCount++; return(true); } return(false); } // Set the flag to validate that the file is sorted as it is read/written. // Must be called after the file has been opened. void SamFile::setSortedValidation(SortedType sortType) { mySortedType = sortType; } // Return the number of records that have been read/written so far. uint32_t SamFile::GetCurrentRecordCount() { return(myRecordCount); } // Sets what part of the SamFile should be read. bool SamFile::SetReadSection(int32_t refID) { // No start/end specified, so set back to default -1. return(SetReadSection(refID, -1, -1)); } // Sets what part of the SamFile should be read. bool SamFile::SetReadSection(const char* refName) { // No start/end specified, so set back to default -1. return(SetReadSection(refName, -1, -1)); } // Sets what part of the BAM file should be read. bool SamFile::SetReadSection(int32_t refID, int32_t start, int32_t end, bool overlap) { // If there is not a BAM file open for reading, return failure. // Opening a new file clears the read section, so it must be // set after the file is opened. if(!myIsBamOpenForRead) { // There is not a BAM file open for reading. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot set section since there is no bam file open"); return(false); } myNewSection = true; myOverlapSection = overlap; myStartPos = start; myEndPos = end; myRefID = refID; myRefName.clear(); myChunksToRead.clear(); // Reset the end of the current chunk. We are resetting our read, so // we no longer have a "current chunk" that we are reading. myCurrentChunkEnd = 0; myStatus = SamStatus::SUCCESS; // Reset the sort order criteria since we moved around in the file. myPrevCoord = -1; myPrevRefID = 0; myPrevReadName.Clear(); return(true); } // Sets what part of the BAM file should be read. bool SamFile::SetReadSection(const char* refName, int32_t start, int32_t end, bool overlap) { // If there is not a BAM file open for reading, return failure. // Opening a new file clears the read section, so it must be // set after the file is opened. if(!myIsBamOpenForRead) { // There is not a BAM file open for reading. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot set section since there is no bam file open"); return(false); } myNewSection = true; myOverlapSection = overlap; myStartPos = start; myEndPos = end; if((strcmp(refName, "") == 0) || (strcmp(refName, "*") == 0)) { // No Reference name specified, so read just the "-1" entries. myRefID = BamIndex::REF_ID_UNMAPPED; } else { // save the reference name and revert the reference ID to unknown // so it will be calculated later. myRefName = refName; myRefID = BamIndex::REF_ID_ALL; } myChunksToRead.clear(); // Reset the end of the current chunk. We are resetting our read, so // we no longer have a "current chunk" that we are reading. myCurrentChunkEnd = 0; myStatus = SamStatus::SUCCESS; // Reset the sort order criteria since we moved around in the file. myPrevCoord = -1; myPrevRefID = 0; myPrevReadName.Clear(); return(true); } void SamFile::SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags) { myRequiredFlags = requiredFlags; myExcludedFlags = excludedFlags; } // Get the number of mapped reads in the specified reference id. // Returns -1 for out of range refIDs. int32_t SamFile::getNumMappedReadsFromIndex(int32_t refID) { // The bam index must have already been read. if(myBamIndex == NULL) { myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot get num mapped reads from the index until it has been read."); return(false); } return(myBamIndex->getNumMappedReads(refID)); } // Get the number of unmapped reads in the specified reference id. // Returns -1 for out of range refIDs. int32_t SamFile::getNumUnMappedReadsFromIndex(int32_t refID) { // The bam index must have already been read. if(myBamIndex == NULL) { myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot get num unmapped reads from the index until it has been read."); return(false); } return(myBamIndex->getNumUnMappedReads(refID)); } // Get the number of mapped reads in the specified reference id. // Returns -1 for out of range references. int32_t SamFile::getNumMappedReadsFromIndex(const char* refName, SamFileHeader& header) { // The bam index must have already been read. if(myBamIndex == NULL) { myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot get num mapped reads from the index until it has been read."); return(false); } int32_t refID = BamIndex::REF_ID_UNMAPPED; if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0)) { // Reference name specified, so read just the "-1" entries. refID = header.getReferenceID(refName); } return(myBamIndex->getNumMappedReads(refID)); } // Get the number of unmapped reads in the specified reference id. // Returns -1 for out of range refIDs. int32_t SamFile::getNumUnMappedReadsFromIndex(const char* refName, SamFileHeader& header) { // The bam index must have already been read. if(myBamIndex == NULL) { myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot get num unmapped reads from the index until it has been read."); return(false); } int32_t refID = BamIndex::REF_ID_UNMAPPED; if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0)) { // Reference name specified, so read just the "-1" entries. refID = header.getReferenceID(refName); } return(myBamIndex->getNumUnMappedReads(refID)); } // Returns the number of bases in the passed in read that overlap the // region that is currently set. uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord) { if(myRefPtr != NULL) { samRecord.setReference(myRefPtr); } samRecord.setSequenceTranslation(myReadTranslation); // Get the overlaps in the sam record for the region currently set // for this file. return(samRecord.getNumOverlaps(myStartPos, myEndPos)); } void SamFile::GenerateStatistics(bool genStats) { if(genStats) { if(myStatistics == NULL) { // Want to generate statistics, but do not yet have the // structure for them, so create one. myStatistics = new SamStatistics(); } } else { // Do not generate statistics, so if myStatistics is not NULL, // delete it. if(myStatistics != NULL) { delete myStatistics; myStatistics = NULL; } } } const BamIndex* SamFile::GetBamIndex() { return(myBamIndex); } // initialize. void SamFile::init() { myFilePtr = NULL; myInterfacePtr = NULL; myStatistics = NULL; myBamIndex = NULL; myRefPtr = NULL; myReadTranslation = SamRecord::NONE; myWriteTranslation = SamRecord::NONE; myAttemptRecovery = false; myRequiredFlags = 0; myExcludedFlags = 0; } void SamFile::init(const char* filename, OpenType mode, SamFileHeader* header) { init(); resetFile(); bool openStatus = true; if(mode == READ) { // open the file for read. openStatus = OpenForRead(filename, header); } else { // open the file for write. openStatus = OpenForWrite(filename, header); } if(!openStatus) { // Failed to open the file - print error and abort. fprintf(stderr, "%s\n", GetStatusMessage()); std::cerr << "FAILURE - EXITING!!!" << std::endl; exit(-1); } } // Reset variables for each file. void SamFile::resetFile() { // Close the file. if (myFilePtr != NULL) { // If we already have an open file, close it. ifclose(myFilePtr); myFilePtr = NULL; } if(myInterfacePtr != NULL) { delete myInterfacePtr; myInterfacePtr = NULL; } myIsOpenForRead = false; myIsOpenForWrite = false; myHasHeader = false; mySortedType = UNSORTED; myPrevReadName.Clear(); myPrevCoord = -1; myPrevRefID = 0; myRecordCount = 0; myStatus = SamStatus::SUCCESS; // Reset indexed bam values. myIsBamOpenForRead = false; myRefID = BamIndex::REF_ID_ALL; myStartPos = -1; myEndPos = -1; myNewSection = false; myOverlapSection = true; myCurrentChunkEnd = 0; myChunksToRead.clear(); if(myBamIndex != NULL) { delete myBamIndex; myBamIndex = NULL; } // If statistics are being generated, reset them. if(myStatistics != NULL) { myStatistics->reset(); } myRefName.clear(); } // Validate that the record is sorted compared to the previously read record // if there is one, according to the specified sort order. // If the sort order is UNSORTED, true is returned. bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header) { if(myRefPtr != NULL) { record.setReference(myRefPtr); } record.setSequenceTranslation(myReadTranslation); bool status = false; if(mySortedType == UNSORTED) { // Unsorted, so nothing to validate, just return true. status = true; } else { // Check to see if mySortedType is based on the header. if(mySortedType == FLAG) { // Determine the sorted type from what was read out of the header. mySortedType = getSortOrderFromHeader(header); } if(mySortedType == QUERY_NAME) { // Validate that it is sorted by query name. // Get the query name from the record. const char* readName = record.getReadName(); // Check if it is sorted either in samtools way or picard's way. if((myPrevReadName.Compare(readName) > 0) && (strcmp(myPrevReadName.c_str(), readName) > 0)) { // return false. String errorMessage = "ERROR: File is not sorted by read name at record "; errorMessage += myRecordCount; errorMessage += "\n\tPrevious record was "; errorMessage += myPrevReadName; errorMessage += ", but this record is "; errorMessage += readName; myStatus.setStatus(SamStatus::INVALID_SORT, errorMessage.c_str()); status = false; } else { myPrevReadName = readName; status = true; } } else { // Validate that it is sorted by COORDINATES. // Get the leftmost coordinate and the reference index. int32_t refID = record.getReferenceID(); int32_t coord = record.get0BasedPosition(); // The unmapped reference id is at the end of a sorted file. if(refID == BamIndex::REF_ID_UNMAPPED) { // A new reference ID that is for the unmapped reads // is always valid. status = true; myPrevRefID = refID; myPrevCoord = coord; } else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED) { // Previous reference ID was for unmapped reads, but the // current one is not, so this is not sorted. String errorMessage = "ERROR: File is not coordinate sorted at record "; errorMessage += myRecordCount; errorMessage += "\n\tPrevious record was unmapped, but this record is "; errorMessage += header.getReferenceLabel(refID) + ":" + coord; myStatus.setStatus(SamStatus::INVALID_SORT, errorMessage.c_str()); status = false; } else if(refID < myPrevRefID) { // Current reference id is less than the previous one, //meaning that it is not sorted. String errorMessage = "ERROR: File is not coordinate sorted at record "; errorMessage += myRecordCount; errorMessage += "\n\tPrevious record was "; errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord; errorMessage += ", but this record is "; errorMessage += header.getReferenceLabel(refID) + ":" + coord; myStatus.setStatus(SamStatus::INVALID_SORT, errorMessage.c_str()); status = false; } else { // The reference IDs are in the correct order. if(refID > myPrevRefID) { // New reference id, so set the previous coordinate to -1 myPrevCoord = -1; } // Check the coordinates. if(coord < myPrevCoord) { // New Coord is less than the previous position. String errorMessage = "ERROR: File is not coordinate sorted at record "; errorMessage += myRecordCount; errorMessage += "\n\tPreviousRecord was "; errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord; errorMessage += ", but this record is "; errorMessage += header.getReferenceLabel(refID) + ":" + coord; myStatus.setStatus(SamStatus::INVALID_SORT, errorMessage.c_str()); status = false; } else { myPrevRefID = refID; myPrevCoord = coord; status = true; } } } } return(status); } SamFile::SortedType SamFile::getSortOrderFromHeader(SamFileHeader& header) { const char* tag = header.getSortOrder(); // Default to unsorted since if it is not specified in the header // that is the value that should be used. SortedType headerSortOrder = UNSORTED; if(strcmp(tag, "queryname") == 0) { headerSortOrder = QUERY_NAME; } else if(strcmp(tag, "coordinate") == 0) { headerSortOrder = COORDINATE; } return(headerSortOrder); } bool SamFile::ensureIndexedReadPosition() { // If no sections are specified, return true. if(myRefID == BamIndex::REF_ID_ALL) { return(true); } // Check to see if we have more to read out of the current chunk. // By checking the current position in relation to the current // end chunk. If the current position is >= the end of the // current chunk, then we must see to the next chunk. uint64_t currentPos = iftell(myFilePtr); if(currentPos >= myCurrentChunkEnd) { // If there are no more chunks to process, return failure. if(myChunksToRead.empty()) { myStatus = SamStatus::NO_MORE_RECS; return(false); } // There are more chunks left, so get the next chunk. Chunk newChunk = myChunksToRead.pop(); // Move to the location of the new chunk if it is not adjacent // to the current chunk. if(newChunk.chunk_beg != currentPos) { // New chunk is not adjacent, so move to it. if(ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) != true) { // seek failed, cannot retrieve next record, return failure. myStatus.setStatus(SamStatus::FAIL_IO, "Failed to seek to the next record"); return(false); } } // Seek succeeded, set the end of the new chunk. myCurrentChunkEnd = newChunk.chunk_end; } return(true); } bool SamFile::checkRecordInSection(SamRecord& record) { bool recordFound = true; if(myRefID == BamIndex::REF_ID_ALL) { return(true); } // Check to see if it is in the correct reference/position. if(record.getReferenceID() != myRefID) { // Incorrect reference ID, return no more records. myStatus = SamStatus::NO_MORE_RECS; return(false); } // Found a record. recordFound = true; // If start/end position are set, verify that the alignment falls // within those. // If the alignment start is greater than the end of the region, // return NO_MORE_RECS. // Since myEndPos is Exclusive 0-based, anything >= myEndPos is outside // of the region. if((myEndPos != -1) && (record.get0BasedPosition() >= myEndPos)) { myStatus = SamStatus::NO_MORE_RECS; return(false); } // We know the start is less than the end position, so the alignment // overlaps the region if the alignment end position is greater than the // start of the region. if((myStartPos != -1) && (record.get0BasedAlignmentEnd() < myStartPos)) { // If it does not overlap the region, so go to the next // record...set recordFound back to false. recordFound = false; } if(!myOverlapSection) { // Needs to be fully contained. Not fully contained if // 1) the record start position is < the region start position. // or // 2) the end position is specified and the record end position // is greater than or equal to the region end position. // (equal to since the region is exclusive. if((record.get0BasedPosition() < myStartPos) || ((myEndPos != -1) && (record.get0BasedAlignmentEnd() >= myEndPos))) { // This record is not fully contained, so move on to the next // record. recordFound = false; } } return(recordFound); } bool SamFile::processNewSection(SamFileHeader &header) { myNewSection = false; // If there is no index file, return failure. if(myBamIndex == NULL) { // No bam index has been read. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot read section since there is no index file open"); throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file.")); return(false); } // If there is not a BAM file open for reading, return failure. if(!myIsBamOpenForRead) { // There is not a BAM file open for reading. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot read section since there is no bam file open"); throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file.")); return(false); } if(myHasHeader == false) { // The header has not yet been read. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot read record since the header has not been read."); throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the header.")); return(false); } // Indexed Bam open for read, so disable read buffering because iftell // will be used. // Needs to be done here after we already know that the header has been // read. myFilePtr->disableBuffering(); myChunksToRead.clear(); // Reset the end of the current chunk. We are resetting our read, so // we no longer have a "current chunk" that we are reading. myCurrentChunkEnd = 0; // Check to see if the read section was set based on the reference name // but not yet converted to reference id. if(!myRefName.empty()) { myRefID = header.getReferenceID(myRefName.c_str()); // Clear the myRefName length so this code is only executed once. myRefName.clear(); // Check to see if a reference id was found. if(myRefID == SamReferenceInfo::NO_REF_ID) { myStatus = SamStatus::NO_MORE_RECS; return(false); } } // Get the chunks associated with this reference region. if(myBamIndex->getChunksForRegion(myRefID, myStartPos, myEndPos, myChunksToRead) == true) { myStatus = SamStatus::SUCCESS; } else { String errorMsg = "Failed to get the specified region, refID = "; errorMsg += myRefID; errorMsg += "; startPos = "; errorMsg += myStartPos; errorMsg += "; endPos = "; errorMsg += myEndPos; myStatus.setStatus(SamStatus::FAIL_PARSE, errorMsg); } return(true); } // // When the caller to SamFile::ReadRecord() catches an // exception, it may choose to call this method to resync // on the underlying binary stream. // // Arguments: a callback function that will requires length bytes // of data to validate a record header. // // The expected use case is to re-sync on the next probably valid // BAM record, so that we can resume reading even after detecting // a corrupted BAM file. // bool SamFile::attemptRecoverySync(bool (*checkSignature)(void *data) , int length) { if(myFilePtr==NULL) return false; // non-recovery aware objects will just return false: return myFilePtr->attemptRecoverySync(checkSignature, length); } // Default Constructor. SamFileReader::SamFileReader() : SamFile() { } // Constructor that opens the specified file for read. SamFileReader::SamFileReader(const char* filename) : SamFile(filename, READ) { } // Constructor that opens the specified file for read. SamFileReader::SamFileReader(const char* filename, ErrorHandler::HandlingType errorHandlingType) : SamFile(filename, READ, errorHandlingType) { } // Constructor that opens the specified file for read. SamFileReader::SamFileReader(const char* filename, SamFileHeader* header) : SamFile(filename, READ, header) { } // Constructor that opens the specified file for read. SamFileReader::SamFileReader(const char* filename, ErrorHandler::HandlingType errorHandlingType, SamFileHeader* header) : SamFile(filename, READ, errorHandlingType, header) { } SamFileReader::~SamFileReader() { } // Default Constructor. SamFileWriter::SamFileWriter() : SamFile() { } // Constructor that opens the specified file for write. SamFileWriter::SamFileWriter(const char* filename) : SamFile(filename, WRITE) { } // Constructor that opens the specified file for write. SamFileWriter::SamFileWriter(const char* filename, ErrorHandler::HandlingType errorHandlingType) : SamFile(filename, WRITE, errorHandlingType) { } // Constructor that opens the specified file for write. SamFileWriter::SamFileWriter(const char* filename, SamFileHeader* header) : SamFile(filename, WRITE, header) { } // Constructor that opens the specified file for write. SamFileWriter::SamFileWriter(const char* filename, ErrorHandler::HandlingType errorHandlingType, SamFileHeader* header) : SamFile(filename, WRITE, errorHandlingType, header) { } SamFileWriter::~SamFileWriter() { } libStatGen-1.0.14/bam/SamFile.h000066400000000000000000000540001254730101300160540ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_FILE_H__ #define __SAM_FILE_H__ #include "SamStatus.h" #include "InputFile.h" #include "SamFileHeader.h" #include "SamRecord.h" #include "GenericSamInterface.h" #include "BamIndex.h" #include "SamStatistics.h" /// Allows the user to easily read/write a SAM/BAM file. /// The SamFile class contains additional functionality that allows a user /// to read specific sections of sorted & indexed BAM files. In order to /// take advantage of this capability, the index file must be read prior to /// setting the read section. This logic saves the time of having to read /// the entire file and takes advantage of the seeking capability of BGZF. class SamFile { public: /// Enum for indicating whether to open the file for read or write. enum OpenType { READ, ///< open for reading. WRITE ///< open for writing. }; /// Enum for indicating the type of sort expected in the file. enum SortedType { UNSORTED = 0, ///< file is not sorted. FLAG, ///< SO flag from the header indicates the sort type. COORDINATE, ///< file is sorted by coordinate. QUERY_NAME ///< file is sorted by queryname. }; /// Default Constructor, initializes the variables, but does not open /// any files. SamFile(); /// Constructor that sets the error handling type. /// \param errorHandlingType how to handle errors. SamFile(ErrorHandler::HandlingType errorHandlingType); /// Constructor that opens the specified file based on the specified mode /// (READ/WRITE), aborts if the file could not be opened. /// \param filename name of the file to open. /// \param mode mode to use for opening the file. SamFile(const char* filename, OpenType mode); /// Constructor that opens the specified file based on the specified mode /// (READ/WRITE) and handles errors per the specified handleType. /// \param filename name of the file to open. /// \param mode mode to use for opening the file. /// \param errorHandlingType how to handle errors. SamFile(const char* filename, OpenType mode, ErrorHandler::HandlingType errorHandlingType); /// Constructor that opens the specified file based on the specified mode /// (READ/WRITE) and reads the header, aborts if the file could not be /// opened or the header not read. /// \param filename name of the file to open. /// \param mode mode to use for opening the file. /// \param header to read into or write from SamFile(const char* filename, OpenType mode, SamFileHeader* header); /// Constructor that opens the specified file based on the specified mode /// (READ/WRITE) and reads the header, handling errors per the specified /// handleType. /// \param filename name of the file to open. /// \param mode mode to use for opening the file. /// \param errorHandlingType how to handle errors. /// \param header to read into or write from SamFile(const char* filename, OpenType mode, ErrorHandler::HandlingType errorHandlingType, SamFileHeader* header); /// Destructor virtual ~SamFile(); /// Open a sam/bam file for reading with the specified filename, /// determing the type of file and SAM/BAM by reading the file /// (if not stdin). /// \param filename the sam/bam file to open for reading. /// \param header to read into or write from (optional) /// \return true = success; false = failure. bool OpenForRead(const char * filename, SamFileHeader* header = NULL); /// Open a sam/bam file for writing with the specified filename, /// determining SAM/BAM from the extension (.bam = BAM). /// \param filename the sam/bam file to open for writing. /// \param header to read into or write from (optional) /// \return true = success; false = failure. bool OpenForWrite(const char * filename, SamFileHeader* header = NULL); /// Read the specified bam index file. It must be read prior to setting a /// read section, for seeking and reading portions of a bam file. /// \param filename the name of the bam index file to be read. /// \return true = success; false = failure. bool ReadBamIndex(const char * filename); /// Read the bam index file using the BAM filename as a base. /// It must be read prior to setting a read section, for seeking /// and reading portions of a bam file. /// Must be read after opening the BAM file since it uses the /// BAM filename as a base name for the index file. /// First it tries filename.bam.bai. If that fails, it tries /// it without the .bam extension, filename.bai. /// \return true = success; false = failure. bool ReadBamIndex(); /// Sets the reference to the specified genome sequence object. /// \param reference pointer to the GenomeSequence object. void SetReference(GenomeSequence* reference); /// Set the type of sequence translation to use when reading /// the sequence. Passed down to the SamRecord when it is read. /// The default type (if this method is never called) is /// NONE (the sequence is left as-is). /// \param translation type of sequence translation to use. void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation); /// Set the type of sequence translation to use when writing /// the sequence. Passed down to the SamRecord when it is written. /// The default type (if this method is never called) is /// NONE (the sequence is left as-is). /// \param translation type of sequence translation to use. void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation); /// Close the file if there is one open. void Close(); /// Returns whether or not the file has been opened successfully. /// \return true = open; false = not open. bool IsOpen(); /// Returns whether or not the end of the file has been reached. /// \return true = EOF; false = not eof. /// If the file is not open, true is returned. bool IsEOF(); /// Reads the header section from the file and stores it in /// the passed in header. /// \return true = success; false = failure. bool ReadHeader(SamFileHeader& header); /// Writes the specified header into the file. /// \return true = success; false = failure. bool WriteHeader(SamFileHeader& header); /// Reads the next record from the file & stores it in the passed in record. /// /// If it is an indexed BAM file and SetReadSection was called, /// only alignments in the section specified by SetReadSection are read. /// If they all have already been read, this method returns false. /// /// Validates that the record is sorted according to the value set by /// setSortedValidation. No sorting validation is done if specified to be /// unsorted, or setSortedValidation was never called. /// \return true = record was successfully set (and sorted if applicable), /// false = record was not successfully set /// (or not sorted as expected). bool ReadRecord(SamFileHeader& header, SamRecord& record); /// Writes the specified record into the file. /// Validates that the record is sorted according to the value set by /// setSortedValidation. No sorting validation is done if specified to /// be unsorted, or setSortedValidation was never called. Returns false /// and does not write the record if the record was not properly sorted. /// \return true = success; false = failure. bool WriteRecord(SamFileHeader& header, SamRecord& record); /// Set the flag to validate that the file is sorted as it is read/written. /// Must be called after the file has been opened. /// Sorting validation is reset everytime SetReadPosition is called since /// it can jump around in the file. /// \param sortType specifies the type of sort to be checked for. void setSortedValidation(SortedType sortType); /// Return the number of records that have been read/written so far. uint32_t GetCurrentRecordCount(); /// Deprecated, get the Status of the last call that sets status. /// To remain backwards compatable - will be removed later. inline SamStatus::Status GetFailure() { return(GetStatus()); } /// Get the Status of the last call that sets status. inline SamStatus::Status GetStatus() { return(myStatus.getStatus()); } /// Get the Status Message of the last call that sets status. inline const char* GetStatusMessage() { return(myStatus.getStatusMessage()); } /// Sets which reference id (index into the BAM list of reference /// information) of the BAM file should be read. The records /// for that reference id will be retrieved on each ReadRecord call. /// Reference ids start at 0, and -1 indicates reads with no reference. /// When all records have been retrieved for the specified reference id, /// ReadRecord will return failure until a new read section is set. /// Must be called only after the file has been opened for reading. /// Sorting validation is reset everytime SetReadPosition is called since /// it can jump around in the file. /// \param refID the reference ID of the records to read from the file. /// \return true = success; false = failure. bool SetReadSection(int32_t refID); /// Sets which reference name of the BAM file should be read. The records /// for that reference name will be retrieved on each ReadRecord call. /// Specify "" or "*" to read records not associated with a reference. /// When all records have been retrieved for the specified reference name, /// ReadRecord will return failure until a new read section is set. /// Must be called only after the file has been opened for reading. /// Sorting validation is reset everytime SetReadPosition is called since /// it can jump around in the file. /// \param refName the reference name of the records to read from the file. /// \return true = success; false = failure. bool SetReadSection(const char* refName); /// Sets which reference id (index into the BAM list of reference /// information) & start/end positions of the BAM file should be read. /// The records for that reference id and positions will be retrieved on /// each ReadRecord call. Reference ids start at 0, and -1 indicates /// reads with no reference. When all records have been retrieved for the /// specified reference id, ReadRecord will return failure until a new read /// section is set. Must be called only after the file has been opened /// for reading. Sorting validation is reset everytime SetReadPosition is /// called since it can jump around in the file. /// \param refID the reference ID of the records to read from the file. /// \param start inclusive 0-based start position of records that should be read for this refID. /// \param end exclusive 0-based end position of records that should be read for this refID. /// \param overlap When true (default), return reads that just overlap the region; when false, only return reads that fall completely within the region /// \return true = success; false = failure. bool SetReadSection(int32_t refID, int32_t start, int32_t end, bool overlap = true); /// Sets which reference name & start/end positions of the BAM file should /// be read. The records for this reference name & positions will be /// retrieved on each ReadRecord call. Specify "" or "*" to indicate /// reads with no reference. When all records have been retrieved for /// the specified section, ReadRecord will return failure until a new read /// section is set. Must be called only after the file has been opened for /// reading. Sorting validation is reset everytime SetReadSection is /// called since it can jump around in the file. /// \param refName the reference name of the records to read from the file. /// \param start inclusive 0-based start position of records that should be read for this refID. /// \param end exclusive 0-based end position of records that should be read for this refID. /// \param overlap When true (default), return reads that just overlap the region; when false, only return reads that fall completely within the region /// \return true = success; false = failure. bool SetReadSection(const char* refName, int32_t start, int32_t end, bool overlap = true); /// Specify which reads should be returned by ReadRecord. /// Reads will only be returned by ReadRecord that contain the specified /// required flags and that do not contain any of the specified excluded /// flags. ReadRecord will continue to read from the file until a record /// that complies with these flag settings is found or until the end of the /// file/region. /// \param requiredFlags flags that are required to be in records /// returned by ReadRecord (set to 0x0 if there are no required flags). /// \param excludedFlags flags that are required to not be in records /// returned by ReadRecord (set to 0x0 if there are no excluded flags). void SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags); /// Get the number of mapped reads in the specified reference id. /// Returns -1 for out of range refIDs. /// \param refID reference ID for which to extract the number of mapped reads. /// \return number of mapped reads for the specified reference id. int32_t getNumMappedReadsFromIndex(int32_t refID); /// Get the number of unmapped reads in the specified reference id. /// Returns -1 for out of range refIDs. /// \param refID reference ID for which to extract the number of unmapped reads. /// \return number of unmapped reads for the specified reference id. int32_t getNumUnMappedReadsFromIndex(int32_t refID); /// Get the number of mapped reads in the specified reference name. /// Returns -1 for unknown reference names. /// \param refName reference name for which to extract the number of mapped reads. /// \param header header object containing the map from refName to refID /// \return number of mapped reads for the specified reference name. int32_t getNumMappedReadsFromIndex(const char* refName, SamFileHeader& header); /// Get the number of unmapped reads in the specified reference name. /// Returns -1 for unknown reference names. /// \param refName reference name for which to extract the number of unmapped reads. /// \param header header object containing the map from refName to refID /// \return number of unmapped reads for the specified reference name. int32_t getNumUnMappedReadsFromIndex(const char* refName, SamFileHeader& header); /// Returns the number of bases in the passed in read that overlap the /// region that is currently set. Overlapping means that the bases occur /// in both the read and the reference as either matches or mismatches. /// This does not count insertions, deletions, clips, pads, or skips. /// \param samRecord to check for overlapping bases. /// \return number of bases that overlap region that is currently set. uint32_t GetNumOverlaps(SamRecord& samRecord); /// Whether or not statistics should be generated for this file. /// The value is carried over between files and is not reset, but /// the statistics themselves are reset between files. /// \param genStats set to true if statistics should be generated, false if not. void GenerateStatistics(bool genStats); /// Return the bam index if one has been opened. /// \return const pointer to the bam index, or null if one has not been opened. const BamIndex* GetBamIndex(); /// Get the current file position. /// \return current position in the file. inline int64_t GetCurrentPosition() { return(iftell(myFilePtr)); } /// Turn off file read buffering. inline void DisableBuffering() { if(myFilePtr != NULL) { myFilePtr->disableBuffering(); } } /// Print the statistics that have been recorded due to a call to /// GenerateStatistics. inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();} protected: void init(); void init(const char* filename, OpenType mode, SamFileHeader* header); /// Resets the file prepping for a new file. void resetFile(); /// Validate that the record is sorted compared to the previously read /// record if there is one, according to the specified sort order. /// If the sort order is UNSORTED, true is returned. /// Sorting validation is reset everytime SetReadPosition is called since /// it can jump around in the file. bool validateSortOrder(SamRecord& record, SamFileHeader& header); // Return the sort order as defined by the header. If it is undefined // or set to an unknown value, UNSORTED is returned. SortedType getSortOrderFromHeader(SamFileHeader& header); bool processNewSection(SamFileHeader &header); // Check if there is more to read in the current chunk, if not, // move to the next chunk. // If no sections are specified or it successfully found a chunk to read, // return true. // Sets the status and returns false if it was unable to move to a new chunk // or there are no more chunks to read, otherwise returns true. bool ensureIndexedReadPosition(); // Check whether or not the record falls within the specified section. // If no sections are specified or this read falls within the // specified sections, return true. // If it does not, return false. // If the record position indicates there will be no more records within the // region, return false AND set the sam status to indicate NO_MORE_RECS. bool checkRecordInSection(SamRecord& record); IFILE myFilePtr; GenericSamInterface* myInterfacePtr; /// Flag to indicate if a file is open for reading. bool myIsOpenForRead; /// Flag to indicate if a file is open for writing. bool myIsOpenForWrite; /// Flag to indicate if a header has been read/written - required before /// being able to read/write a record. bool myHasHeader; SortedType mySortedType; /// Previous values used for checking if the file is sorted. int32_t myPrevCoord; int32_t myPrevRefID; String myPrevReadName; /// Keep a count of the number of records that have been read/written so far. uint32_t myRecordCount; /// Pointer to the statistics for this file. SamStatistics* myStatistics; /// The status of the last SamFile command. SamStatus myStatus; /// Values for reading Sorted BAM files via the index. bool myIsBamOpenForRead; bool myNewSection; // whether to return reads that overlap (true) the section or // are fully enclosed (false) in the section. bool myOverlapSection; int32_t myRefID; int32_t myStartPos; int32_t myEndPos; uint64_t myCurrentChunkEnd; SortedChunkList myChunksToRead; BamIndex* myBamIndex; GenomeSequence* myRefPtr; SamRecord::SequenceTranslation myReadTranslation; SamRecord::SequenceTranslation myWriteTranslation; std::string myRefName; private: bool myAttemptRecovery; uint16_t myRequiredFlags; uint16_t myExcludedFlags; public: bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length); void setAttemptRecovery(bool flag = false) { myAttemptRecovery = flag; } }; /// Child class of SamFile for reading files. class SamFileReader : public SamFile { public: /// Default Constructor. SamFileReader(); /// Constructor that opens the specified file for read. SamFileReader(const char* filename); /// Constructor that opens the specified file for read. SamFileReader(const char* filename, ErrorHandler::HandlingType errorHandlingType); /// Constructor that opens the specified file for read and reads /// the header from the file. SamFileReader(const char* filename, SamFileHeader* header); /// Constructor that opens the specified file for read and reads /// the header from the file. SamFileReader(const char* filename, ErrorHandler::HandlingType errorHandlingType, SamFileHeader* header); virtual ~SamFileReader(); }; /// Child class of SamFile for writing files. class SamFileWriter : public SamFile { public: /// Default Constructor. SamFileWriter(); /// Constructor that opens the specified file for write. SamFileWriter(const char* filename); /// Constructor that opens the specified file for write. SamFileWriter(const char* filename, ErrorHandler::HandlingType errorHandlingType); /// Constructor that opens the specified file for write and write /// the specified header into the file. SamFileWriter(const char* filename, SamFileHeader* header); /// Constructor that opens the specified file for write and write /// the specified header into the file. SamFileWriter(const char* filename, ErrorHandler::HandlingType errorHandlingType, SamFileHeader* header); virtual ~SamFileWriter(); }; #endif libStatGen-1.0.14/bam/SamFileHeader.cpp000066400000000000000000001012021254730101300175150ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFileHeader.h" #include "SamHeaderSQ.h" #include "SamHeaderRG.h" const std::string SamFileHeader::EMPTY_RETURN = ""; SamFileHeader::SamFileHeader() : myHD(NULL), myReferenceInfo(), myErrorMessage("") { resetHeader(); mySQs.setCaseSensitive(true); myRGs.setCaseSensitive(true); myPGs.setCaseSensitive(true); } SamFileHeader::~SamFileHeader() { resetHeader(); } // Copy Constructor SamFileHeader::SamFileHeader(const SamFileHeader& header) { copy(header); } // Overload operator = to copy the passed in header into this header. SamFileHeader & SamFileHeader::operator = (const SamFileHeader& header) { copy(header); return(*this); } bool SamFileHeader::copy(const SamFileHeader& header) { // Check to see if the passed in value is the same as this. if(this == &header) { return(true); } resetHeader(); // Copy the records by getting the other header's header string // and parsing it. std::string newString; bool status = header.getHeaderString(newString); String newHeaderString = newString.c_str(); status &= parseHeader(newHeaderString); myCurrentHeaderIndex = header.myCurrentHeaderIndex; myCurrentCommentIndex = header.myCurrentCommentIndex; // Clear the reference info and copy it to ensure it is the same. myReferenceInfo.clear(); // Copy Reference contigs, hash, lengths. myReferenceInfo = header.myReferenceInfo; return(status); } // Reset the header for a new entry, clearing out previous values. void SamFileHeader::resetHeader() { myReferenceInfo.clear(); // Clear the pointers to the header records. They are deleted when the // vector is cleaned up. myHD = NULL; mySQs.Clear(); myRGs.Clear(); myPGs.Clear(); // Delete the header records and clear the vector. for(unsigned int headerIndex = 0; headerIndex < myHeaderRecords.size(); headerIndex++) { if(myHeaderRecords[headerIndex] != NULL) { delete myHeaderRecords[headerIndex]; myHeaderRecords[headerIndex] = NULL; } } myHeaderRecords.clear(); // Reset the iterator for the header lines. resetHeaderRecordIter(); // Reset the comment iterator. resetCommentIter(); // Reset the individual type header iterators. resetSQRecordIter(); resetRGRecordIter(); resetPGRecordIter(); // Clear the comments myComments.clear(); } // Set the passed in string to the entire header string. Clearing its // current contents. bool SamFileHeader::getHeaderString(std::string& header) const { header.clear(); // Keep getting header lines until there are no more - false returned. unsigned int index = 0; while(getHeaderLine(index, header) != false) { ++index; } return(true); } int SamFileHeader::getReferenceID(const String & referenceName, bool addID) { return(myReferenceInfo.getReferenceID(referenceName, addID)); } int SamFileHeader::getReferenceID(const char* referenceName, bool addID) { return(myReferenceInfo.getReferenceID(referenceName, addID)); } const String & SamFileHeader::getReferenceLabel(int id) const { return(myReferenceInfo.getReferenceLabel(id)); } // Get the Reference Information const SamReferenceInfo& SamFileHeader::getReferenceInfo() const { return(myReferenceInfo); } // Get the Reference Information for updating separately when reading // BAMs...should only be called by BamInterface. SamReferenceInfo& SamFileHeader::getReferenceInfoForBamInterface() { return(myReferenceInfo); } // Add a header line that has an const char* value. bool SamFileHeader::addHeaderLine(const char* type, const char* tag, const char* value) { String headerLine; headerLine += "@"; headerLine += type; headerLine += "\t"; headerLine += tag; headerLine += ":"; headerLine += value; return(addHeaderLine(headerLine.c_str())); } // Add a header line that is already preformatted in a const char*. bool SamFileHeader::addHeaderLine(const char* headerLine) { // Parse the added header line. String headerString = headerLine; return(parseHeader(headerString)); } // Add a header line that is already preformatted in a const char*. bool SamFileHeader::addHeader(const char* header) { // Parse the added header line. String headerString = header; return(parseHeader(headerString)); } // Add a comment. bool SamFileHeader::addComment(const char* comment) { if((comment != NULL) && (strcmp(comment, EMPTY_RETURN.c_str()) != 0)) { // Valid comment, so add it. myComments.push_back(comment); } return(true); } // Add the specified tag and value to the HD header. bool SamFileHeader::setHDTag(const char* tag, const char* value) { if(myHD == NULL) { // Need to create the HD line. myHD = new SamHeaderHD(); if(myHD == NULL) { // New failed, return false. myErrorMessage = "SamFileHeader: Failed to allocate a new HD tag"; return(false); } // Succeeded to create the line, add it to the // list. myHeaderRecords.push_back(myHD); } if(!myHD->setTag(tag, value)) { myErrorMessage = "SamFileHeader: Failed to set the specified HD tag"; return(false); } return(true); } // Add the specified tag and value to the SQ header with the specified name. // If the header does not yet exist, the header is added. bool SamFileHeader::setSQTag(const char* tag, const char* value, const char* name) { // Get the SQ record for the specified name. SamHeaderSQ* sq = getSQ(name); if(sq == NULL) { // The SQ does not yet exist. // Make sure the tag is LN. if(strcmp(tag, "LN") != 0) { // LN is required so must be the first tag added myErrorMessage = "SamFileHeader:Failed to add the specified SQ key, LN not specified."; return(false); } // Add it. sq = new SamHeaderSQ(); if(sq == NULL) { // Could not create the header record. myErrorMessage = "SamFileHeader: Failed to allocate a new SQ tag"; return(false); } // Created the header record, so add it to the list of SQ lines. mySQs.Add(name, sq); myHeaderRecords.push_back(sq); // value is the length, so update the reference info. myReferenceInfo.add(name, atoi(value)); // Add the key tag if(!sq->addKey(name)) { // Failed to add the key tag, return false. myErrorMessage = "SamFileHeader:Failed to add the specified SQ key"; return(false); } } else if(strcmp(tag, "LN") == 0) { // Cannot modify/remove the LN tag. myErrorMessage = "SamFileHeader:Cannot modify/remove the SQ's LN tag"; return(false); } if(!sq->setTag(tag, value)) { myErrorMessage = "Failed to set the specified SQ tag"; return(false); } return(true); } // Add the specified tag and value to the RG header with the read group // identifier. If the header does not yet exist, the header is added. bool SamFileHeader::setRGTag(const char* tag, const char* value, const char* id) { // Get the RG record for the specified name. SamHeaderRG* rg = getRG(id); if(rg == NULL) { // The RG does not yet exist. // Add it. rg = new SamHeaderRG(); if(rg == NULL) { // Could not create the header record. myErrorMessage = "Failed to allocate a new RG tag"; return(false); } // Created the header record, so add it to the list of RG lines. myRGs.Add(id, rg); myHeaderRecords.push_back(rg); // Add the key tag if(!rg->addKey(id)) { // Failed to add the key tag, return false. myErrorMessage = "Failed to add the specified RG key"; return(false); } } if(!rg->setTag(tag, value)) { myErrorMessage = "Failed to set the specified RG tag"; return(false); } return(true); } // Add the specified tag and value to the PG header with the specified id. // If the header does not yet exist, the header is added. // Add the specified tag and value to the PG header. bool SamFileHeader::setPGTag(const char* tag, const char* value, const char* id) { // Get the PG record for the specified name. SamHeaderPG* pg = getPG(id); if(pg == NULL) { // The PG does not yet exist. // Add it. pg = new SamHeaderPG(); if(pg == NULL) { // Could not create the header record. myErrorMessage = "Failed to allocate a new PG tag"; return(false); } // Created the header record, so add it to the list of PG lines. myPGs.Add(id, pg); myHeaderRecords.push_back(pg); // Add the key tag if(!pg->addKey(id)) { // Failed to add the key tag, return false. myErrorMessage = "Failed to add the specified PG key"; return(false); } } if(!pg->setTag(tag, value)) { myErrorMessage = "Failed to set the specified PG tag"; return(false); } return(true); } // Add the HD record to the header. bool SamFileHeader::addHD(SamHeaderHD* hd) { // If there is already an HD header or if null // was passed in, return false. if(myHD != NULL) { myErrorMessage = "Failed add an HD tag - there is already one"; return(false); } if(hd == NULL) { myErrorMessage = "Failed add an HD tag - no tag specified"; return(false); } myHD = hd; myHeaderRecords.push_back(myHD); return(true); } // Add the SQ record to the header. bool SamFileHeader::addSQ(SamHeaderSQ* sq) { if(sq == NULL) { // null pointer passed in, can't add it. myErrorMessage = "SAM/BAM Header line failed to allocate SQ."; return(false); } const char* name = sq->getTagValue("SN"); const char* length = sq->getTagValue("LN"); if(strcmp(name, EMPTY_RETURN.c_str()) == 0) { // SN is not set, so can't add it. myErrorMessage = "SAM/BAM Header line failure: Skipping SQ line that is missing the SN field."; return(false); } if(strcmp(length, EMPTY_RETURN.c_str()) == 0) { // LN is not set, so can't add it. myErrorMessage = "SAM/BAM Header line failure: Skipping SQ line that is missing the LN field."; return(false); } // Determine whether or not a record with this // key is already in the hash. if(mySQs.Find(name) < 0) { // It is not already in the hash so add it. mySQs.Add(name, sq); myHeaderRecords.push_back(sq); myReferenceInfo.add(name, atoi(length)); return(true); } // It is already in the hash, so cannot be added. myErrorMessage = "SAM/BAM Header line failure: Skipping SQ line that has a repeated SN field."; return(false); } // Add the RG record to the header. bool SamFileHeader::addRG(SamHeaderRG* rg) { if(rg == NULL) { // null pointer passed in, can't add it. myErrorMessage = "SAM/BAM Header line failed to allocate RG."; return(false); } const char* id = rg->getTagValue("ID"); if(strcmp(id, EMPTY_RETURN.c_str()) == 0) { // ID is not set, so can't add it. myErrorMessage = "SAM/BAM Header line failure: Skipping RG line that is missing the ID field."; return(false); } // Determine whether or not a record with this // key is already in the hash. if(myRGs.Find(id) < 0) { // It is not already in the hash so // add it. myRGs.Add(id, rg); myHeaderRecords.push_back(rg); return(true); } // It is already in the hash, so cannot be added. myErrorMessage = "SAM/BAM Header line failure: Skipping RG line that has a repeated ID field."; return(false); } // Add the PG record to the header. bool SamFileHeader::addPG(SamHeaderPG* pg) { // If a null pointer was passed in, return false. if(pg == NULL) { myErrorMessage = "SAM/BAM Header line failed to allocate PG."; return(false); } const char* id = pg->getTagValue("ID"); if(strcmp(id, EMPTY_RETURN.c_str()) == 0) { // ID is not set, so can't add the header record. myErrorMessage = "SAM/BAM Header line failure: Skipping PG line that is missing the ID field."; return(false); } // Determine whether or not a record with this // key is already in the hash. if(myPGs.Find(id) < 0) { // It is not already in the hash so // add it. myPGs.Add(id, pg); myHeaderRecords.push_back(pg); return(true); } // It is already in the hash, so cannot be added. myErrorMessage = "SAM/BAM Header line failure: Skipping PG line that has a repeated ID field."; return(false); } // Add the RG record to the header. bool SamFileHeader::addRecordCopy(const SamHeaderRecord& hdrRec) { SamHeaderRecord* newRec = hdrRec.createCopy(); bool returnVal = true; switch(newRec->getType()) { case SamHeaderRecord::HD: returnVal = addHD((SamHeaderHD*)newRec); break; case SamHeaderRecord::PG: returnVal = addPG((SamHeaderPG*)newRec); break; case SamHeaderRecord::RG: returnVal = addRG((SamHeaderRG*)newRec); break; case SamHeaderRecord::SQ: returnVal = addSQ((SamHeaderSQ*)newRec); break; default: myErrorMessage = "Failed to copy a header record, unknown type."; returnVal = false; break; } return(returnVal); } // Remove the HD record. bool SamFileHeader::removeHD() { if(myHD != NULL) { // Reset the record. Do not delete it since it is in the headerRecords // vector and it is not worth the time to remove it from the middle of // that vector since this is the header and the space does not need // to be conserved. myHD->reset(); // Set myHD to null so a new HD could be added. myHD = NULL; } return(true); } // Remove the SQ record associated with the specified name. bool SamFileHeader::removeSQ(const char* name) { // Look up the name in the hash. int hashIndex = mySQs.Find(name); if(hashIndex < 0) { // Not found in the hash, so nothing to // delete, return true it does not exist // in the hash. return(true); } // Get the SQ. SamHeaderSQ* sq = (SamHeaderSQ*)(mySQs.Object(hashIndex)); if(sq == NULL) { // sq is null, this is an error since hashIndex was greater than 0, // so it should have been found. myErrorMessage = "SAM/BAM Header line failed to get SQ object."; return(false); } // Reset the record. Do not delete it since it is in the headerRecords // vector and it is not worth the time to remove it from the middle of // that vector since this is the header and the space does not need // to be conserved. sq->reset(); // Delete the entry from the hash. mySQs.Delete(hashIndex); return(true); } // Remove the RG record associated with the specified id. bool SamFileHeader::removeRG(const char* id) { // Look up the id in the hash. int hashIndex = myRGs.Find(id); if(hashIndex < 0) { // Not found in the hash, so nothing to // delete, return true it does not exist // in the hash. return(true); } // Get the RG. SamHeaderRG* rg = (SamHeaderRG*)(myRGs.Object(hashIndex)); if(rg == NULL) { // rg is null, this is an error since hashIndex was greater than 0, // so it should have been found. myErrorMessage = "SAM/BAM Header line failed to get RG object."; return(false); } // Reset the record. Do not delete it since it is in the headerRecords // vector and it is not worth the time to remove it from the middle of // that vector since this is the header and the space does not need // to be conserved. rg->reset(); // Delete the entry from the hash. myRGs.Delete(hashIndex); return(true); } // Remove the PG record associated with the specified id. bool SamFileHeader::removePG(const char* id) { // Look up the id in the hash. int hashIndex = myPGs.Find(id); if(hashIndex < 0) { // Not found in the hash, so nothing to // delete, return true it does not exist // in the hash. return(true); } // Get the PG. SamHeaderPG* pg = (SamHeaderPG*)(myPGs.Object(hashIndex)); if(pg == NULL) { // pg is null, this is an error since hashIndex was greater than 0, // so it should have been found. myErrorMessage = "SAM/BAM Header line failed to get PG object."; return(false); } // Reset the record. Do not delete it since it is in the headerRecords // vector and it is not worth the time to remove it from the middle of // that vector since this is the header and the space does not need // to be conserved. pg->reset(); // Delete the entry from the hash. myPGs.Delete(hashIndex); return(true); } const char* SamFileHeader::getHDTagValue(const char* tag) { if(myHD == NULL) { // return blank since there is no HD type. return(EMPTY_RETURN.c_str()); } return(myHD->getTagValue(tag)); } // Get the value associated with the specified tag on the SQ line with // the specified sequence name. const char* SamFileHeader::getSQTagValue(const char* tag, const char* name) { // Look up the name in the hash to get the associated SQ object. SamHeaderSQ* sq = (SamHeaderSQ*)(mySQs.Object(name)); // If it is NULL - the tag was not found, so return if(sq == NULL) { return(EMPTY_RETURN.c_str()); } // Found the object, so return the SQ Tag. return(sq->getTagValue(tag)); } // Get the value associated with the specified tag on the RG line with // the specified read group identifier. const char* SamFileHeader::getRGTagValue(const char* tag, const char* id) { // Look up the id in the hash to get the associated RG object. SamHeaderRG* rg = (SamHeaderRG*)(myRGs.Object(id)); // If it is NULL - the tag was not found, so return if(rg == NULL) { return(EMPTY_RETURN.c_str()); } // Found the object, so return the RG Tag. return(rg->getTagValue(tag)); } const char* SamFileHeader::getPGTagValue(const char* tag, const char* id) { // Look up the id in the hash to get the associated PG object. SamHeaderPG* pg = (SamHeaderPG*)(myPGs.Object(id)); // If it is NULL - the tag was not found, so return if(pg == NULL) { return(EMPTY_RETURN.c_str()); } // Found the object, so return the PG Tag. return(pg->getTagValue(tag)); } // Get the number of SQ objects. int SamFileHeader::getNumSQs() { return(mySQs.Entries()); } // Get the number of RG objects. int SamFileHeader::getNumRGs() { return(myRGs.Entries()); } // Get the number of PG objects. int SamFileHeader::getNumPGs() { return(myPGs.Entries()); } // Get the HD object. SamHeaderHD* SamFileHeader::getHD() { return(myHD); } // Get the SQ object with the specified sequence name. SamHeaderSQ* SamFileHeader::getSQ(const char* name) { return((SamHeaderSQ*)(mySQs.Object(name))); } // Get the RG object with the specified read group identifier. SamHeaderRG* SamFileHeader::getRG(const char* id) { return((SamHeaderRG*)(myRGs.Object(id))); } // Get the PG object. SamHeaderPG* SamFileHeader::getPG(const char* id) { return((SamHeaderPG*)(myPGs.Object(id))); } // Return the value of the SO tag. // If this field does not exist, EMPTY_RETURN.c_str() is returned. const char* SamFileHeader::getSortOrder() { if(myHD == NULL) { // No HD, so return blank EMPTY_RETURN.c_str() return(EMPTY_RETURN.c_str()); } return(myHD->getSortOrder()); } // Deprecated way of getting the sort order from the file. const char* SamFileHeader::getTagSO() { return(getSortOrder()); } // Get the next SQ header record. After all SQ headers have been retrieved, // NULL is returned until a reset is called. SamHeaderRecord* SamFileHeader::getNextSQRecord() { return(getNextHeaderRecord(myCurrentSQIndex, SamHeaderRecord::SQ)); } // Get the next RG header record. After all RG headers have been retrieved, // NULL is returned until a reset is called. SamHeaderRecord* SamFileHeader::getNextRGRecord() { return(getNextHeaderRecord(myCurrentRGIndex, SamHeaderRecord::RG)); } // Get the next PG header record. After all PG headers have been retrieved, // NULL is returned until a reset is called. SamHeaderRecord* SamFileHeader::getNextPGRecord() { return(getNextHeaderRecord(myCurrentPGIndex, SamHeaderRecord::PG)); } // Reset to the beginning of the header records so the next call // to getNextSQRecord returns the first SQ header record. void SamFileHeader::resetSQRecordIter() { myCurrentSQIndex = 0; } // Reset to the beginning of the header records so the next call // to getNextRGRecord returns the first RG header record. void SamFileHeader::resetRGRecordIter() { myCurrentRGIndex = 0; } // Reset to the beginning of the header records so the next call // to getNextPGRecord returns the first PG header record. void SamFileHeader::resetPGRecordIter() { myCurrentPGIndex = 0; } // Get the next header record of the specified type. // Pass in the index to start looking at and the type to look for. // Update the index. // After all headers of that type have been retrieved, // NULL is returned until a reset is called for that type. SamHeaderRecord* SamFileHeader::getNextHeaderRecord(uint32_t& index, SamHeaderRecord::SamHeaderRecordType headerType) { SamHeaderRecord* foundRecord = NULL; // Loop until a record is found or until out of range of the // headerRecord vector. while((index < myHeaderRecords.size()) && (foundRecord == NULL)) { // Get the next record. foundRecord = myHeaderRecords[index]; // Either way, increment the index. ++index; // Check to see if the next record is active. if(!foundRecord->isActiveHeaderRecord()) { // Not active, so clear the pointer. foundRecord = NULL; } // Check to see if the record is the right type. else if(foundRecord->getType() != headerType) { // Not the right type, so clear the pointer. foundRecord = NULL; } } // Return the record if it was found. Will be null if none were found. return(foundRecord); } // Get the next header record. After all headers have been retrieved, // NULL is returned until a reset is called. Does not return the // Comment lines. // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the // same iterator. SamHeaderRecord* SamFileHeader::getNextHeaderRecord() { // Get the next header record SamHeaderRecord* foundRecord = NULL; // Loop until a record is found or until out of range of the // headerRecord vector. while((myCurrentHeaderIndex < myHeaderRecords.size()) && (foundRecord == NULL)) { // Get the next record. foundRecord = myHeaderRecords[myCurrentHeaderIndex]; // Either way, increment the index. ++myCurrentHeaderIndex; // Check to see if the next record is active. if(!foundRecord->isActiveHeaderRecord()) { // Not active, so clear the pointer. foundRecord = NULL; } } // Return the record if it was found. Will be null if none were found. return(foundRecord); } // Set the passed in string to the next header line. The passed in // string will be overwritten. If there are no more header lines or there // is an error, false is returned and the passed in string is set to EMPTY_RETURN.c_str() // until a rest is called. // Will also return the comment lines. // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the // same iterator. bool SamFileHeader::getNextHeaderLine(std::string &headerLine) { headerLine = EMPTY_RETURN.c_str(); // Until the header is set, keep reading. // Header could return EMPTY_RETURN.c_str() if the header line is blank. while(headerLine == EMPTY_RETURN.c_str()) { if(getHeaderLine(myCurrentHeaderIndex, headerLine) == false) { // getHeaderLine failed, so stop processing, and return false. return(false); } else { // In range, increment the index. ++myCurrentHeaderIndex; } } return(true); } // Reset to the beginning of the header records so the next call // to getNextHeaderRecord returns the first header line. void SamFileHeader::resetHeaderRecordIter() { myCurrentHeaderIndex = 0; } void SamFileHeader::appendCommentLines(std::string &commentLines) { for(unsigned int i = 0; i < myComments.size(); i++) { commentLines += "@CO\t";; commentLines += myComments[i]; commentLines += "\n"; } } // Returns the comment on the next comment line. Returns EMPTY_RETURN.c_str() if all comment // lines have been returned, until resetCommentIter is called. const char* SamFileHeader::getNextComment() { if(myCurrentCommentIndex < myComments.size()) { return(myComments[myCurrentCommentIndex++].c_str()); } // Already gone through all the comments, return EMPTY_RETURN.c_str(). return(EMPTY_RETURN.c_str()); } // Resets to the beginning of the comments so getNextComment returns // the first comment. void SamFileHeader::resetCommentIter() { myCurrentCommentIndex = 0; } // Parse the header. bool SamFileHeader::parseHeader(String& header) { std::string errorMessage = ""; int numErrors = 0; int numValid = 0; // Split the header into lines. std::vector* types = header.Split('\n'); // Loop through each header line, parsing that line. for(uint32_t index = 0; index < types->size(); index++) { // Parse the header line. if(!parseHeaderLine(types->at(index))) { errorMessage += myErrorMessage; errorMessage += "\n"; ++numErrors; } else { // valid header line ++numValid; } } // Delete the types vector. delete types; types = NULL; myErrorMessage = errorMessage; if((numErrors > 0) && (numValid == 0)) { // Only errors. std::cerr << numErrors << " invalid SAM/BAM Header lines were skipped due to:\n" << errorMessage << std::endl; return(false); } else if(numErrors > 0) { // Some valid & some invalid. // Going to return true, but add note about the invalid lines. std::cerr << numErrors << " invalid SAM/BAM Header lines were skipped due to:\n" << errorMessage << std::endl; } return(true); } // Parse one line of the header. bool SamFileHeader::parseHeaderLine(const String& headerLine) { // Check if the line starts with @CO. if((headerLine.Length() >= 4) && (headerLine[0] == '@') && (headerLine[1] == 'C') && (headerLine[2] == 'O') && (headerLine[3] == '\t')) { // Comment line. String comment = headerLine.SubStr(4); return(addComment(comment)); } StringArray tokens; // Split the line by tabs. tokens.ReplaceColumns(headerLine, '\t'); if(tokens.Length() < 1) { // Nothing on this line, just return true. return(true); } // Get the header type, the first column. if((tokens[0].Length() != 3) || (tokens[0][0] != '@')) { // The header type string is incorrect. Should be 3 characters // with the first one @. myErrorMessage = "SAM/BAM Header line does not start with @ & at least 2 chars."; return(false); } bool status = true; if(tokens[0] == "@HD") { if(myHD == NULL) { // Create a new hd. myHD = new SamHeaderHD(); if(myHD == NULL) { // Failed to allocate HD, so return false. myErrorMessage = "SAM/BAM Header line failed to allocate HD."; return(false); } myHeaderRecords.push_back(myHD); if(!myHD->setFields(tokens)) { myErrorMessage = "SAM/BAM Header line failed to store HD record."; status = false; } } else { // HD already set, so return false. myErrorMessage = "SAM/BAM Header line failure: multiple HD records."; status = false; } } else if(tokens[0] == "@SQ") { // Create a new SQ record. SamHeaderSQ* sq = new SamHeaderSQ(); if(sq->setFields(tokens)) { // sq fields were properly set, so add it to the list of // SQ lines. // myStatus set in the method. status &= addSQ(sq); } else { myErrorMessage = "SAM/BAM Header line failed to store SQ record."; status = false; } } else if(tokens[0] == "@RG") { // Create a new RG record. SamHeaderRG* rg = new SamHeaderRG(); if(rg->setFields(tokens)) { // rg fields were properly set, so add it to the list of // RG lines. // myStatus set in the method. status &= addRG(rg); } else { myErrorMessage = "SAM/BAM Header line failed to store RG record."; status = false; } } else if(tokens[0] == "@PG") { // Create a new PG record. SamHeaderPG* pg = new SamHeaderPG(); if(pg->setFields(tokens)) { // pg fields were properly set, so add it to the list of // PG lines. // myStatus set in the method. status &= addPG(pg); } else { myErrorMessage = "SAM/BAM Header line failed to store PG record."; status = false; } } else { // Unknown header type. myErrorMessage = "SAM/BAM Header line failure: Skipping unknown header type, "; myErrorMessage += (const char*)(tokens[0]); status = false; } return(status); } // Set the passed in string to the header line at the specified index. // It does NOT clear the current contents of header. // NOTE: some indexes will return blank if the entry was deleted. bool SamFileHeader::getHeaderLine(unsigned int index, std::string& header) const { // Check to see if the index is in range of the header records vector. if(index < myHeaderRecords.size()) { // In range of the header records vector, so get the string for // that record. SamHeaderRecord* hdrRec = myHeaderRecords[index]; hdrRec->appendString(header); return(true); } else { unsigned int commentIndex = index - myHeaderRecords.size(); // Check to see if it is in range of the comments. if(commentIndex < myComments.size()) { // It is in range of the comments, so add the type. header += "@CO\t"; // Add the comment. header += myComments[commentIndex]; // Add the new line. header += "\n"; return(true); } } // Invalid index. return(false); } libStatGen-1.0.14/bam/SamFileHeader.h000066400000000000000000000463131254730101300171750ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_FILE_HEADER_H__ #define __SAM_FILE_HEADER_H__ #include #include #include "SamReferenceInfo.h" #include "SamHeaderHD.h" #include "SamHeaderSQ.h" #include "SamHeaderRG.h" #include "SamHeaderPG.h" /// This class allows a user to get/set the fields in a SAM/BAM Header. /// Sam/Bam headers contain comments and multiple SamHeaderRecords /// (HD, SQs, RGs, PGs) comprised of tag/value pairs with each tag only /// appearing once within a specific record. class SamFileHeader { public: SamFileHeader(); ~SamFileHeader(); ///////////////////////////// /// @name Copying a Header /// These methods are ways of copying the contents of one header into /// another one. //@{ /// Copy Constructor copies the specified header into this one. SamFileHeader(const SamFileHeader& header); /// Overload operator = to copy the passed in header into this header. SamFileHeader & operator = (const SamFileHeader& header); /// Copy method copies the passed in header into this header. /// Returns true if at least one header line was successfully copied. bool copy(const SamFileHeader& header); //@} /// Initialize the header. void resetHeader(); ///////////////////////////// /// @name Get the Entire Header /// Get the entire header as a single string. //@{ /// Set the passed in string to the entire header string, clearing its /// current contents. /// \return true if successfully set (even if set to "") bool getHeaderString(std::string& header) const; //@} /// Get the reference ID for the specified reference name (chromosome). /// If addID is set to true, a reference id will be created for the /// referenceName if one does not already exist. If addID is set to /// false (default), it will return SamReferenceInfo::NO_REF_ID. int getReferenceID(const String & referenceName, bool addID = false); /// Get the reference ID for the specified reference name (chromosome). /// If addID is set to true, a reference id will be created for the /// referenceName if one does not already exist. If addID is set to /// false (default), it will return SamReferenceInfo::NO_REF_ID. int getReferenceID(const char* referenceName, bool addID = false); /// Return the reference name (chromosome) for the specified reference id. const String & getReferenceLabel(int id) const; /// Get the Reference Information const SamReferenceInfo& getReferenceInfo() const; // Get the Reference Information for updating separately when reading // BAMs...should only be called by BamInterface. SamReferenceInfo& getReferenceInfoForBamInterface(); //////////////////////////////////////////////////////////////////////// // Set Values in the header //////////////////////////////////////////////////////////////////////// ///////////////////////////////////////// /// @name Adding an entire header/comment line. /// These methods are ways of adding an entire header line at once. //@{ /// Add a header line that is just one tag with a const char* value. /// Note: This method will only do one tag per type on a line, so if a /// type has multiple tags, the whole line needs to be added at once, /// and a different method should be used. bool addHeaderLine(const char* type, const char* tag, const char* value); /// Add a header line that is already preformatted in a const char*. /// Returns true if at least one header line was successfully added. bool addHeaderLine(const char* headerLine); /// Add a header that is already preformatted in a const char*. /// Returns true if at least one header line was successfully added. bool addHeader(const char* header); /// Add the specified comment to the header (do not include "@CO" or "\n"). /// \return true if successfully added, false if not. bool addComment(const char* comment); //@} ///////////////////////////////////////// /// @name Set/Add/Remove a Single Tag /// The passed in tag should be the two character SAM tag as defined /// in the SAM spec. A tag is removed from the header record by setting /// it to "". For the SQ and RG header types, the key tags (SN for SQ /// and ID for RG) may not be modified or removed once set. This is /// because these values are used as a lookup key for the header record, /// so the entire record must be removed. //@{ // // Set the specified header type tag to the specified value in the // // header with the specified keyID. keyID must be specified when // // type = SQ, RG, or PG. // bool setTag(SamHeaderRecord::SamHeaderRecordType type, const char* tag, // const char* value, const char* keyID = NULL); /// Set the specified tag to the specified value in the HD header, remove /// the tag by specifying value="". /// \return true if the tag was successfully set, false if not. bool setHDTag(const char* tag, const char* value); /// Set the specified tag to the specified value in the SQ header with /// the specified name, remove the tag by specifying value="". If the /// header does not yet exist, the tag must be "LN" and the header is added /// with the specified LN value and the SN value passed in name. /// The SN & LN tags may not be modified or removed after they are /// set unless the entire record is deleted. /// \return true if the tag was successfully set, false if not. bool setSQTag(const char* tag, const char* value, const char* name); /// Set the specified tag to the specified value in the RG header with /// the specified id, remove the tag by specifying value="". If the /// header does not yet exist, the header is added and so is the ID tag /// with the value set to the passed in id. The ID tag may not be /// modified or removed after it is set unless the entire record is deleted. /// \return true if the tag was successfully set, false if not. bool setRGTag(const char* tag, const char* value, const char* id); /// Set the specified tag to the specified value in the PG header with /// the specified id, remove the tag by specifying value="". If the /// header does not yet exist, the header is added and so is the ID tag /// with the value set to the passed in id. The ID tag may not be /// modified or removed after it is set unless the entire record is deleted. /// \return true if the tag was successfully set, false if not. bool setPGTag(const char* tag, const char* value, const char* id); //@} ///////////////////////////////////////// /// @name Add an Already Setup SamHeaderRecord /// NOTE: These methods add a pointer to the passed in record. /// The header record will be deleted when it's cleaned up from this header. /// NOTE: Do NOT delete the passed in record, the SamFileHeader class /// takes care of that itself. //@{ /// Add the HD record to the header. /// Note: it adds a pointer to the passed in header record. The header /// record will be deleted when it is cleaned up from this header. /// \return true if the record was successfully added, false otherwise. bool addHD(SamHeaderHD* hd); /// Add the SQ record to the header. /// Note: it adds a pointer to the passed in header record. The header /// record will be deleted when it is cleaned up from this header. /// \return true if the record was successfully added, false otherwise. bool addSQ(SamHeaderSQ* sq); /// Add the RG record to the header. /// Note: it adds a pointer to the passed in header record. The header /// record will be deleted when it is cleaned up from this header. /// \return true if the record was successfully added, false otherwise. bool addRG(SamHeaderRG* rg); /// Add the PG record to the header. /// Note: it adds a pointer to the passed in header record. The header /// record will be deleted when it is cleaned up from this header. /// \return true if the record was successfully added, false otherwise. bool addPG(SamHeaderPG* pg); /// Add a copy of the specified header record to the header. /// Note: it creates a new header record that is identical to the specified /// one and adds it to the header. The passed in pointer will not be /// deleted due to this. /// \return true if the record was successfully added, false otherwise. bool addRecordCopy(const SamHeaderRecord& hdrRec); //@} //////////////////////////////////////////////////////////////////////// /// @name Remove an Entire Header Record //@{ /// Remove the HD record. /// \return true if successfully removed or did not exist, false if /// the record still exists. bool removeHD(); /// Remove SQ record with the specified key. /// NOTE: Does not remove it from the BAM index. /// \return true if successfully removed or did not exist, false if /// the record still exists. bool removeSQ(const char* name); /// Remove RG record with the specified key. /// \return true if successfully removed or did not exist, false if /// the record still exists. bool removeRG(const char* id); /// Remove PG record with the specified key. /// \return true if successfully removed or did not exist, false if /// the record still exists. bool removePG(const char* id); //@} //////////////////////////////////////////////////////////////////////// /// @name Get a Specific Tag /// These methods return the value associated with the specified tag. /// If the tag does not exist in the record "" is returned. /// /// For SQ, RG, and PG the value returned is for the tag associated with /// the specified key (name/id). If a record with that key does not exist /// or if the tag does not exist for the record with that key, "" is /// returned. //@{ /// Returns the value associated with the specified HD tag, returning "" if /// the tag does not exist in the header. const char* getHDTagValue(const char* tag); /// Get the value associated with the specified tag on the SQ line with /// the specified sequence name, returning "" if the tag or key does /// not exist. const char* getSQTagValue(const char* tag, const char* name); /// Get the value associated with the specified tag on the RG line with /// the specified read group identifier, returning "" if the tag or key does /// not exist. const char* getRGTagValue(const char* tag, const char* id); /// Get the value associated with the specified tag on the RG line with /// the specified id, returning "" if the tag or key does /// not exist. const char* getPGTagValue(const char* tag, const char* id); //@} /// Get the number of SQ objects. int getNumSQs(); /// Get the number of RG objects. int getNumRGs(); /// Get the number of PG objects. int getNumPGs(); //////////////////////////////////////////////////////////////////////// /// @name Get a Specific Header Record /// These methods return a reference to the specific record that was /// requested, returning NULL if that record does not exist in the header. /// /// The returned record can be modified to add/remove some tags. /// Since a reference is returned, the SamHeaderFile automatically /// reflects these changes. //@{ /// Get the HD object, returning NULL if there is no HD record. SamHeaderHD* getHD(); /// Get the SQ object with the specified sequence name, returning NULL /// if there is no SQ object with that key. SamHeaderSQ* getSQ(const char* name); /// Get the RG object with the specified read group identifier, returning /// NULL if there is no RG object with that key.. SamHeaderRG* getRG(const char* id); /// Get the PG object with the specified id, returning NULL /// if there is no PG object with that key.. SamHeaderPG* getPG(const char* id); //@} // ////////////////////////////////// // // Set methods for header fields. // bool setVersion(const char* version); // bool setSortOrder(const char* sortOrder); // bool addSequenceName(const char* sequenceName); // bool setSequenceLength(const char* keyID, int sequenceLength); // bool setGenomeAssemblyId(const char* keyID, const char* genomeAssemblyId); // bool setMD5Checksum(const char* keyID, const char* md5sum); // bool setURI(const char* keyID, const char* uri); // bool setSpecies(const char* keyID, const char* species); // bool addReadGroupID(const char* readGroupID); // bool setSample(const char* keyID, const char* sample); // bool setLibrary(const char* keyID, const char* library); // bool setDescription(const char* keyID, const char* description); // bool setPlatformUnit(const char* keyID, const char* platform); // bool setPredictedMedianInsertSize(const char* keyID, const char* isize); // bool setSequencingCenter(const char* keyID, const char* center); // bool setRunDate(const char* keyID, const char* runDate); // bool setTechnology(const char* keyID, const char* technology); // bool addProgram(const char* programID); // bool setProgramVersion(const char* keyID, const char* version); // bool setCommandLine(const char* keyID, const char* commandLine); // /////////////////////////////////// // // Get methods for header fields. // // Returns the number of SQ entries in the header. // int32_t getSequenceDictionaryCount(); /// Return the Sort Order value that is set in the Header, returning "" /// if this field does not exist. const char* getSortOrder(); /// DEPRECATED const char* getTagSO(); ///////////////////////////// /// @name Get the Header Record/Comment/Line by Record/Comment/Line /// These methods iterate through the header. /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the /// same iterator. getNextHeaderRecord that takes a header type /// uses the same iterator as the getNextXXRecord with that type. /// Otherwise the iterators are independent. //@{ /// Get the next SQ header record. After all SQ headers have been /// retrieved, NULL is returned until a reset is called. /// Independent from getNextHeaderRecord, getNextHeaderLine and the /// other getNextXXRecord methods and the associated reset methods. SamHeaderRecord* getNextSQRecord(); /// Get the next RG header record. After all RG headers have been /// retrieved, NULL is returned until a reset is called. /// Independent from getNextHeaderRecord, getNextHeaderLine and the /// other getNextXXRecord methods and the associated reset methods. SamHeaderRecord* getNextRGRecord(); /// Get the next PG header record. After all PG headers have been /// retrieved, NULL is returned until a reset is called. /// Independent from getNextHeaderRecord, getNextHeaderLine and the /// other getNextXXRecord methods and the associated reset methods. SamHeaderRecord* getNextPGRecord(); /// Reset to the beginning of the header records so the next call /// to getNextSQRecord returns the first SQ header record. void resetSQRecordIter(); /// Reset to the beginning of the header records so the next call /// to getNextRGRecord returns the first RG header record. void resetRGRecordIter(); /// Reset to the beginning of the header records so the next call /// to getNextPGRecord returns the first PG header record. void resetPGRecordIter(); /// Get the next header record of the specified type starting from the /// specified index and update the index. /// After all headers of that type have been retrieved, /// NULL is returned until a reset is called for that type. SamHeaderRecord* getNextHeaderRecord(uint32_t& index, SamHeaderRecord::SamHeaderRecordType headerType); /// Get the next header record, but not comment line. After all headers /// have been retrieved, NULL is returned until a reset is called. /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the /// same iterator. SamHeaderRecord* getNextHeaderRecord(); /// Set the passed in string to the next header line, overwritting /// the passed in string. If there are no more header lines or there /// is an error, false is returned and the passed in string is set to "" /// until a rest is called. /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the /// same iterator. bool getNextHeaderLine(std::string &headerLine); /// Reset to the beginning of the header records so the next call /// to getNextHeaderRecord returns the first header line. void resetHeaderRecordIter(); /// Append all of the comment lines to the specified string. void appendCommentLines(std::string &commentLines); /// Returns the comment on the next comment line. Returns "" if all comment /// lines have been returned, until resetCommentIter is called. const char* getNextComment(); /// Resets to the beginning of the comments so getNextComment returns /// the first comment. void resetCommentIter(); //@} /// Get the failure message if a method returned failure. const char* getErrorMessage() { return(myErrorMessage.c_str()); } static const std::string EMPTY_RETURN; private: // Parse the header string. bool parseHeader(String& header); // Parse the specified line of the header. bool parseHeaderLine(const String& headerLine); // Set the passed in string to the header line at the specified index. // It does NOT clear the current contents of header. bool getHeaderLine(unsigned int index, std::string& header) const; int16_t makeKey(char ch1, char ch2) { return((ch1 << 8) + ch2); } // Only one HD type is allowed per file. SamHeaderHD* myHD; // There can be multiple SQ Types, indexed by SN. StringHash mySQs; // There can be multiple RG Types, indexed by ID. StringHash myRGs; // There can be multiple PG types, indexed by ID. StringHash myPGs; // Reference Name information SamReferenceInfo myReferenceInfo; // Vector of comments std::vector myComments; std::vector myHeaderRecords; std::string myErrorMessage; uint32_t myCurrentSQIndex; uint32_t myCurrentRGIndex; uint32_t myCurrentPGIndex; uint32_t myCurrentHeaderIndex; uint32_t myCurrentCommentIndex; }; #endif libStatGen-1.0.14/bam/SamFilter.cpp000066400000000000000000000450471254730101300167700ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ ////////////////////////////////////////////////////////////////////////// #include "SamFilter.h" #include "SamQuerySeqWithRefHelper.h" #include "BaseUtilities.h" #include "SamFlag.h" SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold(SamRecord& record, GenomeSequence& refSequence, double mismatchThreshold) { // Read & clip from the left & right. SamQuerySeqWithRefIter iterFromFront(record, refSequence, true); SamQuerySeqWithRefIter iterFromBack(record, refSequence, false); SamSingleBaseMatchInfo baseMatchInfo; int32_t readLength = record.getReadLength(); // Init last front clip to be prior to the lastFront index (0). const int32_t initialLastFrontClipPos = -1; int32_t lastFrontClipPos = initialLastFrontClipPos; // Init first back clip to be past the last index (readLength). int32_t firstBackClipPos = readLength; bool fromFrontComplete = false; bool fromBackComplete = false; int32_t numBasesFromFront = 0; int32_t numBasesFromBack = 0; int32_t numMismatchFromFront = 0; int32_t numMismatchFromBack = 0; ////////////////////////////////////////////////////////// // Determining the clip positions. while(!fromFrontComplete || !fromBackComplete) { // Read from the front (left to right) of the read until // more have been read from that direction than the opposite direction. while(!fromFrontComplete && ((numBasesFromFront <= numBasesFromBack) || (fromBackComplete))) { if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false) { // Nothing more to read in this direction. fromFrontComplete = true; break; } // Got a read. Check to see if it is to or past the last clip. if(baseMatchInfo.getQueryIndex() >= firstBackClipPos) { // This base is past where we are clipping, so we // are done reading in this direction. fromFrontComplete = true; break; } // This is an actual base read from the left to the // right, so up the counter and determine if it was a mismatch. ++numBasesFromFront; if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) { // Mismatch ++numMismatchFromFront; // Check to see if it is over the threshold. double mismatchPercent = (double)numMismatchFromFront / numBasesFromFront; if(mismatchPercent > mismatchThreshold) { // Need to clip. lastFrontClipPos = baseMatchInfo.getQueryIndex(); // Reset the counters. numBasesFromFront = 0; numMismatchFromFront = 0; } } } // Now, read from right to left until more have been read // from the back than from the front. while(!fromBackComplete && ((numBasesFromBack <= numBasesFromFront) || (fromFrontComplete))) { if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false) { // Nothing more to read in this direction. fromBackComplete = true; break; } // Got a read. Check to see if it is to or past the first clip. if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos) { // This base is past where we are clipping, so we // are done reading in this direction. fromBackComplete = true; break; } // This is an actual base read from the right to the // left, so up the counter and determine if it was a mismatch. ++numBasesFromBack; if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) { // Mismatch ++numMismatchFromBack; // Check to see if it is over the threshold. double mismatchPercent = (double)numMismatchFromBack / numBasesFromBack; if(mismatchPercent > mismatchThreshold) { // Need to clip. firstBackClipPos = baseMatchInfo.getQueryIndex(); // Reset the counters. numBasesFromBack = 0; numMismatchFromBack = 0; } } } } ////////////////////////////////////////////////////////// // Done determining the clip positions, so clip. // To determine the number of clips from the front, add 1 to the // lastFrontClipPos since the index starts at 0. // To determine the number of clips from the back, subtract the // firstBackClipPos from the readLength. // Example: // Pos: 012345 // Read: AAAAAA // Read Length = 6. If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2. return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos)); } // Soft clip the record from the front and/or the back. SamFilter::FilterStatus SamFilter::softClip(SamRecord& record, int32_t numFrontClips, int32_t numBackClips) { ////////////////////////////////////////////////////////// Cigar* cigar = record.getCigarInfo(); FilterStatus status = NONE; int32_t startPos = record.get0BasedPosition(); CigarRoller updatedCigar; status = softClip(*cigar, numFrontClips, numBackClips, startPos, updatedCigar); if(status == FILTERED) { ///////////////////////////// // The entire read is clipped, so rather than clipping it, // filter it out. filterRead(record); return(FILTERED); } else if(status == CLIPPED) { // Part of the read was clipped, and now that we have // an updated cigar, update the read. record.setCigar(updatedCigar); // Update the starting position. record.set0BasedPosition(startPos); } return(status); } // Soft clip the cigar from the front and/or the back, writing the value // into the new cigar. SamFilter::FilterStatus SamFilter::softClip(Cigar& oldCigar, int32_t numFrontClips, int32_t numBackClips, int32_t& startPos, CigarRoller& updatedCigar) { int32_t readLength = oldCigar.getExpectedQueryBaseCount(); int32_t endClipPos = readLength - numBackClips; FilterStatus status = NONE; if((numFrontClips != 0) || (numBackClips != 0)) { // Clipping from front and/or from the back. // Check to see if the entire read was clipped. int32_t totalClips = numFrontClips + numBackClips; if(totalClips >= readLength) { ///////////////////////////// // The entire read is clipped, so rather than clipping it, // filter it out. return(FILTERED); } // Part of the read was clipped. status = CLIPPED; // Loop through, creating an updated cigar. int origCigarOpIndex = 0; // Track how many read positions are covered up to this // point by the cigar to determine up to up to what // point in the cigar is affected by this clipping. int32_t numPositions = 0; // Track if any non-clips are in the new cigar. bool onlyClips = true; const Cigar::CigarOperator* op = NULL; ////////////////// // Clip from front while((origCigarOpIndex < oldCigar.size()) && (numPositions < numFrontClips)) { op = &(oldCigar.getOperator(origCigarOpIndex)); switch(op->operation) { case Cigar::hardClip: // Keep this operation as the new clips do not // affect other clips. updatedCigar += *op; break; case Cigar::del: case Cigar::skip: // Skip and delete are going to be dropped, and // are not in the read, so the read index doesn't // need to be updated break; case Cigar::insert: case Cigar::match: case Cigar::mismatch: case Cigar::softClip: // Update the read index as these types // are found in the read. numPositions += op->count; break; case Cigar::none: default: // Nothing to do for none. break; }; ++origCigarOpIndex; } // If bases were clipped from the front, add the clip and // any partial cigar operation as necessary. if(numFrontClips != 0) { // Add the softclip to the front of the read. updatedCigar.Add(Cigar::softClip, numFrontClips); // Add the rest of the last Cigar operation if // it is not entirely clipped. int32_t newCount = numPositions - numFrontClips; if(newCount > 0) { // Before adding it, check to see if the same // operation is clipped from the end. // numPositions greater than the endClipPos // means that it is equal or past that position, // so shorten the number of positions. if(numPositions > endClipPos) { newCount -= (numPositions - endClipPos); } if(newCount > 0) { updatedCigar.Add(op->operation, newCount); if(!Cigar::isClip(op->operation)) { onlyClips = false; } } } } // Add operations until the point of the end clip is reached. // For example... // 2M1D3M = MMDMMM readLength = 5 // readIndex: 01 234 // at cigarOpIndex 0 (2M), numPositions = 2. // at cigarOpIndex 1 (1D), numPositions = 2. // at cigarOpIndex 2 (3M), numPositions = 5. // if endClipPos = 2, we still want to consume the 1D, so // need to keep looping until numPositions > endClipPos while((origCigarOpIndex < oldCigar.size()) && (numPositions <= endClipPos)) { op = &(oldCigar.getOperator(origCigarOpIndex)); // Update the numPositions count if the operations indicates // bases within the read. if(!Cigar::foundInQuery(op->operation)) { // This operation is not in the query read sequence, // so it is not yet to the endClipPos, just add the // operation do not increment the number of positions. updatedCigar += *op; if(!Cigar::isClip(op->operation)) { onlyClips = false; } } else { // This operation appears in the query sequence, so // check to see if the clip occurs in this operation. // endClipPos is 0 based & numPositions is a count. // If endClipPos is 4, then it is the 5th position. // If 4 positions are covered so far (numPositions = 4), // then we are right at endCLipPos: 4-4 = 0, none of // this operation should be kept. // If only 3 positions were covered, then we are at offset // 3, so offset 3 should be added: 4-3 = 1. uint32_t numPosTilClip = endClipPos - numPositions; if(numPosTilClip < op->count) { // this operation is partially clipped, write the part // that was not clipped if it is not all clipped. if(numPosTilClip != 0) { updatedCigar.Add(op->operation, numPosTilClip); if(!Cigar::isClip(op->operation)) { onlyClips = false; } } } else { // This operation is not clipped, so add it updatedCigar += *op; if(!Cigar::isClip(op->operation)) { onlyClips = false; } } // This operation occurs in the query sequence, so // increment the number of positions covered. numPositions += op->count; } // Move to the next cigar position. ++origCigarOpIndex; } ////////////////// // Add the softclip to the back. if(numBackClips != 0) { // Add the softclip to the end updatedCigar.Add(Cigar::softClip, numBackClips); } ////////////////// // Add any hardclips remaining in the original cigar to the back. while(origCigarOpIndex < oldCigar.size()) { op = &(oldCigar.getOperator(origCigarOpIndex)); if(op->operation == Cigar::hardClip) { // Keep this operation as the new clips do not // affect other clips. updatedCigar += *op; } ++origCigarOpIndex; } // Check to see if the new cigar is only clips. if(onlyClips) { // Only clips in the new cigar, so mark the read as filtered // instead of updating the cigar. ///////////////////////////// // The entire read was clipped. status = FILTERED; } else { // Part of the read was clipped. // Update the starting position if a clip was added to // the front. if(numFrontClips > 0) { // Convert from query index to reference position (from the // old cigar) // Get the position for the last front clipped position by // getting the position associated with the clipped base on // the reference. Then add one to get to the first // non-clipped position. int32_t lastFrontClipPos = numFrontClips - 1; int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos, startPos); if(newStartPos != Cigar::INDEX_NA) { // Add one to get first non-clipped position. startPos = newStartPos + 1; } } } } return(status); } SamFilter::FilterStatus SamFilter::filterOnMismatchQuality(SamRecord& record, GenomeSequence& refSequence, uint32_t qualityThreshold, uint8_t defaultQualityInt) { uint32_t totalMismatchQuality = sumMismatchQuality(record, refSequence, defaultQualityInt); // If the total mismatch quality is over the threshold, // filter the read. if(totalMismatchQuality > qualityThreshold) { filterRead(record); return(FILTERED); } return(NONE); } // NOTE: Only positions where the reference and read both have bases that // are different and not 'N' are considered mismatches. uint32_t SamFilter::sumMismatchQuality(SamRecord& record, GenomeSequence& refSequence, uint8_t defaultQualityInt) { // Track the mismatch info. int mismatchQual = 0; int numMismatch = 0; SamQuerySeqWithRefIter sequenceIter(record, refSequence); SamSingleBaseMatchInfo baseMatchInfo; while(sequenceIter.getNextMatchMismatch(baseMatchInfo)) { if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) { // Got a mismatch, get the associated quality. char readQualityChar = record.getQuality(baseMatchInfo.getQueryIndex()); uint8_t readQualityInt = BaseUtilities::getPhredBaseQuality(readQualityChar); if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT) { // Quality was not specified, so use the configured setting. readQualityInt = defaultQualityInt; } mismatchQual += readQualityInt; ++numMismatch; } } return(mismatchQual); } void SamFilter::filterRead(SamRecord& record) { // Filter the read by marking it as unmapped. uint16_t flag = record.getFlag(); SamFlag::setUnmapped(flag); // Clear N/A flags. flag &= ~SamFlag::PROPER_PAIR; flag &= ~SamFlag::SECONDARY_ALIGNMENT; flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT; record.setFlag(flag); // Clear Cigar record.setCigar("*"); // Clear mapping quality record.setMapQuality(0); } libStatGen-1.0.14/bam/SamFilter.h000066400000000000000000000113351254730101300164260ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_FILTER_H__ #define __SAM_FILTER_H__ #include "SamRecord.h" #include "GenomeSequence.h" /// Class for helping to filter a SAM/BAM record. class SamFilter { public: /// Enum describing what sort of filtering was done. enum FilterStatus { NONE, ///< The filter did not affect the read. CLIPPED, ///< Filtering clipped the read. FILTERED ///< Filtering caused the read to be modified to unmapped. }; /// Clip the read based on the specified mismatch threshold. /// \return how the read was affected, /// NONE if the read was not modified, /// CLIPPED if the read was clipped, /// FILTERED if the whole read would have been clipped so instead the /// read was modified to unmapped. static FilterStatus clipOnMismatchThreshold(SamRecord& record, GenomeSequence& refSequence, double mismatchThreshold); /// Soft clip the record from the front and/or the back. /// \param record record to be clipped (input/output parameter). /// \param numFrontClips number of bases that should be clipped from the /// front of the sequence read. (total count, including any that are /// already clipped.) /// \param backClipPos number of bases that should be clipped from the /// back of the sequence read. (total count, including any that are /// already clipped.) static FilterStatus softClip(SamRecord& record, int32_t numFrontClips, int32_t numBackClips); /// Soft clip the cigar from the front and/or the back, writing the value /// into the new cigar, updatedCigar & startPos are only updated if /// the return FilterStatus is CLIPPED. /// \param oldCigar cigar prior to clipping /// \param numFrontClips number of bases that should be clipped from the /// front of the sequence read. (total count, including any that are /// already clipped.) /// \param numBackClips number of bases that should be clipped from the /// back of the sequence read. (total count, including any that are /// already clipped.) /// \param startPos 0-based start position associated with the /// cigar prior to updating (input) and set to the 0-based start position /// after updating (output) the cigar if it was CLIPPED. /// \param updatedCigar set to the clipped cigar if CLIPPED (output param). static FilterStatus softClip(Cigar& oldCigar, int32_t numFrontClips, int32_t numBackClips, int32_t& startPos, CigarRoller& updatedCigar); /// Filter the read based on the specified quality threshold. /// \return how the read was affected, /// NONE if the read was not modified, /// FILTERED if the read was modified to unmapped because it was over /// the quality threshold. static FilterStatus filterOnMismatchQuality(SamRecord& record, GenomeSequence& refSequence, uint32_t qualityThreshold, uint8_t defaultQualityInt); /// Get the sum of the qualities of all mismatches in the record. /// \param record record on which to calculate the sum the mismatch qualities /// \param refSequence reference to use to check for mismatches. /// \param defaultQualityInt default value to use for the quality if no /// quality was specified in the read. /// \return sum of the qualities of mismatches static uint32_t sumMismatchQuality(SamRecord& record, GenomeSequence& refSequence, uint8_t defaultQualityInt); /// Filter the read by marking it as unmapped. static void filterRead(SamRecord& record); }; #endif libStatGen-1.0.14/bam/SamFlag.h000066400000000000000000000106441254730101300160540ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_FLAG_H__ #define __SAM_FLAG_H__ #include #ifdef DUPLICATE #undef DUPLICATE #endif /// Class for extracting information from a SAM Flag. class SamFlag { public: /////////////////////// /// @name Constants for parsing a flag. //@{ static const int16_t PAIRED = 0x0001; static const int16_t PROPER_PAIR = 0x0002; static const int16_t UNMAPPED = 0x0004; static const int16_t MATE_UNMAPPED = 0x0008; static const int16_t REVERSE = 0x0010; static const int16_t MATE_REVERSED = 0x0020; static const int16_t FIRST_READ = 0x0040; static const int16_t SECOND_READ = 0x0080; static const int16_t SECONDARY_ALIGNMENT = 0x0100; static const int16_t FAILED_QUALITY = 0x0200; static const int16_t DUPLICATE = 0x0400; static const int16_t SUPPLEMENTARY_ALIGNMENT = 0x0800; static const int16_t FRAGMENT_INFO = 0x00C0; static const int16_t FRAGMENT_SHIFT = 6; //@} /////////////////////// /// @name Static methods for determining the contents of a flag. //@{ static inline bool isMapped(uint16_t flag) {return(!(flag & UNMAPPED));} static inline bool isMateMapped(uint16_t flag) {return(!(flag & MATE_UNMAPPED));} static inline bool isPaired(uint16_t flag) {return(flag & PAIRED);} static inline bool isReverse(uint16_t flag) {return(flag & REVERSE);} static inline bool isMateReverse(uint16_t flag) {return(flag & MATE_REVERSED);} static inline bool isProperPair(uint16_t flag) { // Proper pair is only applicable if also paired. return(isPaired(flag) && (flag & PROPER_PAIR)); } static inline bool isDuplicate(uint16_t flag) {return(flag & DUPLICATE);} static inline bool isQCFailure(uint16_t flag) {return(flag & FAILED_QUALITY);} static inline bool isSecondary(uint16_t flag) {return(flag & SECONDARY_ALIGNMENT);} /// Return if it is the first fragment or not /// (if FIRST_READ is set and SECOND_READ is not). static inline bool isFirstFragment(uint16_t flag) { // first fragment if FIRST_READ is set and SECOND_READ is not. return((flag & FIRST_READ) && !(flag & SECOND_READ)); } /// Return if it is the last fragment or not /// (if FIRST_READ is not set and SECOND_READ is). static inline bool isLastFragment(uint16_t flag) { // last fragment if FIRST_READ is not set and SECOND_READ is set. return(!(flag & FIRST_READ) && (flag & SECOND_READ)); } /// Return if it is a middle fragment or not /// (if FIRST_READ is set and SECOND_READ is also set). static inline bool isMidFragment(uint16_t flag) { // mid fragment if both FIRST_READ and SECOND_READ are set. return((flag & FIRST_READ) && (flag & SECOND_READ)); } /// Return if it is an unknown fragment fragment or not /// (if FIRST_READ is not set and SECOND_READ is also not set). static inline bool isUnknownFragment(uint16_t flag) { // unknown fragment index if neither FIRST_READ nor SECOND_READ are not. return(!(flag & FIRST_READ) && !(flag & SECOND_READ)); } static inline uint8_t getFragmentType(uint16_t flag) { return((flag & FRAGMENT_INFO) >> FRAGMENT_SHIFT); } /// Mark the passed in flag as unmapped. static inline void setUnmapped(uint16_t& flag) { flag |= UNMAPPED;} /// Mark the passed in flag as not duplicate. static inline void setNotDuplicate(uint16_t& flag) { flag ^= DUPLICATE;} /// Mark the passed in flag as not duplicate. static inline void setDuplicate(uint16_t& flag) { flag |= DUPLICATE;} //@} private: SamFlag(); }; #endif libStatGen-1.0.14/bam/SamHeaderHD.cpp000066400000000000000000000025251254730101300171410ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamHeaderHD.h" // Constructor SamHeaderHD::SamHeaderHD() { // Add required tags for this type. myType = SamHeaderRecord::HD; myTypeString = "HD"; addRequiredTag("VN"); myKeyTag.clear(); } // Destructor SamHeaderHD::~SamHeaderHD() { } const char* SamHeaderHD::getSortOrder() { return(getTagValue("SO")); } SamHeaderRecord* SamHeaderHD ::createCopy() const { SamHeaderHD* newHD = new SamHeaderHD(); if(newHD == NULL) { std::cerr << "Failed to create a copy of an HD Header Record\n" ; return(NULL); } internalCopy(*newHD); return(newHD); } libStatGen-1.0.14/bam/SamHeaderHD.h000066400000000000000000000027201254730101300166030ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAMHEADERHD_H__ #define __SAMHEADERHD_H__ #include "SamHeaderRecord.h" class SamHeaderHD : public SamHeaderRecord { public: // Constructor SamHeaderHD(); // Destructor virtual ~SamHeaderHD(); const char* getSortOrder(); /// Return a pointer to a newly created header record of the appropriate type /// that is a copy of this record. The newly created record will not be /// deleted by this class and it is the responsibility of the calling method /// to handle the deletion. /// Returns NULL on failure to copy. virtual SamHeaderRecord* createCopy() const; private: SamHeaderHD(const SamHeaderHD& samHeaderHD); SamHeaderHD& operator=(const SamHeaderHD& samHeaderHD); }; #endif libStatGen-1.0.14/bam/SamHeaderPG.cpp000066400000000000000000000024061254730101300171520ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamHeaderPG.h" // Constructor SamHeaderPG::SamHeaderPG() { // Add required tags for this type. myType = SamHeaderRecord::PG; myTypeString = "PG"; addRequiredTag("ID"); myKeyTag = "ID"; } // Destructor SamHeaderPG::~SamHeaderPG() { } SamHeaderRecord* SamHeaderPG::createCopy() const { SamHeaderPG* newPG = new SamHeaderPG(); if(newPG == NULL) { std::cerr << "Failed to create a copy of an PG Header Record\n" ; return(NULL); } internalCopy(*newPG); return(newPG); } libStatGen-1.0.14/bam/SamHeaderPG.h000066400000000000000000000027011254730101300166150ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAMHEADERPG_H__ #define __SAMHEADERPG_H__ #include #include "SamHeaderRecord.h" class SamHeaderPG : public SamHeaderRecord { public: // Constructor SamHeaderPG(); // Destructor virtual ~SamHeaderPG(); /// Return a pointer to a newly created header record of the appropriate type /// that is a copy of this record. The newly created record will not be /// deleted by this class and it is the responsibility of the calling method /// to handle the deletion. /// Returns NULL on failure to copy. virtual SamHeaderRecord* createCopy() const; private: SamHeaderPG(const SamHeaderPG& samHeaderPG); SamHeaderPG& operator=(const SamHeaderPG& samHeaderPG); }; #endif libStatGen-1.0.14/bam/SamHeaderRG.cpp000066400000000000000000000024401254730101300171520ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamHeaderRG.h" // Constructor SamHeaderRG::SamHeaderRG() { // Add required tags for this type. myType = SamHeaderRecord::RG; myTypeString = "RG"; addRequiredTag("ID"); addRequiredTag("SM"); myKeyTag = "ID"; } // Destructor SamHeaderRG::~SamHeaderRG() { } SamHeaderRecord* SamHeaderRG::createCopy() const { SamHeaderRG* newRG = new SamHeaderRG(); if(newRG == NULL) { std::cerr << "Failed to create a copy of an RG Header Record\n" ; return(NULL); } internalCopy(*newRG); return(newRG); } libStatGen-1.0.14/bam/SamHeaderRG.h000066400000000000000000000026571254730101300166310ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAMHEADERRG_H__ #define __SAMHEADERRG_H__ #include "SamHeaderRecord.h" class SamHeaderRG : public SamHeaderRecord { public: // Constructor SamHeaderRG(); // Destructor virtual ~SamHeaderRG(); /// Return a pointer to a newly created header record of the appropriate type /// that is a copy of this record. The newly created record will not be /// deleted by this class and it is the responsibility of the calling method /// to handle the deletion. /// Returns NULL on failure to copy. virtual SamHeaderRecord* createCopy() const; private: SamHeaderRG(const SamHeaderRG& samHeaderRG); SamHeaderRG& operator=(const SamHeaderRG& samHeaderRG); }; #endif libStatGen-1.0.14/bam/SamHeaderRecord.cpp000066400000000000000000000235501254730101300200650ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamHeaderRecord.h" // Constructor SamHeaderRecord::SamHeaderRecord() : myTagHash(), myTags(), myNumActiveTags(0) { } // Destructor SamHeaderRecord::~SamHeaderRecord() { reset(); } // Set the fields from the passed in line. // Return true if successfully set. bool SamHeaderRecord::setFields(const StringArray& tokens) { bool status = true; // Loop through the tags for this type. // The tags start in column 1 since column 0 contains the type. for(int columnIndex = 1; columnIndex < tokens.Length(); columnIndex++) { // Validate that the tag is at least 3 characters. Two for the token, // one for the ':'. if((tokens[columnIndex].Length() < 3) || (tokens[columnIndex][2] != ':')) { // Continue to the next tag, this one is too small/invalid. status = false; std::cerr << "ERROR: Poorly formatted tag in header: " << tokens[columnIndex] << std::endl; continue; } // Get the tag from the token. char tag[3]; tag[0] = tokens[columnIndex][0]; tag[1] = tokens[columnIndex][1]; tag[2] = 0; // The tag value is the rest of the substring. String tagValue = (tokens[columnIndex]).SubStr(3); // Set the tag. status &= setTag(tag, tagValue.c_str()); } status &= isValid(); return(status); } // Check to see if the record is valid. bool SamHeaderRecord::isValid() { bool status = true; // Check that the required tags are set. If they aren't, return false. for(unsigned int reqIndex = 0; reqIndex < myRequiredTags.size(); reqIndex++) { // Check to see if the required tag at this index exists and has // a value. int index = myTagHash.Integer(myRequiredTags[reqIndex].c_str()); if((index < 0) || !(myTags[index]->hasValue())) { // Did not find the tag, stet status to false. std::cerr << "ERROR: Missing required tag: " << myRequiredTags[reqIndex] << "." << std::endl; status = false; } } return(status); } // Return the value associated with the specified tag. const char* SamHeaderRecord::getTagValue(const char* tag) const { // Look up the tag in myTags. int index = myTagHash.Integer(tag); if(index < 0) { // The tag was not found in the hash, so return "". return(""); } // The tag was found in the hash, so return the tag value found at the // index associated with the tag. return(myTags[index]->getValue()); } // Set the value of the specified tag to the specified value. // Set value to NULL in order to delete the tag. // Returns whether or not it was successful. bool SamHeaderRecord::setTag(const char* tag, const char* value) { // Lookup the tag in the hash. int vectorIndex = myTagHash.Integer(tag); if(vectorIndex < 0) { // The tag was not found in the hash, so create a new one. SamHeaderTag* tagPtr = new SamHeaderTag(tag, value); if(tagPtr == NULL) { // Failed to allocate the tag, return false. std::cerr << "Failed to allocate space (new) for a SamHeaderTag.\n"; return(false); } // Add the new tag to the back of the tag values. vectorIndex = myTags.size(); myTags.push_back(tagPtr); // If the value is not null, increment the number of active tags. if(value[0] != 0) { ++myNumActiveTags; } // Add the tag to the hash. int hashIndex = myTagHash.Add(tag, vectorIndex); if((myTagHash.Integer(hashIndex) != vectorIndex) || (myTagHash[hashIndex] != tag)) { // Failed to add the tag, so return false. std::cerr << "Failed to add tag, " << tag << ", to the hash." << std::endl; return(false); } return(true); } else if((unsigned int)vectorIndex < myTags.size()) { // Found the tag in the hash. So, update the tag if it // is not the key. if(myKeyTag != tag) { // Not the key, so update the tag. // If the new value is null and the old one is not, decrement the // number of active tags. if((value[0] == 0) && ((myTags[vectorIndex]->getValue())[0] != 0)) { // Tag was deleted since the new value is blank but the old // value was not. --myNumActiveTags; } else if((value[0] != 0) && ((myTags[vectorIndex]->getValue())[0] == 0)) { // Tag was added since the old value was blank and the new value // is not. ++myNumActiveTags; } // Just modifying a tag, so this does not affect the number // of active tags. return(myTags[vectorIndex]->setValue(value)); } else if(strcmp(value, myTags[vectorIndex]->getValue()) == 0) { // The new key value is the same as the previous value, so // it is not a change, return true. return(true); } else { // Can't modify the key tag's value since that will // screw up the hash. std::cerr << "Can't modify the key tag, " << tag << " from " << myTags[vectorIndex]->getValue() << " to " << value << std::endl; return(false); } } // Got an invalid index from the hash. This is not supposed to happen. // so return false. std::cerr << "Invalid tag index found: " << vectorIndex << ", but max index is " << myTags.size() << " for tag: " << tag << std::endl; return(false); } // Reset this header record to an empty state. void SamHeaderRecord::reset() { // Delete the tag hash. myTagHash.Clear(); // Loop through deleting all the tags in the vector. for(unsigned int vectorIndex = 0; vectorIndex < myTags.size(); vectorIndex++) { delete myTags[vectorIndex]; myTags[vectorIndex] = NULL; } // Clear the tag vector. myTags.clear(); myNumActiveTags = 0; } // Appends the string representation of this header record // to the passed in string. bool SamHeaderRecord::appendString(std::string& header) { // Track whether or not the header type has been written. // Only write the header type if at least one of the tags has // an associated value. bool writtenHeader = false; if(isActiveHeaderRecord() && isValid()) { // Loop through all the entries in the tag vector. for(unsigned int vectorIndex = 0; vectorIndex < myTags.size(); vectorIndex++) { if(!writtenHeader && (myTags[vectorIndex]->hasValue())) { // The tag has a value and the header type has not yet been written, // so write it. header += "@"; header += myTypeString; writtenHeader = true; } myTags[vectorIndex]->getTagString(header); } // If a header has been written, add a new line character. if(writtenHeader) { header += "\n"; return(true); } } // Nothing was written, return false. return(false); } // Add the key tag with the specified value. bool SamHeaderRecord::addKey(const char* value) { if(myKeyTag.size() == 0) { return(false); } return(setTag(myKeyTag.data(), value)); } // Return the value associated with the specified tag. const char* SamHeaderRecord::getKeyValue() const { // Look up the tag in myTags. int index = myTagHash.Integer(myKeyTag.c_str()); if(index < 0) { // The tag was not found in the hash, so return "". return(""); } // The tag was found in the hash, so return the tag value found at the // index associated with the tag. return(myTags[index]->getValue()); } // This header is active if there is at least one tag set. bool SamHeaderRecord::isActiveHeaderRecord() { return(myNumActiveTags != 0); } // Return the type of this header record. const char* SamHeaderRecord::getTypeString() { return(myTypeString.c_str()); } // Return the type of this header record. SamHeaderRecord::SamHeaderRecordType SamHeaderRecord::getType() { return(myType); } void SamHeaderRecord::addRequiredTag(const char* requiredTag) { myRequiredTags.push_back(requiredTag); } void SamHeaderRecord::internalCopy(SamHeaderRecord& newRec) const { newRec.myTagHash = myTagHash; newRec.myTags.clear(); // Loop through copying the tags. for(unsigned int vectorIndex = 0; vectorIndex < myTags.size(); vectorIndex++) { if(myTags[vectorIndex] != NULL) { newRec.myTags.push_back(new SamHeaderTag(*(myTags[vectorIndex]))); } } newRec.myRequiredTags = myRequiredTags; newRec.myNumActiveTags = myNumActiveTags; } libStatGen-1.0.14/bam/SamHeaderRecord.h000066400000000000000000000077251254730101300175400ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAMHEADER_RECORD_H__ #define __SAMHEADER_RECORD_H__ #include "StringArray.h" #include "StringHash.h" #include "SamHeaderTag.h" /// This class encapsulates the tag value pairs contained with a SAM Header /// line with accessors for getting and setting the tags within this header. class SamHeaderRecord { public: /// Specifies the Type for the sam header record (line). enum SamHeaderRecordType { HD, ///< Header SQ, ///< Sequence Dictionary RG, ///< Read Group PG ///< Program }; /// Constructor SamHeaderRecord(); /// Destructor virtual ~SamHeaderRecord(); /// Return a pointer to a newly created header record of the appropriate type /// that is a copy of this record. The newly created record will not be /// deleted by this class and it is the responsibility of the calling method /// to handle the deletion. /// Returns NULL on failure to copy. virtual SamHeaderRecord* createCopy() const = 0; /// Set the fields from the passed in line. /// Return true if successfully set. bool setFields(const StringArray& tokens); /// Check to see if the record is valid. bool isValid(); /// Return the value associated with the specified tag. Returns "" if it /// is not set. const char* getTagValue(const char* tag) const; /// Set the value of the specified tag to the specified value, deletes /// the tag when value is NULL. /// Returns whether or not it was successful, fails if tag is the key tag /// and the key tag already exists. bool setTag(const char* tag, const char* value); /// Reset this header record to an empty state with no tags. void reset(); /// Appends the string representation of this header record /// to the passed in string. bool appendString(std::string& header); /// Add the key tag with the specified value (not for HD headers). bool addKey(const char* value); /// Get the value associated with the key tag. Returns "" if it is not set. const char* getKeyValue() const; /// This record is active (true) if there is at least one tag set. bool isActiveHeaderRecord(); /// Return the type of this header record (HD, SQ, RG, or PG) as a string. const char* getTypeString(); /// Return the type of this header record (HD, SQ, RG, or PG) as an enum. SamHeaderRecordType getType(); protected: void addRequiredTag(const char* requiredTag); // Copy this record into the specified new one. virtual void internalCopy(SamHeaderRecord& newRec) const; // The type for this header record. std::string myTypeString; // The type for this header record. SamHeaderRecordType myType; // The TAG name that is the key for this record // Only applicable if more than one of this type // of record is allowed. std::string myKeyTag; private: SamHeaderRecord(const SamHeaderRecord& samHeaderRecord); SamHeaderRecord& operator=(const SamHeaderRecord& samHeaderRecord); // hash from tag name to index into the tag values vector. StringIntHash myTagHash; std::vector myTags; // The tags that are required for this record. std::vector myRequiredTags; int myNumActiveTags; }; #endif libStatGen-1.0.14/bam/SamHeaderSQ.cpp000066400000000000000000000024401254730101300171650ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamHeaderSQ.h" // Constructor SamHeaderSQ::SamHeaderSQ() { // Add required tags for this type. myType = SamHeaderRecord::SQ; myTypeString = "SQ"; addRequiredTag("SN"); addRequiredTag("LN"); myKeyTag = "SN"; } // Destructor SamHeaderSQ::~SamHeaderSQ() { } SamHeaderRecord* SamHeaderSQ::createCopy() const { SamHeaderSQ* newSQ = new SamHeaderSQ(); if(newSQ == NULL) { std::cerr << "Failed to create a copy of an SQ Header Record\n" ; return(NULL); } internalCopy(*newSQ); return(newSQ); } libStatGen-1.0.14/bam/SamHeaderSQ.h000066400000000000000000000026571254730101300166440ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAMHEADERSQ_H__ #define __SAMHEADERSQ_H__ #include "SamHeaderRecord.h" class SamHeaderSQ : public SamHeaderRecord { public: // Constructor SamHeaderSQ(); // Destructor virtual ~SamHeaderSQ(); /// Return a pointer to a newly created header record of the appropriate type /// that is a copy of this record. The newly created record will not be /// deleted by this class and it is the responsibility of the calling method /// to handle the deletion. /// Returns NULL on failure to copy. virtual SamHeaderRecord* createCopy() const; private: SamHeaderSQ(const SamHeaderSQ& samHeaderSQ); SamHeaderSQ& operator=(const SamHeaderSQ& samHeaderSQ); }; #endif libStatGen-1.0.14/bam/SamHeaderTag.cpp000066400000000000000000000041731254730101300173620ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamHeaderTag.h" SamHeaderTag::SamHeaderTag(const char* tag, const char* value) { setTag(tag, value); } SamHeaderTag::SamHeaderTag(const SamHeaderTag& oldTag) { setTag(oldTag.myTag.c_str(), oldTag.myValue.c_str()); } SamHeaderTag::~SamHeaderTag() { } // Add this tag to the passed in tag string. // NOTE: does not clear tagString. bool SamHeaderTag::getTagString(std::string& tagString) { if(myValue.length() != 0) { // There is a value associated with this tag, so add it to the string. tagString += "\t"; tagString += myTag; tagString += ":"; tagString += myValue; return(true); } // This tag has no associated value, return false. return(false); } // Set this tag to the passed in tag and value. bool SamHeaderTag::setTag(const char* tag, const char* value) { myTag = tag; myValue = value; return(true); } // Set the value associated with this tag to the passed in value. bool SamHeaderTag::setValue(const char* value) { myValue = value; return(true); } // Return the tag for this tag. const char* SamHeaderTag::getTag() { return(myTag.c_str()); } // Return the value associated with this tag. const char* SamHeaderTag::getValue() { return(myValue.c_str()); } // Return true if there is a non-blank value associated with this tag. bool SamHeaderTag::hasValue() { return(myValue.size() != 0); } libStatGen-1.0.14/bam/SamHeaderTag.h000066400000000000000000000033651254730101300170310ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAMHEADER_TAG_H__ #define __SAMHEADER_TAG_H__ #include class SamHeaderTag { public: SamHeaderTag(const char* tag, const char* value); SamHeaderTag(const SamHeaderTag&); ~SamHeaderTag(); // Add this tag to the passed in tag string. // If the tag value is blank, the tag will not be added to the // passed in string. // NOTE: does not clear tagString. bool getTagString(std::string& tagString); // Set this tag to the passed in tag and value. bool setTag(const char* tag, const char* value); // Set the value associated with this tag to the passed in value. bool setValue(const char* value); // Return the tag for this tag. const char* getTag(); // Return the value associated with this tag. const char* getValue(); // Return true if there is a non-blank value associated with this tag. bool hasValue(); private: SamHeaderTag(); SamHeaderTag& operator=(const SamHeaderTag&); std::string myTag; std::string myValue; }; #endif libStatGen-1.0.14/bam/SamHelper.h000066400000000000000000000023751254730101300164240ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_HELPER_H__ #define __SAM_HELPER_H__ #include #ifdef DUPLICATE #undef DUPLICATE #endif /// Class for extracting information from a SAM Flag. class SamHelper { public: /// Helper method that combines the chromosome ID and position into a /// 64bit number by shifting the chromosome ID to the upper bits. static inline uint64_t combineChromPos(int32_t chromID, int32_t position) { return(((uint64_t)chromID << 32) | (position & 0xFFFFFFFF)); } private: SamHelper(); }; #endif libStatGen-1.0.14/bam/SamInterface.cpp000066400000000000000000000305411254730101300174340ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamInterface.h" #include "SamRecordHelper.h" #include #include SamInterface::SamInterface() { } SamInterface::~SamInterface() { } // Read a SAM file's header. bool SamInterface::readHeader(IFILE filePtr, SamFileHeader& header, SamStatus& status) { if(filePtr == NULL) { // File is not open. status.setStatus(SamStatus::FAIL_ORDER, "Cannot read header since the file pointer is null"); return(false); } // Clear the passed in header. header.resetHeader(); int numValid = 0; int numInvalid = 0; std::string errorMessages = ""; do { StringIntHash tags; StringArray values; buffer.ReadLine(filePtr); // Stop reading header lines if at the end of the file or // if the line is not blank and does not start with an @. if ( ifeof(filePtr) || ((buffer.Length() != 0) && (buffer[0] != '@')) ) { break; } // This is a header line, so add it to header. if(header.addHeaderLine(buffer.c_str())) { if(buffer.Length() != 0) { ++numValid; } } else { ++numInvalid; // Failed reading the header. errorMessages += header.getErrorMessage(); // Skip further processing on this line since it was an error. continue; } } while (1); // Store the first record since it was read. myFirstRecord = buffer; if(numInvalid > 0) { if(numValid == 0) { std::cerr << "Failed to parse " << numInvalid << " header lines"; std::cerr << ". No valid header lines.\n"; status.setStatus(SamStatus::FAIL_PARSE, errorMessages.c_str()); return(false); } } // Successfully read. return(true); } bool SamInterface::writeHeader(IFILE filePtr, SamFileHeader& header, SamStatus& status) { if((filePtr == NULL) || (filePtr->isOpen() == false)) { // File is not open, return failure. status.setStatus(SamStatus::FAIL_ORDER, "Cannot write header since the file pointer is null"); return(false); } //////////////////////////////// // Write the header to the file. //////////////////////////////// // Construct a string containing the entire header. std::string headerString = ""; header.getHeaderString(headerString); int32_t headerLen = headerString.length(); int numWrite = 0; // Write the header to the file. numWrite = ifwrite(filePtr, headerString.c_str(), headerLen); if(numWrite != headerLen) { status.setStatus(SamStatus::FAIL_IO, "Failed to write the SAM header."); return(false); } return(true); } void SamInterface::readRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamStatus& samStatus) { // Initialize the status to success - will be set to false on failure. samStatus = SamStatus::SUCCESS; if((filePtr == NULL) || (filePtr->isOpen() == false)) { // File is not open. samStatus.addError(SamStatus::FAIL_ORDER, "filePtr does not point to an open file."); return; } // If the first record has been set, use that and clear it, // otherwise read the record from the file. if(myFirstRecord.Length() != 0) { buffer = myFirstRecord; myFirstRecord.Clear(); } else { // Read the next record. buffer.Clear(); buffer.ReadLine(filePtr); // If the end of the file and nothing was read, return false. if ((ifeof(filePtr)) && (buffer.Length() == 0)) { // end of the file and nothing to process. samStatus.addError(SamStatus::NO_MORE_RECS, "No more records in the file."); return; } } tokens.ReplaceColumns(buffer, '\t'); // Error string for reporting a parsing failure. String errorString = ""; if (tokens.Length() < 11) { errorString = "Too few columns ("; errorString += tokens.Length(); errorString += ") in the Record, expected at least 11."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); return; } // Reset the record before setting any fields. record.resetRecord(); if(!record.setReadName(tokens[0])) { samStatus.addError(record.getStatus()); } long flagInt = 0; if(!tokens[1].AsInteger(flagInt)) { errorString = "flag, "; errorString += tokens[1].c_str(); errorString += ", is not an integer."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); } else if((flagInt < 0) || (flagInt > UINT16_MAX)) { errorString = "flag, "; errorString += tokens[1].c_str(); errorString += ", is not between 0 and (2^16)-1 = 65535."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); } else if(!record.setFlag(flagInt)) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } if(!record.setReferenceName(header, tokens[2])) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } long posInt = 0; if(!tokens[3].AsInteger(posInt)) { errorString = "position, "; errorString += tokens[3].c_str(); errorString += ", is not an integer."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); } else if((posInt < INT32_MIN) || (posInt > INT32_MAX)) { // If it is not in this range, it cannot fit into a 32 bit int. errorString = "position, "; errorString += tokens[3].c_str(); errorString += ", does not fit in a 32 bit signed int."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); } else if(!record.set1BasedPosition(posInt)) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } long mapInt = 0; if(!tokens[4].AsInteger(mapInt)) { errorString = "map quality, "; errorString += tokens[4].c_str(); errorString += ", is not an integer."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); } else if((mapInt < 0) || (mapInt > UINT8_MAX)) { errorString = "map quality, "; errorString += tokens[4].c_str(); errorString += ", is not between 0 and (2^8)-1 = 255."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); } else if(!record.setMapQuality(mapInt)) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } if(!record.setCigar(tokens[5])) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } if(!record.setMateReferenceName(header, tokens[6])) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } long matePosInt = 0; if(!tokens[7].AsInteger(matePosInt)) { errorString = "mate position, "; errorString += tokens[7].c_str(); errorString += ", is not an integer."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); } else if(!record.set1BasedMatePosition(matePosInt)) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } long insertInt = 0; if(!tokens[8].AsInteger(insertInt)) { errorString = "insert size, "; errorString += tokens[8].c_str(); errorString += ", is not an integer."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); } else if(!record.setInsertSize(insertInt)) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } if(!record.setSequence(tokens[9])) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } if(!record.setQuality(tokens[10])) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } // Clear the tag fields. record.clearTags(); // Add the tags to the record. for (int i = 11; i < tokens.Length(); i++) { String & nugget = tokens[i]; if (nugget.Length() < 6 || nugget[2] != ':' || nugget[4] != ':') { // invalid tag format. errorString = "Invalid Tag Format: "; errorString += nugget.c_str(); errorString += ", should be cc:c:x*."; samStatus.addError(SamStatus::FAIL_PARSE, errorString.c_str()); continue; } // Valid tag format. // Add the tag. if(!record.addTag((const char *)nugget, nugget[3], (const char *)nugget + 5)) { samStatus.addError(record.getStatus().getStatus(), record.getStatus().getStatusMessage()); } } return; } SamStatus::Status SamInterface::writeRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamRecord::SequenceTranslation translation) { // Store all the fields into a string, then write the string. String recordString = record.getReadName(); recordString += "\t"; recordString += record.getFlag(); recordString += "\t"; recordString += record.getReferenceName(); recordString += "\t"; recordString += record.get1BasedPosition(); recordString += "\t"; recordString += record.getMapQuality(); recordString += "\t"; recordString += record.getCigar(); recordString += "\t"; recordString += record.getMateReferenceNameOrEqual(); recordString += "\t"; recordString += record.get1BasedMatePosition(); recordString += "\t"; recordString += record.getInsertSize(); recordString += "\t"; recordString += record.getSequence(translation); recordString += "\t"; recordString += record.getQuality(); // If there are any tags, add a preceding tab. if(record.getTagLength() != 0) { recordString += "\t"; SamRecordHelper::genSamTagsString(record, recordString); } recordString += "\n"; // Write the record. ifwrite(filePtr, recordString.c_str(), recordString.Length()); return(SamStatus::SUCCESS); } void SamInterface::ParseHeaderLine(StringIntHash & tags, StringArray & values) { tags.Clear(); values.Clear(); tokens.AddColumns(buffer, '\t'); for (int i = 1; i < tokens.Length(); i++) { tags.Add(tokens[i].Left(2), i - 1); values.Push(tokens[i].SubStr(3)); } } libStatGen-1.0.14/bam/SamInterface.h000066400000000000000000000044161254730101300171030ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_INTERFACE_H__ #define __SAM_INTERFACE_H__ #include "GenericSamInterface.h" class SamInterface : public GenericSamInterface { public: SamInterface(); ~SamInterface(); // Reads the header section from the specified SAM file and stores it in // the passed in header. virtual bool readHeader(IFILE filePtr, SamFileHeader& header, SamStatus& status); // Writes the specified header into the specified SAM file. virtual bool writeHeader(IFILE filePtr, SamFileHeader& header, SamStatus& status); // Reads the next record from the specified SAM file and stores it in // the passed in record. virtual void readRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamStatus& samStatus); // Writes the specified record into the specified SAM file. virtual SamStatus::Status writeRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamRecord::SequenceTranslation translation); private: void ParseHeaderLine(StringIntHash & tags, StringArray & values); String buffer; StringArray tokens; // Store the first record as it is read when trying to read the // header so it can be returned when a record is read. // Clear after it has been processed. String myFirstRecord; }; #endif libStatGen-1.0.14/bam/SamQuerySeqWithRefHelper.cpp000066400000000000000000000245621254730101300217510ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "SamQuerySeqWithRefHelper.h" #include "BaseUtilities.h" #include "SamFlag.h" SamQuerySeqWithRefIter::SamQuerySeqWithRefIter(SamRecord& record, GenomeSequence& refSequence, bool forward) : myRecord(record), myRefSequence(refSequence), myCigar(NULL), myStartOfReadOnRefIndex(INVALID_GENOME_INDEX), myQueryIndex(0), myForward(forward) { myCigar = myRecord.getCigarInfo(); myStartOfReadOnRefIndex = refSequence.getGenomePosition(myRecord.getReferenceName()); if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX) { // This reference name was found in the reference file, so // add the start position. myStartOfReadOnRefIndex += myRecord.get0BasedPosition(); } if(!forward) { myQueryIndex = myRecord.getReadLength() - 1; } } SamQuerySeqWithRefIter::~SamQuerySeqWithRefIter() { } bool SamQuerySeqWithRefIter::reset(bool forward) { myCigar = myRecord.getCigarInfo(); if(myCigar == NULL) { // Failed to get Cigar. return(false); } // Get where the position of where this read starts as mapped to the // reference. myStartOfReadOnRefIndex = myRefSequence.getGenomePosition(myRecord.getReferenceName()); if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX) { // This reference name was found in the reference file, so // add the start position. myStartOfReadOnRefIndex += myRecord.get0BasedPosition(); } myForward = forward; if(myForward) { myQueryIndex = 0; } else { // reverse, so start at the last entry. myQueryIndex = myRecord.getReadLength() - 1; } return(true); } // Returns information for the next position where the query and the // reference match or mismatch. To be a match or mismatch, both the query // and reference must have a base that is not 'N'. // This means: // insertions and deletions are not mismatches or matches. // 'N' bases are not matches or mismatches // Returns true if an entry was found, false if there are no more matches or // mismatches. bool SamQuerySeqWithRefIter::getNextMatchMismatch(SamSingleBaseMatchInfo& matchMismatchInfo) { // Check whether or not this read is mapped. // If the read is not mapped, return no matches. if(!SamFlag::isMapped(myRecord.getFlag())) { // Not mapped. return(false); } // Check that the Cigar is set. if(myCigar == NULL) { // Error. throw(std::runtime_error("Cannot determine matches/mismatches since failed to retrieve the cigar")); return(false); } // If myStartOfReadOnRefIndex is the default (unset) value, then // the reference was not found, so return false, no matches/mismatches. if(myStartOfReadOnRefIndex == INVALID_GENOME_INDEX) { // This reference name was not found in the reference file, so just // return no matches/mismatches. return(false); } // Repull the read length from the record to check just in case the // record has changed length. // Loop until a match or mismatch is found as long as query index // is still within the read (Loop is broken by a return). while((myQueryIndex < myRecord.getReadLength()) && (myQueryIndex >= 0)) { // Still more bases, look for a match/mismatch. // Get the reference offset for this read position. int32_t refOffset = myCigar->getRefOffset(myQueryIndex); if(refOffset == Cigar::INDEX_NA) { // This is either a softclip or an insertion // which do not count as a match or a mismatch, so // go to the next index. nextIndex(); continue; } // Both the reference and the read have a base, so get the bases. char readBase = myRecord.getSequence(myQueryIndex, SamRecord::NONE); char refBase = myRefSequence[myStartOfReadOnRefIndex + refOffset]; // If either the read or the reference bases are unknown, then // it does not count as a match or a mismatch. if(BaseUtilities::isAmbiguous(readBase) || BaseUtilities::isAmbiguous(refBase)) { // Either the reference base or the read base are unknown, // so skip this position. nextIndex(); continue; } // Both the read & the reference have a known base, so it is either // a match or a mismatch. matchMismatchInfo.setQueryIndex(myQueryIndex); // Check if they are equal. if(BaseUtilities::areEqual(readBase, refBase)) { // Match. matchMismatchInfo.setType(SamSingleBaseMatchInfo::MATCH); // Increment the query index to the next position. nextIndex(); return(true); } else { // Mismatch matchMismatchInfo.setType(SamSingleBaseMatchInfo::MISMATCH); // Increment the query index to the next position. nextIndex(); return(true); } } // No matches or mismatches were found, so return false. return(false); } void SamQuerySeqWithRefIter::nextIndex() { if(myForward) { ++myQueryIndex; } else { --myQueryIndex; } } SamSingleBaseMatchInfo::SamSingleBaseMatchInfo() : myType(UNKNOWN), myQueryIndex(0) { } SamSingleBaseMatchInfo::~SamSingleBaseMatchInfo() { } SamSingleBaseMatchInfo::Type SamSingleBaseMatchInfo::getType() { return(myType); } int32_t SamSingleBaseMatchInfo::getQueryIndex() { return(myQueryIndex); } void SamSingleBaseMatchInfo::setType(Type newType) { myType = newType; } void SamSingleBaseMatchInfo::setQueryIndex(int32_t queryIndex) { myQueryIndex = queryIndex; } /////////////////////////////////////////////////////////////////////////// void SamQuerySeqWithRef::seqWithEquals(const char* currentSeq, int32_t seq0BasedPos, Cigar& cigar, const char* referenceName, const GenomeSequence& refSequence, std::string& updatedSeq) { updatedSeq = currentSeq; int32_t seqLength = updatedSeq.length(); int32_t queryIndex = 0; uint32_t startOfReadOnRefIndex = refSequence.getGenomePosition(referenceName); if(startOfReadOnRefIndex == INVALID_GENOME_INDEX) { // This reference name was not found in the reference file, so just // return. return; } startOfReadOnRefIndex += seq0BasedPos; // Loop until the entire sequence has been updated. while(queryIndex < seqLength) { // Still more bases, look for matches. // Get the reference offset for this read position. int32_t refOffset = cigar.getRefOffset(queryIndex); if(refOffset != Cigar::INDEX_NA) { // Both the reference and the read have a base, so get the bases. char readBase = currentSeq[queryIndex]; char refBase = refSequence[startOfReadOnRefIndex + refOffset]; // If neither base is unknown and they are the same, count it // as a match. if(!BaseUtilities::isAmbiguous(readBase) && !BaseUtilities::isAmbiguous(refBase) && (BaseUtilities::areEqual(readBase, refBase))) { // Match. updatedSeq[queryIndex] = '='; } } // Increment the query index to the next position. ++queryIndex; continue; } } void SamQuerySeqWithRef::seqWithoutEquals(const char* currentSeq, int32_t seq0BasedPos, Cigar& cigar, const char* referenceName, const GenomeSequence& refSequence, std::string& updatedSeq) { updatedSeq = currentSeq; int32_t seqLength = updatedSeq.length(); int32_t queryIndex = 0; uint32_t startOfReadOnRefIndex = refSequence.getGenomePosition(referenceName); if(startOfReadOnRefIndex == INVALID_GENOME_INDEX) { // This reference name was not found in the reference file, so just // return. return; } startOfReadOnRefIndex += seq0BasedPos; // Loop until the entire sequence has been updated. while(queryIndex < seqLength) { // Still more bases, look for matches. // Get the reference offset for this read position. int32_t refOffset = cigar.getRefOffset(queryIndex); if(refOffset != Cigar::INDEX_NA) { // Both the reference and the read have a base, so get the bases. char readBase = currentSeq[queryIndex]; char refBase = refSequence[startOfReadOnRefIndex + refOffset]; // If the bases are equal, set the sequence to the reference // base. (Skips the check for ambiguous to catch a case where // ambiguous had been converted to a '=', and if both are ambiguous, // it will still be set to ambiguous.) if(BaseUtilities::areEqual(readBase, refBase)) { // Match. updatedSeq[queryIndex] = refBase; } } // Increment the query index to the next position. ++queryIndex; continue; } } libStatGen-1.0.14/bam/SamQuerySeqWithRefHelper.h000066400000000000000000000126741254730101300214170ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_QUERY_SEQ_WITH_REF_HELPER_H__ #define __SAM_QUERY_SEQ_WITH_REF_HELPER_H__ #include #include "SamRecord.h" #include "GenomeSequence.h" /// This class contains the match/mismatch information /// between the reference and a read for a single base. class SamSingleBaseMatchInfo { public: /// More types can be added later as needed. enum Type {UNKNOWN, MATCH, MISMATCH}; SamSingleBaseMatchInfo(); ~SamSingleBaseMatchInfo(); /// Get the type (match/mismatch/unknown) for this object. Type getType(); /// Get the query index for this object. int32_t getQueryIndex(); /// Set the type (match/mismatch/unkown) for this object. void setType(Type newType); /// Set the query index for this object. void setQueryIndex(int32_t queryIndex); private: Type myType; int32_t myQueryIndex; }; /// Iterates through the query and compare with reference. /// NOTE: References to the GenomeSequence and SamRecord are stored, the objects /// are not copied, so they must remain valid as long as this class is used. class SamQuerySeqWithRefIter { public: SamQuerySeqWithRefIter(SamRecord& record, GenomeSequence& refSequence, bool forward = true); virtual ~SamQuerySeqWithRefIter(); /// Reset to start at the beginning of the record. /// This will re-read values from SamRecord, so can be used if it has /// changed to contain information for a new record. /// \param forward true means to start from the beginning and go to the end; /// false means to start from the end and go to the beginning. /// \return true if successfully reset; false if failed to read the Cigar. bool reset(bool forward = true); /// Returns information for the next position where the query and the /// reference match or mismatch. To be a match or mismatch, both the query /// and reference must have a base that is not 'N'. /// This means: /// insertions and deletions are not mismatches or matches. /// 'N' bases are not matches or mismatches /// \param matchMismatchInfo return parameter with the information about /// the matching/mismatching base. /// \return true if there was another match/mismatch /// (matchMismatchInfo was set); false if not. bool getNextMatchMismatch(SamSingleBaseMatchInfo& matchMismatchInfo); private: SamQuerySeqWithRefIter(); void nextIndex(); SamRecord& myRecord; GenomeSequence& myRefSequence; Cigar* myCigar; uint32_t myStartOfReadOnRefIndex; int32_t myQueryIndex; bool myForward; }; /// Contains methods for converting between the query sequence and reference. class SamQuerySeqWithRef { public: /// Gets the sequence with '=' in any position where the sequence matches /// the reference. /// NOTE: 'N' in both the sequence and the reference is not considered a /// match. /// \param currentSeq sequence that should be converted /// \param seq0BasedPos 0 based start position of currentSeq on the reference. /// \param cigar cigar string for currentSeq (used for determining how the sequence aligns to the reference) /// \param referenceName reference name associated with this sequence /// \param refSequence reference sequence object /// \param updatedSeq return parameter that this method sets to the /// current sequence, replacing any matches to the reference with '='. static void seqWithEquals(const char* currentSeq, int32_t seq0BasedPos, Cigar& cigar, const char* referenceName, const GenomeSequence& refSequence, std::string& updatedSeq); /// Gets the sequence converting '=' to the appropriate base using the /// reference. /// \param currentSeq sequence that should be converted /// \param seq0BasedPos 0 based start position of currentSeq on the reference. /// \param cigar cigar string for currentSeq (used for determining how the sequence aligns to the reference) /// \param referenceName reference name associated with this sequence /// \param refSequence reference sequence object /// \param updatedSeq return parameter that this method sets to the /// current sequence, replacing any '=' with the base from the reference. static void seqWithoutEquals(const char* currentSeq, int32_t seq0BasedPos, Cigar& cigar, const char* referenceName, const GenomeSequence& refSequence, std::string& updatedSeq); private: SamQuerySeqWithRef(); }; #endif libStatGen-1.0.14/bam/SamRecord.cpp000066400000000000000000003367001254730101300167600ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include "bam.h" #include "SamRecord.h" #include "SamValidation.h" #include "BaseUtilities.h" #include "SamQuerySeqWithRefHelper.h" const char* SamRecord::DEFAULT_READ_NAME = "UNKNOWN"; const char* SamRecord::FIELD_ABSENT_STRING = "="; int SamRecord::myNumWarns = 0; SamRecord::SamRecord() : myStatus(), myRefPtr(NULL), mySequenceTranslation(NONE) { int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t); myRecordPtr = (bamRecordStruct *) malloc(defaultAllocSize); myCigarTempBuffer = NULL; myCigarTempBufferAllocatedSize = 0; allocatedSize = defaultAllocSize; resetRecord(); } SamRecord::SamRecord(ErrorHandler::HandlingType errorHandlingType) : myStatus(errorHandlingType), myRefPtr(NULL), mySequenceTranslation(NONE) { int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t); myRecordPtr = (bamRecordStruct *) malloc(defaultAllocSize); myCigarTempBuffer = NULL; myCigarTempBufferAllocatedSize = 0; allocatedSize = defaultAllocSize; resetRecord(); } SamRecord::~SamRecord() { resetRecord(); if(myRecordPtr != NULL) { free(myRecordPtr); myRecordPtr = NULL; } if(myCigarTempBuffer != NULL) { free(myCigarTempBuffer); myCigarTempBuffer = NULL; myCigarTempBufferAllocatedSize = 0; } } // Resets the fields of the record to a default value. void SamRecord::resetRecord() { myIsBufferSynced = true; myRecordPtr->myBlockSize = DEFAULT_BLOCK_SIZE; myRecordPtr->myReferenceID = -1; myRecordPtr->myPosition = -1; myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH; myRecordPtr->myMapQuality = 0; myRecordPtr->myBin = DEFAULT_BIN; myRecordPtr->myCigarLength = 0; myRecordPtr->myFlag = 0; myRecordPtr->myReadLength = 0; myRecordPtr->myMateReferenceID = -1; myRecordPtr->myMatePosition = -1; myRecordPtr->myInsertSize = 0; // Set the sam values for the variable length fields. // TODO - one way to speed this up might be to not set to "*" and just // clear them, and write out a '*' for SAM if it is empty. myReadName = DEFAULT_READ_NAME; myReferenceName = "*"; myMateReferenceName = "*"; myCigar = "*"; mySequence = "*"; mySeqWithEq.clear(); mySeqWithoutEq.clear(); myQuality = "*"; myNeedToSetTagsFromBuffer = false; myNeedToSetTagsInBuffer = false; // Initialize the calculated alignment info to the uncalculated value. myAlignmentLength = -1; myUnclippedStartOffset = -1; myUnclippedEndOffset = -1; clearTags(); // Set the bam values for the variable length fields. // Only the read name needs to be set, the others are a length of 0. // Set the read name. The min size of myRecordPtr includes the size for // the default read name. memcpy(&(myRecordPtr->myData), myReadName.c_str(), myRecordPtr->myReadNameLength); // Set that the variable length buffer fields are valid. myIsReadNameBufferValid = true; myIsCigarBufferValid = true; myPackedSequence = (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength * sizeof(int); myIsSequenceBufferValid = true; myBufferSequenceTranslation = NONE; myPackedQuality = myPackedSequence; myIsQualityBufferValid = true; myIsTagsBufferValid = true; myIsBinValid = true; myCigarTempBufferLength = -1; myStatus = SamStatus::SUCCESS; NOT_FOUND_TAG_STRING = ""; NOT_FOUND_TAG_INT = -1; // TODO - deprecate } // Returns whether or not the record is valid. // Header is needed to perform some validation against it. bool SamRecord::isValid(SamFileHeader& header) { myStatus = SamStatus::SUCCESS; SamValidationErrors invalidSamErrors; if(!SamValidator::isValid(header, *this, invalidSamErrors)) { // The record is not valid. std::string errorMessage = ""; invalidSamErrors.getErrorString(errorMessage); myStatus.setStatus(SamStatus::INVALID, errorMessage.c_str()); return(false); } // The record is valid. return(true); } void SamRecord::setReference(GenomeSequence* reference) { myRefPtr = reference; } // Set the type of sequence translation to use when getting // the sequence. The default type (if this method is never called) is // NONE (the sequence is left as-is). This is used void SamRecord::setSequenceTranslation(SequenceTranslation translation) { mySequenceTranslation = translation; } bool SamRecord::setReadName(const char* readName) { myReadName = readName; myIsBufferSynced = false; myIsReadNameBufferValid = false; myStatus = SamStatus::SUCCESS; // The read name must at least have some length, otherwise this is a parsing // error. if(myReadName.Length() == 0) { // Invalid - reset ReadName return false. myReadName = DEFAULT_READ_NAME; myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH; myStatus.setStatus(SamStatus::INVALID, "0 length Query Name."); return(false); } return true; } bool SamRecord::setFlag(uint16_t flag) { myStatus = SamStatus::SUCCESS; myRecordPtr->myFlag = flag; return true; } bool SamRecord::setReferenceName(SamFileHeader& header, const char* referenceName) { myStatus = SamStatus::SUCCESS; myReferenceName = referenceName; // If the reference ID does not already exist, add it (pass true) myRecordPtr->myReferenceID = header.getReferenceID(referenceName, true); return true; } bool SamRecord::set1BasedPosition(int32_t position) { return(set0BasedPosition(position - 1)); } bool SamRecord::set0BasedPosition(int32_t position) { myStatus = SamStatus::SUCCESS; myRecordPtr->myPosition = position; myIsBinValid = false; return true; } bool SamRecord::setMapQuality(uint8_t mapQuality) { myStatus = SamStatus::SUCCESS; myRecordPtr->myMapQuality = mapQuality; return true; } bool SamRecord::setCigar(const char* cigar) { myStatus = SamStatus::SUCCESS; myCigar = cigar; myIsBufferSynced = false; myIsCigarBufferValid = false; myCigarTempBufferLength = -1; myIsBinValid = false; // Initialize the calculated alignment info to the uncalculated value. myAlignmentLength = -1; myUnclippedStartOffset = -1; myUnclippedEndOffset = -1; return true; } bool SamRecord::setCigar(const Cigar& cigar) { myStatus = SamStatus::SUCCESS; cigar.getCigarString(myCigar); myIsBufferSynced = false; myIsCigarBufferValid = false; myCigarTempBufferLength = -1; myIsBinValid = false; // Initialize the calculated alignment info to the uncalculated value. myAlignmentLength = -1; myUnclippedStartOffset = -1; myUnclippedEndOffset = -1; return true; } bool SamRecord::setMateReferenceName(SamFileHeader& header, const char* mateReferenceName) { myStatus = SamStatus::SUCCESS; // Set the mate reference, if it is "=", set it to be equal // to myReferenceName. This assumes that myReferenceName has already // been called. if(strcmp(mateReferenceName, FIELD_ABSENT_STRING) == 0) { myMateReferenceName = myReferenceName; } else { myMateReferenceName = mateReferenceName; } // Set the Mate Reference ID. // If the reference ID does not already exist, add it (pass true) myRecordPtr->myMateReferenceID = header.getReferenceID(myMateReferenceName, true); return true; } bool SamRecord::set1BasedMatePosition(int32_t matePosition) { return(set0BasedMatePosition(matePosition - 1)); } bool SamRecord::set0BasedMatePosition(int32_t matePosition) { myStatus = SamStatus::SUCCESS; myRecordPtr->myMatePosition = matePosition; return true; } bool SamRecord::setInsertSize(int32_t insertSize) { myStatus = SamStatus::SUCCESS; myRecordPtr->myInsertSize = insertSize; return true; } bool SamRecord::setSequence(const char* seq) { myStatus = SamStatus::SUCCESS; mySequence = seq; mySeqWithEq.clear(); mySeqWithoutEq.clear(); myIsBufferSynced = false; myIsSequenceBufferValid = false; return true; } bool SamRecord::setQuality(const char* quality) { myStatus = SamStatus::SUCCESS; myQuality = quality; myIsBufferSynced = false; myIsQualityBufferValid = false; return true; } //Shift indels to the left bool SamRecord::shiftIndelsLeft() { // Check to see whether or not the Cigar has already been // set - this is determined by checking if alignment length // is set since alignment length and the cigar are set // at the same time. if(myAlignmentLength == -1) { // Not been set, so calculate it. parseCigar(); } // Track whether or not there was a shift. bool shifted = false; // Cigar is set, so now myCigarRoller can be used. // Track where in the read we are. uint32_t currentPos = 0; // Since the loop starts at 1 because the first operation can't be shifted, // increment the currentPos past the first operation. if(Cigar::foundInQuery(myCigarRoller[0])) { // This op was found in the read, increment the current position. currentPos += myCigarRoller[0].count; } int numOps = myCigarRoller.size(); // Loop through the cigar operations from the 2nd operation since // the first operation is already on the end and can't shift. for(int currentOp = 1; currentOp < numOps; currentOp++) { if(myCigarRoller[currentOp].operation == Cigar::insert) { // For now, only shift a max of 1 operation. int prevOpIndex = currentOp-1; // Track the next op for seeing if it is the same as the // previous for merging reasons. int nextOpIndex = currentOp+1; if(nextOpIndex == numOps) { // There is no next op, so set it equal to the current one. nextOpIndex = currentOp; } // The start of the previous operation, so we know when we hit it // so we don't shift past it. uint32_t prevOpStart = currentPos - myCigarRoller[prevOpIndex].count; // We can only shift if the previous operation if(!Cigar::isMatchOrMismatch(myCigarRoller[prevOpIndex])) { // TODO - shift past pads // An insert is in the read, so increment the position. currentPos += myCigarRoller[currentOp].count; // Not a match/mismatch, so can't shift into it. continue; } // It is a match or mismatch, so check to see if we can // shift into it. // The end of the insert is calculated by adding the size // of this insert minus 1 to the start of the insert. uint32_t insertEndPos = currentPos + myCigarRoller[currentOp].count - 1; // The insert starts at the current position. uint32_t insertStartPos = currentPos; // Loop as long as the position before the insert start // matches the last character in the insert. If they match, // the insert can be shifted one index left because the // implied reference will not change. If they do not match, // we can't shift because the implied reference would change. // Stop loop when insertStartPos = prevOpStart, because we // don't want to move past that. while((insertStartPos > prevOpStart) && (getSequence(insertEndPos,BASES) == getSequence(insertStartPos - 1, BASES))) { // We can shift, so move the insert start & end one left. --insertEndPos; --insertStartPos; } // Determine if a shift has occurred. int shiftLen = currentPos - insertStartPos; if(shiftLen > 0) { // Shift occured, so adjust the cigar if the cigar will // not become more operations. // If the next operation is the same as the previous or // if the insert and the previous operation switch positions // then the cigar has the same number of operations. // If the next operation is different, and the shift splits // the previous operation in 2, then the cigar would // become longer, so we do not want to shift. if(myCigarRoller[nextOpIndex].operation == myCigarRoller[prevOpIndex].operation) { // The operations are the same, so merge them by adding // the length of the shift to the next operation. myCigarRoller.IncrementCount(nextOpIndex, shiftLen); myCigarRoller.IncrementCount(prevOpIndex, -shiftLen); // If the previous op length is 0, just remove that // operation. if(myCigarRoller[prevOpIndex].count == 0) { myCigarRoller.Remove(prevOpIndex); } shifted = true; } else { // Can only shift if the insert shifts past the // entire previous operation, otherwise an operation // would need to be added. if(insertStartPos == prevOpStart) { // Swap the positions of the insert and the // previous operation. myCigarRoller.Update(currentOp, myCigarRoller[prevOpIndex].operation, myCigarRoller[prevOpIndex].count); // Size of the previous op is the entire // shift length. myCigarRoller.Update(prevOpIndex, Cigar::insert, shiftLen); shifted = true; } } } // An insert is in the read, so increment the position. currentPos += myCigarRoller[currentOp].count; } else if(Cigar::foundInQuery(myCigarRoller[currentOp])) { // This op was found in the read, increment the current position. currentPos += myCigarRoller[currentOp].count; } } if(shifted) { // TODO - setCigar is currently inefficient because later the cigar // roller will be recalculated, but for now it will work. setCigar(myCigarRoller); } return(shifted); } // Set the BAM record from the passeed in buffer of the specified size. // Note: The size includes the block size. SamStatus::Status SamRecord::setBuffer(const char* fromBuffer, uint32_t fromBufferSize, SamFileHeader& header) { myStatus = SamStatus::SUCCESS; if((fromBuffer == NULL) || (fromBufferSize == 0)) { // Buffer is empty. myStatus.setStatus(SamStatus::FAIL_PARSE, "Cannot parse an empty file."); return(SamStatus::FAIL_PARSE); } // Clear the record. resetRecord(); // allocate space for the record size. if(!allocateRecordStructure(fromBufferSize)) { // Failed to allocate space. return(SamStatus::FAIL_MEM); } memcpy(myRecordPtr, fromBuffer, fromBufferSize); setVariablesForNewBuffer(header); // Return the status of the record. return(SamStatus::SUCCESS); } // Read the BAM record from a file. SamStatus::Status SamRecord::setBufferFromFile(IFILE filePtr, SamFileHeader& header) { myStatus = SamStatus::SUCCESS; if((filePtr == NULL) || (filePtr->isOpen() == false)) { // File is not open, return failure. myStatus.setStatus(SamStatus::FAIL_ORDER, "Can't read from an unopened file."); return(SamStatus::FAIL_ORDER); } // Clear the record. resetRecord(); // read the record size. int numBytes = ifread(filePtr, &(myRecordPtr->myBlockSize), sizeof(int32_t)); // Check to see if the end of the file was hit and no bytes were read. if(ifeof(filePtr) && (numBytes == 0)) { // End of file, nothing was read, no more records. myStatus.setStatus(SamStatus::NO_MORE_RECS, "No more records left to read."); return(SamStatus::NO_MORE_RECS); } if(numBytes != sizeof(int32_t)) { // Failed to read the entire block size. Either the end of the file // was reached early or there was an error. if(ifeof(filePtr)) { // Error: end of the file reached prior to reading the rest of the // record. myStatus.setStatus(SamStatus::FAIL_PARSE, "EOF reached in the middle of a record."); return(SamStatus::FAIL_PARSE); } else { // Error reading. myStatus.setStatus(SamStatus::FAIL_IO, "Failed to read the record size."); return(SamStatus::FAIL_IO); } } // allocate space for the record size. if(!allocateRecordStructure(myRecordPtr->myBlockSize + sizeof(int32_t))) { // Failed to allocate space. // Status is set by allocateRecordStructure. return(SamStatus::FAIL_MEM); } // Read the rest of the alignment block, starting at the reference id. if(ifread(filePtr, &(myRecordPtr->myReferenceID), myRecordPtr->myBlockSize) != (unsigned int)myRecordPtr->myBlockSize) { // Error reading the record. Reset it and return failure. resetRecord(); myStatus.setStatus(SamStatus::FAIL_IO, "Failed to read the record"); return(SamStatus::FAIL_IO); } setVariablesForNewBuffer(header); // Return the status of the record. return(SamStatus::SUCCESS); } // Add the specified tag to the record. // Returns true if the tag was successfully added, false otherwise. bool SamRecord::addIntTag(const char* tag, int32_t value) { myStatus = SamStatus::SUCCESS; int key = 0; int index = 0; char bamvtype; int tagBufferSize = 0; // First check to see if the tags need to be synced to the buffer. if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read tags from the buffer, so cannot add new ones. return(false); } } // Ints come in as int. But it can be represented in fewer bits. // So determine a more specific type that is in line with the // types for BAM files. // First check to see if it is a negative. if(value < 0) { // The int is negative, so it will need to use a signed type. // See if it is greater than the min value for a char. if(value > ((std::numeric_limits::min)())) { // It can be stored in a signed char. bamvtype = 'c'; tagBufferSize += 4; } else if(value > ((std::numeric_limits::min)())) { // It fits in a signed short. bamvtype = 's'; tagBufferSize += 5; } else { // Just store it as a signed int. bamvtype = 'i'; tagBufferSize += 7; } } else { // It is positive, so an unsigned type can be used. if(value < ((std::numeric_limits::max)())) { // It is under the max of an unsigned char. bamvtype = 'C'; tagBufferSize += 4; } else if(value < ((std::numeric_limits::max)())) { // It is under the max of an unsigned short. bamvtype = 'S'; tagBufferSize += 5; } else { // Just store it as an unsigned int. bamvtype = 'I'; tagBufferSize += 7; } } // Check to see if the tag is already there. key = MAKEKEY(tag[0], tag[1], bamvtype); unsigned int hashIndex = extras.Find(key); if(hashIndex != LH_NOTFOUND) { // Tag was already found. index = extras[hashIndex]; // Since the tagBufferSize was already updated with the new value, // subtract the size for the previous tag (even if they are the same). switch(intType[index]) { case 'c': case 'C': case 'A': tagBufferSize -= 4; break; case 's': case 'S': tagBufferSize -= 5; break; case 'i': case 'I': tagBufferSize -= 7; break; default: myStatus.setStatus(SamStatus::INVALID, "unknown tag inttype type found.\n"); return(false); } // Tag already existed, print message about overwriting. // WARN about dropping duplicate tags. if(myNumWarns++ < myMaxWarns) { String newVal; String origVal; appendIntArrayValue(index, origVal); appendIntArrayValue(bamvtype, value, newVal); fprintf(stderr, "WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n", tag[0], tag[1], intType[index], origVal.c_str(), tag[0], tag[1], bamvtype, newVal.c_str()); if(myNumWarns == myMaxWarns) { fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n"); } } // Update the integer value and type. integers[index] = value; intType[index] = bamvtype; } else { // Tag is not already there, so add it. index = integers.Length(); integers.Push(value); intType.push_back(bamvtype); extras.Add(key, index); } // The buffer tags are now out of sync. myNeedToSetTagsInBuffer = true; myIsTagsBufferValid = false; myIsBufferSynced = false; myTagBufferSize += tagBufferSize; return(true); } // Add the specified tag to the record, replacing it if it is already there and // is different from the previous value. // Returns true if the tag was successfully added (or was already there), false otherwise. bool SamRecord::addTag(const char* tag, char vtype, const char* valuePtr) { if(vtype == 'i') { // integer type. Call addIntTag to handle it. int intVal = atoi(valuePtr); return(addIntTag(tag, intVal)); } // Non-int type. myStatus = SamStatus::SUCCESS; bool status = true; // default to successful. int key = 0; int index = 0; int tagBufferSize = 0; // First check to see if the tags need to be synced to the buffer. if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read tags from the buffer, so cannot add new ones. return(false); } } // First check to see if the tag is already there. key = MAKEKEY(tag[0], tag[1], vtype); unsigned int hashIndex = extras.Find(key); if(hashIndex != LH_NOTFOUND) { // The key was found in the hash, so get the lookup index. index = extras[hashIndex]; String origTag; char origType = vtype; // Adjust the currently pointed to value to the new setting. switch (vtype) { case 'A' : // First check to see if the value changed. if((integers[index] == (const int)*(valuePtr)) && (intType[index] == vtype)) { // The value & type has not changed, so do nothing. return(true); } else { // Tag buffer size changes if type changes, so subtract & add. origType = intType[index]; appendIntArrayValue(index, origTag); tagBufferSize -= getNumericTagTypeSize(intType[index]); tagBufferSize += getNumericTagTypeSize(vtype); integers[index] = (const int)*(valuePtr); intType[index] = vtype; } break; case 'Z' : // First check to see if the value changed. if(strings[index] == valuePtr) { // The value has not changed, so do nothing. return(true); } else { // Adjust the tagBufferSize by removing the size of the old string. origTag = strings[index]; tagBufferSize -= strings[index].Length(); strings[index] = valuePtr; // Adjust the tagBufferSize by adding the size of the new string. tagBufferSize += strings[index].Length(); } break; case 'B' : // First check to see if the value changed. if(strings[index] == valuePtr) { // The value has not changed, so do nothing. return(true); } else { // Adjust the tagBufferSize by removing the size of the old field. origTag = strings[index]; tagBufferSize -= getBtagBufferSize(strings[index]); strings[index] = valuePtr; // Adjust the tagBufferSize by adding the size of the new field. tagBufferSize += getBtagBufferSize(strings[index]); } break; case 'f' : // First check to see if the value changed. if(floats[index] == (float)atof(valuePtr)) { // The value has not changed, so do nothing. return(true); } else { // Tag buffer size doesn't change between different 'f' entries. origTag.appendFullFloat(floats[index]); floats[index] = (float)atof(valuePtr); } break; default : fprintf(stderr, "samRecord::addTag() - Unknown custom field of type %c\n", vtype); myStatus.setStatus(SamStatus::FAIL_PARSE, "Unknown custom field in a tag"); status = false; break; } // Duplicate tag in this record. // Tag already existed, print message about overwriting. // WARN about dropping duplicate tags. if(myNumWarns++ < myMaxWarns) { fprintf(stderr, "WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n", tag[0], tag[1], origType, origTag.c_str(), tag[0], tag[1], vtype, valuePtr); if(myNumWarns == myMaxWarns) { fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n"); } } } else { // The key was not found in the hash, so add it. switch (vtype) { case 'A' : index = integers.Length(); integers.Push((const int)*(valuePtr)); intType.push_back(vtype); tagBufferSize += 4; break; case 'Z' : index = strings.Length(); strings.Push(valuePtr); tagBufferSize += 4 + strings.Last().Length(); break; case 'B' : index = strings.Length(); strings.Push(valuePtr); tagBufferSize += 3 + getBtagBufferSize(strings[index]); break; case 'f' : index = floats.size(); floats.push_back((float)atof(valuePtr)); tagBufferSize += 7; break; default : fprintf(stderr, "samRecord::addTag() - Unknown custom field of type %c\n", vtype); myStatus.setStatus(SamStatus::FAIL_PARSE, "Unknown custom field in a tag"); status = false; break; } if(status) { // If successful, add the key to extras. extras.Add(key, index); } } // Only add the tag if it has so far been successfully processed. if(status) { // The buffer tags are now out of sync. myNeedToSetTagsInBuffer = true; myIsTagsBufferValid = false; myIsBufferSynced = false; myTagBufferSize += tagBufferSize; } return(status); } void SamRecord::clearTags() { if(extras.Entries() != 0) { extras.Clear(); } strings.Clear(); integers.Clear(); intType.clear(); floats.clear(); myTagBufferSize = 0; resetTagIter(); } bool SamRecord::rmTag(const char* tag, char type) { // Check the length of tag. if(strlen(tag) != 2) { // Tag is the wrong length. myStatus.setStatus(SamStatus::INVALID, "rmTag called with tag that is not 2 characters\n"); return(false); } myStatus = SamStatus::SUCCESS; if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. return(false); } } // Construct the key. int key = MAKEKEY(tag[0], tag[1], type); // Look to see if the key exsists in the hash. int offset = extras.Find(key); if(offset < 0) { // Not found, so return true, successfully removed since // it is not in tag. return(true); } // Offset is set, so the key was found. // First if it is an integer, determine the actual type of the int. char vtype; getTypeFromKey(key, vtype); if(vtype == 'i') { vtype = getIntegerType(offset); } // Offset is set, so recalculate the buffer size without this entry. // Do NOT remove from strings, integers, or floats because then // extras would need to be updated for all entries with the new indexes // into those variables. int rmBuffSize = 0; switch(vtype) { case 'A': case 'c': case 'C': rmBuffSize = 4; break; case 's': case 'S': rmBuffSize = 5; break; case 'i': case 'I': rmBuffSize = 7; break; case 'f': rmBuffSize = 7; break; case 'Z': rmBuffSize = 4 + getString(offset).Length(); break; case 'B': rmBuffSize = 3 + getBtagBufferSize(getString(offset)); break; default: myStatus.setStatus(SamStatus::INVALID, "rmTag called with unknown type.\n"); return(false); break; }; // The buffer tags are now out of sync. myNeedToSetTagsInBuffer = true; myIsTagsBufferValid = false; myIsBufferSynced = false; myTagBufferSize -= rmBuffSize; // Remove from the hash. extras.Delete(offset); return(true); } bool SamRecord::rmTags(const char* tags) { const char* currentTagPtr = tags; myStatus = SamStatus::SUCCESS; if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. return(false); } } bool returnStatus = true; int rmBuffSize = 0; while(*currentTagPtr != '\0') { // Tags are formatted as: XY:Z // Where X is [A-Za-z], Y is [A-Za-z], and // Z is A,i,f,Z,H (cCsSI are also excepted) if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') || (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0')) { myStatus.setStatus(SamStatus::INVALID, "rmTags called with improperly formatted tags.\n"); returnStatus = false; break; } // Construct the key. int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1], currentTagPtr[3]); // Look to see if the key exsists in the hash. int offset = extras.Find(key); if(offset >= 0) { // Offset is set, so the key was found. // First if it is an integer, determine the actual type of the int. char vtype; getTypeFromKey(key, vtype); if(vtype == 'i') { vtype = getIntegerType(offset); } // Offset is set, so recalculate the buffer size without this entry. // Do NOT remove from strings, integers, or floats because then // extras would need to be updated for all entries with the new indexes // into those variables. switch(vtype) { case 'A': case 'c': case 'C': rmBuffSize += 4; break; case 's': case 'S': rmBuffSize += 5; break; case 'i': case 'I': rmBuffSize += 7; break; case 'f': rmBuffSize += 7; break; case 'Z': rmBuffSize += 4 + getString(offset).Length(); break; case 'B': rmBuffSize += 3 + getBtagBufferSize(getString(offset)); break; default: myStatus.setStatus(SamStatus::INVALID, "rmTag called with unknown type.\n"); returnStatus = false; break; }; // Remove from the hash. extras.Delete(offset); } // Increment to the next tag. if((currentTagPtr[4] == ';') || (currentTagPtr[4] == ',')) { // Increment once more. currentTagPtr += 5; } else if(currentTagPtr[4] != '\0') { // Invalid tag format. myStatus.setStatus(SamStatus::INVALID, "rmTags called with improperly formatted tags.\n"); returnStatus = false; break; } else { // Last Tag. currentTagPtr += 4; } } // The buffer tags are now out of sync. myNeedToSetTagsInBuffer = true; myIsTagsBufferValid = false; myIsBufferSynced = false; myTagBufferSize -= rmBuffSize; return(returnStatus); } // Get methods for record fields. const void* SamRecord::getRecordBuffer() { return(getRecordBuffer(mySequenceTranslation)); } // Get methods for record fields. const void* SamRecord::getRecordBuffer(SequenceTranslation translation) { myStatus = SamStatus::SUCCESS; bool status = true; // If the buffer is not synced or the sequence in the buffer is not // properly translated, fix the buffer. if((myIsBufferSynced == false) || (myBufferSequenceTranslation != translation)) { status &= fixBuffer(translation); } // If the buffer is synced, check to see if the tags need to be synced. if(myNeedToSetTagsInBuffer) { status &= setTagsInBuffer(); } if(!status) { return(NULL); } return (const void *)myRecordPtr; } // Write the record as a buffer into the file using the class's // sequence translation setting. SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr) { return(writeRecordBuffer(filePtr, mySequenceTranslation)); } // Write the record as a buffer into the file using the specified translation. SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr, SequenceTranslation translation) { myStatus = SamStatus::SUCCESS; if((filePtr == NULL) || (filePtr->isOpen() == false)) { // File is not open, return failure. myStatus.setStatus(SamStatus::FAIL_ORDER, "Can't write to an unopened file."); return(SamStatus::FAIL_ORDER); } if((myIsBufferSynced == false) || (myBufferSequenceTranslation != translation)) { if(!fixBuffer(translation)) { return(myStatus.getStatus()); } } // Write the record. unsigned int numBytesToWrite = myRecordPtr->myBlockSize + sizeof(int32_t); unsigned int numBytesWritten = ifwrite(filePtr, myRecordPtr, numBytesToWrite); // Return status based on if the correct number of bytes were written. if(numBytesToWrite == numBytesWritten) { return(SamStatus::SUCCESS); } // The correct number of bytes were not written. myStatus.setStatus(SamStatus::FAIL_IO, "Failed to write the entire record."); return(SamStatus::FAIL_IO); } int32_t SamRecord::getBlockSize() { myStatus = SamStatus::SUCCESS; // If the buffer isn't synced, sync the buffer to determine the // block size. if(myIsBufferSynced == false) { // Since this just returns the block size, the translation of // the sequence does not matter, so just use the currently set // value. fixBuffer(myBufferSequenceTranslation); } return myRecordPtr->myBlockSize; } // This method returns the reference name. const char* SamRecord::getReferenceName() { myStatus = SamStatus::SUCCESS; return myReferenceName.c_str(); } int32_t SamRecord::getReferenceID() { myStatus = SamStatus::SUCCESS; return myRecordPtr->myReferenceID; } int32_t SamRecord::get1BasedPosition() { myStatus = SamStatus::SUCCESS; return (myRecordPtr->myPosition + 1); } int32_t SamRecord::get0BasedPosition() { myStatus = SamStatus::SUCCESS; return myRecordPtr->myPosition; } uint8_t SamRecord::getReadNameLength() { myStatus = SamStatus::SUCCESS; // If the buffer is valid, return the size from there, otherwise get the // size from the string length + 1 (ending null). if(myIsReadNameBufferValid) { return(myRecordPtr->myReadNameLength); } return(myReadName.Length() + 1); } uint8_t SamRecord::getMapQuality() { myStatus = SamStatus::SUCCESS; return myRecordPtr->myMapQuality; } uint16_t SamRecord::getBin() { myStatus = SamStatus::SUCCESS; if(!myIsBinValid) { // The bin that is set in the record is not valid, so // reset it. myRecordPtr->myBin = bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd()); myIsBinValid = true; } return(myRecordPtr->myBin); } uint16_t SamRecord::getCigarLength() { myStatus = SamStatus::SUCCESS; // If the cigar buffer is valid // then get the length from there. if(myIsCigarBufferValid) { return myRecordPtr->myCigarLength; } if(myCigarTempBufferLength == -1) { // The cigar buffer is not valid and the cigar temp buffer is not set, // so parse the string. parseCigarString(); } // The temp buffer is now set, so return the size. return(myCigarTempBufferLength); } uint16_t SamRecord::getFlag() { myStatus = SamStatus::SUCCESS; return myRecordPtr->myFlag; } int32_t SamRecord::getReadLength() { myStatus = SamStatus::SUCCESS; if(myIsSequenceBufferValid == false) { // If the sequence is "*", then return 0. if((mySequence.Length() == 1) && (mySequence[0] == '*')) { return(0); } // Do not add 1 since it is not null terminated. return(mySequence.Length()); } return(myRecordPtr->myReadLength); } // This method returns the mate reference name. If it is equal to the // reference name, it still returns the reference name. const char* SamRecord::getMateReferenceName() { myStatus = SamStatus::SUCCESS; return myMateReferenceName.c_str(); } // This method returns the mate reference name. If it is equal to the // reference name, it returns "=", unless they are both "*" in which case // "*" is returned. const char* SamRecord::getMateReferenceNameOrEqual() { myStatus = SamStatus::SUCCESS; if(myMateReferenceName == "*") { return(myMateReferenceName); } if(myMateReferenceName == getReferenceName()) { return(FIELD_ABSENT_STRING); } else { return(myMateReferenceName); } } int32_t SamRecord::getMateReferenceID() { myStatus = SamStatus::SUCCESS; return myRecordPtr->myMateReferenceID; } int32_t SamRecord::get1BasedMatePosition() { myStatus = SamStatus::SUCCESS; return (myRecordPtr->myMatePosition + 1); } int32_t SamRecord::get0BasedMatePosition() { myStatus = SamStatus::SUCCESS; return myRecordPtr->myMatePosition; } int32_t SamRecord::getInsertSize() { myStatus = SamStatus::SUCCESS; return myRecordPtr->myInsertSize; } // Returns the inclusive rightmost position of the clipped sequence. int32_t SamRecord::get0BasedAlignmentEnd() { myStatus = SamStatus::SUCCESS; if(myAlignmentLength == -1) { // Alignment end has not been set, so calculate it. parseCigar(); } // If alignment length > 0, subtract 1 from it to get the end. if(myAlignmentLength == 0) { // Length is 0, just return the start position. return(myRecordPtr->myPosition); } return(myRecordPtr->myPosition + myAlignmentLength - 1); } // Returns the inclusive rightmost position of the clipped sequence. int32_t SamRecord::get1BasedAlignmentEnd() { return(get0BasedAlignmentEnd() + 1); } // Return the length of the alignment. int32_t SamRecord::getAlignmentLength() { myStatus = SamStatus::SUCCESS; if(myAlignmentLength == -1) { // Alignment end has not been set, so calculate it. parseCigar(); } // Return the alignment length. return(myAlignmentLength); } // Returns the inclusive left-most position adjust for clipped bases. int32_t SamRecord::get0BasedUnclippedStart() { myStatus = SamStatus::SUCCESS; if(myUnclippedStartOffset == -1) { // Unclipped has not yet been calculated, so parse the cigar to get it parseCigar(); } return(myRecordPtr->myPosition - myUnclippedStartOffset); } // Returns the inclusive left-most position adjust for clipped bases. int32_t SamRecord::get1BasedUnclippedStart() { return(get0BasedUnclippedStart() + 1); } // Returns the inclusive right-most position adjust for clipped bases. int32_t SamRecord::get0BasedUnclippedEnd() { // myUnclippedEndOffset will be set by get0BasedAlignmentEnd if the // cigar has not yet been parsed, so no need to check it here. return(get0BasedAlignmentEnd() + myUnclippedEndOffset); } // Returns the inclusive right-most position adjust for clipped bases. int32_t SamRecord::get1BasedUnclippedEnd() { return(get0BasedUnclippedEnd() + 1); } // Get the read name. const char* SamRecord::getReadName() { myStatus = SamStatus::SUCCESS; if(myReadName.Length() == 0) { // 0 Length, means that it is in the buffer, but has not yet // been synced to the string, so do the sync. myReadName = (char*)&(myRecordPtr->myData); } return myReadName.c_str(); } const char* SamRecord::getCigar() { myStatus = SamStatus::SUCCESS; if(myCigar.Length() == 0) { // 0 Length, means that it is in the buffer, but has not yet // been synced to the string, so do the sync. parseCigarBinary(); } return myCigar.c_str(); } const char* SamRecord::getSequence() { return(getSequence(mySequenceTranslation)); } const char* SamRecord::getSequence(SequenceTranslation translation) { myStatus = SamStatus::SUCCESS; if(mySequence.Length() == 0) { // 0 Length, means that it is in the buffer, but has not yet // been synced to the string, so do the sync. setSequenceAndQualityFromBuffer(); } // Determine if translation needs to be done. if((translation == NONE) || (myRefPtr == NULL)) { return mySequence.c_str(); } else if(translation == EQUAL) { if(mySeqWithEq.length() == 0) { // Check to see if the sequence is defined. if(mySequence == "*") { // Sequence is undefined, so no translation necessary. mySeqWithEq = '*'; } else { // Sequence defined, so translate it. SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), myRecordPtr->myPosition, *(getCigarInfo()), getReferenceName(), *myRefPtr, mySeqWithEq); } } return(mySeqWithEq.c_str()); } else { // translation == BASES if(mySeqWithoutEq.length() == 0) { if(mySequence == "*") { // Sequence is undefined, so no translation necessary. mySeqWithoutEq = '*'; } else { // Sequence defined, so translate it. SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), myRecordPtr->myPosition, *(getCigarInfo()), getReferenceName(), *myRefPtr, mySeqWithoutEq); } } return(mySeqWithoutEq.c_str()); } } const char* SamRecord::getQuality() { myStatus = SamStatus::SUCCESS; if(myQuality.Length() == 0) { // 0 Length, means that it is in the buffer, but has not yet // been synced to the string, so do the sync. setSequenceAndQualityFromBuffer(); } return myQuality.c_str(); } char SamRecord::getSequence(int index) { return(getSequence(index, mySequenceTranslation)); } char SamRecord::getSequence(int index, SequenceTranslation translation) { static const char * asciiBases = "=AC.G...T......N"; // Determine the read length. int32_t readLen = getReadLength(); // If the read length is 0, this method should not be called. if(readLen == 0) { String exceptionString = "SamRecord::getSequence("; exceptionString += index; exceptionString += ") is not allowed since sequence = '*'"; throw std::runtime_error(exceptionString.c_str()); } else if((index < 0) || (index >= readLen)) { // Only get here if the index was out of range, so thow an exception. String exceptionString = "SamRecord::getSequence("; exceptionString += index; exceptionString += ") is out of range. Index must be between 0 and "; exceptionString += (readLen - 1); throw std::runtime_error(exceptionString.c_str()); } // Determine if translation needs to be done. if((translation == NONE) || (myRefPtr == NULL)) { // No translation needs to be done. if(mySequence.Length() == 0) { // Parse BAM sequence. if(myIsSequenceBufferValid) { return(index & 1 ? asciiBases[myPackedSequence[index / 2] & 0xF] : asciiBases[myPackedSequence[index / 2] >> 4]); } else { String exceptionString = "SamRecord::getSequence("; exceptionString += index; exceptionString += ") called with no sequence set"; throw std::runtime_error(exceptionString.c_str()); } } // Already have string. return(mySequence[index]); } else { // Need to translate the sequence either to have '=' or to not // have it. // First check to see if the sequence has been set. if(mySequence.Length() == 0) { // 0 Length, means that it is in the buffer, but has not yet // been synced to the string, so do the sync. setSequenceAndQualityFromBuffer(); } // Check the type of translation. if(translation == EQUAL) { // Check whether or not the string has already been // retrieved that has the '=' in it. if(mySeqWithEq.length() == 0) { // The string with '=' has not yet been determined, // so get the string. // Check to see if the sequence is defined. if(mySequence == "*") { // Sequence is undefined, so no translation necessary. mySeqWithEq = '*'; } else { // Sequence defined, so translate it. SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), myRecordPtr->myPosition, *(getCigarInfo()), getReferenceName(), *myRefPtr, mySeqWithEq); } } // Sequence is set, so return it. return(mySeqWithEq[index]); } else { // translation == BASES // Check whether or not the string has already been // retrieved that does not have the '=' in it. if(mySeqWithoutEq.length() == 0) { // The string with '=' has not yet been determined, // so get the string. // Check to see if the sequence is defined. if(mySequence == "*") { // Sequence is undefined, so no translation necessary. mySeqWithoutEq = '*'; } else { // Sequence defined, so translate it. // The string without '=' has not yet been determined, // so get the string. SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), myRecordPtr->myPosition, *(getCigarInfo()), getReferenceName(), *myRefPtr, mySeqWithoutEq); } } // Sequence is set, so return it. return(mySeqWithoutEq[index]); } } } char SamRecord::getQuality(int index) { // Determine the read length. int32_t readLen = getReadLength(); // If the read length is 0, return ' ' whose ascii code is below // the minimum ascii code for qualities. if(readLen == 0) { return(BaseUtilities::UNKNOWN_QUALITY_CHAR); } else if((index < 0) || (index >= readLen)) { // Only get here if the index was out of range, so thow an exception. String exceptionString = "SamRecord::getQuality("; exceptionString += index; exceptionString += ") is out of range. Index must be between 0 and "; exceptionString += (readLen - 1); throw std::runtime_error(exceptionString.c_str()); } if(myQuality.Length() == 0) { // Parse BAM Quality. // Know that myPackedQuality is correct since readLen != 0. return(myPackedQuality[index] + 33); } else { // Already have string. if((myQuality.Length() == 1) && (myQuality[0] == '*')) { // Return the unknown quality character. return(BaseUtilities::UNKNOWN_QUALITY_CHAR); } else if(index >= myQuality.Length()) { // Only get here if the index was out of range, so thow an exception. // Technically the myQuality string is not guaranteed to be the same length // as the sequence, so this catches that error. String exceptionString = "SamRecord::getQuality("; exceptionString += index; exceptionString += ") is out of range. Index must be between 0 and "; exceptionString += (myQuality.Length() - 1); throw std::runtime_error(exceptionString.c_str()); } else { return(myQuality[index]); } } } Cigar* SamRecord::getCigarInfo() { // Check to see whether or not the Cigar has already been // set - this is determined by checking if alignment length // is set since alignment length and the cigar are set // at the same time. if(myAlignmentLength == -1) { // Not been set, so calculate it. parseCigar(); } return(&myCigarRoller); } // Return the number of bases in this read that overlap the passed in // region. (start & end are 0-based) uint32_t SamRecord::getNumOverlaps(int32_t start, int32_t end) { // Determine whether or not the cigar has been parsed, which sets up // the cigar roller. This is determined by checking the alignment length. if(myAlignmentLength == -1) { parseCigar(); } return(myCigarRoller.getNumOverlaps(start, end, get0BasedPosition())); } // Returns the values of all fields except the tags. bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, String& cigar, String& sequence, String& quality) { return(getFields(recStruct, readName, cigar, sequence, quality, mySequenceTranslation)); } // Returns the values of all fields except the tags. bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, String& cigar, String& sequence, String& quality, SequenceTranslation translation) { myStatus = SamStatus::SUCCESS; if(myIsBufferSynced == false) { if(!fixBuffer(translation)) { // failed to set the buffer, return false. return(false); } } memcpy(&recStruct, myRecordPtr, sizeof(bamRecordStruct)); readName = getReadName(); // Check the status. if(myStatus != SamStatus::SUCCESS) { // Failed to set the fields, return false. return(false); } cigar = getCigar(); // Check the status. if(myStatus != SamStatus::SUCCESS) { // Failed to set the fields, return false. return(false); } sequence = getSequence(translation); // Check the status. if(myStatus != SamStatus::SUCCESS) { // Failed to set the fields, return false. return(false); } quality = getQuality(); // Check the status. if(myStatus != SamStatus::SUCCESS) { // Failed to set the fields, return false. return(false); } return(true); } // Returns the reference pointer. GenomeSequence* SamRecord::getReference() { return(myRefPtr); } uint32_t SamRecord::getTagLength() { myStatus = SamStatus::SUCCESS; if(myNeedToSetTagsFromBuffer) { // Tags are only set in the buffer, so the size of the tags is // the length of the record minus the starting location of the tags. unsigned char * tagStart = (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength * sizeof(int) + (myRecordPtr->myReadLength + 1) / 2 + myRecordPtr->myReadLength; // The non-tags take up from the start of the record to the tag start. // Do not include the block size part of the record since it is not // included in the size. uint32_t nonTagSize = tagStart - (unsigned char*)&(myRecordPtr->myReferenceID); // Tags take up the size of the block minus the non-tag section. uint32_t tagSize = myRecordPtr->myBlockSize - nonTagSize; return(tagSize); } // Tags are stored outside the buffer, so myTagBufferSize is set. return(myTagBufferSize); } // Returns true if there is another tag and sets tag and vtype to the // appropriate values, and returns a pointer to the value. // Sets the Status to SUCCESS when a tag is successfully returned or // when there are no more tags. Otherwise the status is set to describe // why it failed (parsing, etc). bool SamRecord::getNextSamTag(char* tag, char& vtype, void** value) { myStatus = SamStatus::SUCCESS; if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. return(false); } } // Increment the tag index to start looking at the next tag. // At the beginning, it is set to -1. myLastTagIndex++; int maxTagIndex = extras.Capacity(); if(myLastTagIndex >= maxTagIndex) { // Hit the end of the tags, return false, no more tags. // Status is still success since this is not an error, // it is just the end of the list. return(false); } bool tagFound = false; // Loop until a tag is found or the end of extras is hit. while((tagFound == false) && (myLastTagIndex < maxTagIndex)) { if(extras.SlotInUse(myLastTagIndex)) { // Found a slot to use. int key = extras.GetKey(myLastTagIndex); getTag(key, tag); getTypeFromKey(key, vtype); tagFound = true; // Get the value associated with the key based on the vtype. switch (vtype) { case 'f' : *value = getFloatPtr(myLastTagIndex); break; case 'i' : *value = getIntegerPtr(myLastTagIndex, vtype); if(vtype != 'A') { // Convert all int types to 'i' vtype = 'i'; } break; case 'Z' : case 'B' : *value = getStringPtr(myLastTagIndex); break; default: myStatus.setStatus(SamStatus::FAIL_PARSE, "Unknown tag type"); tagFound = false; break; } } if(!tagFound) { // Increment the index since a tag was not found. myLastTagIndex++; } } return(tagFound); } // Reset the tag iterator to the beginning of the tags. void SamRecord::resetTagIter() { myLastTagIndex = -1; } bool SamRecord::isIntegerType(char vtype) { if((vtype == 'c') || (vtype == 'C') || (vtype == 's') || (vtype == 'S') || (vtype == 'i') || (vtype == 'I')) { return(true); } return(false); } bool SamRecord::isFloatType(char vtype) { if(vtype == 'f') { return(true); } return(false); } bool SamRecord::isCharType(char vtype) { if(vtype == 'A') { return(true); } return(false); } bool SamRecord::isStringType(char vtype) { if((vtype == 'Z') || (vtype == 'B')) { return(true); } return(false); } bool SamRecord::getTagsString(const char* tags, String& returnString, char delim) { const char* currentTagPtr = tags; returnString.Clear(); myStatus = SamStatus::SUCCESS; if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. return(false); } } bool returnStatus = true; while(*currentTagPtr != '\0') { // Tags are formatted as: XY:Z // Where X is [A-Za-z], Y is [A-Za-z], and // Z is A,i,f,Z,H (cCsSI are also excepted) if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') || (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0')) { myStatus.setStatus(SamStatus::INVALID, "getTagsString called with improperly formatted tags.\n"); returnStatus = false; break; } // Construct the key. int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1], currentTagPtr[3]); // Look to see if the key exsists in the hash. int offset = extras.Find(key); if(offset >= 0) { // Offset is set, so the key was found. if(!returnString.IsEmpty()) { returnString += delim; } returnString += currentTagPtr[0]; returnString += currentTagPtr[1]; returnString += ':'; returnString += currentTagPtr[3]; returnString += ':'; // First if it is an integer, determine the actual type of the int. char vtype; getTypeFromKey(key, vtype); switch(vtype) { case 'i': returnString += *(int*)getIntegerPtr(offset, vtype); break; case 'f': returnString += *(float*)getFloatPtr(offset); break; case 'Z': case 'B': returnString += *(String*)getStringPtr(offset); break; default: myStatus.setStatus(SamStatus::INVALID, "rmTag called with unknown type.\n"); returnStatus = false; break; }; } // Increment to the next tag. if((currentTagPtr[4] == ';') || (currentTagPtr[4] == ',')) { // Increment once more. currentTagPtr += 5; } else if(currentTagPtr[4] != '\0') { // Invalid tag format. myStatus.setStatus(SamStatus::INVALID, "rmTags called with improperly formatted tags.\n"); returnStatus = false; break; } else { // Last Tag. currentTagPtr += 4; } } return(returnStatus); } const String* SamRecord::getStringTag(const char * tag) { // Parse the buffer if necessary. if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. setTagsFromBuffer set the errors, // so just return null. return(NULL); } } int key = MAKEKEY(tag[0], tag[1], 'Z'); int offset = extras.Find(key); int value; if (offset < 0) { // Check for 'B' tag. key = MAKEKEY(tag[0], tag[1], 'B'); offset = extras.Find(key); if(offset < 0) { // Tag not found. return(NULL); } } // Offset is valid, so return the tag. value = extras[offset]; return(&(strings[value])); } int* SamRecord::getIntegerTag(const char * tag) { // Init to success. myStatus = SamStatus::SUCCESS; // Parse the buffer if necessary. if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. setTagsFromBuffer set the errors, // so just return NULL. return(NULL); } } int key = MAKEKEY(tag[0], tag[1], 'i'); int offset = extras.Find(key); int value; if (offset < 0) { // Failed to find the tag. return(NULL); } else value = extras[offset]; return(&(integers[value])); } bool SamRecord::getIntegerTag(const char * tag, int& tagVal) { // Init to success. myStatus = SamStatus::SUCCESS; // Parse the buffer if necessary. if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. setTagsFromBuffer set the errors, // so just return false. return(false); } } int key = MAKEKEY(tag[0], tag[1], 'i'); int offset = extras.Find(key); int value; if (offset < 0) { // Failed to find the tag. return(false); } else value = extras[offset]; tagVal = integers[value]; return(true); } bool SamRecord::getFloatTag(const char * tag, float& tagVal) { // Init to success. myStatus = SamStatus::SUCCESS; // Parse the buffer if necessary. if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. setTagsFromBuffer set the errors, // so just return false. return(false); } } int key = MAKEKEY(tag[0], tag[1], 'f'); int offset = extras.Find(key); int value; if (offset < 0) { // Failed to find the tag. return(false); } else value = extras[offset]; tagVal = floats[value]; return(true); } const String & SamRecord::getString(const char * tag) { // Init to success. myStatus = SamStatus::SUCCESS; // Parse the buffer if necessary. if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. // TODO - what do we want to do on failure? } } int key = MAKEKEY(tag[0], tag[1], 'Z'); int offset = extras.Find(key); int value; if (offset < 0) { key = MAKEKEY(tag[0], tag[1], 'B'); offset = extras.Find(key); if (offset < 0) { // TODO - what do we want to do on failure? return(NOT_FOUND_TAG_STRING); } } value = extras[offset]; return strings[value]; } int & SamRecord::getInteger(const char * tag) { // Init to success. myStatus = SamStatus::SUCCESS; // Parse the buffer if necessary. if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. setTagsFromBuffer set the error. // TODO - what do we want to do on failure? } } int key = MAKEKEY(tag[0], tag[1], 'i'); int offset = extras.Find(key); int value; if (offset < 0) { // TODO - what do we want to do on failure? return NOT_FOUND_TAG_INT; } else value = extras[offset]; return integers[value]; } bool SamRecord::checkTag(const char * tag, char type) { // Init to success. myStatus = SamStatus::SUCCESS; // Parse the buffer if necessary. if(myNeedToSetTagsFromBuffer) { if(!setTagsFromBuffer()) { // Failed to read the tags from the buffer, so cannot // get tags. setTagsFromBuffer set the error. return(""); } } int key = MAKEKEY(tag[0], tag[1], type); return (extras.Find(key) != LH_NOTFOUND); } // Return the error after a failed SamRecord call. const SamStatus& SamRecord::getStatus() { return(myStatus); } // Allocate space for the record - does a realloc. // The passed in size is the size of the entire record including the // block size field. bool SamRecord::allocateRecordStructure(int size) { if (allocatedSize < size) { bamRecordStruct* tmpRecordPtr = (bamRecordStruct *)realloc(myRecordPtr, size); if(tmpRecordPtr == NULL) { // FAILED to allocate memory fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!"); myStatus.addError(SamStatus::FAIL_MEM, "Failed Memory Allocation."); return(false); } // Successfully allocated memory, so set myRecordPtr. myRecordPtr = tmpRecordPtr; // Reset the pointers into the record. if(myIsSequenceBufferValid) { myPackedSequence = (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength * sizeof(int); } if(myIsQualityBufferValid) { myPackedQuality = (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength * sizeof(int) + (myRecordPtr->myReadLength + 1) / 2; } allocatedSize = size; } return(true); } // Index is the index into the strings array. void* SamRecord::getStringPtr(int index) { int value = extras[index]; return &(strings[value]); } void* SamRecord::getIntegerPtr(int offset, char& type) { int value = extras[offset]; type = intType[value]; return &(integers[value]); } void* SamRecord::getFloatPtr(int offset) { int value = extras[offset]; return &(floats[value]); } // Fixes the buffer to match the variable length fields if they are set. bool SamRecord::fixBuffer(SequenceTranslation translation) { // Check to see if the buffer is already synced. if(myIsBufferSynced && (myBufferSequenceTranslation == translation)) { // Already synced, nothing to do. return(true); } // Set the bin if necessary. if(!myIsBinValid) { // The bin that is set in the record is not valid, so // reset it. myRecordPtr->myBin = bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd()); myIsBinValid = true; } // Not synced. bool status = true; // First determine the size the buffer needs to be. uint8_t newReadNameLen = getReadNameLength(); uint16_t newCigarLen = getCigarLength(); int32_t newReadLen = getReadLength(); uint32_t newTagLen = getTagLength(); uint32_t bamSequenceLen = (newReadLen+1)/2; // The buffer size extends from the start of the record to data // plus the length of the variable fields, // Multiply the cigar length by 4 since it is the number of uint32_t fields. int newBufferSize = ((unsigned char*)(&(myRecordPtr->myData)) - (unsigned char*)myRecordPtr) + newReadNameLen + ((newCigarLen)*4) + newReadLen + bamSequenceLen + newTagLen; if(!allocateRecordStructure(newBufferSize)) { // Failed to allocate space. return(false); } // Now that space has been added to the buffer, check to see what if // any fields need to be extracted from the buffer prior to starting to // overwrite it. Fields need to be extracted from the buffer if the // buffer is valid for the field and a previous variable length field has // changed length. bool readNameLenChange = (newReadNameLen != myRecordPtr->myReadNameLength); bool cigarLenChange = (newCigarLen != myRecordPtr->myCigarLength); bool readLenChange = (newReadLen != myRecordPtr->myReadLength); // If the tags are still stored in the buffer and any other fields changed // lengths, they need to be extracted. if(myIsTagsBufferValid && (readNameLenChange | cigarLenChange | readLenChange)) { status &= setTagsFromBuffer(); // The tag buffer will not be valid once the other fields // are written, so set it to not valid. myIsTagsBufferValid = false; } // If the sequence or quality strings are still stored in the buffer, and // any of the previous fields have changed length, extract it from the // current buffer. if((myIsQualityBufferValid | myIsSequenceBufferValid) && (readNameLenChange | cigarLenChange | readLenChange)) { setSequenceAndQualityFromBuffer(); // The quality and sequence buffers will not be valid once the other // fields are written, so set them to not valid. myIsQualityBufferValid = false; myIsSequenceBufferValid = false; } // If the cigar is still stored in the buffer, and any of the // previous fields have changed length, extract it from the current buffer. if((myIsCigarBufferValid) && (readNameLenChange)) { status &= parseCigarBinary(); myIsCigarBufferValid = false; } // Set each value in the buffer if it is not already valid. if(!myIsReadNameBufferValid) { memcpy(&(myRecordPtr->myData), myReadName.c_str(), newReadNameLen); // Set the new ReadNameLength. myRecordPtr->myReadNameLength = newReadNameLen; myIsReadNameBufferValid = true; } unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) + myRecordPtr->myReadNameLength; // Set the Cigar. Need to reformat from the string to unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds; if(!myIsCigarBufferValid) { // The cigar was already parsed when it was set, so just copy // data from the temporary buffer. myRecordPtr->myCigarLength = newCigarLen; memcpy(packedCigar, myCigarTempBuffer, myRecordPtr->myCigarLength * sizeof(uint32_t)); myIsCigarBufferValid = true; } unsigned char * packedSequence = readNameEnds + myRecordPtr->myCigarLength * sizeof(int); unsigned char * packedQuality = packedSequence + bamSequenceLen; if(!myIsSequenceBufferValid || !myIsQualityBufferValid || (myBufferSequenceTranslation != translation)) { myRecordPtr->myReadLength = newReadLen; // Determine if the quality needs to be set and is just a * and needs to // be set to 0xFF. bool noQuality = false; if((myQuality.Length() == 1) && (myQuality[0] == '*')) { noQuality = true; } const char* translatedSeq = NULL; // If the sequence is not valid in the buffer or it is not // properly translated, get the properly translated sequence // that needs to be put into the buffer. if((!myIsSequenceBufferValid) || (translation != myBufferSequenceTranslation)) { translatedSeq = getSequence(translation); } for (int i = 0; i < myRecordPtr->myReadLength; i++) { if((!myIsSequenceBufferValid) || (translation != myBufferSequenceTranslation)) { // Sequence buffer is not valid, so set the sequence. int seqVal = 0; switch(translatedSeq[i]) { case '=': seqVal = 0; break; case 'A': case 'a': seqVal = 1; break; case 'C': case 'c': seqVal = 2; break; case 'G': case 'g': seqVal = 4; break; case 'T': case 't': seqVal = 8; break; case 'N': case 'n': case '.': seqVal = 15; break; default: myStatus.addError(SamStatus::FAIL_PARSE, "Unknown Sequence character found."); status = false; break; }; if(i & 1) { // Odd number i's go in the lower 4 bits, so OR in the // lower bits packedSequence[i/2] |= seqVal; } else { // Even i's go in the upper 4 bits and are always set first. packedSequence[i/2] = seqVal << 4; } } if(!myIsQualityBufferValid) { // Set the quality. if((noQuality) || (myQuality.Length() <= i)) { // No quality or the quality is smaller than the sequence, // so set it to 0xFF packedQuality[i] = 0xFF; } else { // Copy the quality string. packedQuality[i] = myQuality[i] - 33; } } } myPackedSequence = (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength * sizeof(int); myPackedQuality = myPackedSequence + (myRecordPtr->myReadLength + 1) / 2; myIsSequenceBufferValid = true; myIsQualityBufferValid = true; myBufferSequenceTranslation = translation; } if(!myIsTagsBufferValid) { status &= setTagsInBuffer(); } // Set the lengths in the buffer. myRecordPtr->myReadNameLength = newReadNameLen; myRecordPtr->myCigarLength = newCigarLen; myRecordPtr->myReadLength = newReadLen; // Set the buffer block size to the size of the buffer minus the // first field. myRecordPtr->myBlockSize = newBufferSize - sizeof(int32_t); if(status) { myIsBufferSynced = true; } return(status); } // Sets the Sequence and Quality strings from the buffer. // They are done together in one method because they require the same // loop, so might as well be done at the same time. void SamRecord::setSequenceAndQualityFromBuffer() { // NOTE: If the sequence buffer is not valid, do not set the sequence // string from the buffer. // NOTE: If the quality buffer is not valid, do not set the quality string // from the buffer. // Extract the sequence if the buffer is valid and the string's length is 0. bool extractSequence = false; if(myIsSequenceBufferValid && (mySequence.Length() == 0)) { extractSequence = true; } // Extract the quality if the buffer is valid and the string's length is 0. bool extractQuality = false; if(myIsQualityBufferValid && (myQuality.Length() == 0)) { extractQuality = true; } // If neither the quality nor the sequence need to be extracted, // just return. if(!extractSequence && !extractQuality) { return; } // Set the sequence and quality strings.. if(extractSequence) { mySequence.SetLength(myRecordPtr->myReadLength); } if(extractQuality) { myQuality.SetLength(myRecordPtr->myReadLength); } const char * asciiBases = "=AC.G...T......N"; // Flag to see if the quality is specified - the quality contains a value // other than 0xFF. If all values are 0xFF, then there is no quality. bool qualitySpecified = false; for (int i = 0; i < myRecordPtr->myReadLength; i++) { if(extractSequence) { mySequence[i] = i & 1 ? asciiBases[myPackedSequence[i / 2] & 0xF] : asciiBases[myPackedSequence[i / 2] >> 4]; } if(extractQuality) { if(myPackedQuality[i] != 0xFF) { // Quality is specified, so mark the flag. qualitySpecified = true; } myQuality[i] = myPackedQuality[i] + 33; } } // If the read length is 0, then set the sequence and quality to '*' if(myRecordPtr->myReadLength == 0) { if(extractSequence) { mySequence = "*"; } if(extractQuality) { myQuality = "*"; } } else if(extractQuality && !qualitySpecified) { // No quality was specified, so set it to "*" myQuality = "*"; } } // Parse the cigar to calculate the alignment/unclipped end. bool SamRecord::parseCigar() { // Determine if the cigar string or cigar binary needs to be parsed. if(myCigar.Length() == 0) { // The cigar string is not yet set, so parse the binary. return(parseCigarBinary()); } return(parseCigarString()); } // Parse the cigar to calculate the alignment/unclipped end. bool SamRecord::parseCigarBinary() { // Only need to parse if the string is not already set. // The length of the cigar string is set to zero when the // record is read from a file into the buffer. if(myCigar.Length() != 0) { // Already parsed. return(true); } unsigned char * readNameEnds = (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength; unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds; myCigarRoller.Set(packedCigar, myRecordPtr->myCigarLength); myCigarRoller.getCigarString(myCigar); myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount(); myUnclippedStartOffset = myCigarRoller.getNumBeginClips(); myUnclippedEndOffset = myCigarRoller.getNumEndClips(); // if the cigar length is 0, then set the cigar string to "*" if(myRecordPtr->myCigarLength == 0) { myCigar = "*"; return(true); } // Copy the cigar into a temporary buffer. int newBufferSize = myRecordPtr->myCigarLength * sizeof(uint32_t); if(newBufferSize > myCigarTempBufferAllocatedSize) { uint32_t* tempBufferPtr = (uint32_t*)realloc(myCigarTempBuffer, newBufferSize); if(tempBufferPtr == NULL) { // Failed to allocate memory. // Do not parse, just return. fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!"); myStatus.addError(SamStatus::FAIL_MEM, "Failed to Allocate Memory."); return(false); } myCigarTempBuffer = tempBufferPtr; myCigarTempBufferAllocatedSize = newBufferSize; } memcpy(myCigarTempBuffer, packedCigar, myRecordPtr->myCigarLength * sizeof(uint32_t)); // Set the length of the temp buffer. myCigarTempBufferLength = myRecordPtr->myCigarLength; return(true); } // Parse the cigar string to calculate the cigar length and alignment end. bool SamRecord::parseCigarString() { myCigarTempBufferLength = 0; if(myCigar == "*") { // Cigar is empty, so initialize the variables. myAlignmentLength = 0; myUnclippedStartOffset = 0; myUnclippedEndOffset = 0; myCigarRoller.clear(); return(true); } myCigarRoller.Set(myCigar); myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount(); myUnclippedStartOffset = myCigarRoller.getNumBeginClips(); myUnclippedEndOffset = myCigarRoller.getNumEndClips(); // Check to see if the Temporary Cigar Buffer is large enough to contain // this cigar. If we make it the size of the length of the cigar string, // it will be more than large enough. int newBufferSize = myCigar.Length() * sizeof(uint32_t); if(newBufferSize > myCigarTempBufferAllocatedSize) { uint32_t* tempBufferPtr = (uint32_t*)realloc(myCigarTempBuffer, newBufferSize); if(tempBufferPtr == NULL) { // Failed to allocate memory. // Do not parse, just return. fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!"); myStatus.addError(SamStatus::FAIL_MEM, "Failed to Allocate Memory."); return(false); } myCigarTempBuffer = tempBufferPtr; myCigarTempBufferAllocatedSize = newBufferSize; } // Track if there were any errors. bool status = true; // Track the index into the cigar string that is being parsed. char *cigarOp; const char* cigarEntryStart = myCigar.c_str(); int opLen = 0; int op = 0; unsigned int * packedCigar = myCigarTempBuffer; // TODO - maybe one day make a cigar list... or maybe make a // reference cigar string for ease of lookup.... const char* endCigarString = cigarEntryStart + myCigar.Length(); while(cigarEntryStart < endCigarString) { bool validCigarEntry = true; // Get the opLen from the string. cigarOp will then point to // the operation. opLen = strtol(cigarEntryStart, &cigarOp, 10); // Switch on the type of operation. switch(*cigarOp) { case('M'): op = 0; break; case('I'): // Insert into the reference position, so do not increment the // reference end position. op = 1; break; case('D'): op = 2; break; case('N'): op = 3; break; case('S'): op = 4; break; case('H'): op = 5; break; case('P'): op = 6; break; default: fprintf(stderr, "ERROR parsing cigar\n"); validCigarEntry = false; status = false; myStatus.addError(SamStatus::FAIL_PARSE, "Unknown operation found when parsing the Cigar."); break; }; if(validCigarEntry) { // Increment the cigar length. ++myCigarTempBufferLength; *packedCigar = (opLen << 4) | op; packedCigar++; } // The next Entry starts at one past the cigar op, so set the start. cigarEntryStart = ++cigarOp; } // Check clipLength to adjust the end position. return(status); } bool SamRecord::setTagsFromBuffer() { // If the tags do not need to be set from the buffer, return true. if(myNeedToSetTagsFromBuffer == false) { // Already been set from the buffer. return(true); } // Mark false, as they are being set now. myNeedToSetTagsFromBuffer = false; unsigned char * extraPtr = myPackedQuality + myRecordPtr->myReadLength; // Default to success, will be changed to false on failure. bool status = true; // Clear any previously set tags. clearTags(); while (myRecordPtr->myBlockSize + 4 - (extraPtr - (unsigned char *)myRecordPtr) > 0) { int key = 0; int value = 0; void * content = extraPtr + 3; int tagBufferSize = 0; key = MAKEKEY(extraPtr[0], extraPtr[1], extraPtr[2]); // First check if the tag already exists. unsigned int location = extras.Find(key); int origIndex = 0; String* duplicate = NULL; String* origTag = NULL; if(location != LH_NOTFOUND) { duplicate = new String; origTag = new String; origIndex = extras[location]; *duplicate = (char)(extraPtr[0]); *duplicate += (char)(extraPtr[1]); *duplicate += ':'; *origTag = *duplicate; *duplicate += (char)(extraPtr[2]); *duplicate += ':'; } switch (extraPtr[2]) { case 'A' : if(duplicate != NULL) { *duplicate += (* (char *) content); *origTag += intType[origIndex]; *origTag += ':'; appendIntArrayValue(origIndex, *origTag); tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); integers[origIndex] = *(char *)content; intType[origIndex] = extraPtr[2]; tagBufferSize += getNumericTagTypeSize(intType[origIndex]); } else { value = integers.Length(); integers.Push(* (char *) content); intType.push_back(extraPtr[2]); tagBufferSize += 4; } extraPtr += 4; break; case 'c' : if(duplicate != NULL) { *duplicate += (* (char *) content); *origTag += intType[origIndex]; *origTag += ':'; appendIntArrayValue(origIndex, *origTag); tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); integers[origIndex] = *(char *)content; intType[origIndex] = extraPtr[2]; tagBufferSize += getNumericTagTypeSize(intType[origIndex]); } else { value = integers.Length(); integers.Push(* (char *) content); intType.push_back(extraPtr[2]); tagBufferSize += 4; } extraPtr += 4; break; case 'C' : if(duplicate != NULL) { *duplicate += (* (unsigned char *) content); *origTag += intType[origIndex]; *origTag += ':'; appendIntArrayValue(origIndex, *origTag); tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); integers[origIndex] = *(unsigned char *)content; intType[origIndex] = extraPtr[2]; tagBufferSize += getNumericTagTypeSize(intType[origIndex]); } else { value = integers.Length(); integers.Push(* (unsigned char *) content); intType.push_back(extraPtr[2]); tagBufferSize += 4; } extraPtr += 4; break; case 's' : if(duplicate != NULL) { *duplicate += (* (short *) content); *origTag += intType[origIndex]; *origTag += ':'; appendIntArrayValue(origIndex, *origTag); tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); integers[origIndex] = *(short *)content; intType[origIndex] = extraPtr[2]; tagBufferSize += getNumericTagTypeSize(intType[origIndex]); } else { value = integers.Length(); integers.Push(* (short *) content); intType.push_back(extraPtr[2]); tagBufferSize += 5; } extraPtr += 5; break; case 'S' : if(duplicate != NULL) { *duplicate += (* (unsigned short *) content); *origTag += intType[origIndex]; *origTag += ':'; appendIntArrayValue(origIndex, *origTag); tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); integers[origIndex] = *(unsigned short *)content; intType[origIndex] = extraPtr[2]; tagBufferSize += getNumericTagTypeSize(intType[origIndex]); } else { value = integers.Length(); integers.Push(* (unsigned short *) content); intType.push_back(extraPtr[2]); tagBufferSize += 5; } extraPtr += 5; break; case 'i' : if(duplicate != NULL) { *duplicate += (* (int *) content); *origTag += intType[origIndex]; *origTag += ':'; appendIntArrayValue(origIndex, *origTag); tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); integers[origIndex] = *(int *)content; intType[origIndex] = extraPtr[2]; tagBufferSize += getNumericTagTypeSize(intType[origIndex]); } else { value = integers.Length(); integers.Push(* (int *) content); intType.push_back(extraPtr[2]); tagBufferSize += 7; } extraPtr += 7; break; case 'I' : if(duplicate != NULL) { *duplicate += (* (unsigned int *) content); *origTag += intType[origIndex]; *origTag += ':'; appendIntArrayValue(origIndex, *origTag); tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); integers[origIndex] = *(unsigned int *)content; intType[origIndex] = extraPtr[2]; tagBufferSize += getNumericTagTypeSize(intType[origIndex]); } else { value = integers.Length(); integers.Push((int) * (unsigned int *) content); intType.push_back(extraPtr[2]); tagBufferSize += 7; } extraPtr += 7; break; case 'Z' : if(duplicate != NULL) { *duplicate += ((const char *) content); *origTag += 'Z'; *origTag += ':'; *origTag += (char*)(strings[origIndex]); tagBufferSize -= strings[origIndex].Length(); strings[origIndex] = (const char *) content; extraPtr += 4 + strings[origIndex].Length(); tagBufferSize += strings[origIndex].Length(); } else { value = strings.Length(); strings.Push((const char *) content); tagBufferSize += 4 + strings.Last().Length(); extraPtr += 4 + strings.Last().Length(); } break; case 'B' : if(duplicate != NULL) { *origTag += 'B'; *origTag += ':'; *origTag += (char*)(strings[origIndex]); tagBufferSize -= getBtagBufferSize(strings[origIndex]); int bufferSize = getStringFromBtagBuffer((unsigned char*)content, strings[origIndex]); *duplicate += (char *)(strings[origIndex]); tagBufferSize += bufferSize; extraPtr += 3 + bufferSize; } else { value = strings.Length(); String tempBTag; int bufferSize = getStringFromBtagBuffer((unsigned char*)content, tempBTag); strings.Push(tempBTag); tagBufferSize += 3 + bufferSize; extraPtr += 3 + bufferSize; } break; case 'f' : if(duplicate != NULL) { duplicate->appendFullFloat(* (float *) content); *origTag += 'f'; *origTag += ':'; origTag->appendFullFloat(floats[origIndex]); floats[origIndex] = *(float *)content; } else { value = floats.size(); floats.push_back(* (float *) content); tagBufferSize += 7; } extraPtr += 7; break; default : fprintf(stderr, "parsing BAM - Unknown custom field of type %c%c:%c\n", extraPtr[0], extraPtr[1], extraPtr[2]); fprintf(stderr, "BAM Tags: \n"); unsigned char* tagInfo = myPackedQuality + myRecordPtr->myReadLength; fprintf(stderr, "\n\n"); tagInfo = myPackedQuality + myRecordPtr->myReadLength; while(myRecordPtr->myBlockSize + 4 - (tagInfo - (unsigned char *)myRecordPtr) > 0) { fprintf(stderr, "%02x",tagInfo[0]); ++tagInfo; } fprintf(stderr, "\n"); // Failed on read. // Increment extraPtr just by the size of the 3 known fields extraPtr += 3; myStatus.addError(SamStatus::FAIL_PARSE, "Unknown tag type."); status = false; } if(duplicate != NULL) { // Duplicate tag in this record. // Tag already existed, print message about overwriting. // WARN about dropping duplicate tags. if(myNumWarns++ < myMaxWarns) { fprintf(stderr, "WARNING: Duplicate Tags, overwritting %s with %s\n", origTag->c_str(), duplicate->c_str()); if(myNumWarns == myMaxWarns) { fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n"); } } continue; } // Only add the tag if it has so far been successfully processed. if(status) { // Add the tag. extras.Add(key, value); myTagBufferSize += tagBufferSize; } } return(status); } bool SamRecord::setTagsInBuffer() { // The buffer size extends from the start of the record to data // plus the length of the variable fields, // Multiply the cigar length by 4 since it is the number of uint32_t fields. int bamSequenceLength = (myRecordPtr->myReadLength+1)/2; int newBufferSize = ((unsigned char*)(&(myRecordPtr->myData)) - (unsigned char*)myRecordPtr) + myRecordPtr->myReadNameLength + ((myRecordPtr->myCigarLength)*4) + myRecordPtr->myReadLength + bamSequenceLength + myTagBufferSize; // Make sure the buffer is big enough. if(!allocateRecordStructure(newBufferSize)) { // Failed to allocate space. return(false); } char * extraPtr = (char*)myPackedQuality + myRecordPtr->myReadLength; bool status = true; // Set the tags in the buffer. if (extras.Entries()) { for (int i = 0; i < extras.Capacity(); i++) { if (extras.SlotInUse(i)) { int key = extras.GetKey(i); getTag(key, extraPtr); extraPtr += 2; char vtype; getTypeFromKey(key, vtype); if(vtype == 'i') { vtype = getIntegerType(i); } extraPtr[0] = vtype; // increment the pointer to where the value is. extraPtr += 1; switch (vtype) { case 'A' : *(char*)extraPtr = (char)getInteger(i); // sprintf(extraPtr, "%d", getInteger(i)); extraPtr += 1; break; case 'c' : *(int8_t*)extraPtr = (int8_t)getInteger(i); // sprintf(extraPtr, "%.4d", getInteger(i)); extraPtr += 1; break; case 'C' : *(uint8_t*)extraPtr = (uint8_t)getInteger(i); // sprintf(extraPtr, "%.4d", getInteger(i)); extraPtr += 1; break; case 's' : *(int16_t*)extraPtr = (int16_t)getInteger(i); // sprintf(extraPtr, "%.4d", getInteger(i)); extraPtr += 2; break; case 'S' : *(uint16_t*)extraPtr = (uint16_t)getInteger(i); // sprintf(extraPtr, "%.4d", getInteger(i)); extraPtr += 2; break; case 'i' : *(int32_t*)extraPtr = (int32_t)getInteger(i); // sprintf(extraPtr, "%.4d", getInteger(i)); extraPtr += 4; break; case 'I' : *(uint32_t*)extraPtr = (uint32_t)getInteger(i); // sprintf(extraPtr, "%.4d", getInteger(i)); extraPtr += 4; break; case 'Z' : sprintf(extraPtr, "%s", getString(i).c_str()); extraPtr += getString(i).Length() + 1; break; case 'B' : extraPtr += setBtagBuffer(getString(i), extraPtr); //--TODO-- Set buffer with correct B tag //sprintf(extraPtr, "%s", getString(i).c_str()); // extraPtr += getBtagBufferSize(getString(i)); break; case 'f' : *(float*)extraPtr = getFloat(i); extraPtr += 4; break; default : myStatus.addError(SamStatus::FAIL_PARSE, "Unknown tag type."); status = false; break; } } } } // Validate that the extra pointer is at the end of the allocated buffer. // If not then there was a problem. if(extraPtr != (char*)myRecordPtr + newBufferSize) { fprintf(stderr, "ERROR updating the buffer. Incorrect size."); myStatus.addError(SamStatus::FAIL_PARSE, "ERROR updating the buffer. Incorrect size."); status = false; } // The buffer tags are now in sync. myNeedToSetTagsInBuffer = false; myIsTagsBufferValid = true; return(status); } // Reset the variables for a newly set buffer. The buffer must be set first // since this looks up the reference ids in the buffer to set the reference // names. void SamRecord::setVariablesForNewBuffer(SamFileHeader& header) { // Lookup the reference name & mate reference name associated with this // record. myReferenceName = header.getReferenceLabel(myRecordPtr->myReferenceID); myMateReferenceName = header.getReferenceLabel(myRecordPtr->myMateReferenceID); // Clear the SAM Strings that are now not in-sync with the buffer. myReadName.SetLength(0); myCigar.SetLength(0); mySequence.SetLength(0); mySeqWithEq.clear(); mySeqWithoutEq.clear(); myQuality.SetLength(0); myNeedToSetTagsFromBuffer = true; myNeedToSetTagsInBuffer = false; //Set that the buffer is valid. myIsBufferSynced = true; // Set that the variable length buffer fields are valid. myIsReadNameBufferValid = true; myIsCigarBufferValid = true; myPackedSequence = (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength * sizeof(int); myIsSequenceBufferValid = true; myBufferSequenceTranslation = NONE; myPackedQuality = myPackedSequence + (myRecordPtr->myReadLength + 1) / 2; myIsQualityBufferValid = true; myIsTagsBufferValid = true; myIsBinValid = true; } // Extract the vtype from the key. void SamRecord::getTypeFromKey(int key, char& type) const { // Extract the type from the key. type = (key >> 16) & 0xFF; } // Extract the tag from the key. void SamRecord::getTag(int key, char* tag) const { // Extract the tag from the key. tag[0] = key & 0xFF; tag[1] = (key >> 8) & 0xFF; tag[2] = 0; } // Index is the index into the strings array. String & SamRecord::getString(int index) { int value = extras[index]; return strings[value]; } int & SamRecord::getInteger(int offset) { int value = extras[offset]; return integers[value]; } const char & SamRecord::getIntegerType(int offset) const { int value = extras[offset]; return intType[value]; } float & SamRecord::getFloat(int offset) { int value = extras[offset]; return floats[value]; } void SamRecord::appendIntArrayValue(char type, int value, String& strVal) const { switch(type) { case 'A': strVal += (char)value; break; case 'c': case 's': case 'i': case 'C': case 'S': case 'I': strVal += value; break; default: // Do nothing. ; } } int SamRecord::getBtagBufferSize(String& tagStr) { if(tagStr.Length() < 1) { // ERROR, needs at least the type. myStatus.addError(SamStatus::FAIL_PARSE, "SamRecord::getBtagBufferSize no tag subtype specified"); return(0); } char type = tagStr[0]; int elementSize = getNumericTagTypeSize(type); if(elementSize <= 0) { // ERROR, 'B' tag subtype must be numeric, so should be non-zero String errorMsg = "SamRecord::getBtagBufferSize invalid tag subtype, "; errorMsg += type; myStatus.addError(SamStatus::FAIL_PARSE, errorMsg.c_str()); return(0); } // Separated by ',', so count the number of commas. int numElements = 0; int index = tagStr.FastFindChar(',', 0); while(index > 0) { ++numElements; index = tagStr.FastFindChar(',', index+1); } // Add 5 bytes: type & numElements return(numElements * elementSize + 5); } int SamRecord::setBtagBuffer(String& tagStr, char* extraPtr) { if(tagStr.Length() < 1) { // ERROR, needs at least the type. myStatus.addError(SamStatus::FAIL_PARSE, "SamRecord::getBtagBufferSize no tag subtype specified"); return(0); } char type = tagStr[0]; int elementSize = getNumericTagTypeSize(type); if(elementSize <= 0) { // ERROR, 'B' tag subtype must be numeric, so should be non-zero String errorMsg = "SamRecord::getBtagBufferSize invalid tag subtype, "; errorMsg += type; myStatus.addError(SamStatus::FAIL_PARSE, errorMsg.c_str()); return(0); } int totalInc = 0; // Write the type. *(char*)extraPtr = type; ++extraPtr; ++totalInc; // Get the number of elements by counting ','s uint32_t numElements = 0; int index = tagStr.FastFindChar(',', 0); while(index > 0) { ++numElements; index = tagStr.FastFindChar(',', index+1); } *(uint32_t*)extraPtr = numElements; extraPtr += 4; totalInc += 4; const char* stringPtr = tagStr.c_str(); const char* endPtr = stringPtr + tagStr.Length(); // increment past the subtype and ','. stringPtr += 2; char* newPtr = NULL; while(stringPtr < endPtr) { switch(type) { case 'f': *(float*)extraPtr = (float)(strtod(stringPtr, &newPtr)); break; case 'c': *(int8_t*)extraPtr = (int8_t)strtol(stringPtr, &newPtr, 0); break; case 's': *(int16_t*)extraPtr = (int16_t)strtol(stringPtr, &newPtr, 0); break; case 'i': *(int32_t*)extraPtr = (int32_t)strtol(stringPtr, &newPtr, 0); break; case 'C': *(uint8_t*)extraPtr = (uint8_t)strtoul(stringPtr, &newPtr, 0); break; case 'S': *(uint16_t*)extraPtr = (uint16_t)strtoul(stringPtr, &newPtr, 0); break; case 'I': *(uint32_t*)extraPtr = (uint32_t)strtoul(stringPtr, &newPtr, 0); break; default : myStatus.addError(SamStatus::FAIL_PARSE, "Unknown 'B' tag subtype."); break; } extraPtr += elementSize; totalInc += elementSize; stringPtr = newPtr + 1; // skip the ',' } return(totalInc); } int SamRecord::getStringFromBtagBuffer(unsigned char* buffer, String& tagStr) { tagStr.Clear(); int bufferSize = 0; // 1st byte is the type. char type = *buffer; ++buffer; ++bufferSize; tagStr = type; // 2nd-5th bytes are the size unsigned int numEntries = *(unsigned int *)buffer; buffer += 4; bufferSize += 4; // Num Entries is not included in the string. int subtypeSize = getNumericTagTypeSize(type); for(unsigned int i = 0; i < numEntries; i++) { tagStr += ','; switch(type) { case 'f': tagStr.appendFullFloat(*(float *)buffer); break; case 'c': tagStr += *(int8_t *)buffer; break; case 's': tagStr += *(int16_t *)buffer; break; case 'i': tagStr += *(int32_t *)buffer; break; case 'C': tagStr += *(uint8_t *)buffer; break; case 'S': tagStr += *(uint16_t *)buffer; break; case 'I': tagStr += *(uint32_t *)buffer; break; default : myStatus.addError(SamStatus::FAIL_PARSE, "Unknown 'B' tag subtype."); break; } buffer += subtypeSize; bufferSize += subtypeSize; } return(bufferSize); } libStatGen-1.0.14/bam/SamRecord.h000066400000000000000000001017541254730101300164240ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_RECORD_H__ #define __SAM_RECORD_H__ #include #include "GenomeSequence.h" #include "SamStatus.h" #include "LongHash.h" #include "MathVector.h" #include "StringArray.h" #include "IntArray.h" #include "SamFileHeader.h" #include "CigarRoller.h" /// Structure of a BAM record. struct bamRecordStruct { public: int32_t myBlockSize; int32_t myReferenceID; int32_t myPosition; uint32_t myReadNameLength : 8, myMapQuality : 8, myBin : 16; uint32_t myCigarLength : 16, myFlag : 16; int32_t myReadLength; int32_t myMateReferenceID; int32_t myMatePosition; int32_t myInsertSize; // Outer fragment length char myData[1]; }; /// Class providing an easy to use interface to get/set/operate on the /// fields in a SAM/BAM record. class SamRecord { public: /// Enum containing the settings on how to translate the sequence if a /// reference is available. If no reference is available, no translation /// is done. enum SequenceTranslation { NONE, ///< Leave the sequence as is. EQUAL, ///< Translate bases that match the reference to '=' BASES, ///< Translate '=' to the actual base. }; /// Default Constructor. SamRecord(); /// Constructor that sets the error handling type. /// \param errorHandlingType how to handle errors. SamRecord(ErrorHandler::HandlingType errorHandlingType); /// Destructor ~SamRecord(); /// Reset the fields of the record to a default value. /// This is not necessary when you are reading a SAM/BAM file, /// but if you are setting fields, it is a good idea to clean /// out a record before reusing it. Clearing it allows you to /// not have to set any empty fields. void resetRecord(); /// Returns whether or not the record is valid, setting the status to /// indicate success or failure. /// \param header SAM Header associated with the record. Used to perform /// some validation against the header. /// \return true if the record is valid, false if not. bool isValid(SamFileHeader& header); /// Set the reference to the specified genome sequence object. /// \param reference pointer to the GenomeSequence object. void setReference(GenomeSequence* reference); /// Set the type of sequence translation to use when getting /// the sequence. The default type (if this method is never called) is /// NONE (the sequence is left as-is). Can be over-ridden by using /// the accessors that take a SequenceTranslation parameter. /// \param translation type of sequence translation to use. void setSequenceTranslation(SequenceTranslation translation); /////////////////////// /// @name Set Alignment Data /// Set methods for record fields. All of the "set" methods set the /// status to indicate success or the failure reason. //@{ /// Set QNAME to the passed in name. /// \param readName the readname to set the QNAME to. /// \return true if successfully set, false if not. bool setReadName(const char* readName); /// Set the bitwise FLAG to the specified value. /// \param flag integer flag to use. /// \return true if successfully set, false if not. bool setFlag(uint16_t flag); /// Set the reference sequence name (RNAME) to the specified name, using /// the header to determine the reference id. /// \param header SAM/BAM header to use to determine the reference id. /// \param referenceName reference name to use. /// \return true if successfully set, false if not bool setReferenceName(SamFileHeader& header, const char* referenceName); /// Set the leftmost position (POS) using the specified 1-based (SAM format) /// value. /// Internal processing handles the switching between SAM/BAM formats /// when read/written. /// \param position 1-based start position /// \return true if successfully set, false if not. bool set1BasedPosition(int32_t position); /// Set the leftmost position using the specified 0-based (BAM format) /// value. /// Internal processing handles the switching between SAM/BAM formats /// when read/written. /// \param position 0-based start position /// \return true if successfully set, false if not. bool set0BasedPosition(int32_t position); /// Set the mapping quality (MAPQ). /// \param mapQuality map quality to set in the record. /// \return true if successfully set, false if not. bool setMapQuality(uint8_t mapQuality); /// Set the CIGAR to the specified SAM formatted cigar string. /// Internal processing handles the switching between SAM/BAM formats /// when read/written. /// \param cigar string containing the SAM formatted cigar. /// \return true if successfully set, false if not. bool setCigar(const char* cigar); /// Set the CIGAR to the specified Cigar object. /// Internal processing handles the switching between SAM/BAM formats /// when read/written. /// \param cigar object to set this record's cigar to have. /// \return true if successfully set, false if not. bool setCigar(const Cigar& cigar); /// Set the mate/next fragment's reference sequence name (RNEXT) to the /// specified name, using the header to determine the mate reference id. /// \param header SAM/BAM header to use to determine the mate reference id. /// \param referenceName mate reference name to use. /// \return true if successfully set, false if not bool setMateReferenceName(SamFileHeader& header, const char* mateReferenceName); /// Set the mate/next fragment's leftmost position (PNEXT) using the /// specified 1-based (SAM format) value. /// Internal processing handles the switching between SAM/BAM formats /// when read/written. /// \param position 1-based start position /// \return true if successfully set, false if not. bool set1BasedMatePosition(int32_t matePosition); /// Set the mate/next fragment's leftmost position using the specified /// 0-based (BAM format) value. /// Internal processing handles the switching between SAM/BAM formats /// when read/written. /// \param position 0-based start position /// \return true if successfully set, false if not. bool set0BasedMatePosition(int32_t matePosition); /// Sets the inferred insert size (ISIZE)/observed template length (TLEN). /// \param insertSize inferred insert size/observed template length. /// \return true if successfully set, false if not. bool setInsertSize(int32_t insertSize); /// Sets the sequence (SEQ) to the specified SAM formatted sequence string. /// Internal processing handles switching between SAM/BAM formats when /// read/written. /// \param seq SAM sequence string. May contain '='. /// \return true if successfully set, false if not. bool setSequence(const char* seq); /// Sets the quality (QUAL) to the specified SAM formatted quality string. /// Internal processing handles switching between SAM/BAM formats when /// read/written. /// \param quality SAM quality string. /// \return true if successfully set, false if not. bool setQuality(const char* quality); /// Shift the indels (if any) to the left by updating the CIGAR. /// \return true if the cigar was shifted, false if not. bool shiftIndelsLeft(); /// Sets the SamRecord to contain the information in the BAM formatted /// fromBuffer. /// \param fromBuffer buffer to read the BAM record from. /// \param fromBufferSize size of the buffer containing the BAM record. /// \param header BAM header for the record. /// \return status of reading the BAM record from the buffer. SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize, SamFileHeader& header); /// Read the BAM record from a file. /// \param filePtr file to read the buffer from. /// \param header BAM header for the record. /// \return status of the reading the BAM record from the file. SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader& header); //@} /////////////////////// /// @name Set Tag Data /// Set methods for tags. //@{ /// Add the specified integer tag to the record. Internal processing /// handles switching between SAM/BAM formats when read/written and /// determining the type for BAM format. If the tag is already there /// this code will replace it if the specified value is different. /// \param tag two character tag to be added to the SAM/BAM record. /// \param value value for the specified tag. /// \return true if the tag was successfully added, false otherwise. bool addIntTag(const char* tag, int32_t value); /// Add the specified tag,vtype,value to the record. Vtype can be SAM/BAM /// format. Internal processing handles switching between SAM/BAM formats /// when read/written. If the tag is already there this code will replace /// it if the specified value is different. /// \param tag two character tag to be added to the SAM/BAM record. /// \param vtype vtype of the specified value - either SAM/BAM vtypes. /// \param value value as a string for the specified tag. /// \return true if the tag was successfully added, false otherwise. bool addTag(const char* tag, char vtype, const char* value); /// Clear the tags in this record. /// Does not set SamStatus. void clearTags(); /// Remove a tag. /// \param tag tag to remove. /// \param type of the tag to be removed. /// \return true if the tag no longer exists in the record, false if it could not be removed (Returns true if the tag was not found in the record). bool rmTag(const char* tag, char type); /// Remove tags. /// The delimiter between the tags is ',' or ';'. ',' was added since /// the original delimiter, ';', requires the string to be quoted on the /// command-line. /// \param tags tags to remove, formatted as Tag:Type,Tag:Type,Tag:Type... /// \return true if all tags no longer exist in the record, false if any could not be removed /// (Returns true if the tags were not found in the record). /// SamStatus is set to INVALID if the tags are incorrectly formatted. bool rmTags(const char* tags); //@} /////////////////////// /// @name Get Alignment Data /// Get methods for record fields. All of the "get" methods set the /// status to indicate success or the failure reason. //@{ /// Get a const pointer to the buffer that contains the BAM representation /// of the record. /// \return const pointer to the buffer that contains the BAM representation /// of the record. const void* getRecordBuffer(); /// Get a const pointer to the buffer that contains the BAM representation /// of the record using the specified translation on the sequence. /// \param translation type of sequence translation to use. /// \return const pointer to the buffer that contains the BAM representation /// of the record. const void* getRecordBuffer(SequenceTranslation translation); /// Write the record as a BAM into the specified already opened file. /// \param filePtr file to write the BAM record into. /// \return status of the write. SamStatus::Status writeRecordBuffer(IFILE filePtr); /// Write the record as a BAM into the specified already opened file using /// the specified translation on the sequence. /// \param filePtr file to write the BAM record into. /// \param translation type of sequence translation to use. /// \return status of the write. SamStatus::Status writeRecordBuffer(IFILE filePtr, SequenceTranslation translation); /// Get the block size of the record (BAM format). /// \return BAM block size of the record. int32_t getBlockSize(); /// Get the reference sequence name (RNAME) of the record. /// \return reference sequence name const char* getReferenceName(); /// Get the reference sequence id of the record (BAM format rid). /// \return reference sequence id int32_t getReferenceID(); /// Get the 1-based(SAM) leftmost position (POS) of the record. /// \return 1-based leftmost position. int32_t get1BasedPosition(); /// Get the 0-based(BAM) leftmost position of the record. /// \return 0-based leftmost position. int32_t get0BasedPosition(); /// Get the length of the readname (QNAME) including the null. /// \return length of the read name (including null). uint8_t getReadNameLength(); /// Get the mapping quality (MAPQ) of the record. /// \return map quality. uint8_t getMapQuality(); /// Get the BAM bin for the record. /// \return BAM bin uint16_t getBin(); /// Get the length of the BAM formatted CIGAR. /// \return length of BAM formatted cigar. uint16_t getCigarLength(); /// Get the flag (FLAG). /// \return flag. uint16_t getFlag(); /// Get the length of the read. /// \return read length. int32_t getReadLength(); /// Get the mate/next fragment's reference sequence name (RNEXT). If it /// is equal to the reference name, it still returns the reference name. /// \return reference sequence name const char* getMateReferenceName(); /// Get the mate/next fragment's reference sequence name (RNEXT), /// returning "=" if it is the same as the reference name, unless /// they are both "*" in which case "*" is returned. /// \return reference sequence name or '=' const char* getMateReferenceNameOrEqual(); /// Get the mate reference id of the record /// (BAM format: mate_rid/next_refID). /// \return reference id int32_t getMateReferenceID(); /// Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT). /// \return 1-based leftmost position. int32_t get1BasedMatePosition(); /// Get the 0-based(BAM) leftmost mate/next fragment's position. /// \return 0-based leftmost position. int32_t get0BasedMatePosition(); /// Get the inferred insert size of the read pair (ISIZE) or /// observed template length (TLEN). /// \return inferred insert size or observed template length. int32_t getInsertSize(); /// Returns the 0-based inclusive rightmost position of the /// clipped sequence. /// \return 0-based inclusive rightmost position int32_t get0BasedAlignmentEnd(); /// Returns the 1-based inclusive rightmost position of the /// clipped sequence. /// \return 1-based inclusive rightmost position int32_t get1BasedAlignmentEnd(); /// Returns the length of the clipped sequence, returning 0 if the cigar /// is '*'. /// \return length of the clipped sequence. int32_t getAlignmentLength(); /// Returns the 0-based inclusive left-most position adjusted for /// clipped bases. /// \return 0-based inclusive leftmost position including clips. int32_t get0BasedUnclippedStart(); /// Returns the 1-based inclusive left-most position adjusted for /// clipped bases. /// \return 1-based inclusive leftmost position including clips. int32_t get1BasedUnclippedStart(); /// Returns the 0-based inclusive right-most position adjusted for /// clipped bases. /// \return 0-based inclusive rightmost position including clips. int32_t get0BasedUnclippedEnd(); /// Returns the 1-based inclusive right-most position adjusted for /// clipped bases. /// \return 1-based inclusive rightmost position including clips. int32_t get1BasedUnclippedEnd(); /// Returns the SAM formatted Read Name (QNAME). /// \return read name. const char* getReadName(); /// Returns the SAM formatted CIGAR string. /// \return cigar string. const char* getCigar(); /// Returns the SAM formatted sequence string (SEQ), translating the base as /// specified by setSequenceTranslation. /// \return sequence string. const char* getSequence(); /// Returns the SAM formatted sequence string (SEQ) performing the specified /// sequence translation. /// \param translation type of sequence translation to use. /// \return sequence string. const char* getSequence(SequenceTranslation translation); /// Returns the SAM formatted quality string (QUAL). /// \return quality string. const char* getQuality(); /// Get the sequence base at the specified index into this sequence 0 to /// readLength - 1, translating the base as specified by /// setSequenceTranslation. Throws an exception if index is out of range. /// \param index index into the sequence string (0 to readLength-1). /// \return the sequence base at the specified index into the sequence. char getSequence(int index); /// Get the sequence base at the specified index into this sequence 0 to /// readLength - 1 performing the specified sequence translation. /// Throws an exception if index is out of range. /// \param index index into the sequence string (0 to readLength-1). /// \param translation type of sequence translation to use. /// \return the sequence base at the specified index into the sequence. char getSequence(int index, SequenceTranslation translation); /// Get the quality character at the specified index into the quality 0 to /// readLength - 1. Throws an exception if index is out of range. /// \param index index into the quality string (0 to readLength-1). /// \return the quality character at the specified index into the quality. char getQuality(int index); /// Returns a pointer to the Cigar object associated with this record. /// The object is essentially read-only, only allowing modifications /// due to lazy evaluations. /// \return pointer to the Cigar object. Cigar* getCigarInfo(); /// Return the number of bases in this read that overlap the passed in /// region. Matches & mismatches between the read and the reference /// are counted as overlaps, but insertions, deletions, skips, clips, and /// pads are not counted. /// \param start inclusive 0-based start position (reference position) of /// the region to check for overlaps in. /// (-1 indicates to start at the beginning of the reference.) /// \param end exclusive 0-based end position (reference position) of the /// region to check for overlaps in. /// (-1 indicates to go to the end of the reference.) /// \return number of overlapping bases uint32_t getNumOverlaps(int32_t start, int32_t end); /// Returns the values of all fields except the tags. /// \param recStruct structure containing the contents of all /// non-variable length fields. /// \param readName read name from the record (return param) /// \param cigar cigar string from the record (return param) /// \param sequence sequence string from the record (return param) /// \param quality quality string from the record (return param) /// \return true if all fields were successfully set, false otherwise. bool getFields(bamRecordStruct& recStruct, String& readName, String& cigar, String& sequence, String& quality); /// Returns the values of all fields except the tags using the specified /// sequence translation. /// \param recStruct structure containing the contents of all /// non-variable length fields. /// \param readName read name from the record (return param) /// \param cigar cigar string from the record (return param) /// \param sequence sequence string from the record (return param) /// \param quality quality string from the record (return param) /// \param translation type of sequence translation to use. /// \return true if all fields were successfully set, false otherwise. bool getFields(bamRecordStruct& recStruct, String& readName, String& cigar, String& sequence, String& quality, SequenceTranslation translation); /// Returns a pointer to the genome sequence object associated with this /// record if it was set (NULL if it was not set). /// \return pointer to the GenomeSequence object or NULL if there isn't one. GenomeSequence* getReference(); //@} /////////////////////// /// @name Get Tag Methods /// Get methods for obtaining information on tags. //@{ /// Returns the length of the BAM formatted tags. /// \return length of the BAM formatted tags. uint32_t getTagLength(); /// Get the next tag from the record. /// Sets the Status to SUCCESS when a tag is successfully returned or /// when there are no more tags. Otherwise the status is set to describe /// why it failed (parsing, etc). /// \param tag set to the tag when a tag is read. /// \param vtype set to the vtype when a tag is read. /// \param value pointer to the value of the tag (will need to cast /// to int, float, char, or string based on vtype). /// \return true if a tag was read, false if there are no more tags. bool getNextSamTag(char* tag, char& vtype, void** value); /// Reset the tag iterator to the beginning of the tags. void resetTagIter(); /// Returns whether or not the specified vtype is an integer type. /// Does not set SamStatus. /// \param vtype value type to check. /// \return true if the passed in vtype is an integer ('c', 'C', 's', /// 'S', 'i', 'I'), false otherwise. static bool isIntegerType(char vtype); /// Returns whether or not the specified vtype is a float type. /// Does not set SamStatus. /// \param vtype value type to check. /// \return true if the passed in vtype is a float ('f'), false otherwise. static bool isFloatType(char vtype); /// Returns whether or not the specified vtype is a char type. /// Does not set SamStatus. /// \param vtype value type to check. /// \return true if the passed in vtype is a char ('A'), false otherwise. static bool isCharType(char vtype); /// Returns whether or not the specified vtype is a string type. /// Does not set SamStatus. /// \param vtype value type to check. /// \return true if the passed in vtype is a string ('Z'/'B'), false othwerise. static bool isStringType(char vtype); /// Get the string representation of the tags from the record, formatted /// as TAG:TYPE:VALUETAG:TYPE:VALUE... /// Sets the Status to SUCCESS when the tags are successfully returned or /// the tags were not found. If a different error occured, the status is /// set appropriately. /// The delimiter between the tags to retrieve is ',' or ';'. ',' was added /// since the original delimiter, ';', requires the string to be quoted on /// the command-line. /// \param tags the tags to retrieve, formatted as TAG:TYPE,TAG:TYPE... /// \param returnString the String to set (this method first clears returnString) /// to TAG:TYPE:VALUETAG:TYPE:VALUE... /// \param delim delimiter to use to separate two tags, default is a tab. /// \return true if there were not any errors even if no tags were found. bool getTagsString(const char* tags, String& returnString, char delim = '\t'); /// Get the string value for the specified tag. /// \param tag tag to retrieve /// \param pointer to the tag's string value if found, NULL if not found. const String* getStringTag(const char * tag); /// Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure). /// \param tag tag to retrieve /// \retun pointer to the tag's integer value if found, NULL if not found. int* getIntegerTag(const char * tag); /// Get the integer value for the specified tag. /// \param tag tag to retrieve /// \param tagVal return parameter with integer value for the tag /// \retun bool true if Integer tag was found and tagVal was set, /// false if not. bool getIntegerTag(const char * tag, int& tagVal); /// Get the float value for the specified tag. /// \param tag tag to retrieve /// \param tagVal return parameter with integer value for the tag /// \return bool true if Float tag was found and tagVal was set, /// false if not. bool getFloatTag(const char * tag, float& tagVal); /// Get the string value for the specified tag. const String & getString(const char * tag); /// Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool. int & getInteger(const char * tag); /// Check if the specified tag contains a string. /// Does not set SamStatus. /// \param tag SAM tag to check contents of. /// \return true if the value associated with the tag is a string. bool checkString(const char * tag) { return(checkTag(tag, 'Z') || checkTag(tag, 'B')); } /// Check if the specified tag contains an integer. /// Does not set SamStatus. /// \param tag SAM tag to check contents of. /// \return true if the value associated with the tag is a string. bool checkInteger(const char * tag) { return checkTag(tag, 'i'); } /// Check if the specified tag contains a string. /// Does not set SamStatus. /// \param tag SAM tag to check contents of. /// \return true if the value associated with the tag is a string. bool checkFloat(const char * tag) { return checkTag(tag, 'f'); } /// Check if the specified tag contains a value of the specified vtype. /// Does not set SamStatus. /// \param tag SAM tag to check contents of. /// \param type value type to check if the SAM tag matches. /// \return true if the value associated with the tag is a string. bool checkTag(const char * tag, char type); //@} /// Returns the status associated with the last method that sets the status. /// \return SamStatus of the last command that sets status. const SamStatus& getStatus(); private: static int MAKEKEY(char ch1, char ch2, char type) { return (getKeyType(type) << 16) + (ch2 << 8) + ch1; } static char getKeyType(char type) { switch(type) { // For any char/integer type, return 'i' case 'A' : case 'c' : case 'C' : case 's' : case 'S' : case 'i' : case 'I' : return('i'); break; default: // For all other types, return the actual type. return(type); }; } static inline int getNumericTagTypeSize(char type) { switch(type) { case 'A': case 'c': case 'C': return(1); break; case 's': case 'S': return(2); break; case 'i': case 'I': case 'f': return(4); default: // Not a numeric type. return(0); } } // Allocate space for the record - does a realloc. // The passed in size is the size of the entire record including the // block size field. // Adds any errors to myStatus. bool allocateRecordStructure(int size); void* getStringPtr(int offset); void* getIntegerPtr(int offset, char& vtype); void* getFloatPtr(int offset); // Fixes the buffer to match the variable length fields. // Adds any errors to myStatus. bool fixBuffer(SequenceTranslation translation); // Sets the Sequence and Quality strings from the buffer. // They are done together in one method because they require the same // loop, so might as well be done at the same time. // Adds any errors to myStatus. void setSequenceAndQualityFromBuffer(); // Parse the cigar to calculate the alignment/unclipped ends and convert // to SAM/BAM format. // Adds any errors to myStatus. bool parseCigar(); // Parse the cigar string to calculate the cigar length and alignment end // and convert to SAM format. // Adds any errors to myStatus. bool parseCigarBinary(); // Parse the cigar string to calculate the cigar length and alignment end // and convert to BAM format. // Adds any errors to myStatus. bool parseCigarString(); // Set the tags from the buffer. // Adds any errors to myStatus. bool setTagsFromBuffer(); // Set the tags in the buffer. // Adds any errors to myStatus. bool setTagsInBuffer(); void setVariablesForNewBuffer(SamFileHeader& header); void getTypeFromKey(int key, char& type) const; void getTag(int key, char* tag) const; String & getString(int offset); int & getInteger(int offset); const char & getIntegerType(int offset) const; float & getFloat(int offset); // Append the string representation of the value at the specified index // of the int array. inline void appendIntArrayValue(int index, String& strVal) const { appendIntArrayValue(intType[index], integers[index], strVal); } void appendIntArrayValue(char type, int value, String& strVal) const; int getBtagBufferSize(String& tagStr); int setBtagBuffer(String& tagStr, char* extraPtr); int getStringFromBtagBuffer(unsigned char* buffer, String& tagStr); static const int DEFAULT_BLOCK_SIZE = 40; static const int DEFAULT_BIN = 4680; static const int DEFAULT_READ_NAME_LENGTH = 8; static const char* DEFAULT_READ_NAME; static const char* FIELD_ABSENT_STRING; bamRecordStruct * myRecordPtr; int allocatedSize; // Pointer to a temporary cigar buffer that can be used during string // parsing before it is ready to be copied into the actual record. uint32_t* myCigarTempBuffer; // Size of the currently allocated temporary cigar buffer. int myCigarTempBufferAllocatedSize; // Length of the cigar currently contained in the temporary buffer. int myCigarTempBufferLength; // Track if the buffer is in sync with the Strings/Tags. // Set to false if any of the variable length fields are modified. // Set to true when the buffer is updated to match the variable length // fields. bool myIsBufferSynced; // Track if the tags need to be set from the buffer. bool myNeedToSetTagsFromBuffer; // Trag if the tags need to be set in the buffer. // Allows you to set just the tags if they are the only thing that changed // in the buffer. bool myNeedToSetTagsInBuffer; int myTagBufferSize; int myLastTagIndex; String myReadName; String myReferenceName; String myMateReferenceName; String myCigar; String mySequence; String myQuality; std::string mySeqWithEq; std::string mySeqWithoutEq; // The length of the alignment. int32_t myAlignmentLength; // Unclipped alignment positions. int32_t myUnclippedStartOffset; int32_t myUnclippedEndOffset; CigarRoller myCigarRoller; LongHash extras; // Note: not all values in strings, integers, and floats are always // in extras. They will not be if the tags were removed. Removed // tags are removed from extras, but not from strings, integers, or floats // since if one was removed from these arrays, all other entries would // need their indices updated in extras. StringArray strings; IntArray integers; std::vector intType; // contains the type of int at same position in integers. std::vector floats; // Track whether or not the buffer values are correct for // each setting. bool myIsReadNameBufferValid; bool myIsCigarBufferValid; bool myIsSequenceBufferValid; bool myIsQualityBufferValid; bool myIsTagsBufferValid; bool myIsBinValid; unsigned char* myPackedSequence; unsigned char* myPackedQuality; SamStatus myStatus; // The current translation of the sequence as it occurs in the buffer. // Only applicable if myIsSequenceBufferValid == true. SequenceTranslation myBufferSequenceTranslation; // Track the Reference. GenomeSequence* myRefPtr; // The type of translation to do when getting a sequence. SequenceTranslation mySequenceTranslation; String NOT_FOUND_TAG_STRING; int NOT_FOUND_TAG_INT; static const int myMaxWarns = 5; static int myNumWarns; }; #endif libStatGen-1.0.14/bam/SamRecordHelper.cpp000066400000000000000000000063021254730101300201100ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamRecordHelper.h" #include int SamRecordHelper::checkSequence(SamRecord& record, int32_t pos0Based, const char* sequence) { const char* readSeq = record.getSequence(); // Get the cigar. Cigar* cigar = record.getCigarInfo(); if(cigar == NULL) { throw std::runtime_error("Failed to get Cigar."); } int32_t readStartIndex = cigar->getQueryIndex(pos0Based, record.get0BasedPosition()); // if the read start is negative, this position was deleted, so // return false, it doesn't match. if(readStartIndex == Cigar::INDEX_NA) { return(false); } // Increment the readSeq start to where this position is found. readSeq += readStartIndex; if(strncmp(readSeq, sequence, strlen(sequence)) == 0) { // Match, so return the readStartIndex (cycle). return(readStartIndex); } // Did not match. return(-1); } bool SamRecordHelper::genSamTagsString(SamRecord& record, String& returnString, char delim) { char tag[3]; char vtype; void* value; // Reset the tag iterator to ensure that all the tags are written. record.resetTagIter(); // While there are more tags, write them to the recordString. bool firstEntry = true; bool returnStatus = true; while(record.getNextSamTag(tag, vtype, &value) != false) { if(!firstEntry) { returnString += delim; } else { firstEntry = false; } returnStatus &= genSamTagString(tag, vtype, value, returnString); } return(returnStatus); } bool SamRecordHelper::genSamTagString(const char* tag, char vtype, void* value, String& returnString) { returnString += tag; returnString += ":"; returnString += vtype; returnString += ":"; if(SamRecord::isIntegerType(vtype)) { returnString += (int)*(int*)value; } else if(SamRecord::isFloatType(vtype)) { returnString.appendFullFloat(*(float*)value); } else if(SamRecord::isCharType(vtype)) { returnString += (char)*(char*)value; } else if(SamRecord::isStringType(vtype)) { // String type. returnString += (String)*(String*)value; } else { // Could not determine the type. return(false); } return(true); } libStatGen-1.0.14/bam/SamRecordHelper.h000066400000000000000000000046521254730101300175630ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_RECORD_HELPER_H__ #define __SAM_RECORD_HELPER_H__ #include "SamRecord.h" /// Class for extracting information from a SAM Flag. class SamRecordHelper { public: /// Helper method that checks if the record's read sequence starting /// at the specified 0-based reference position matches the passed in /// sequence. /// \return returns -1 if it does not match, /// returns the cycle (read position) of pos0Based if it does match. static int checkSequence(SamRecord& record, int32_t pos0Based, const char* sequence); /// Helper to append the SAM string representation of all the tags to /// the specified string. Does NOT add a preceding delimiter before the /// first tag. /// \param record record whose tags to append. /// \param returnString string to append the tags to. /// \param delim delimiter to use to separate different tags. /// \return true on success, false on failure/partial generation. static bool genSamTagsString(SamRecord& record, String& returnString, char delim = '\t'); /// Helper to append the SAM string representation of the specified tag to /// the specified string. /// \param tag the tag name. /// \param vtype the vtype. /// \param value pointer to the value of the tag (will be cast /// to int, double, char, or string based on vtype). /// \param returnString string to append the tag to. /// \return true on success, false on failure/partial generation. static bool genSamTagString(const char* tag, char vtype, void* value, String& returnString); private: SamRecordHelper(); }; #endif libStatGen-1.0.14/bam/SamRecordPool.cpp000066400000000000000000000050241254730101300176020ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "SamRecordPool.h" SamRecordPool::SamRecordPool() : myFreeSamRecords(), myMaxAllowedRecs(-1), myAllocatedRecs(0) { } SamRecordPool::SamRecordPool(int maxNumRecs) : myFreeSamRecords(), myMaxAllowedRecs(maxNumRecs), myAllocatedRecs(0) { } SamRecordPool::~SamRecordPool() { // Loop through the stack deleting the free records. while (!myFreeSamRecords.empty()) { delete(myFreeSamRecords.front()); myFreeSamRecords.pop(); } } SamRecord* SamRecordPool::getRecord() { // Get new samRecord. SamRecord* returnSam = NULL; if(!myFreeSamRecords.empty()) { // have free already allocated records, so get one of those. returnSam = myFreeSamRecords.front(); myFreeSamRecords.pop(); } else if((myMaxAllowedRecs == -1) || (myAllocatedRecs < myMaxAllowedRecs)) { // There were no free records, but either there is no max or // there is still room to allocate more. returnSam = new SamRecord(); ++myAllocatedRecs; if(returnSam == NULL) { // Failed allocation. throw(std::runtime_error("Failed to allocate SamRecord")); } } else { // There are no more free ones and we have already hit the // max number allowed to be allocated, so return NULL. // The user will have to release some or update the max. returnSam = NULL; } return(returnSam); } void SamRecordPool::releaseRecord(SamRecord* record) { if(record == NULL) { // Nothing to release, so just return. return; } // Release the samRecord to be reused. myFreeSamRecords.push(record); } void SamRecordPool::setMaxAllocatedRecs(int maxNumRecs) { myMaxAllowedRecs = maxNumRecs; } libStatGen-1.0.14/bam/SamRecordPool.h000066400000000000000000000051641254730101300172540ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_RECORD_POOL_H__ #define __SAM_RECORD_POOL_H__ #include #include "SamRecord.h" class SamRecordPool { public: /// Constructor that sets there to be no max number of allocated records. SamRecordPool(); /// Constructor that sets the maximum number of allocated records /// \param maxNumRecs maximum number of allocated records (-1 means no max) SamRecordPool(int maxNumRecs); /// Destructor. Any records that were allocated without calling "releaseRecord" /// will not get cleaned up and the user will need to delete them. ~SamRecordPool(); /// Get a SamRecord. If records are already allocated and free use those, if not /// and there are still more that are allowed to be allocated, allocate a new one. /// If no more records are allowed to be allocated, NULL is returned. /// NOTE: The user should call releaseRecord when done using the record. /// If the user deletes the record instead, it still counts as allocated when /// comparing against the maxNumRecs but cannot be reused. /// \return pointer to a SamRecord available for use, or NULL if no more records /// are allowed to be allocated. SamRecord* getRecord(); /// If record is not NULL, adds it back to the free list. /// If record is NULL, nothing is done. /// \param record pointer to a record that is no longer being used /// and is available for reuse. void releaseRecord(SamRecord* record); /// Set the maximum number of records allowed to be allocated. /// If more than the new value have already been allocated, /// it does not deallocate any, and will continue to reuse /// the already allocated records, but it will not allocate /// any additional records. void setMaxAllocatedRecs(int maxNumRecs); private: std::queue myFreeSamRecords; int myMaxAllowedRecs; int myAllocatedRecs; }; #endif libStatGen-1.0.14/bam/SamReferenceInfo.cpp000066400000000000000000000073451254730101300202540ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamReferenceInfo.h" SamReferenceInfo::SamReferenceInfo() : myReferenceContigs(), myReferenceHash(), myReferenceLengths() { clear(); } SamReferenceInfo::~SamReferenceInfo() { clear(); } // Add reference sequence name and reference sequence length. void SamReferenceInfo::add(const char* referenceSequenceName, int32_t referenceSequenceLength) { myReferenceHash.Add(referenceSequenceName, myReferenceContigs.Length()); myReferenceContigs.Push(referenceSequenceName); myReferenceLengths.Push(referenceSequenceLength); } int SamReferenceInfo::getReferenceID(const String & referenceName, bool addID) { if (referenceName == "*") return -1; int id = myReferenceHash.Find(referenceName); if (id >= 0) return myReferenceHash.Integer(id); if(!addID) { // Don't add the id, so return NO_REF_ID return(NO_REF_ID); } id = myReferenceContigs.Length(); myReferenceContigs.Push(referenceName); myReferenceLengths.Push(0); myReferenceHash.Add(referenceName, id); return id; } int SamReferenceInfo::getReferenceID(const char* referenceName, bool addID) { String referenceNameString = referenceName; return(getReferenceID(referenceNameString, addID)); } const String & SamReferenceInfo::getReferenceLabel(int id) const { static String noname("*"); if ((id < 0) || (id >= myReferenceContigs.Length())) { return noname; } return myReferenceContigs[id]; } int32_t SamReferenceInfo::getNumEntries() const { // The number of entries is the size of referenceLengths. return(myReferenceLengths.Length()); } const char* SamReferenceInfo::getReferenceName(int index) const { if((index >= 0) && (index < getNumEntries())) { return(myReferenceContigs[index].c_str()); } // Out of range, return blank return(""); } int32_t SamReferenceInfo::getReferenceLength(int index) const { if((index >= 0) && (index < getNumEntries())) { return(myReferenceLengths[index]); } // Out of bounds, return 0 return(0); } void SamReferenceInfo::clear() { myReferenceContigs.Clear(); myReferenceHash.Clear(); myReferenceLengths.Clear(); } SamReferenceInfo& SamReferenceInfo::operator = (const SamReferenceInfo &newInfo) { clear(); // Copy Reference contigs, hash, lengths. myReferenceContigs = newInfo.myReferenceContigs; myReferenceHash = newInfo.myReferenceHash; myReferenceLengths = newInfo.myReferenceLengths; return(*this); } bool SamReferenceInfo::operator== (const SamReferenceInfo& rhs) const { // Hash may be different, but if Contigs are the same, the hashes will // contain the same basic info (maybe just at different indices. return((myReferenceContigs == rhs.myReferenceContigs) && (myReferenceLengths == rhs.myReferenceLengths)); } libStatGen-1.0.14/bam/SamReferenceInfo.h000066400000000000000000000062061254730101300177140ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_REFERENCE_INFO_H__ #define __SAM_REFERENCE_INFO_H__ #include "StringArray.h" #include "StringHash.h" #include "IntArray.h" /// Class for tracking the reference information mapping between the /// reference ids and the reference names. class SamReferenceInfo { public: /// Constructor. SamReferenceInfo(); /// Destructor. ~SamReferenceInfo(); /// Add reference sequence name and reference sequence length. void add(const char* referenceSequenceName, int32_t referenceSequenceLength); /// Get the reference ID for the specified name, if addID is set to true, /// a reference id will be created for the referenceName if one does not /// already exist, while if addID is set to false (default), it will return /// NO_REF_ID if the reference name does not exist. int getReferenceID(const String & referenceName, bool addID = false); /// Get the reference ID for the specified name, if addID is set to true, /// a reference id will be created for the referenceName if one does not /// already exist, while if addID is set to false (default), it will return /// NO_REF_ID if the reference name does not exist. int getReferenceID(const char* referenceName, bool addID = false); /// Get the reference name for the specified id, if the id is not found, /// return "*". const String & getReferenceLabel(int id) const; /// Get the number of entries contained here. int32_t getNumEntries() const; /// Return the reference name at the specified index, returning "" if the /// index is out of bounds. const char* getReferenceName(int index) const; /// Return the reference length at the specified index, returning 0 if the /// index is out of bounds. int32_t getReferenceLength(int index) const; /// Reset this reference info. void clear(); /// Copy the reference information. SamReferenceInfo & operator = (const SamReferenceInfo & rhs); bool operator== (const SamReferenceInfo& rhs) const; bool operator!= (const SamReferenceInfo& rhs) const { return(!operator==(rhs)); } /// Constant for the value returned if a reference id does not exist /// for a queried reference name. static const int NO_REF_ID = -3; private: // Reference Name information StringArray myReferenceContigs; StringIntHash myReferenceHash; IntArray myReferenceLengths; }; #endif libStatGen-1.0.14/bam/SamStatistics.cpp000066400000000000000000000075721254730101300176760ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamStatistics.h" #include #include "SamFlag.h" SamStatistics::SamStatistics() { reset(); } SamStatistics::~SamStatistics() { reset(); } void SamStatistics::reset() { myReadCount = 0; myMappedReadCount = 0; myPairedReadCount = 0; myProperPairedReadCount = 0; myBaseCount = 0; myMappedReadBases = 0; myDupReadCount = 0; myQCFailureReadCount = 0; } bool SamStatistics::updateStatistics(SamRecord& samRecord) { // Each record has one read, so update the read count. ++myReadCount; int32_t readLen = samRecord.getReadLength(); // Get the flag to determine the type or // read (mapped, paired, proper paired). uint16_t flag = samRecord.getFlag(); // If the read is mapped, update the mapped c if(SamFlag::isMapped(flag)) { ++myMappedReadCount; myMappedReadBases += readLen; } if(SamFlag::isPaired(flag)) { ++myPairedReadCount; if(SamFlag::isProperPair(flag)) { ++myProperPairedReadCount; } } if(SamFlag::isDuplicate(flag)) { ++myDupReadCount; } if(SamFlag::isQCFailure(flag)) { ++myQCFailureReadCount; } // Increment the total number of bases. myBaseCount += readLen; return(true); } void SamStatistics::print() { double DIVIDE_UNITS = 1000000; std::string units = "(e6)"; std::cerr << std::fixed << std::setprecision(2); // If total reads is less than DIVIDE_UNITS, just show the straight number. if(myReadCount < DIVIDE_UNITS) { DIVIDE_UNITS = 1; units.clear(); } // Read Counts std::cerr << "TotalReads" << units << "\t" << myReadCount/DIVIDE_UNITS << std::endl; std::cerr << "MappedReads" << units << "\t" << myMappedReadCount/DIVIDE_UNITS << std::endl; std::cerr << "PairedReads" << units << "\t" << myPairedReadCount/DIVIDE_UNITS << std::endl; std::cerr << "ProperPair" << units << "\t" << myProperPairedReadCount/DIVIDE_UNITS << std::endl; std::cerr << "DuplicateReads" << units << "\t" << myDupReadCount/DIVIDE_UNITS << std::endl; std::cerr << "QCFailureReads" << units << "\t" << myQCFailureReadCount/DIVIDE_UNITS << std::endl; std::cerr << std::endl; // Read Percentages std::cerr << "MappingRate(%)\t" << 100 * myMappedReadCount/(double)myReadCount << std::endl; std::cerr << "PairedReads(%)\t" << 100 * myPairedReadCount/(double)myReadCount << std::endl; std::cerr << "ProperPair(%)\t" << 100 * myProperPairedReadCount/(double)myReadCount << std::endl; std::cerr << "DupRate(%)\t" << 100 * myDupReadCount/(double)myReadCount << std::endl; std::cerr << "QCFailRate(%)\t" << 100 * myQCFailureReadCount/(double)myReadCount << std::endl; std::cerr << std::endl; // Base Counts std::cerr << "TotalBases" << units << "\t" << myBaseCount/DIVIDE_UNITS << std::endl; std::cerr << "BasesInMappedReads" << units << "\t" << myMappedReadBases/DIVIDE_UNITS << std::endl; } libStatGen-1.0.14/bam/SamStatistics.h000066400000000000000000000040411254730101300173270ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_STATISTICS_H__ #define __SAM_STATISTICS_H__ #include #include "SamRecord.h" class SamStatistics { public: SamStatistics(); ~SamStatistics(); // Reset the statistics - clear them for processing a new file. void reset(); // Method to update the statistics to include the passed in record. bool updateStatistics(SamRecord& samRecord); void print(); private: /////////////////////////////////////////////////////// // Read Counts /// The number of reads (records) that were processed. uint64_t myReadCount; /// The number of mapped reads (records). uint64_t myMappedReadCount; /// The number of paired reads (records). uint64_t myPairedReadCount; /// The number of proper paired reads (records). uint64_t myProperPairedReadCount; /// The number of duplicate reads (based on the flag). uint64_t myDupReadCount; /// The number of QC failure reads (based on the flag). uint64_t myQCFailureReadCount; /////////////////////////////////////////////////////// // Base Counts /// The total number of bases in the reads in the file (sum of read lengths) uint64_t myBaseCount; /// The total number of bases in mapped reads (sum of read lengths for mapped reads). uint64_t myMappedReadBases; }; #endif libStatGen-1.0.14/bam/SamStatus.h000066400000000000000000000015331254730101300164630ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_STATUS_H__ #define __SAM_STATUS_H__ #include "StatGenStatus.h" typedef StatGenStatus SamStatus; #endif libStatGen-1.0.14/bam/SamTags.cpp000066400000000000000000000127571254730101300164430ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamTags.h" #include "BaseUtilities.h" const char* SamTags::BQ_TAG = "BQ"; const char SamTags::BQ_TAG_TYPE = 'Z'; const char* SamTags::MD_TAG = "MD"; const char SamTags::MD_TAG_TYPE = 'Z'; const char* SamTags::ORIG_POS_TAG = "OP"; const char SamTags::ORIG_POS_TAG_TYPE = 'i'; const char* SamTags::ORIG_CIGAR_TAG = "OC"; const char SamTags::ORIG_CIGAR_TAG_TYPE = 'Z'; const char* SamTags::ORIG_QUAL_TAG = "OQ"; const char SamTags::ORIG_QUAL_TAG_TYPE = 'Z'; // Create the MD tag for the specified input record and the genome. bool SamTags::createMDTag(String& outputMDtag, SamRecord& inputRec, GenomeSequence& genome) { outputMDtag.Clear(); // Get the cigar to use for determing alignment. Cigar* cigarInfo = inputRec.getCigarInfo(); if(cigarInfo == NULL) { throw(std::runtime_error("Cannot createMDTag - failed to get the cigar")); return(false); } int32_t queryIndex = Cigar::INDEX_NA; // get where this read starts on the reference. uint32_t startOfReadOnRefIndex = genome.getGenomePosition(inputRec.getReferenceName()); if(startOfReadOnRefIndex == (uint32_t)INVALID_CHROMOSOME_INDEX) { // Failed to find the reference for this chromosome, so return false. return(false); } startOfReadOnRefIndex += inputRec.get0BasedPosition(); // Track the number of consecutive matches. int32_t matchCount = 0; // Track if it is currently in a deletion so it knows when not to add // a '^'. bool currentDeletion = false; // Loop through the Reference indices (ignores insertions/pads/clips). for(int refOffset = 0; refOffset < cigarInfo->getExpectedReferenceBaseCount(); ++refOffset) { // Get the query index for this reference position.. queryIndex = cigarInfo->getQueryIndex(refOffset); char refBase = genome[startOfReadOnRefIndex + refOffset]; if(queryIndex != Cigar::INDEX_NA) { // Both the reference and the read have a base, so get the bases. char readBase = inputRec.getSequence(queryIndex); currentDeletion = false; // If neither base is unknown and they are the same, count it // as a match. if(!BaseUtilities::isAmbiguous(readBase) && !BaseUtilities::isAmbiguous(refBase) && (BaseUtilities::areEqual(readBase, refBase))) { // Match, so update counter. ++matchCount; } else { // Mismatch, so output the number of matches if any. if(matchCount != 0) { outputMDtag += matchCount; matchCount = 0; } outputMDtag += refBase; } } else { // This reference position is not in the query, so it is a deletion. // Deletion, so output the number of matches if any. if(matchCount != 0) { outputMDtag += matchCount; matchCount = 0; } if(!currentDeletion) { // Not currently in a deletion, so add the ^ outputMDtag += '^'; } // Add the deleted base. outputMDtag += refBase; currentDeletion = true; } } // output the match count at the end. outputMDtag += matchCount; return(true); } // Check to see if the MD tag in the record is accurate. bool SamTags::isMDTagCorrect(SamRecord& inputRec, GenomeSequence& genome) { String calcMDtag; if(!createMDTag(calcMDtag, inputRec, genome)) { // Could not generate the MD tag, so just return that it is incorrect. return(false); } const String* origMDtag = inputRec.getStringTag(MD_TAG); if(origMDtag == NULL) { // There was no tag. // if there is not a new tag, then they are the same and true // should be returned. If there is a new tag, then the old one was // wrong so false should be returned. So just return the result of // IsEmpty. return(calcMDtag.IsEmpty()); } else { // origMDtag is not NULL, so just compare the two tags. return(calcMDtag == *origMDtag); } } // Update/Add the MD tag in the inputRec. bool SamTags::updateMDTag(SamRecord& inputRec, GenomeSequence& genome) { // Calculate the new MD tag. String calcMDtag; createMDTag(calcMDtag, inputRec, genome); // Add the MD tag. If it is already there and is different it will // replace it. If it is already there and it is the same, it won't // do anything. return(inputRec.addTag(MD_TAG, MD_TAG_TYPE, calcMDtag.c_str())); } libStatGen-1.0.14/bam/SamTags.h000066400000000000000000000036611254730101300161020ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_TAGS_H__ #define __SAM_TAGS_H__ #include #include #include "SamRecord.h" /// Class for parsing/creating/operating on SAM/BAM record tags. class SamTags { public: /////////////////////// /// @name Constants for parsing tags. //@{ static const char* BQ_TAG; static const char BQ_TAG_TYPE; static const char* MD_TAG; static const char MD_TAG_TYPE; static const char* ORIG_POS_TAG; static const char ORIG_POS_TAG_TYPE; static const char* ORIG_CIGAR_TAG; static const char ORIG_CIGAR_TAG_TYPE; static const char* ORIG_QUAL_TAG; static const char ORIG_QUAL_TAG_TYPE; //@} /// Create the MD tag for the specified input record and the genome. /// \return returns true if an MD tag was created, false if one could not /// be created. static bool createMDTag(String& outputMDtag, SamRecord& inputRec, GenomeSequence& genome); /// Check to see if the MD tag in the record is accurate. static bool isMDTagCorrect(SamRecord& inputRec, GenomeSequence& genome); // Update/Add the MD tag in the inputRec. static bool updateMDTag(SamRecord& inputRec, GenomeSequence& genome); private: SamTags(); }; #endif libStatGen-1.0.14/bam/SamValidation.cpp000066400000000000000000000552041254730101300176310ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "SamValidation.h" #include "CigarRoller.h" #include "SamTags.h" const char* SamValidationError::enumSeverityString[] = { "WARNING", "ERROR"}; const char* SamValidationError::enumTypeString[] = { "INVALID_QNAME", "INVALID_REF_ID", "INVALID_RNAME", "INVALID_POS", "INVALID_MAPQ", "INVALID_CIGAR", "INVALID_MRNM", "INVALID_QUAL", "INVALID_TAG" }; const char* SamValidationError::getTypeString(Type type) { return(enumTypeString[type]); } SamValidationError::SamValidationError(Type type, Severity severity, std::string message) { myType = type; mySeverity = severity; myMessage = message; } SamValidationError::Type SamValidationError::getType() const { return(myType); } SamValidationError::Severity SamValidationError::getSeverity() const { return(mySeverity); } const char* SamValidationError::getMessage() const { return(myMessage.c_str()); } const char* SamValidationError::getTypeString() const { return(enumTypeString[myType]); } const char* SamValidationError::getSeverityString() const { return(enumSeverityString[mySeverity]); } void SamValidationError::getErrorString(std::string& errorString) const { errorString = getTypeString(); errorString += " ("; errorString += getSeverityString(); errorString += ") : "; errorString += getMessage(); errorString += "\n"; } void SamValidationError::printError() const { std::cerr << this; } // Constructor. SamValidationErrors::SamValidationErrors() : myValidationErrors() { myErrorIter = myValidationErrors.begin(); } // Destructor SamValidationErrors::~SamValidationErrors() { clear(); } void SamValidationErrors::clear() { // Clear the errors. std::list::iterator errorIter; for(errorIter = myValidationErrors.begin(); errorIter != myValidationErrors.end(); ++errorIter) { delete *errorIter; *errorIter = NULL; } myValidationErrors.clear(); myErrorIter = myValidationErrors.end(); } void SamValidationErrors::addError(SamValidationError::Type newType, SamValidationError::Severity newSeverity, const char* newMessage) { myValidationErrors.push_back(new SamValidationError(newType, newSeverity, newMessage)); // If this is the first element in the list, set the iterator. if(myValidationErrors.size() == 1) { // set the iterator to the first element. myErrorIter = myValidationErrors.begin(); } } // Return the number of validation errors that are contained in this object. unsigned int SamValidationErrors::numErrors() { return(myValidationErrors.size()); } // Return a pointer to the next error. It does not remove it from the list. // Returns null once all errors have been retrieved until resetErrorIter // is called. const SamValidationError* SamValidationErrors::getNextError() { if(myErrorIter == myValidationErrors.end()) { // at the end of the list, return null. return(NULL); } // Not at the end of the list, return the last element and increment. return(*myErrorIter++); } // Resets the iterator to the begining of the errors. void SamValidationErrors::resetErrorIter() { myErrorIter = myValidationErrors.begin(); } // Appends the error messages to the passed in string. void SamValidationErrors::getErrorString(std::string& errorString) const { for(std::list:: const_iterator validationErrorIter = myValidationErrors.begin(); validationErrorIter != myValidationErrors.end(); validationErrorIter++) { std::string error = ""; (*validationErrorIter)->getErrorString(error); errorString += error; } } bool SamValidator::isValid(SamFileHeader& samHeader, SamRecord& samRecord, SamValidationErrors& validationErrors) { bool status = true; status &= isValidQname(samRecord.getReadName(), samRecord.getReadNameLength(), validationErrors); status &= isValidFlag(samRecord.getFlag(), validationErrors); // Validate the RName including validating it against the header. status &= isValidRname(samHeader, samRecord.getReferenceName(), validationErrors); status &= isValidRefID(samRecord.getReferenceID(), samHeader.getReferenceInfo(), validationErrors); status &= isValid1BasedPos(samRecord.get1BasedPosition(), validationErrors); status &= isValidMapQuality(samRecord.getMapQuality(), validationErrors); status &= isValidSequence(samRecord, validationErrors); status &= isValidCigar(samRecord, validationErrors); status &= isValidQuality(samRecord, validationErrors); status &= isValidTags(samRecord, validationErrors); return(status); } // qname is the query (read) name - result of SamRecord::getReadName(). // readNameLen is the length of the read name including the null (the result // of SamRecord::getReadNameLength()). // For some invalid records, the getReadNameLength may be different than the // length of qname. // NOTE: Query Name and Read Name both refer to the same field. bool SamValidator::isValidQname(const char* qname, uint8_t readNameLen, SamValidationErrors& validationErrors) { // Validation for QNAME is: // a) length of the qname string is the same as the read name length // b) length is between 1 and 254. // c) [ \t\n\r] are not allowed in the name. bool status = true; // Get the length of the qname string. int32_t qnameLenNull = strlen(qname) + 1; //////////////////////////////////// // a) length of the qname string is the same as the read name length if(qnameLenNull != readNameLen) { // This results from a poorly formatted bam file, where the null // terminated read_name field is not the same length as specified by // read_name_len. String message = "Invalid Query Name - the string length ("; message += qnameLenNull; message += ") does not match the specified query name length ("; message += readNameLen; message += ")."; validationErrors.addError(SamValidationError::INVALID_QNAME, SamValidationError::ERROR, message.c_str()); status = false; } //////////////////////////////////// // b) length is between 1 and 254 // The length with the terminating null must be between 2 & 255, if((qnameLenNull < 2) || (qnameLenNull > 255)) { String message = "Invalid Query Name (QNAME) length: "; message += qnameLenNull; message += ". Length with the terminating null must be between 2 & 255."; validationErrors.addError(SamValidationError::INVALID_QNAME, SamValidationError::WARNING, message.c_str()); status = false; } //////////////////////////////////// // Loop through and validate they all characters are valid. // c) [ \t\n\r] are not allowed in the name. String message; for(int i = 0; i < qnameLenNull; ++i) { switch(qname[i]) { case ' ': // Invalid character. message = "Invalid character in the Query Name (QNAME): ' ' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_QNAME, SamValidationError::WARNING, message.c_str()); status = false; break; case '\t': // Invalid character. message = "Invalid character in the Query Name (QNAME): '\t' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_QNAME, SamValidationError::WARNING, message.c_str()); status = false; break; case '\n': // Invalid character. message = "Invalid character in the Query Name (QNAME): '\n' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_QNAME, SamValidationError::WARNING, message.c_str()); status = false; break; case '\r': // Invalid character. message = "Invalid character in the Query Name (QNAME): '\r' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_QNAME, SamValidationError::WARNING, message.c_str()); status = false; break; } } return(status); } bool SamValidator::isValidFlag(uint16_t flag, SamValidationErrors& validationErrors) { // All values in a uint16_t are valid, so return true. return(true); } bool SamValidator::isValidRname(SamFileHeader& samHeader, const char* rname, SamValidationErrors& validationErrors) { bool status = true; // Cross validate the rname and the header. // If the rname is not '*' // AND there are any SQ records in the header, // Then the rname must be in one of them. if((strcmp(rname, "*") != 0) && (samHeader.getNumSQs() != 0) && (samHeader.getSQ(rname) == NULL)) { // There are SQ fields, but the ref name is not in it. status = false; std::string message = "RNAME, "; message += rname; message += ", was not found in a SAM Header SQ record"; validationErrors.addError(SamValidationError::INVALID_RNAME, SamValidationError::WARNING, message.c_str()); } status &= isValidRname(rname, validationErrors); return(status); } bool SamValidator::isValidRname(const char* rname, SamValidationErrors& validationErrors) { // Validation for RNAME is: // a) cannot be 0 length. // b) [ \t\n\r@=] are not allowed in the name. bool status = true; // Get the length of the rname string. int32_t rnameLen = strlen(rname); String message; if(rnameLen == 0) { validationErrors.addError(SamValidationError::INVALID_RNAME, SamValidationError::WARNING, "Reference Sequence Name (RNAME) cannot have 0 length."); status = false; } //////////////////////////////////// //////////////////////////////////// // Loop through and validate they all characters are valid. // b) [ \t\n\r] are not allowed in the name. for(int i = 0; i < rnameLen; ++i) { switch(rname[i]) { case ' ': // Invalid character. message = "Invalid character in the Reference Sequence Name (RNAME): ' ' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_RNAME, SamValidationError::WARNING, message.c_str()); status = false; break; case '\t': // Invalid character. message = "Invalid character in the Reference Sequence Name (RNAME): '\t' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_RNAME, SamValidationError::WARNING, message.c_str()); status = false; break; case '\n': // Invalid character. message = "Invalid character in the Reference Sequence Name (RNAME): '\n' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_RNAME, SamValidationError::WARNING, message.c_str()); status = false; break; case '\r': // Invalid character. message = "Invalid character in the Reference Sequence Name (RNAME): '\r' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_RNAME, SamValidationError::WARNING, message.c_str()); status = false; break; case '@': // Invalid character. message = "Invalid character in the Reference Sequence Name (RNAME): '@' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_RNAME, SamValidationError::WARNING, message.c_str()); status = false; break; case '=': // Invalid character. message = "Invalid character in the Reference Sequence Name (RNAME): '=' at position "; message += i; message += "."; validationErrors.addError(SamValidationError::INVALID_RNAME, SamValidationError::WARNING, message.c_str()); status = false; break; default: // Allowed character. break; } } return(status); } bool SamValidator::isValidRefID(int32_t refID, const SamReferenceInfo& refInfo, SamValidationErrors& validationErrors) { // Validation for rID is: // a) must be between -1 and the number of refInfo. // -1 is allowed, and otherwise it must properly index into the array. bool status = true; if((refID < -1) || (refID >= refInfo.getNumEntries())) { // Reference ID is too large or too small. String message = "Invalid Reference ID, out of range ("; message += refID; message += ") must be between -1 and "; message += refInfo.getNumEntries() - 1; message += "."; validationErrors.addError(SamValidationError::INVALID_REF_ID, SamValidationError::WARNING, message.c_str()); status = false; } return(status); } bool SamValidator::isValid1BasedPos(int32_t pos, SamValidationErrors& validationErrors) { // Validation for pos is: // a) must be between 0 and (2^29)-1. bool status = true; if((pos < 0) || (pos > 536870911)) { String message = "POS out of range ("; message += pos; message += ") must be between 0 and (2^29)-1."; validationErrors.addError(SamValidationError::INVALID_POS, SamValidationError::WARNING, message.c_str()); status = false; } return(status); } bool SamValidator::isValidMapQuality(uint8_t mapQuality, SamValidationErrors& validationErrors) { // All values in a uint8_t are valid, so return true. return(true); } bool SamValidator::isValidSequence(SamRecord& samRecord, SamValidationErrors& validationErrors) { return(true); } bool SamValidator::isValidCigar(SamRecord& samRecord, SamValidationErrors& validationErrors) { return(isValidCigar(samRecord.getCigar(), samRecord.getReadLength(), validationErrors)); } bool SamValidator::isValidCigar(const char* cigar, const char* sequence, SamValidationErrors& validationErrors) { return(isValidCigar(cigar, strlen(sequence), validationErrors)); } bool SamValidator::isValidCigar(const char* cigar, int seqLen, SamValidationErrors& validationErrors) { // Validation for CIGAR is: // a) cannot be 0 length. // if not "*", validate the following: // b) must have an integer length for each operator (if not "*"). TODO // c) all operators must be valid (if not "*"). TODO // d) evaluates to the same read length as the sequence string. bool status = true; String message; int32_t cigarLen = strlen(cigar); // a) cannot be 0 length. if(cigarLen == 0) { validationErrors.addError(SamValidationError::INVALID_CIGAR, SamValidationError::WARNING, "Cigar must not be blank."); status = false; } if(strcmp(cigar, "*") != 0) { // The cigar is not "*", so validate it. CigarRoller cigarRoller(cigar); // b) must have an integer length for each operator. // TODO // c) all operators must be valid. // TODO // d) is the same length as the sequence string. int cigarSeqLen = cigarRoller.getExpectedQueryBaseCount(); if(cigarSeqLen != seqLen) { message = "CIGAR does not evaluate to the same length as SEQ, ("; message += cigarSeqLen; message += " != "; message += seqLen; message += ")."; validationErrors.addError(SamValidationError::INVALID_CIGAR, SamValidationError::WARNING, message.c_str()); status = false; } } return(status); } bool SamValidator::isValidQuality(SamRecord& samRecord, SamValidationErrors& validationErrors) { return(isValidQuality(samRecord.getQuality(), samRecord.getReadLength(), validationErrors)); } bool SamValidator::isValidQuality(const char* quality, const char* sequence, SamValidationErrors& validationErrors) { // Determine the length of the sequence. int seqLen = strlen(sequence); // Check if the sequence is '*' since then the seqLength is 0. if(strcmp(sequence, "*") == 0) { seqLen = 0; } return(isValidQuality(quality, seqLen, validationErrors)); } bool SamValidator::isValidQuality(const char* quality, int seqLength, SamValidationErrors& validationErrors) { bool status = true; // If the quality or the sequence are non-"*", validate that the quality // and sequence have the same length. if((seqLength != 0) && (strcmp(quality, "*") != 0)) { int qualLen = strlen(quality); // Both the sequence and the quality are not "*", so validate // that they are the same length. if(seqLength != qualLen) { // Both fields are specified but are different lengths. String message = "QUAL is not the same length as SEQ, ("; message += qualLen; message += " != "; message += seqLength; message += ")."; validationErrors.addError(SamValidationError::INVALID_QUAL, SamValidationError::WARNING, message.c_str()); status = false; } } return(status); } bool SamValidator::isValidTags(SamRecord& samRecord, SamValidationErrors& validationErrors) { bool status = true; GenomeSequence* reference = samRecord.getReference(); // If the reference is not null, check the MD tag. if(reference != NULL) { const String* recordMD = samRecord.getStringTag(SamTags::MD_TAG); if(recordMD != NULL) { // The record has an MD tag so check to see if it is correct. if(!SamTags::isMDTagCorrect(samRecord, *reference)) { // Invalid MD tags. String correctMD; if(!SamTags::createMDTag(correctMD, samRecord, *reference)) { // Failed to get the MD tag, so indicate that it is unknown. correctMD = "UNKNOWN"; } String message = "Incorrect MD Tag, "; message += *recordMD; message += ", should be "; message += correctMD; message += "."; validationErrors.addError(SamValidationError::INVALID_TAG, SamValidationError::WARNING, message.c_str()); status = false; } } } return(status); } libStatGen-1.0.14/bam/SamValidation.h000066400000000000000000000411011254730101300172650ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SAM_VALIDATION_H__ #define __SAM_VALIDATION_H__ #include "SamFile.h" #include // On windows, ERROR and WARNING are pre-defined macros, so undefine them. #ifdef WARNING #undef WARNING #endif #ifdef ERROR #undef ERROR #endif /// The SamValidationError class describes a validation error that occured, /// containing the error type, severity, and textual error message. class SamValidationError { public: /// Severity of the error. enum Severity { WARNING, ///< Warning is used if it is just an invalid value. ERROR ///< Error is used if parsing could not succeed. }; /// Type of the error. /// TODO: NOT ALL INVALID TYPES HAVE BEEN ADDED SINCE NOT ALL VALIDATION /// IS COMPLETE YET enum Type { INVALID_QNAME, ///< Invalid read/query name INVALID_REF_ID, ///< Invalid reference id INVALID_RNAME, ///< Invalid reference name INVALID_POS, ///< Invalid position INVALID_MAPQ, ///< Invalid mapping quality INVALID_CIGAR, ///< Invalid CIGAR INVALID_MRNM, ///< Invalid mate/next fragment reference name INVALID_QUAL, ///< Invalid base quality INVALID_TAG ///< Invalid tag }; /// Get the string representing the specified type of validation error. static const char* getTypeString(Type type); /// Constructor that sets the type, severity, and message for the /// validation error. SamValidationError(Type type, Severity severity, std::string Message); /// Return the type enum of this validation error object. Type getType() const; /// Return the severity enum of this validation error object. Severity getSeverity() const; /// Return the error message of this validation error object. const char* getMessage() const; /// Return the string representing this object's type of validation error. const char* getTypeString() const; /// Return the string representing this object's severity of validation /// error. const char* getSeverityString() const; /// Get the error string representing this object's error. void getErrorString(std::string& errorString) const; /// Print a formatted output of the error to cerr. void printError() const; private: SamValidationError(); static const char* enumTypeString[]; static const char* enumSeverityString[]; Type myType; Severity mySeverity; std::string myMessage; }; /// stream output for validation failure information inline std::ostream &operator << (std::ostream &stream, const SamValidationError &error) { std::string errorMessage; error.getErrorString(errorMessage); stream << errorMessage; return stream; } /// The SamValidationErrors class is a container class that holds /// SamValidationError Objects, allowing a validation method to return all /// of the invalid errors rather than just one. class SamValidationErrors { public: /// Constructor. SamValidationErrors(); /// Destructor ~SamValidationErrors(); /// Remove all the errors from the container. void clear(); /// Add the specified error to this container. void addError(SamValidationError::Type newType, SamValidationError::Severity newSeverity, const char* newMessage); /// Return the number of validation errors contained in this object. unsigned int numErrors(); /// Return a pointer to the next error without removing it from the /// container, and returning null once all errors have been retrieved /// until resetErrorIter is called. const SamValidationError* getNextError(); /// Reset the iterator to the begining of the errors. void resetErrorIter(); /// Append the error messages contained in this container to the passed /// in string. void getErrorString(std::string& errorString) const; private: std::list myValidationErrors; std::list::const_iterator myErrorIter; }; /// stream output for all validation failures information inline std::ostream& operator << (std::ostream& stream, const SamValidationErrors& errors) { std::string errorString = ""; errors.getErrorString(errorString); stream << errorString; return stream; } /// The SamValidator class contains static methods for validating the SAM/BAM /// Record and each of its fields. The generic isValid method performs all of /// the other validations. The SamValidator methods return whether or not what /// is being validated is valid. True means it is valid, false means it is not. /// The specifics of the invalid value(s) are contained in the /// SamValidationErrors object that is passed in (by reference) to the method. /// The specific errors can be pulled out of that object. /// TODO: VALIDATION METHODS STILL NEED TO BE ADDED, and isValid does not yet /// validate all fields!!! class SamValidator { public: /// Validates whether or not the specified SamRecord is valid, calling /// all of the other validations. /// TODO: more validation needs to be added. /// \param samHeader header associated with the record to be validated. /// \param samRecord record to be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValid(SamFileHeader& samHeader, SamRecord& samRecord, SamValidationErrors& validationErrors); /// Determines whether or not the specified qname is valid. /// Validation for QNAME is: /// a) length of the qname string is the same as the read name length /// b) length is between 1 and 254. /// c) [ \t\n\r] are not allowed in the name. /// \param qname the read/query name. /// \param qnameLen length of the read including the null (result of /// SamRecord::getReadNameLength(). /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidQname(const char* qname, uint8_t qnameLen, SamValidationErrors& validationErrors); /// Determines whether or not the flag is valid. /// TODO: currently no validation is done on the flag. /// \param flag flag to be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidFlag(uint16_t flag, SamValidationErrors& validationErrors); /// Validate the reference name including validating against the header. /// 1) Cross validate the rname and the header. /// 2) perform the validation in the method that doesn't take the header. /// \param samHeader header associated with the rname to be validated. /// \param rname reference name to be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidRname(SamFileHeader& samHeader, const char* rname, SamValidationErrors& validationErrors); /// Validate the rname without validating against the header. /// Validation for RNAME is: /// a) cannot be 0 length. /// b) [ \t\n\r@=] are not allowed in the name. /// \param rname reference name to be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidRname(const char* rname, SamValidationErrors& validationErrors); /// Validate whether or not the specified reference id is valid. /// Validation for rID is: /// a) must be between -1 and the number of refInfo. /// -1 is allowed, and otherwise it must properly index into the array. /// \param refID reference id to be validated. /// \param refInfo sam reference information containing the mapping /// from reference id to reference name for this refID. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidRefID(int32_t refID, const SamReferenceInfo& refInfo, SamValidationErrors& validationErrors); /// Validate the refeference position. /// Validation for pos is: /// a) must be between 0 and (2^29)-1. /// \param pos position to be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValid1BasedPos(int32_t pos, SamValidationErrors& validationErrors); /// Validate the mapping quality. /// TODO: currently no validation is done on the mapping quality. /// \param mapQuality mapping quality to be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidMapQuality(uint8_t mapQuality, SamValidationErrors& validationErrors); /// Validate the sequence, but not against the cigar or quality string. /// Validation against cigar is done in isValidCigar. /// Validation against the quality string is done in isValidQuality. /// TODO: currently no validation is done in this method. /// \param samRecord record whose sequence should be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidSequence(SamRecord& samRecord, SamValidationErrors& validationErrors); /// Validate the cigar. Cigar validation depends on sequence. /// Validation for CIGAR is: /// a) cannot be 0 length. /// if not "*", validate the following: /// b) must have an integer length for each operator (if not "*"). TODO /// c) all operators must be valid (if not "*"). TODO /// d) evaluates to the same read length as the sequence string. /// \param samRecord record whose cigar should be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidCigar(SamRecord& samRecord, SamValidationErrors& validationErrors); /// Validate the cigar. Cigar validation depends on sequence. /// Validation for CIGAR is: /// a) cannot be 0 length. /// if not "*", validate the following: /// b) must have an integer length for each operator (if not "*"). TODO /// c) all operators must be valid (if not "*"). TODO /// d) evaluates to the same read length as the sequence string. /// \param cigar cigar string to be validated. /// \param sequence sequence to check the cigar against. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidCigar(const char* cigar, const char* sequence, SamValidationErrors& validationErrors); /// Validate the cigar. Cigar validation depends on sequence. /// Validation for CIGAR is: /// a) cannot be 0 length. /// if not "*", validate the following: /// b) TODO: must have an integer length for each operator (if not "*"). /// c) TODO: all operators must be valid (if not "*"). /// d) evaluates to the same read length as the sequence string. /// \param cigar cigar string to be validated. /// \param seqLen sequence length to check the cigar against. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidCigar(const char* cigar, int seqLen, SamValidationErrors& validationErrors); /// TODO: validate the mate/next fragment's reference name. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidMrnm(); /// TODO: validate the mate/next fragment's position. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidMpos(); /// TODO: validate the insertion size/observed template length. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidIsize(); /// TODO, validate the sequence. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidSeq(); /// Validate the base quality. /// Quality validation depends on sequence. /// Validation for quality is: /// a) quality & sequence are the same length if both are specified. /// TODO: more validation. /// \param samRecord record whose quality should be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidQuality(SamRecord& samRecord, SamValidationErrors& validationErrors); /// Validate the base quality. /// Quality validation depends on sequence. /// Validation for quality is: /// a) quality & sequence are the same length if both are specified. /// TODO: more validation. /// \param quality quality string to be validated. /// \param seqLen sequence length to check the quality against. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidQuality(const char* quality, const char* sequence, SamValidationErrors& validationErrors); /// Validate the base quality. /// Quality validation depends on sequence. /// Validation for quality is: /// a) quality & sequence are the same length if both are specified. /// TODO: more validation. /// \param quality quality string to be validated. /// \param seqLen sequence length to check the quality against. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not bool static isValidQuality(const char* quality, int seqLength, SamValidationErrors& validationErrors); /// Validate the tags. /// Validation for tags is: /// a) check that the "MD" tag is correct if it is present. /// TODO: more validation. /// \param samRecord record whose tags should be validated. /// \param validationErrors status to append any errors too. /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidTags(SamRecord& samRecord, SamValidationErrors& validationErrors); /// TODO validate the tag vtype /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidVtype(); /// TODO validate the tag vtype /// \return true if it is valid, false and appends to SamValidationErrors /// if it is not static bool isValidValue(); }; #endif libStatGen-1.0.14/bam/examples/000077500000000000000000000000001254730101300162025ustar00rootroot00000000000000libStatGen-1.0.14/bam/examples/Makefile000066400000000000000000000010101254730101300176320ustar00rootroot00000000000000PATH_TO_BASE=../../.. include ../../Makefiles/Makefile.include SUBDIRS = $(wildcard */) # Build in all subdirectories. # # see http://www.gnu.org/software/make/manual/make.html#Phony-Targets # for a way of improving the following: # .PHONY : clean $(SUBDIRS) all: $(SUBDIRS) $(SUBDIRS): (echo "building in directory $@"; $(MAKE) $(PARALLEL_MAKE) OPTFLAG="$(OPTFLAG)" --no-print-directory -C $@ ${RECURSIVE_TARGET}) test: RECURSIVE_TARGET = test test: $(SUBDIRS) clean: RECURSIVE_TARGET = clean clean: $(SUBDIRS) libStatGen-1.0.14/bam/examples/pileup/000077500000000000000000000000001254730101300175005ustar00rootroot00000000000000libStatGen-1.0.14/bam/examples/pileup/.gitignore000066400000000000000000000000071254730101300214650ustar00rootroot00000000000000pileup libStatGen-1.0.14/bam/examples/pileup/Main.cpp000066400000000000000000000070741254730101300211000ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" #include "Pileup.h" #include "PileupElementBaseQual.h" void newAnalyze(PileupElementBaseQual& element) { std::cout << "newAnalyze: "; element.analyze(); } class AnalyzeClass { public: AnalyzeClass() { myCounter = 33; } bool operator() (PileupElementBaseQual& element) { std::cout << "Class Analyze: Counter = " << myCounter << ": "; element.analyze(); ++myCounter; return(true); } int myCounter; private: }; int main(int argc, char ** argv) { const char* fileName = "../../test/testFiles/sortedBam.bam"; const char* indexName = "../../test/testFiles/sortedBam.bam.bai"; printf("\nPileup on entire file: %s\n", fileName); Pileup pileup(1024); pileup.processFile(fileName); printf("\nPileup on entire file: %s\n", fileName); Pileup pileup1(1024); pileup1.processFile(fileName); printf("\nPileup on a section of file: %s\n", fileName); // Read a sorted & indexed BAM file. Pileup pileup2(100); SamFile samIn; SamFileHeader header; SamRecord record; if(!samIn.OpenForRead(fileName)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading. if(!samIn.ReadBamIndex(indexName)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } if(!samIn.ReadHeader(header)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } const char* refName = "1"; int start = 1000; int end = 1500; if(!samIn.SetReadSection(refName, start, end)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Iterate over all records while (samIn.ReadRecord(header, record)) { pileup2.processAlignment(record); } pileup2.flushPileup(); int returnValue = 0; if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed to read a record. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnValue = samIn.GetStatus(); } printf("\nPileup on entire file, newAnalyze: %s\n", fileName); void (*fnPtr)(PileupElementBaseQual&) = newAnalyze; Pileup pileup3(1024, fnPtr); pileup3.processFile(fileName); printf("\nPileup on entire file, newAnalyze: %s\n", fileName); AnalyzeClass myAnalyzeClass; myAnalyzeClass.myCounter = 2; Pileup pileup4(1024, myAnalyzeClass); pileup4.processFile(fileName); return(0); } libStatGen-1.0.14/bam/examples/pileup/Makefile000066400000000000000000000001421254730101300211350ustar00rootroot00000000000000EXE = pileup SRCONLY = Main.cpp TEST_COMMAND = ./pileup include ../../../Makefiles/Makefile.testlibStatGen-1.0.14/bam/examples/printRefPositions/000077500000000000000000000000001254730101300217035ustar00rootroot00000000000000libStatGen-1.0.14/bam/examples/printRefPositions/.gitignore000066400000000000000000000000321254730101300236660ustar00rootroot00000000000000printRefPositions results/libStatGen-1.0.14/bam/examples/printRefPositions/Main.cpp000066400000000000000000000025321254730101300232750ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include "PrintRefPositions.h" int main(int argc, char ** argv) { std::string inFile = "../../test/testFiles/sortedBam.bam"; std::string indexFile = "../../test/testFiles/sortedBam.bam.bai"; std::string rname = "1"; int startPosition = 1013; int endPosition = 1751; if(argc == 6) { inFile = argv[1]; indexFile = argv[2]; rname = argv[3]; startPosition = atoi(argv[4]); endPosition = atoi(argv[5]); } printRefPositions(inFile, indexFile, rname, startPosition, endPosition); return(0); } libStatGen-1.0.14/bam/examples/printRefPositions/Makefile000066400000000000000000000003441254730101300233440ustar00rootroot00000000000000EXE = printRefPositions SRCONLY = Main.cpp TOOLBASE = PrintRefPositions TEST_COMMAND = mkdir -p results; ./printRefPositions > results/test.txt; diff results/test.txt expected/test.txt include ../../../Makefiles/Makefile.testlibStatGen-1.0.14/bam/examples/printRefPositions/PrintRefPositions.cpp000066400000000000000000000036741254730101300260620ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ ////////////////////////////////////////////////////////////////////////// #include "SamFile.h" void printRefPositions(std::string inFile, std::string indexFile, std::string rname, int startPosition, int endPosition) { SamFileHeader header; // Open the bam file for reading and read the header. SamFile samIn(inFile.c_str(), SamFile::READ, &header); // Open the bam index file for reading. samIn.ReadBamIndex(indexFile.c_str()); // Set the section to be read. samIn.SetReadSection(rname.c_str(), startPosition, endPosition); SamRecord record; // Keep reading BAM records until they aren't anymore. while(samIn.ReadRecord(header, record)) { // Print the reference positions associated with this read. std::cout << "Read " << samIn.GetCurrentRecordCount() << ":"; Cigar* cigar = record.getCigarInfo(); for(int i = 0; i < record.getReadLength(); i++) { int refPos = cigar->getRefPosition(i, record.get1BasedPosition()); if(refPos != Cigar::INDEX_NA) { std::cout << " " << refPos; } } std::cout << "\n"; } } libStatGen-1.0.14/bam/examples/printRefPositions/PrintRefPositions.h000066400000000000000000000017301254730101300255160ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ ////////////////////////////////////////////////////////////////////////// void printRefPositions(std::string inFile, std::string indexFile, std::string rname, int startPosition, int endPosition); libStatGen-1.0.14/bam/examples/printRefPositions/expected/000077500000000000000000000000001254730101300235045ustar00rootroot00000000000000libStatGen-1.0.14/bam/examples/printRefPositions/expected/test.txt000066400000000000000000000001141254730101300252200ustar00rootroot00000000000000Read 1: 1011 1012 1013 1014 1015 Read 2: 1751 1752 1753 1754 1755 libStatGen-1.0.14/bam/test/000077500000000000000000000000001254730101300153435ustar00rootroot00000000000000libStatGen-1.0.14/bam/test/.gitignore000066400000000000000000000000101254730101300173220ustar00rootroot00000000000000samTest libStatGen-1.0.14/bam/test/BamIndexTest.cpp000066400000000000000000000266151254730101300204100ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "BamIndex.h" #include "TestValidate.h" #include "BamIndexTest.h" #include void testIndex(BamIndex& bamIndex) { #ifdef __ZLIB_AVAILABLE__ assert(bamIndex.getNumMappedReads(1) == 2); assert(bamIndex.getNumUnMappedReads(1) == 0); assert(bamIndex.getNumMappedReads(0) == 4); assert(bamIndex.getNumUnMappedReads(0) == 1); assert(bamIndex.getNumMappedReads(23) == -1); assert(bamIndex.getNumUnMappedReads(23) == -1); assert(bamIndex.getNumMappedReads(-1) == 0); assert(bamIndex.getNumUnMappedReads(-1) == 2); assert(bamIndex.getNumMappedReads(-2) == -1); assert(bamIndex.getNumUnMappedReads(-2) == -1); assert(bamIndex.getNumMappedReads(22) == 0); assert(bamIndex.getNumUnMappedReads(22) == 0); // Get the chunks for reference id 1. Chunk testChunk; SortedChunkList chunkList; assert(bamIndex.getChunksForRegion(1, -1, -1, chunkList) == true); assert(!chunkList.empty()); testChunk = chunkList.pop(); assert(chunkList.empty()); assert(testChunk.chunk_beg == 0x4e7); assert(testChunk.chunk_end == 0x599); // Get the chunks for reference id 0. assert(bamIndex.getChunksForRegion(0, -1, -1, chunkList) == true); assert(!chunkList.empty()); testChunk = chunkList.pop(); assert(chunkList.empty()); assert(testChunk.chunk_beg == 0x360); assert(testChunk.chunk_end == 0x4e7); // Get the chunks for reference id 2. assert(bamIndex.getChunksForRegion(2, -1, -1, chunkList) == true); assert(!chunkList.empty()); testChunk = chunkList.pop(); assert(chunkList.empty()); assert(testChunk.chunk_beg == 0x599); assert(testChunk.chunk_end == 0x5ea); // Get the chunks for reference id 3. // There isn't one for this ref id, but still successfully read the file, // so it should return true, but the list should be empty. assert(bamIndex.getChunksForRegion(3, -1, -1, chunkList) == true); assert(chunkList.empty()); // Test reading an indexed bam file. SamFile inFile; assert(inFile.OpenForRead("testFiles/sortedBam.bam")); inFile.setSortedValidation(SamFile::COORDINATE); assert(inFile.ReadBamIndex("testFiles/sortedBam.bam.bai")); SamFileHeader samHeader; assert(inFile.ReadHeader(samHeader)); SamRecord samRecord; // Test getting num mapped/unmapped reads. assert(inFile.getNumMappedReadsFromIndex(1) == 2); assert(inFile.getNumUnMappedReadsFromIndex(1) == 0); assert(inFile.getNumMappedReadsFromIndex(0) == 4); assert(inFile.getNumUnMappedReadsFromIndex(0) == 1); assert(inFile.getNumMappedReadsFromIndex(23) == -1); assert(inFile.getNumUnMappedReadsFromIndex(23) == -1); assert(inFile.getNumMappedReadsFromIndex(-1) == 0); assert(inFile.getNumUnMappedReadsFromIndex(-1) == 2); assert(inFile.getNumMappedReadsFromIndex(-2) == -1); assert(inFile.getNumUnMappedReadsFromIndex(-2) == -1); assert(inFile.getNumMappedReadsFromIndex(22) == 0); assert(inFile.getNumUnMappedReadsFromIndex(22) == 0); assert(inFile.getNumMappedReadsFromIndex("2", samHeader) == 2); assert(inFile.getNumUnMappedReadsFromIndex("2", samHeader) == 0); assert(inFile.getNumMappedReadsFromIndex("1", samHeader) == 4); assert(inFile.getNumUnMappedReadsFromIndex("1", samHeader) == 1); assert(inFile.getNumMappedReadsFromIndex("22", samHeader) == 0); assert(inFile.getNumUnMappedReadsFromIndex("22", samHeader) == 0); assert(inFile.getNumMappedReadsFromIndex("", samHeader) == 0); assert(inFile.getNumUnMappedReadsFromIndex("*", samHeader) == 2); assert(inFile.getNumMappedReadsFromIndex("unknown", samHeader) == -1); assert(inFile.getNumUnMappedReadsFromIndex("unknown", samHeader) == -1); assert(inFile.getNumMappedReadsFromIndex("X", samHeader) == 0); assert(inFile.getNumUnMappedReadsFromIndex("X", samHeader) == 0); // Set the read section saying the reads must be fully enclosed in the section. assert(inFile.SetReadSection("1", 1010, 1011, false)); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1011, 1012, false)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section -1 = Ref *: 2 records (8 & 10 from testSam.sam that is reflected // in the validation. assert(inFile.SetReadSection(-1)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead8(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead10(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 2 = Ref 3: 1 records (9 from testSam.sam that is reflected // in the validation. assert(inFile.SetReadSection(2)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead9(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 0 = Ref 1: 5 records (3, 4, 1, 2, & 6 from testSam.sam that is // reflected in the validation. assert(inFile.SetReadSection(0)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead3(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead4(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead6(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 1 = Ref 2: 2 records (5 & 7 from testSam.sam that is reflected // in the validation. assert(inFile.SetReadSection(1)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead5(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead7(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 3 to 22 (ref 4 - 23): 0 records. for(int i = 3; i < 23; i++) { assert(inFile.SetReadSection(i)); assert(inFile.ReadRecord(samHeader, samRecord) == false); } // Set the read section. assert(inFile.SetReadSection("1", 1010, 1012)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 2); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 0); assert(samRecord.getNumOverlaps(1010, 1012) == 0); assert(samRecord.getNumOverlaps(1010, 1020) == 0); assert(samRecord.getNumOverlaps(1010, 1011) == 0); assert(samRecord.getNumOverlaps(1011, 1012) == 0); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1010, 1020)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 5); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 0); assert(samRecord.getNumOverlaps(1010, 1012) == 0); assert(samRecord.getNumOverlaps(1010, 1020) == 0); assert(samRecord.getNumOverlaps(1010, 1011) == 0); assert(samRecord.getNumOverlaps(1011, 1012) == 0); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1010, 1011)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 1); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1011, 1012)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 1); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 0); assert(samRecord.getNumOverlaps(1010, 1012) == 0); assert(samRecord.getNumOverlaps(1010, 1020) == 0); assert(samRecord.getNumOverlaps(1010, 1011) == 0); assert(samRecord.getNumOverlaps(1011, 1012) == 0); assert(inFile.ReadRecord(samHeader, samRecord) == false); #endif } void testBamIndex() { // BAM indexes are compressed, so can't be tested without zlib. #ifdef __ZLIB_AVAILABLE__ // Create a bam index. BamIndex bamIndex; bamIndex.readIndex("testFiles/sortedBam.bam.bai"); testIndex(bamIndex); BamIndexFileTest test1; bool caughtException = false; try { // Try reading the bam index without specifying a // filename and before opening a bam file. assert(test1.ReadBamIndex() == false); } catch (std::exception& e) { caughtException = true; assert(strcmp(e.what(), "FAIL_ORDER: Failed to read the bam Index file - the BAM file needs to be read first in order to determine the index filename.") == 0); } // Should have failed and thrown an exception. assert(caughtException); // Read the bam index with a specified name. assert(test1.ReadBamIndex("testFiles/sortedBam.bam.bai")); BamIndex* index = test1.getBamIndex(); assert(index != NULL); testIndex(*index); // Open the bam file so the index can be opened. assert(test1.OpenForRead("testFiles/sortedBam.bam")); // Try reading the bam index without specifying a // filename after opening a bam file. assert(test1.ReadBamIndex() == true); index = test1.getBamIndex(); assert(index != NULL); testIndex(*index); // Open the bam file so the index can be opened. // This time the index file does not have .bam in it. assert(test1.OpenForRead("testFiles/sortedBam2.bam")); // Try reading the bam index without specifying a // filename after opening a bam file. assert(test1.ReadBamIndex() == true); index = test1.getBamIndex(); assert(index != NULL); testIndex(*index); #endif } libStatGen-1.0.14/bam/test/BamIndexTest.h000066400000000000000000000016111254730101300200420ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not,x see . */ #include "SamFile.h" class BamIndexFileTest : public SamFile { public: BamIndex* getBamIndex() { return(myBamIndex); } }; void testBamIndex(); libStatGen-1.0.14/bam/test/Main.cpp000066400000000000000000000046701254730101300167420ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "ReadFiles.h" #include "WriteFiles.h" #include "ValidationTest.h" #include "BamIndexTest.h" #include "ModifyVar.h" #include "Modify.h" #include "SamFileTest.h" #include "TestEquals.h" #include "TestFilter.h" #include "ShiftIndels.h" #include "TestPileup.h" #include "TestPosList.h" #include "TestCigarHelper.h" #include "TestSamRecordPool.h" #include "TestSamCoordOutput.h" #include "TestSamRecordHelper.h" int main(int argc, char ** argv) { if(argc == 1) { testReadSam(); #ifdef __ZLIB_AVAILABLE__ testReadBam(); testReadBam(); #endif testAddHeaderAndTagToFile("testFiles/testSam.sam", "results/addedTagToSam.bam"); testAddHeaderAndTagToFile("testFiles/testSam.sam", "results/addedTagToSam.sam"); // Can't read bams without zlib #ifdef __ZLIB_AVAILABLE__ testAddHeaderAndTagToFile("testFiles/testBam.bam", "results/addedTagToBam.sam"); testAddHeaderAndTagToFile("testFiles/testBam.bam", "results/addedTagToBam.bam"); #endif testValidateSortedRead(); testWrite(); testSamQNAME(); testBamRID(); testEmptyQual(); // Can't read bams without zlib #ifdef __ZLIB_AVAILABLE__ testBamIndex(); #endif testModifyVar(); testModify(); testSamFile(); testSeqEquals(); testFilter(); testShiftIndels(); testPileup(); testPosList(); testCigarHelper(); testSamRecordPool(); testSamCoordOutput(); testSamRecordHelper(); } else { modifyFirstBaseLong(); } } libStatGen-1.0.14/bam/test/Makefile000066400000000000000000000006131254730101300170030ustar00rootroot00000000000000EXE = samTest TOOLBASE = WriteFiles ValidationTest ReadFiles BamIndexTest ModifyVar Modify SamFileTest TestValidate TestEquals TestFilter ShiftIndels TestPileup TestPosList TestCigarHelper TestSamRecordPool TestSamCoordOutput TestSamRecordHelper SRCONLY = Main.cpp ifeq ($(ZLIB_AVAIL), 0) TEST_COMMAND = ./test.sh noZlib else TEST_COMMAND = ./test.sh endif include ../../Makefiles/Makefile.testlibStatGen-1.0.14/bam/test/Modify.cpp000066400000000000000000000073231254730101300173030ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" #include "SamFlag.h" #include "Modify.h" void testModify() { modify modTest; modTest.testModify("testFiles/testSam.sam"); #ifdef __ZLIB_AVAILABLE__ modTest.testModify("testFiles/testBam.bam"); #endif } void modify::testModify(const char* filename) { myFilename = filename; modifyPosition(); modifyCigar(); modifyFlag(); modifyTags(); } void modify::modifyPosition() { openAndRead1Rec(); // Verify the initial bin. assert(samRecord.getBin() == 4681); // Change the position and verify that the bin is updated. assert(samRecord.set0BasedPosition(33768)); // Verify the bin was updated. assert(samRecord.getBin() == 4683); assert(samRecord.get0BasedPosition() == 33768); } void modify::modifyCigar() { openAndRead1Rec(); // Verify the initial bin. assert(samRecord.getBin() == 4681); // Change the Cigar such that it modifies the bin. assert(samRecord.setCigar("33768M")); // Verify the bin was updated. assert(samRecord.getBin() == 585); } void modify::modifyFlag() { openAndRead1Rec(); // Verify the initial bin. uint16_t flag = 73; assert(samRecord.getFlag() == flag); SamFlag::setDuplicate(flag); assert(flag == 1097); assert(samRecord.setFlag(flag)); assert(samRecord.getFlag() == 1097); SamFlag::setNotDuplicate(flag); assert(flag == 73); assert(samRecord.setFlag(flag)); assert(samRecord.getFlag() == 73); } void modify::openAndRead1Rec() { // Open the file for reading. assert(samIn.OpenForRead(myFilename.c_str())); // Read the sam header. assert(samIn.ReadHeader(samHeader)); // Read the first record. assert(samIn.ReadRecord(samHeader, samRecord)); } void modify::modifyTags() { assert(samIn.OpenForRead(myFilename.c_str())); // Read the sam header. assert(samIn.ReadHeader(samHeader)); SamFile samOut; SamFile bamOut; std::string inputType = myFilename.substr(myFilename.find_last_of('.')); std::string outFileBase = "results/updateTagFrom"; if(inputType == ".bam") { outFileBase += "Bam"; } else { outFileBase += "Sam"; } std::string outFile = outFileBase + ".sam"; assert(samOut.OpenForWrite(outFile.c_str())); outFile = outFileBase + ".bam"; assert(bamOut.OpenForWrite(outFile.c_str())); assert(samOut.WriteHeader(samHeader)); assert(bamOut.WriteHeader(samHeader)); int count = 0; // Read the records. while(samIn.ReadRecord(samHeader, samRecord)) { if(count == 0) { assert(samRecord.rmTag("MD", 'Z')); } else if(count == 2) { assert(samRecord.rmTags("XT:A;MD:Z;AB:c;NM:i")); } else if(count == 4) { assert(samRecord.rmTags("MD:Z,AB:c,NM:i")); } assert(bamOut.WriteRecord(samHeader, samRecord)); assert(samOut.WriteRecord(samHeader, samRecord)); ++count; } } libStatGen-1.0.14/bam/test/Modify.h000066400000000000000000000022771254730101300167530ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ void testModify(); class modify { public: void testModify(const char* filename); private: void modifyPosition(); void modifyCigar(); void modifyFlag(); // Open and read the first record. void openAndRead1Rec(); void modifyTags(); // Variables. std::string myFilename; // Rather than passing around all these variables, just store them in the class. SamFile samIn; SamFileHeader samHeader; SamRecord samRecord; }; libStatGen-1.0.14/bam/test/ModifyVar.cpp000066400000000000000000000424131254730101300177530ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" #include "ModifyVar.h" #include "SamValidation.h" void modifyFirstBase() { SamFile samIn; // Open the file for reading. assert(samIn.OpenForRead("testFiles/testVar.bam")); SamFile samOut; // Open the file for writing. assert(samOut.OpenForWrite("results/updateVar.bam")); // Read the sam header. SamFileHeader samHeader; assert(samIn.ReadHeader(samHeader)); assert(samOut.WriteHeader(samHeader)); // Read the sam records. SamRecord samRecord; // Keep reading records until the end of the file is reached. while(samIn.ReadRecord(samHeader, samRecord)) { // Successfully read a record from the file, so check to see // if it is valid. SamValidationErrors samValidationErrors; assert(SamValidator::isValid(samHeader, samRecord, samValidationErrors)); // Get the sequence. const char* sequence = samRecord.getSequence(); assert(strcmp(sequence, "") != 0); std::string upSeq = sequence; upSeq[0] = 'N'; assert(samRecord.setSequence(upSeq.c_str())); // write the sequence. assert(samOut.WriteRecord(samHeader, samRecord)); } // Should have exited only when done reading. assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS); } void modifyFirstBaseLong() { SamFile samIn; // Open the file for reading. assert(samIn.OpenForRead("statusTests/HalfG.bam")); SamFile samOut; // Open the file for writing. assert(samOut.OpenForWrite("results/updateSeq.bam")); // Read the sam header. SamFileHeader samHeader; assert(samIn.ReadHeader(samHeader)); assert(samOut.WriteHeader(samHeader)); // Read the sam records. SamRecord samRecord; // Keep reading records until the end of the file is reached. while(samIn.ReadRecord(samHeader, samRecord)) { // Successfully read a record from the file, so check to see // if it is valid. SamValidationErrors samValidationErrors; assert(SamValidator::isValid(samHeader, samRecord, samValidationErrors)); // Get the sequence. const char* sequence = samRecord.getSequence(); assert(strcmp(sequence, "") != 0); std::string upSeq = sequence; upSeq[0] = 'N'; assert(samRecord.setSequence(upSeq.c_str())); // write the sequence. assert(samOut.WriteRecord(samHeader, samRecord)); } // Should have exited only when done reading. assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS); } void testModifyVar() { #ifdef __ZLIB_AVAILABLE__ modifyFirstBase(); #endif modifyVar modTest; modTest.testModifyVar("testFiles/testSam.sam", true); modTest.testModifyVar("testFiles/testSam.sam", false); #ifdef __ZLIB_AVAILABLE__ modTest.testModifyVar("testFiles/testBam.bam", true); modTest.testModifyVar("testFiles/testBam.bam", false); #endif } void modifyVar::testModifyVar(const char* filename, bool valBufFirst) { myFilename = filename; myValBufFirst = valBufFirst; testModifyReadNameOnlySameLength(); testModifyCigarOnlySameLength(); testModifySequenceOnlySameLength(); testModifyQualityOnlySameLength(); testRemoveQuality(); testShortenQuality(); testLengthenQuality(); testShortenReadName(); testShortenCigar(); testShortenSequence(); testLengthenReadName(); testLengthenCigar(); testLengthenSequence(); testRemoveCigar(); testRemoveSequence(); testLengthenSequenceAndQuality(); } void modifyVar::testModifyReadNameOnlySameLength() { resetExpected(); openAndRead1Rec(); // Set the Read Name - same length, just different name. expectedReadNameString = "1:1011:G:255+17M15D20M"; samRecord.setReadName(expectedReadNameString.c_str()); validate(); } void modifyVar::testModifyCigarOnlySameLength() { resetExpected(); openAndRead1Rec(); // Set the Cigar - same length, just different values. expectedCigarString = "3M2I"; samRecord.setCigar(expectedCigarString.c_str()); // The new Cigar for record 1 is 3M2I // 3M = 3 << 4 | 0 = 0x30 // 2I = 2 << 4 | 1 = 0x21 expectedCigarBufLen = 2; expectedCigarBuffer[0] = 0x30; expectedCigarBuffer[1] = 0x21; validate(); } void modifyVar::testModifySequenceOnlySameLength() { resetExpected(); openAndRead1Rec(); // Set the Sequence - same length, just different values. expectedSequenceString = "NCGAN"; samRecord.setSequence(expectedSequenceString.c_str()); // NCGAN = NC GA N = 0xF2 0x41 0xF0 expectedSequenceBuffer[0] = 0xF2; expectedSequenceBuffer[1] = 0x41; expectedSequenceBuffer[2] = 0xF0; validate(); } void modifyVar::testModifyQualityOnlySameLength() { resetExpected(); openAndRead1Rec(); // Set the Quality - same length, just different values. expectedQualityString = "!>6+!"; samRecord.setQuality(expectedQualityString.c_str()); validate(); } void modifyVar::testRemoveQuality() { resetExpected(); openAndRead1Rec(); // Set the Quality - to "*" - does not affect the length since the // sequence field drives the length. expectedQualityString = "*"; samRecord.setQuality(expectedQualityString.c_str()); validate(); } void modifyVar::testShortenQuality() { resetExpected(); openAndRead1Rec(); // Set the Quality - shorten, but doesn't affect the length since // the sequence field drives the length. expectedQualityString = "!!"; samRecord.setQuality(expectedQualityString.c_str()); validate(); } void modifyVar::testLengthenQuality() { resetExpected(); openAndRead1Rec(); // Set the Quality - lengthen, but doesn't affect the length since // the sequence field drives the length. expectedQualityString = "!!@@##"; samRecord.setQuality(expectedQualityString.c_str()); validate(); } void modifyVar::testShortenReadName() { resetExpected(); openAndRead1Rec(); // Set the Read Name - shorter length expectedReadNameString = "1:1011:G:255"; samRecord.setReadName(expectedReadNameString.c_str()); validate(); } void modifyVar::testShortenCigar() { resetExpected(); openAndRead1Rec(); // Set the Cigar - shorter length expectedCigarString = "5M"; samRecord.setCigar(expectedCigarString.c_str()); // The new Cigar for record 1 is 5M // 5M = 5 << 4 | 0 = 0x50 expectedCigarBufLen = 1; expectedCigarBuffer[0] = 0x50; validate(); } void modifyVar::testShortenSequence() { resetExpected(); openAndRead1Rec(); // Set the Sequence - shorter length. expectedSequenceString = "CCGA"; samRecord.setSequence(expectedSequenceString.c_str()); // CCGA = CC GA = 0x22 0x41 expectedSequenceBuffer[0] = 0x22; expectedSequenceBuffer[1] = 0x41; validate(); } void modifyVar::testLengthenReadName() { resetExpected(); openAndRead1Rec(); // Set the Read Name - longer. expectedReadNameString = "1:1011:G:255+17M15D20M:1111111"; samRecord.setReadName(expectedReadNameString.c_str()); validate(); } void modifyVar::testLengthenCigar() { resetExpected(); openAndRead1Rec(); // Set the Cigar - longer length. expectedCigarString = "3M2D2I"; samRecord.setCigar(expectedCigarString.c_str()); // The new Cigar for record 1 is 3M2I // 3M = 3 << 4 | 0 = 0x30 // 2D = 2 << 2 | 1 = 0x22 // 2I = 2 << 4 | 1 = 0x21 expectedCigarBufLen = 3; expectedCigarBuffer[0] = 0x30; expectedCigarBuffer[1] = 0x22; expectedCigarBuffer[2] = 0x21; validate(); } void modifyVar::testLengthenSequence() { resetExpected(); openAndRead1Rec(); // Set the Sequence - longer length. expectedSequenceString = "CCGAATT"; samRecord.setSequence(expectedSequenceString.c_str()); // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80 expectedSequenceBuffer[0] = 0x22; expectedSequenceBuffer[1] = 0x41; expectedSequenceBuffer[2] = 0x18; expectedSequenceBuffer[3] = 0x80; validate(); } void modifyVar::testRemoveCigar() { resetExpected(); openAndRead1Rec(); // Set the Cigar - same length, just different values. expectedCigarString = "*"; expectedCigarBufLen = 0; samRecord.setCigar(expectedCigarString.c_str()); validate(); } void modifyVar::testRemoveSequence() { resetExpected(); openAndRead1Rec(); // Set the Sequence - shorter length. expectedSequenceString = "*"; samRecord.setSequence(expectedSequenceString.c_str()); validate(); } void modifyVar::testLengthenSequenceAndQuality() { resetExpected(); openAndRead1Rec(); // Set the Sequence & quality - longer. expectedSequenceString = "CCGAATT"; expectedQualityString = "!@#$%^&"; samRecord.setSequence(expectedSequenceString.c_str()); samRecord.setQuality(expectedQualityString.c_str()); // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80 expectedSequenceBuffer[0] = 0x22; expectedSequenceBuffer[1] = 0x41; expectedSequenceBuffer[2] = 0x18; expectedSequenceBuffer[3] = 0x80; validate(); } void modifyVar::validate() { if(myValBufFirst) { // get the record. const bamRecordStruct* recordBuffer = (const bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffer. validateReadName(recordBuffer); validateCigar(recordBuffer); validateSequence(recordBuffer); validateQuality(recordBuffer); validateTags(recordBuffer); // Validate the strings. validateReadNameString(); validateCigarString(); validateSequenceString(); validateQualityString(); validateTagsString(); } else { // get the record. const bamRecordStruct* recordBuffer = (const bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffer. validateReadName(recordBuffer); validateCigar(recordBuffer); validateSequence(recordBuffer); validateQuality(recordBuffer); validateTags(recordBuffer); // Validate the strings. validateReadNameString(); validateCigarString(); validateSequenceString(); validateQualityString(); validateTagsString(); } } void modifyVar::validateReadName(const bamRecordStruct* recordBuffer) { const char* varPtr = (const char*)&(recordBuffer->myData); unsigned int len = expectedReadNameString.length(); for(unsigned int i = 0; i < len; i++) { assert(varPtr[i] == expectedReadNameString[i]); } // Verify ending null. assert(varPtr[len] == 0); // verify the length - add one for the terminating null. assert(recordBuffer->myReadNameLength == expectedReadNameString.length() + 1); } void modifyVar::validateCigar(const bamRecordStruct* recordBuffer) { const unsigned char* cigarStart = (const unsigned char*)&(recordBuffer->myData) + recordBuffer->myReadNameLength; unsigned int* varPtr = (unsigned int*)cigarStart; for(int i = 0; i < expectedCigarBufLen; i++) { assert(varPtr[i] == expectedCigarBuffer[i]); } assert(recordBuffer->myCigarLength == expectedCigarBufLen); } void modifyVar::validateSequence(const bamRecordStruct* recordBuffer) { // Calculate the sequence length. int expectedReadLen = expectedSequenceString.length(); int seqLen = (expectedReadLen + 1)/2; if(expectedSequenceString == "*") { expectedReadLen = 0; seqLen = 0; } const unsigned char* sequencePtr = (const unsigned char*)&(recordBuffer->myData) + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4); for(int i = 0; i < seqLen; i++) { assert(sequencePtr[i] == expectedSequenceBuffer[i]); } assert(recordBuffer->myReadLength == expectedReadLen); } void modifyVar::validateQuality(const bamRecordStruct* recordBuffer) { int expectedReadLen = expectedSequenceString.length(); int seqLen = (expectedReadLen + 1)/2; if(expectedSequenceString == "*") { expectedReadLen = 0; seqLen = 0; } const uint8_t* qualityPtr = (const unsigned char*)&(recordBuffer->myData) + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4) + seqLen; int qualityLen = expectedQualityString.length(); for(int i = 0; i < expectedReadLen; i++) { if(expectedQualityString == "*") { // no quality, so check for 0xFF. assert(qualityPtr[i] == 0xFF); } else if(i >= qualityLen) { // Quality is shorter than the sequence, so should be padded with // 0xFF. assert(qualityPtr[i] == 0xFF); } else { assert(qualityPtr[i] == (expectedQualityString[i] - 33)); } } assert(recordBuffer->myReadLength == expectedReadLen); } void modifyVar::validateTags(const bamRecordStruct* recordBuffer) { const unsigned char* tagsPtr = (const unsigned char*)&(recordBuffer->myData) + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4) + (recordBuffer->myReadLength + 1)/2 + recordBuffer->myReadLength; for(int i = 0; i < expectedTagsLen; i++) { assert(tagsPtr[i] == expectedTagsBuffer[i]); } // Calculate expected block size - from the start of the buffer to the // start of the tags plus the tags length - minus the size of the blocksize // field. int32_t expectedBlockSize = tagsPtr - (const unsigned char*)(recordBuffer) + expectedTagsLen - 4; assert(recordBuffer->myBlockSize == expectedBlockSize); } void modifyVar::validateTagsString() { char tag[3]; char type; void* value; assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'A'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 0); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'M'); assert(tag[1] == 'D'); assert(type == 'Z'); assert(*(String*)value == "37"); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'N'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 0); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'X'); assert(tag[1] == 'T'); assert(type == 'A'); assert(*(char*)value == 'R'); // No more tags, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); assert(samRecord.getNextSamTag(tag, type, &value) == false); } void modifyVar::validateReadNameString() { assert(samRecord.getReadName() == expectedReadNameString); } void modifyVar::validateCigarString() { assert(samRecord.getCigar() == expectedCigarString); } void modifyVar::validateSequenceString() { assert(samRecord.getSequence() == expectedSequenceString); } void modifyVar::validateQualityString() { assert(samRecord.getQuality() == expectedQualityString); } void modifyVar::resetExpected() { expectedReadNameString = "1:1011:F:255+17M15D20M"; expectedCigarString = "5M2D"; expectedSequenceString = "CCGAA"; expectedQualityString = "6>6+4"; // The default Cigar for record 1 is 5M2D // 5M = 5 << 4 | 0 = 0x50 // 2D = 2 << 4 | 2 = 0x22 expectedCigarBufLen = 2; expectedCigarBuffer[0] = 0x50; expectedCigarBuffer[1] = 0x22; // CCGAA = CC GA A = 0x22 0x41 0x10 expectedSequenceBuffer[0] = 0x22; expectedSequenceBuffer[1] = 0x41; expectedSequenceBuffer[2] = 0x10; expectedTagsLen = 18; expectedTagsBuffer[0] = 'A'; expectedTagsBuffer[1] = 'M'; expectedTagsBuffer[2] = 'C'; expectedTagsBuffer[3] = 0; expectedTagsBuffer[4] = 'M'; expectedTagsBuffer[5] = 'D'; expectedTagsBuffer[6] = 'Z'; expectedTagsBuffer[7] = '3'; expectedTagsBuffer[8] = '7'; expectedTagsBuffer[9] = 0; expectedTagsBuffer[10] = 'N'; expectedTagsBuffer[11] = 'M'; expectedTagsBuffer[12] = 'C'; expectedTagsBuffer[13] = 0; expectedTagsBuffer[14] = 'X'; expectedTagsBuffer[15] = 'T'; expectedTagsBuffer[16] = 'A'; expectedTagsBuffer[17] = 'R'; } void modifyVar::openAndRead1Rec() { // Open the file for reading. assert(samIn.OpenForRead(myFilename)); // Read the sam header. assert(samIn.ReadHeader(samHeader)); // Read the first record. assert(samIn.ReadRecord(samHeader, samRecord)); } libStatGen-1.0.14/bam/test/ModifyVar.h000066400000000000000000000053261254730101300174220ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ void modifyFirstBase(); void modifyFirstBaseLong(); void testModifyVar(); class modifyVar { public: void testModifyVar(const char* filename, bool valBufFirst); private: void testModifyReadNameOnlySameLength(); void testModifyCigarOnlySameLength(); void testModifySequenceOnlySameLength(); void testModifyQualityOnlySameLength(); void testRemoveQuality(); void testShortenQuality(); void testLengthenQuality(); void testShortenReadName(); void testShortenCigar(); void testShortenSequence(); void testLengthenReadName(); void testLengthenCigar(); void testLengthenSequence(); void testRemoveCigar(); void testRemoveSequence(); void testLengthenSequenceAndQuality(); void validate(); void validateReadName(const bamRecordStruct* recordBuffer); void validateCigar(const bamRecordStruct* recordBuffer); void validateSequence(const bamRecordStruct* recordBuffer); void validateQuality(const bamRecordStruct* recordBuffer); void validateTags(const bamRecordStruct* recordBuffer); void validateReadNameString(); void validateCigarString(); void validateSequenceString(); void validateQualityString(); void validateTagsString(); // Open and read the first record. void openAndRead1Rec(); void resetExpected(); // Variables. const char* myFilename; bool myValBufFirst; // Rather than passing around all these variables, just store them in the class. SamFile samIn; SamFileHeader samHeader; SamRecord samRecord; const bamRecordStruct* recordBuffer; // Expected values. int expectedCigarBufLen; unsigned int expectedCigarBuffer[100]; unsigned char expectedSequenceBuffer[100]; int expectedTagsLen; unsigned char expectedTagsBuffer[100]; // Expected values for the strings. std::string expectedReadNameString; std::string expectedCigarString; std::string expectedSequenceString; std::string expectedQualityString; }; libStatGen-1.0.14/bam/test/ReadFiles.cpp000066400000000000000000000777461254730101300177320ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "ReadFiles.h" #include "TestValidate.h" #include "SamTags.h" #include void testReadSam() { SamFile inSam; assert(inSam.OpenForRead("testFiles/testSam.sam")); // Call generic test which since the sam and bam are identical, should // contain the same results. testRead(inSam); inSam.Close(); testFlagRead("testFiles/testSam.sam"); } void testReadBam() { SamFile inSam; assert(inSam.OpenForRead("testFiles/testBam.bam")); // Call generic test which since the sam and bam are identical, should // contain the same results. testRead(inSam); inSam.Close(); testFlagRead("testFiles/testBam.bam"); } void testRead(SamFile &inSam) { // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); testCopyHeader(samHeader); testModHeader(samHeader); SamRecord samRecord; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); // Set a new quality and get the buffer. samRecord.setQuality("ABCDE"); validateRead1ModQuality(samRecord); // void* buffer = samRecord.getRecordBuffer(); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead3(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead4(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead6(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead9(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); } void testAddHeaderAndTagToFile(const char* inputName, const char* outputName) { SamFile inSam, outSam; assert(inSam.OpenForRead(inputName)); assert(outSam.OpenForWrite(outputName)); // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); // Add a header line. assert(samHeader.addHeaderLine("@RG\tID:myID\tSM:mySM") == false); assert(samHeader.addHeaderLine("@RG\tID:myID3\tSM:mySM") == true); // Write Header assert(outSam.WriteHeader(samHeader)); SamRecord samRecord; assert(inSam.ReadRecord(samHeader, samRecord)); // validateRead1(samRecord); // Add two tags. assert(samRecord.addIntTag("XA", 123)); assert(samRecord.addIntTag("XA", 456)); assert(samRecord.addTag("RR", 'Z', "myID1")); assert(samRecord.addTag("RR", 'Z', "myID2")); // Write as Sam. assert(outSam.WriteRecord(samHeader, samRecord)); // TODO, add test to verify it was written correctly. // Read a couple of records to make sure it properly can read them even // if they are bigger than the original. assert(inSam.ReadRecord(samHeader, samRecord)); assert(inSam.ReadRecord(samHeader, samRecord)); // Check the MD tag, which requires the reference. GenomeSequence reference("testFiles/chr1_partial.fa"); assert(SamTags::isMDTagCorrect(samRecord, reference) == false); String newMDTag; SamTags::createMDTag(newMDTag, samRecord, reference); assert(newMDTag == "2T1N0"); assert(SamTags::updateMDTag(samRecord, reference)); // Write as Sam. assert(outSam.WriteRecord(samHeader, samRecord)); } // Test reading a file, validating it is sorted. void testValidateSortedRead() { // Open a file for reading. SamFile inSam(ErrorHandler::RETURN); assert(inSam.OpenForRead("testFiles/testSam.sam")); // Set the validation to COORDINATE. inSam.setSortedValidation(SamFile::COORDINATE); // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); SamRecord samRecord; // Succeed, first record. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); // Succeed, higher coordinate. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); // Failed sort order - due to coord. assert(inSam.ReadRecord(samHeader, samRecord) == false); validateRead3(samRecord); // Failed sort order - due to coord. assert(inSam.ReadRecord(samHeader, samRecord) == false); validateRead4(samRecord); // Succeed, new reference id assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); // Fail, previous reference id. assert(inSam.ReadRecord(samHeader, samRecord) == false); validateRead6(samRecord); // Succeed, same reference id, higher coord. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); // Succeed, *, new reference id. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); // Fail, reference id is not * assert(inSam.ReadRecord(samHeader, samRecord) == false); validateRead9(samRecord); // Succeed, valid reference id, and no coordinate. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); //////////////////////////////////////////// // Reopen the file for reading assert(inSam.OpenForRead("testFiles/testSam.sam")); // Set the validation to QUERY_NAME. inSam.setSortedValidation(SamFile::QUERY_NAME); // Read the SAM Header. assert(inSam.ReadHeader(samHeader)); // Succeed, first record. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); // Succeed, same name. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead3(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead4(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead6(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); // Succeed - std sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); // Succeed - numeric sort (Y<18) assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead9(samRecord); // Succeed - std sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); //////////////////////////////////////////// // Reopen the file for reading assert(inSam.OpenForRead("testFiles/testSam.sam")); // Set the validation to the SO Flag. Not set, so it is UNSORTED, so // all reads should pass. inSam.setSortedValidation(SamFile::FLAG); // Read the SAM Header. assert(inSam.ReadHeader(samHeader)); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead3(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead4(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead6(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead9(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); //////////////////////////////////////////// // Reopen for reading SO FLAG set to coordinate. assert(inSam.OpenForRead("testFiles/testSamSOcoord.sam")); // Set the validation to SO FLAG which is set to coordinate. inSam.setSortedValidation(SamFile::FLAG); // Read the SAM Header. assert(inSam.ReadHeader(samHeader)); // Succeed, first record. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); // Succeed, higher coordinate. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); // Failed sort order - due to coord. assert(inSam.ReadRecord(samHeader, samRecord) == false); validateRead3(samRecord); // Failed sort order - due to coord. assert(inSam.ReadRecord(samHeader, samRecord) == false); validateRead4(samRecord); // Succeed, new reference id assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); // Fail, previous reference id. assert(inSam.ReadRecord(samHeader, samRecord) == false); validateRead6(samRecord); // Succeed, same reference id, higher coord. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); // Succeed, *, new reference id. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); // Fail, reference id is not * assert(inSam.ReadRecord(samHeader, samRecord) == false); validateRead9(samRecord); // Succeed, valid reference id, and no coordinate. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); //////////////////////////////////////////// // Reopen the file for reading assert(inSam.OpenForRead("testFiles/testSamSOquery.sam")); // Set the validation to FLAG, SO set to queryname. inSam.setSortedValidation(SamFile::FLAG); // Read the SAM Header. assert(inSam.ReadHeader(samHeader)); // Succeed, first record. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); // Succeed, same name. assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead3(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead4(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead6(samRecord); // Succeeds - numeric sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); // Succeed - std sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); // Succeed - numeric sort (Y<18) assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead9(samRecord); // Succeed - std sort assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); //////////////////////////////////////////// // Reopen the file for reading, SO flag set to junk. assert(inSam.OpenForRead("testFiles/testSamSOinvalid.sam")); // Set the validation to the SO Flag. Not set to anything valid, // so it is considered UNSORTED, so all reads should pass. inSam.setSortedValidation(SamFile::FLAG); // Read the SAM Header. assert(inSam.ReadHeader(samHeader)); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead3(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead4(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead6(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead9(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); } void validateRead1ModQuality(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 1 // Create record structure for validating. int expectedBlockSize = 89; const char* expectedReferenceName = "1"; const char* expectedMateReferenceName = "1"; const char* expectedMateReferenceNameOrEqual = "="; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = 0; expectedRecordPtr->myPosition = 1010; expectedRecordPtr->myReadNameLength = 23; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4681; expectedRecordPtr->myCigarLength = 2; expectedRecordPtr->myFlag = 73; expectedRecordPtr->myReadLength = 5; expectedRecordPtr->myMateReferenceID = 0; expectedRecordPtr->myMatePosition = 1010; expectedRecordPtr->myInsertSize = 0; // Check the alignment end assert(samRecord.get0BasedAlignmentEnd() == 1016); assert(samRecord.get1BasedAlignmentEnd() == 1017); assert(samRecord.getAlignmentLength() == 7); assert(samRecord.get0BasedUnclippedStart() == 1010); assert(samRecord.get1BasedUnclippedStart() == 1011); assert(samRecord.get0BasedUnclippedEnd() == 1016); assert(samRecord.get1BasedUnclippedEnd() == 1017); // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "1:1011:F:255+17M15D20M") == 0); assert(strcmp(samRecord.getCigar(), "5M2D") == 0); assert(strcmp(samRecord.getSequence(), "CCGAA") == 0); assert(strcmp(samRecord.getQuality(), "ABCDE") == 0); assert(samRecord.getNumOverlaps(1010, 1017) == 5); assert(samRecord.getNumOverlaps(1010, 1016) == 5); assert(samRecord.getNumOverlaps(1012, 1017) == 3); assert(samRecord.getNumOverlaps(1015, 1017) == 0); assert(samRecord.getNumOverlaps(1017, 1010) == 0); assert(samRecord.getNumOverlaps(1013, 1011) == 0); assert(samRecord.getNumOverlaps(-1, 1017) == 5); // Reset the tag iter, since the tags have already been read. samRecord.resetTagIter(); // Check the tags. assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'A'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 0); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'M'); assert(tag[1] == 'D'); assert(type == 'Z'); assert(*(String*)value == "37"); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'N'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 0); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'X'); assert(tag[1] == 'T'); assert(type == 'A'); assert(*(char*)value == 'R'); // No more tags, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // Validate the cigar. // The First cigar is 5M which is 5 << 4 | 0 = 80 assert(*(unsigned int*)varPtr == 80); // Increment the varptr the size of an int. varPtr += 4; // The 2nd cigar is 2D which is 2 << 4 | 2 = 34 assert(*(unsigned int*)varPtr == 34); // Increment the varptr the size of an int. varPtr += 4; // Validate the sequence. // CC = 0x22 assert(*varPtr == 0x22); varPtr++; // GA = 0x41 assert(*varPtr == 0x41); varPtr++; // A = 0x10 assert(*varPtr == 0x10); varPtr++; // Validate the Quality for(int i = 0; i < expectedRecordPtr->myReadLength; i++) { assert(*varPtr == samRecord.getQuality()[i] - 33); varPtr++; } // Validate the tags. assert(*varPtr == 'A'); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'D'); varPtr++; assert(*varPtr == 'Z'); varPtr++; assert(*varPtr == '3'); varPtr++; assert(*varPtr == '7'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'N'); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'X'); varPtr++; assert(*varPtr == 'T'); varPtr++; assert(*varPtr == 'A'); varPtr++; assert(*varPtr == 'R'); varPtr++; } void testModHeader(SamFileHeader& samHeader) { // Check the header line. std::string headerString = ""; assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:11\tLN:134452384\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\tLB:library2\n@CO\tComment 1\n@CO\tComment 2\n"); // Remove a tag - by setting it to "". assert(samHeader.setRGTag("LB", "", "myID2") == true); // Check the header line. assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:11\tLN:134452384\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\n@CO\tComment 1\n@CO\tComment 2\n"); // Add an HD tag. SamHeaderHD* hd = new SamHeaderHD(); assert(hd->setTag("VN", "1.3") == true); assert(samHeader.addHD(hd) == true); assert(strcmp(samHeader.getHDTagValue("VN"), "1.3") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:11\tLN:134452384\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\n@HD\tVN:1.3\n@CO\tComment 1\n@CO\tComment 2\n"); // Try adding another HD tag. SamHeaderHD* hd2 = new SamHeaderHD(); assert(hd2->setTag("VN", "1.4") == true); assert(samHeader.addHD(hd2) == false); assert(strcmp(samHeader.getHDTagValue("VN"), "1.4") != 0); assert(strcmp(samHeader.getHDTagValue("VN"), "1.3") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:11\tLN:134452384\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\n@HD\tVN:1.3\n@CO\tComment 1\n@CO\tComment 2\n"); // Remove the entire HD Tag. assert(samHeader.removeHD() == true); assert(strcmp(samHeader.getHDTagValue("VN"), "") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:11\tLN:134452384\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\n@CO\tComment 1\n@CO\tComment 2\n"); // Remove an entire SQ Tag. assert(strcmp(samHeader.getSQTagValue("LN", "11"), "134452384") == 0); assert(samHeader.removeSQ("11") == true); assert(strcmp(samHeader.getSQTagValue("LN", "11"), "") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\n@CO\tComment 1\n@CO\tComment 2\n"); // Try adding a null HD tag. hd = NULL; assert(samHeader.addHD(hd) == false); assert(strcmp(samHeader.getHDTagValue("VN"), "") == 0); assert(strcmp(samHeader.getHDTagValue("VN"), "1.4") != 0); assert(strcmp(samHeader.getHDTagValue("VN"), "1.3") != 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\n@CO\tComment 1\n@CO\tComment 2\n"); // Try adding a null SQ tag. SamHeaderSQ* sq = NULL; assert(samHeader.addSQ(sq) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\n@CO\tComment 1\n@CO\tComment 2\n"); // Try adding an HD tag again. assert(samHeader.addHD(hd2) == true); assert(strcmp(samHeader.getHDTagValue("VN"), "1.4") == 0); assert(strcmp(samHeader.getHDTagValue("VN"), "1.3") != 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\n@HD\tVN:1.4\n@CO\tComment 1\n@CO\tComment 2\n"); // TODO Get the comments. } void testFlagRead(const char* fileName) { SamFile inSam; SamFileHeader samHeader; SamRecord samRecord; //////////////////////////////////////////////////////////// // Required flag 0x48 (only flag 73 matches) // Exclude nothing assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x48, 0x0); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // No required flags. // Exclude 0x48. This leaves just the one read with flag 133. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x0, 0x48); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // Required flag 0x40 // Exclude 0x48. // This will not find any records since the exclude and required conflict. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x40, 0x48); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // Required flag 0x4 // Exclude 0x8. // Only finds flag 133. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x4, 0x8); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // Required flag 0x4 // Exclude nothing // Finds flags 133 & 141. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x4, 0x0); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); } void testCopyHeader(SamFileHeader& samHeader) { // Copy the header. SamFileHeader samHeader2; SamHeaderRecord* recPtr = samHeader.getNextHeaderRecord(); while(recPtr != NULL) { samHeader2.addRecordCopy(*recPtr); recPtr = samHeader.getNextHeaderRecord(); } // Add the comments. std::string nextComment = samHeader.getNextComment(); while(nextComment != SamFileHeader::EMPTY_RETURN) { samHeader2.addComment(nextComment.c_str()); nextComment = samHeader.getNextComment(); } // Validate the header. validateHeader(samHeader2); } libStatGen-1.0.14/bam/test/ReadFiles.h000066400000000000000000000021551254730101300173550ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" void testReadSam(); void testReadBam(); void testRead(SamFile &inSam); void testAddHeaderAndTagToFile(const char* inputName, const char* outputName); void testValidateSortedRead(); void validateRead1ModQuality(SamRecord& samRecord); void testModHeader(SamFileHeader& samHeader); void testFlagRead(const char* fileName); void testCopyHeader(SamFileHeader& samHeader); libStatGen-1.0.14/bam/test/SamFileTest.cpp000066400000000000000000000052161254730101300202330ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFileTest.h" #include "SamFile.h" void testSamFile() { SamFileHeader header; // Test open for read via the constructor with return. SamFile samInConstructorReadDefault("testFiles/testSam.sam", SamFile::READ, ErrorHandler::RETURN); assert(samInConstructorReadDefault.WriteHeader(header) == false); assert(samInConstructorReadDefault.ReadHeader(header) == true); // Test open for write via the constructor. SamFile samInConstructorWrite("results/newWrite.sam", SamFile::WRITE, ErrorHandler::RETURN); assert(samInConstructorWrite.ReadHeader(header) == false); assert(samInConstructorWrite.WriteHeader(header) == true); // Test open for read via the constructor SamFile samInConstructorRead("testFiles/testSam.sam", SamFile::READ); bool caughtException = false; try { assert(samInConstructorRead.WriteHeader(header) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(samInConstructorRead.ReadHeader(header) == true); // Test open for write via child class. SamFileWriter samWriteConstructor("results/newWrite1.sam"); caughtException = false; try { assert(samWriteConstructor.ReadHeader(header) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(samWriteConstructor.WriteHeader(header) == true); // Test open for read via child class. SamFileReader samReadConstructor("testFiles/testSam.sam"); caughtException = false; try { assert(samReadConstructor.WriteHeader(header) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(samReadConstructor.ReadHeader(header) == true); } libStatGen-1.0.14/bam/test/SamFileTest.h000066400000000000000000000013671254730101300177030ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ void testSamFile(); libStatGen-1.0.14/bam/test/ShiftIndels.cpp000066400000000000000000000036311254730101300202660ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "ShiftIndels.h" #include "SamFile.h" void testShiftIndels() { ShiftIndelsTest::testShift("testFiles/testShift.sam", "results/testShift.sam"); #ifdef __ZLIB_AVAILABLE__ ShiftIndelsTest::testShift("testFiles/testShift.bam", "results/testShift.bam"); ShiftIndelsTest::testShift("testFiles/testShift.bam", "results/testShiftFromBam.sam"); #endif ShiftIndelsTest::testShift("testFiles/testShift.sam", "results/testShiftFromSam.bam"); } void ShiftIndelsTest::testShift(const char* input, const char* output) { SamFile inSam, outSam; assert(inSam.OpenForRead(input)); assert(outSam.OpenForWrite(output)); // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); assert(outSam.WriteHeader(samHeader)); SamRecord samRecord; int readNum = 1; bool shiftResult = true; while(inSam.ReadRecord(samHeader, samRecord)) { if((readNum == 3)|| (readNum == 5)) { shiftResult = false; } else { shiftResult = true; } ++readNum; assert(samRecord.shiftIndelsLeft() == shiftResult); assert(outSam.WriteRecord(samHeader, samRecord)); } } libStatGen-1.0.14/bam/test/ShiftIndels.h000066400000000000000000000015571254730101300177400ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ void testShiftIndels(); class ShiftIndelsTest { public: static void testShift(const char* input, const char* output); private: }; libStatGen-1.0.14/bam/test/TestCigarHelper.cpp000066400000000000000000001062301254730101300210760ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TestCigarHelper.h" #include "TestValidate.h" #include "CigarHelper.h" #include void testCigarHelper() { // Call generic test. CigarHelperTest::testCigarHelper(); } void CigarHelperTest::testCigarHelper() { testSoftClipBeginByRefPos(); testSoftClipEndByRefPos(); } void CigarHelperTest::testSoftClipBeginByRefPos() { SamRecord record; CigarRoller newCigar; std::string newCigarString; int32_t newPos = 0; // Setup the current Cigar. // Cigar: HHHSSSMMMDDDMMMIIIMMMPPPMMMDDDMMMSSSHHH // ReadPos: 000000 000011111 111 112222 // ReadPos: 012345 678901234 567 890123 // RefPos: 111111111 122 222222223 // RefPos: 012345678 901 234567890 const char* origCigar = "3H3S3M3D3M3I3M3P3M3D3M3S3H"; record.setCigar(origCigar); record.set0BasedPosition(10); record.setSequence("gggAAATTTCCCTTTGGGAAAggg"); //////////////////////////////////////////////////////// // Clip outside of the range (after). Everything should be clipped. assert(CigarHelper::softClipBeginByRefPos(record, 10000, newCigar, newPos) == 23); newCigar.getCigarString(newCigarString); assert(strcmp(newCigarString.c_str(), "3H24S3H") == 0); //////////////////////////////////////////////////////// // Clip outside of the range (before). Nothing should change. assert(CigarHelper::softClipBeginByRefPos(record, 1, newCigar, newPos) == CigarHelper::NO_CLIP); newCigar.getCigarString(newCigarString); assert(strcmp(newCigarString.c_str(), origCigar) == 0); //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// // Test clipping at every position of the read. //////////////////////////////////////////////////////// // Clip at the first position. assert(CigarHelper::softClipBeginByRefPos(record, 10, newCigar, newPos) == 3); assert(newPos == 11); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H4S2M3D3M3I3M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// // Clip in the middle of the first Match. assert(CigarHelper::softClipBeginByRefPos(record, 11, newCigar, newPos) == 4); assert(newPos == 12); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H5S1M3D3M3I3M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 12, newCigar, newPos) == 5); assert(newPos == 16); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H6S3M3I3M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 13, newCigar, newPos) == 5); assert(newPos == 16); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H6S3M3I3M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 14, newCigar, newPos) == 5); assert(newPos == 16); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H6S3M3I3M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 15, newCigar, newPos) == 5); assert(newPos == 16); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H6S3M3I3M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 16, newCigar, newPos) == 6); assert(newPos == 17); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H7S2M3I3M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 17, newCigar, newPos) == 7); assert(newPos == 18); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H8S1M3I3M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 18, newCigar, newPos) == 11); assert(newPos == 19); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H12S3M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 19, newCigar, newPos) == 12); assert(newPos == 20); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H13S2M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 20, newCigar, newPos) == 13); assert(newPos == 21); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H14S1M3P3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 21, newCigar, newPos) == 14); assert(newPos == 22); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H15S3M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 22, newCigar, newPos) == 15); assert(newPos == 23); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H16S2M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 23, newCigar, newPos) == 16); assert(newPos == 24); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H17S1M3D3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 24, newCigar, newPos) == 17); assert(newPos == 28); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H18S3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 25, newCigar, newPos) == 17); assert(newPos == 28); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H18S3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 26, newCigar, newPos) == 17); assert(newPos == 28); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H18S3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 27, newCigar, newPos) == 17); assert(newPos == 28); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H18S3M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 28, newCigar, newPos) == 18); assert(newPos == 29); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H19S2M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 29, newCigar, newPos) == 19); assert(newPos == 30); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H20S1M3S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 30, newCigar, newPos) == 23); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H24S3H") == 0); //////////////////////////////////////////////////////// assert(CigarHelper::softClipBeginByRefPos(record, 31, newCigar, newPos) == 23); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H24S3H") == 0); //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// // Test clipping at every position when insertions & deletions // are next to each other. origCigar = "3M3D3I3M"; record.setCigar(origCigar); record.setSequence("GGGAAAGGG"); // Cigar: MMMDDDIIIMMM // ReadPos: 000 000000 // ReadPos: 012 345678 // RefPos: 111111 111 // RefPos: 012345 678 record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 9, newCigar, newPos) == CigarHelper::NO_CLIP); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), origCigar) == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 10, newCigar, newPos) == 0); assert(newPos == 11); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "1S2M3D3I3M") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 11, newCigar, newPos) == 1); assert(newPos == 12); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "2S1M3D3I3M") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 12, newCigar, newPos) == 5); assert(newPos == 16); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "6S3M") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 13, newCigar, newPos) == 5); assert(newPos == 16); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "6S3M") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 14, newCigar, newPos) == 5); assert(newPos == 16); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "6S3M") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 15, newCigar, newPos) == 5); assert(newPos == 16); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "6S3M") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 16, newCigar, newPos) == 6); assert(newPos == 17); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "7S2M") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 17, newCigar, newPos) == 7); assert(newPos == 18); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "8S1M") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 18, newCigar, newPos) == 8); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "9S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 19, newCigar, newPos) == 8); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "9S") == 0); //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// // Test clipping at every position when first non-clip instruction is delete. origCigar = "3H3S3D3M3S3H"; record.setCigar(origCigar); record.setSequence("gggAAAggg"); // Cigar: HHHSSSDDDMMMSSSHHH // ReadPos: 000 000000 // ReadPos: 012 345678 // RefPos: 111111 // RefPos: 012345 record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 9, newCigar, newPos) == CigarHelper::NO_CLIP); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), origCigar) == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 10, newCigar, newPos) == 2); assert(newPos == 13); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 11, newCigar, newPos) == 2); assert(newPos == 13); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 12, newCigar, newPos) == 2); assert(newPos == 13); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 13, newCigar, newPos) == 3); assert(newPos == 14); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H4S2M3S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 14, newCigar, newPos) == 4); assert(newPos == 15); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H5S1M3S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 15, newCigar, newPos) == 8); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H9S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 16, newCigar, newPos) == 8); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H9S3H") == 0); //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// // Test clipping at every position when first non-clip instruction is insert. origCigar = "3H3S3I3M3S3H"; record.setCigar(origCigar); record.setSequence("gggAAATTTggg"); // Cigar: HHHSSSIIIMMMSSSHHH // ReadPos: 000000000011 // ReadPos: 012345678901 // RefPos: 111 // RefPos: 012 record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 9, newCigar, newPos) == CigarHelper::NO_CLIP); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), origCigar) == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 10, newCigar, newPos) == 6); assert(newPos == 11); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H7S2M3S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 11, newCigar, newPos) == 7); assert(newPos == 12); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H8S1M3S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 12, newCigar, newPos) == 11); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H12S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipBeginByRefPos(record, 13, newCigar, newPos) == 11); assert(newPos == 10); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H12S3H") == 0); } void CigarHelperTest::testSoftClipEndByRefPos() { SamRecord record; CigarRoller newCigar; std::string newCigarString; // Setup the current Cigar. // Cigar: HHHSSSMMMDDDMMMIIIMMMPPPMMMDDDMMMSSSHHH // ReadPos: 000000 000011111 111 112222 // ReadPos: 012345 678901234 567 890123 // RefPos: 111111111 122 222222223 // RefPos: 012345678 901 234567890 const char* origCigar = "3H3S3M3D3M3I3M3P3M3D3M3S3H"; record.setCigar(origCigar); record.set0BasedPosition(10); record.setSequence("gggAAATTTCCCTTTGGGAAAggg"); //////////////////////////////////////////////////////// // Clip outside of the range (after). Nothing should change. assert(CigarHelper::softClipEndByRefPos(record, 10000, newCigar) == CigarHelper::NO_CLIP); newCigar.getCigarString(newCigarString); assert(strcmp(newCigarString.c_str(), origCigar) == 0); //////////////////////////////////////////////////////// // Clip outside of the range (before). Everything should be clipped. assert(CigarHelper::softClipEndByRefPos(record, 1, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H24S3H") == 0); //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// // Test clipping at every position of the read. //////////////////////////////////////////////////////// // Clip at the first position. assert(CigarHelper::softClipEndByRefPos(record, 10, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H24S3H") == 0); //////////////////////////////////////////////////////// // Clip in the middle of the first Match. assert(CigarHelper::softClipEndByRefPos(record, 11, newCigar) == 4); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S1M20S3H") == 0); //////////////////////////////////////////////////////// // Clip just before the first deletion. assert(CigarHelper::softClipEndByRefPos(record, 12, newCigar) == 5); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S2M19S3H") == 0); //////////////////////////////////////////////////////// // Clip at the first deletion. assert(CigarHelper::softClipEndByRefPos(record, 13, newCigar) == 6); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M18S3H") == 0); //////////////////////////////////////////////////////// // Clip in the middle of the first deletion. assert(CigarHelper::softClipEndByRefPos(record, 14, newCigar) == 6); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M18S3H") == 0); //////////////////////////////////////////////////////// // Clip in the end of the first deletion. assert(CigarHelper::softClipEndByRefPos(record, 15, newCigar) == 6); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M18S3H") == 0); //////////////////////////////////////////////////////// // Clip just after the first deletion (should remove the deletion). assert(CigarHelper::softClipEndByRefPos(record, 16, newCigar) == 6); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M18S3H") == 0); //////////////////////////////////////////////////////// // Clip in middle of read after 1st deletion. assert(CigarHelper::softClipEndByRefPos(record, 17, newCigar) == 7); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D1M17S3H") == 0); //////////////////////////////////////////////////////// // Clip in middle of read after 1st deletion. assert(CigarHelper::softClipEndByRefPos(record, 18, newCigar) == 8); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D2M16S3H") == 0); //////////////////////////////////////////////////////// // Clip just after the 1st insertion. assert(CigarHelper::softClipEndByRefPos(record, 19, newCigar) == 12); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I12S3H") == 0); //////////////////////////////////////////////////////// // Clip in middle of the match after 1st insertion. assert(CigarHelper::softClipEndByRefPos(record, 20, newCigar) == 13); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I1M11S3H") == 0); //////////////////////////////////////////////////////// // Clip in middle of the match after 1st insertion. assert(CigarHelper::softClipEndByRefPos(record, 21, newCigar) == 14); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I2M10S3H") == 0); //////////////////////////////////////////////////////// // Clip right after the pad assert(CigarHelper::softClipEndByRefPos(record, 22, newCigar) == 15); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I3M9S3H") == 0); //////////////////////////////////////////////////////// // Clip middle of read after the pad assert(CigarHelper::softClipEndByRefPos(record, 23, newCigar) == 16); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I3M3P1M8S3H") == 0); //////////////////////////////////////////////////////// // Clip end of read after the pad before deletion assert(CigarHelper::softClipEndByRefPos(record, 24, newCigar) == 17); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I3M3P2M7S3H") == 0); //////////////////////////////////////////////////////// // Clip at start of 2nd deletion. assert(CigarHelper::softClipEndByRefPos(record, 25, newCigar) == 18); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I3M3P3M6S3H") == 0); //////////////////////////////////////////////////////// // Clip in 2nd deletion. assert(CigarHelper::softClipEndByRefPos(record, 26, newCigar) == 18); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I3M3P3M6S3H") == 0); //////////////////////////////////////////////////////// // Clip in 2nd deletion. assert(CigarHelper::softClipEndByRefPos(record, 27, newCigar) == 18); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I3M3P3M6S3H") == 0); //////////////////////////////////////////////////////// // Clip right after 2nd deletion. assert(CigarHelper::softClipEndByRefPos(record, 28, newCigar) == 18); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I3M3P3M6S3H") == 0); //////////////////////////////////////////////////////// // Clip in middle of last match. assert(CigarHelper::softClipEndByRefPos(record, 29, newCigar) == 19); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I3M3P3M3D1M5S3H") == 0); //////////////////////////////////////////////////////// // Clip in middle of last match. assert(CigarHelper::softClipEndByRefPos(record, 30, newCigar) == 20); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3M3D3M3I3M3P3M3D2M4S3H") == 0); //////////////////////////////////////////////////////// // Clip right after the read (no change). assert(CigarHelper::softClipEndByRefPos(record, 31, newCigar) == CigarHelper::NO_CLIP); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), origCigar) == 0); //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// // Test clipping at every position when insertions & deletions // are next to each other. origCigar = "3M3D3I3M"; record.setCigar(origCigar); record.setSequence("GGGAAAGGG"); // Cigar: MMMDDDIIIMMM // ReadPos: 000 000000 // ReadPos: 012 345678 // RefPos: 111111 111 // RefPos: 012345 678 record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 9, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "9S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 10, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "9S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 11, newCigar) == 1); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "1M8S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 12, newCigar) == 2); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "2M7S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 13, newCigar) == 3); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3M6S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 14, newCigar) == 3); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3M6S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 15, newCigar) == 3); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3M6S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 16, newCigar) == 6); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3M3D3I3S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 17, newCigar) == 7); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3M3D3I1M2S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 18, newCigar) == 8); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3M3D3I2M1S") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 19, newCigar) == CigarHelper::NO_CLIP); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), origCigar) == 0); //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// // Test clipping at every position when first non-clip instruction is delete. origCigar = "3H3S3D3M3S3H"; record.setCigar(origCigar); record.setSequence("gggAAAggg"); // Cigar: HHHSSSDDDMMMSSSHHH // ReadPos: 000 000000 // ReadPos: 012 345678 // RefPos: 111111 // RefPos: 012345 record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 9, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H9S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 10, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H9S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 11, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H9S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 12, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H9S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 13, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H9S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 14, newCigar) == 4); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3D1M5S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 15, newCigar) == 5); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3D2M4S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 16, newCigar) == CigarHelper::NO_CLIP); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), origCigar) == 0); //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// // Test clipping at every position when first non-clip instruction is insert. origCigar = "3H3S3I3M3S3H"; record.setCigar(origCigar); record.setSequence("gggAAATTTggg"); // Cigar: HHHSSSIIIMMMSSSHHH // ReadPos: 000000000011 // ReadPos: 012345678901 // RefPos: 111 // RefPos: 012 record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 9, newCigar) == 0); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H12S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 10, newCigar) == 6); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3I6S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 11, newCigar) == 7); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3I1M5S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 12, newCigar) == 8); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), "3H3S3I2M4S3H") == 0); record.setCigar(origCigar); assert(CigarHelper::softClipEndByRefPos(record, 13, newCigar) == CigarHelper::NO_CLIP); newCigar.getCigarString(newCigarString); //std::cout << newCigarString.c_str() << std::endl; assert(strcmp(newCigarString.c_str(), origCigar) == 0); } libStatGen-1.0.14/bam/test/TestCigarHelper.h000066400000000000000000000016711254730101300205460ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" void testCigarHelper(); class CigarHelperTest { public: static void testCigarHelper(); private: static void testSoftClipBeginByRefPos(); static void testSoftClipEndByRefPos(); }; libStatGen-1.0.14/bam/test/TestEquals.cpp000066400000000000000000000567161254730101300201600ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TestEquals.h" #include "SamQuerySeqWithRefHelper.h" #include void testSeqEquals() { // Call generic test which since the sam and bam are identical, should // contain the same results. EqualsTest::testEq(EqualsTest::SAM); #ifdef __ZLIB_AVAILABLE__ EqualsTest::testEq(EqualsTest::BAM); #endif } const char* EqualsTest::READ_NAMES[] = {"01:====", "02:===X", "03:==X=", "04:==XX", "05:=X==", "06:=X=X", "07:=XX=", "08:=XXX", "09:X===", "10:X==X", "11:X=X=", "12:X=XX", "13:XX==", "14:XX=X", "15:XXX=", "16:XXXX", "Read:GGCCTA;Ref:CCTA", "Read:CCTA;Ref:CCTA", "Read:CCGTxxxC;Ref:CCxTAACC", "Read:CCxxAC;Ref:CCTAACC", "chromNotInRef", "chromNotInRef1"}; const char* EqualsTest::READ_SEQS_BASES[] = {"CCTA", "CCTT", "CCAA", "CCAT", "CTTA", "CTTT", "CTAA", "CTAT", "TCTA", "TCTT", "TCAA", "TCAT", "TTTA", "TTTT", "TTAA", "TTAT", "GGCCTA", "CCTA", "CCGTC", "CCAC", "CCTA", "CC=A"}; const char* EqualsTest::READ_SEQS_EQUALS[] = {"====", "===T", "==A=", "==AT", "=T==", "=T=T", "=TA=", "=TAT", "T===", "T==T", "T=A=", "T=AT", "TT==", "TT=T", "TTA=", "TTAT", "GG====", "====", "==G==", "====", "CCTA", "CC=A"}; const char* EqualsTest::READ_SEQS_MIXED[] = {"C===", "=C=T", "==AA", "==AT", "=TTA", "CT=T", "=TAA", "=TAT", "T=TA", "TC=T", "TCA=", "TCAT", "TT=A", "TT=T", "TTA=", "TTAT", "GGC=T=", "C=T=", "C=GT=", "C=A=", "CCTA", "CC=A"}; const char* EqualsTest::expectedReferenceName; const char* EqualsTest::expectedMateReferenceName; const char* EqualsTest::expectedMateReferenceNameOrEqual; const char* EqualsTest::expectedCigar; const char* EqualsTest::expectedQuality; std::vector EqualsTest::expectedCigarHex; int EqualsTest::expected0BasedAlignmentEnd; int EqualsTest::expected1BasedAlignmentEnd; int EqualsTest::expectedAlignmentLength; int EqualsTest::expected0BasedUnclippedStart; int EqualsTest::expected1BasedUnclippedStart; int EqualsTest::expected0BasedUnclippedEnd; int EqualsTest::expected1BasedUnclippedEnd; bamRecordStruct EqualsTest::expectedRecord; void EqualsTest::testEq(FileType inputType) { reset(); SamFile inSam; std::string outputBase = "results/out"; if(inputType == SAM) { assert(inSam.OpenForRead("testFiles/testEq.sam")); outputBase += "SamEq"; } else { assert(inSam.OpenForRead("testFiles/testEq.bam")); outputBase += "BamEq"; } // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); std::string outputName = outputBase + "Bases.sam"; SamFile outBasesSam( outputName.c_str(), SamFile::WRITE); outputName = outputBase + "Equals.sam"; SamFile outEqualsSam(outputName.c_str(), SamFile::WRITE); outputName = outputBase + "Orig.sam"; SamFile outOrigSam( outputName.c_str(), SamFile::WRITE); outputName = outputBase + "Bases.bam"; SamFile outBasesBam( outputName.c_str(), SamFile::WRITE); outputName = outputBase + "Equals.bam"; SamFile outEqualsBam(outputName.c_str(), SamFile::WRITE); outputName = outputBase + "Orig.bam"; SamFile outOrigBam( outputName.c_str(), SamFile::WRITE); assert(outBasesSam.WriteHeader(samHeader)); assert(outEqualsSam.WriteHeader(samHeader)); assert(outOrigSam.WriteHeader(samHeader)); assert(outBasesBam.WriteHeader(samHeader)); assert(outEqualsBam.WriteHeader(samHeader)); assert(outOrigBam.WriteHeader(samHeader)); outBasesSam.SetWriteSequenceTranslation(SamRecord::BASES); outEqualsSam.SetWriteSequenceTranslation(SamRecord::EQUAL); outOrigSam.SetWriteSequenceTranslation(SamRecord::NONE); outBasesBam.SetWriteSequenceTranslation(SamRecord::BASES); outEqualsBam.SetWriteSequenceTranslation(SamRecord::EQUAL); outOrigBam.SetWriteSequenceTranslation(SamRecord::NONE); GenomeSequence reference("testFiles/chr1_partial.fa"); inSam.SetReference(&reference); SamRecord samRecord; // The set of 16 variations are repeated 3 times: once with all charcters // 1) Matches have the actual bases in them. // 2) Matches have '=' // 3) Matches are mixed between bases and '=' // Since Sequences are 4 characters long, there are 16 variations // of match/mismatch. for(int j = 0; j < 16; j++) { assert(inSam.ReadRecord(samHeader, samRecord) == true); validateEqRead(samRecord, j, READ_SEQS_BASES[j]); assert(outBasesSam.WriteRecord(samHeader, samRecord)); assert(outEqualsSam.WriteRecord(samHeader, samRecord)); assert(outOrigSam.WriteRecord(samHeader, samRecord)); assert(outBasesBam.WriteRecord(samHeader, samRecord)); assert(outEqualsBam.WriteRecord(samHeader, samRecord)); assert(outOrigBam.WriteRecord(samHeader, samRecord)); } for(int j = 0; j < 16; j++) { assert(inSam.ReadRecord(samHeader, samRecord) == true); validateEqRead(samRecord, j, READ_SEQS_EQUALS[j]); assert(outBasesSam.WriteRecord(samHeader, samRecord)); assert(outEqualsSam.WriteRecord(samHeader, samRecord)); assert(outOrigSam.WriteRecord(samHeader, samRecord)); assert(outBasesBam.WriteRecord(samHeader, samRecord)); assert(outEqualsBam.WriteRecord(samHeader, samRecord)); assert(outOrigBam.WriteRecord(samHeader, samRecord)); } for(int j = 0; j < 16; j++) { assert(inSam.ReadRecord(samHeader, samRecord) == true); validateEqRead(samRecord, j, READ_SEQS_MIXED[j]); assert(outBasesSam.WriteRecord(samHeader, samRecord)); assert(outEqualsSam.WriteRecord(samHeader, samRecord)); assert(outOrigSam.WriteRecord(samHeader, samRecord)); assert(outBasesBam.WriteRecord(samHeader, samRecord)); assert(outEqualsBam.WriteRecord(samHeader, samRecord)); assert(outOrigBam.WriteRecord(samHeader, samRecord)); } expectedCigar = "2S4M"; expectedCigarHex.clear(); expectedCigarHex.push_back(0x24); expectedCigarHex.push_back(0x40); expected0BasedUnclippedStart = expectedRecord.myPosition-2; expected1BasedUnclippedStart = expected0BasedUnclippedStart + 1; expectedRecord.myBlockSize = 70; expectedRecord.myReadNameLength = 21; expectedRecord.myCigarLength = 2; expectedRecord.myReadLength = 6; expectedQuality = "??I00?"; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateEqRead(samRecord, 16, READ_SEQS_MIXED[16]); assert(outBasesSam.WriteRecord(samHeader, samRecord)); assert(outEqualsSam.WriteRecord(samHeader, samRecord)); assert(outOrigSam.WriteRecord(samHeader, samRecord)); assert(outBasesBam.WriteRecord(samHeader, samRecord)); assert(outEqualsBam.WriteRecord(samHeader, samRecord)); assert(outOrigBam.WriteRecord(samHeader, samRecord)); expectedCigar = "4M4H"; expectedCigarHex.clear(); expectedCigarHex.push_back(0x40); expectedCigarHex.push_back(0x45); expected0BasedUnclippedStart = expectedRecord.myPosition; expected1BasedUnclippedStart = expected0BasedUnclippedStart + 1; expected0BasedUnclippedEnd = expectedRecord.myPosition + 7; expected1BasedUnclippedEnd = expected0BasedUnclippedEnd + 1; expectedRecord.myBlockSize = 65; expectedRecord.myReadNameLength = 19; expectedRecord.myCigarLength = 2; expectedRecord.myReadLength = 4; expectedQuality = "I00?"; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateEqRead(samRecord, 17, READ_SEQS_MIXED[17]); assert(outBasesSam.WriteRecord(samHeader, samRecord)); assert(outEqualsSam.WriteRecord(samHeader, samRecord)); assert(outOrigSam.WriteRecord(samHeader, samRecord)); assert(outBasesBam.WriteRecord(samHeader, samRecord)); assert(outEqualsBam.WriteRecord(samHeader, samRecord)); assert(outOrigBam.WriteRecord(samHeader, samRecord)); expectedCigar = "1M1P1M1I1M3D1M"; expectedCigarHex.clear(); expectedCigarHex.push_back(0x10); expectedCigarHex.push_back(0x16); expectedCigarHex.push_back(0x10); expectedCigarHex.push_back(0x11); expectedCigarHex.push_back(0x10); expectedCigarHex.push_back(0x32); expectedCigarHex.push_back(0x10); expected0BasedAlignmentEnd = expectedRecord.myPosition + 6; expected1BasedAlignmentEnd = expected0BasedAlignmentEnd + 1; expectedAlignmentLength = 7; expected0BasedUnclippedStart = expectedRecord.myPosition; expected1BasedUnclippedStart = expected0BasedUnclippedStart + 1; expected0BasedUnclippedEnd = expected0BasedAlignmentEnd; expected1BasedUnclippedEnd = expected0BasedUnclippedEnd + 1; expectedRecord.myBlockSize = 95; expectedRecord.myReadNameLength = 27; expectedRecord.myCigarLength = 7; expectedRecord.myReadLength = 5; expectedQuality = "I00??"; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateEqRead(samRecord, 18, READ_SEQS_MIXED[18]); assert(outBasesSam.WriteRecord(samHeader, samRecord)); assert(outEqualsSam.WriteRecord(samHeader, samRecord)); assert(outOrigSam.WriteRecord(samHeader, samRecord)); assert(outBasesBam.WriteRecord(samHeader, samRecord)); assert(outEqualsBam.WriteRecord(samHeader, samRecord)); assert(outOrigBam.WriteRecord(samHeader, samRecord)); expectedCigar = "2M2N2M"; expectedCigarHex.clear(); expectedCigarHex.push_back(0x20); expectedCigarHex.push_back(0x23); expectedCigarHex.push_back(0x20); expected0BasedAlignmentEnd = expectedRecord.myPosition + 5; expected1BasedAlignmentEnd = expected0BasedAlignmentEnd + 1; expectedAlignmentLength = 6; expected0BasedUnclippedStart = expectedRecord.myPosition; expected1BasedUnclippedStart = expected0BasedUnclippedStart + 1; expected0BasedUnclippedEnd = expected0BasedAlignmentEnd; expected1BasedUnclippedEnd = expected0BasedUnclippedEnd + 1; expectedRecord.myBlockSize = 74; expectedRecord.myReadNameLength = 24; expectedRecord.myCigarLength = 3; expectedRecord.myReadLength = 4; expectedQuality = "I00?"; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateEqRead(samRecord, 19, READ_SEQS_MIXED[19]); assert(outBasesSam.WriteRecord(samHeader, samRecord)); assert(outEqualsSam.WriteRecord(samHeader, samRecord)); assert(outOrigSam.WriteRecord(samHeader, samRecord)); assert(outBasesBam.WriteRecord(samHeader, samRecord)); assert(outEqualsBam.WriteRecord(samHeader, samRecord)); assert(outOrigBam.WriteRecord(samHeader, samRecord)); // Test getNextMatchMismatch. SamSingleBaseMatchInfo matchTest; SamQuerySeqWithRefIter queryIter(samRecord, reference, true); assert(queryIter.getNextMatchMismatch(matchTest) == true); assert(matchTest.getType() == SamSingleBaseMatchInfo::MATCH); assert(matchTest.getQueryIndex() == 0); assert(queryIter.getNextMatchMismatch(matchTest) == true); assert(matchTest.getType() == SamSingleBaseMatchInfo::MATCH); assert(matchTest.getQueryIndex() == 1); assert(queryIter.getNextMatchMismatch(matchTest) == true); assert(matchTest.getType() == SamSingleBaseMatchInfo::MATCH); assert(matchTest.getQueryIndex() == 2); assert(queryIter.getNextMatchMismatch(matchTest) == true); assert(matchTest.getType() == SamSingleBaseMatchInfo::MATCH); assert(matchTest.getQueryIndex() == 3); assert(queryIter.getNextMatchMismatch(matchTest) == false); // Check the read that is on a different chormosome not // found in the reference. reset(); expectedRecord.myBlockSize = 56; expectedRecord.myReadNameLength = 14; expectedRecord.myReferenceID = 1; expectedReferenceName = "2"; expectedRecord.myMateReferenceID = 1; expectedMateReferenceName = "2"; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateEqRead(samRecord, 20, READ_SEQS_MIXED[20]); assert(outBasesSam.WriteRecord(samHeader, samRecord)); assert(outEqualsSam.WriteRecord(samHeader, samRecord)); assert(outOrigSam.WriteRecord(samHeader, samRecord)); assert(outBasesBam.WriteRecord(samHeader, samRecord)); assert(outEqualsBam.WriteRecord(samHeader, samRecord)); assert(outOrigBam.WriteRecord(samHeader, samRecord)); // Check the read that is on a different chormosome and // has '=', but the chromosome is not found in the reference. reset(); expectedRecord.myBlockSize = 57; expectedRecord.myReadNameLength = 15; expectedRecord.myReferenceID = 1; expectedReferenceName = "2"; expectedRecord.myMateReferenceID = 1; expectedMateReferenceName = "2"; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateEqRead(samRecord, 21, READ_SEQS_MIXED[21]); assert(outBasesSam.WriteRecord(samHeader, samRecord)); assert(outEqualsSam.WriteRecord(samHeader, samRecord)); assert(outOrigSam.WriteRecord(samHeader, samRecord)); assert(outBasesBam.WriteRecord(samHeader, samRecord)); assert(outEqualsBam.WriteRecord(samHeader, samRecord)); assert(outOrigBam.WriteRecord(samHeader, samRecord)); SamQuerySeqWithRefIter queryIter2(samRecord, reference, true); assert(queryIter2.getNextMatchMismatch(matchTest) == false); } void EqualsTest::reset() { expectedReferenceName = "1"; expectedMateReferenceName = "1"; expectedMateReferenceNameOrEqual = "="; expectedCigar = "4M"; expectedQuality = "I00?"; // The First cigar is 4M which is 4 << 4 | 0 = 0x40 = 64 expectedCigarHex.clear(); expectedCigarHex.push_back(0x40); expectedRecord.myBlockSize = 50; expectedRecord.myReferenceID = 0; expectedRecord.myPosition = 10010; expectedRecord.myReadNameLength = 8; expectedRecord.myMapQuality = 0; expectedRecord.myBin = 4681; expectedRecord.myCigarLength = 1; expectedRecord.myFlag = 73; expectedRecord.myReadLength = 4; expectedRecord.myMateReferenceID = 0; expectedRecord.myMatePosition = 10008; expectedRecord.myInsertSize = 0; expected0BasedAlignmentEnd = 10013; expected1BasedAlignmentEnd = expected0BasedAlignmentEnd + 1; expectedAlignmentLength = 4; expected0BasedUnclippedStart = expectedRecord.myPosition; expected1BasedUnclippedStart = expected0BasedUnclippedStart + 1; expected0BasedUnclippedEnd = expected0BasedAlignmentEnd; expected1BasedUnclippedEnd = expected1BasedAlignmentEnd; } void EqualsTest::validateEqRead(SamRecord& samRecord, int readIndex, const char* actualExpectedSequence) { char tag[3]; char type; void* value; ////////////////////////////////////////// // Validate Record 1 // Check the alignment end assert(samRecord.get0BasedAlignmentEnd() == expected0BasedAlignmentEnd); assert(samRecord.get1BasedAlignmentEnd() == expected1BasedAlignmentEnd); assert(samRecord.getAlignmentLength() == expectedAlignmentLength); assert(samRecord.get0BasedUnclippedStart() == expected0BasedUnclippedStart); assert(samRecord.get1BasedUnclippedStart() == expected1BasedUnclippedStart); assert(samRecord.get0BasedUnclippedEnd() == expected0BasedUnclippedEnd); assert(samRecord.get1BasedUnclippedEnd() == expected1BasedUnclippedEnd); // Check the accessors. assert(samRecord.getBlockSize() == expectedRecord.myBlockSize); assert(samRecord.getReferenceID() == expectedRecord.myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecord.myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecord.myPosition); assert(samRecord.getReadNameLength() == expectedRecord.myReadNameLength); assert(samRecord.getMapQuality() == expectedRecord.myMapQuality); assert(samRecord.getBin() == expectedRecord.myBin); assert(samRecord.getCigarLength() == expectedRecord.myCigarLength); assert(samRecord.getFlag() == expectedRecord.myFlag); assert(samRecord.getReadLength() == expectedRecord.myReadLength); assert(samRecord.getMateReferenceID() == expectedRecord.myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecord.myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecord.myMatePosition); assert(samRecord.getInsertSize() == expectedRecord.myInsertSize); assert(strcmp(samRecord.getReadName(), READ_NAMES[readIndex]) == 0); assert(strcmp(samRecord.getCigar(), expectedCigar) == 0); samRecord.setSequenceTranslation(SamRecord::BASES); assert(strcmp(samRecord.getSequence(), READ_SEQS_BASES[readIndex]) == 0); assert(strcmp(samRecord.getQuality(), expectedQuality) == 0); assert(samRecord.getSequence(0) == READ_SEQS_BASES[readIndex][0]); assert(samRecord.getQuality(0) == expectedQuality[0]); assert(samRecord.getSequence(1)== READ_SEQS_BASES[readIndex][1]); assert(samRecord.getQuality(1) == expectedQuality[1]); assert(samRecord.getSequence(2) == READ_SEQS_BASES[readIndex][2]); assert(samRecord.getQuality(2) == expectedQuality[2]); assert(samRecord.getSequence(3) == READ_SEQS_BASES[readIndex][3]); assert(samRecord.getQuality(3) == expectedQuality[3]); assert(strcmp(samRecord.getSequence(SamRecord::EQUAL), READ_SEQS_EQUALS[readIndex]) == 0); assert(samRecord.getSequence(0, SamRecord::EQUAL) == READ_SEQS_EQUALS[readIndex][0]); assert(samRecord.getQuality(0) == expectedQuality[0]); assert(samRecord.getSequence(1, SamRecord::EQUAL) == READ_SEQS_EQUALS[readIndex][1]); assert(samRecord.getQuality(1) == expectedQuality[1]); assert(samRecord.getSequence(2, SamRecord::EQUAL) == READ_SEQS_EQUALS[readIndex][2]); assert(samRecord.getQuality(2) == expectedQuality[2]); assert(samRecord.getSequence(3, SamRecord::EQUAL) == READ_SEQS_EQUALS[readIndex][3]); assert(samRecord.getQuality(3) == expectedQuality[3]); assert(strcmp(samRecord.getSequence(SamRecord::NONE), actualExpectedSequence) == 0); assert(samRecord.getSequence(0, SamRecord::NONE) == actualExpectedSequence[0]); assert(samRecord.getQuality(0) == expectedQuality[0]); assert(samRecord.getSequence(1, SamRecord::NONE) == actualExpectedSequence[1]); assert(samRecord.getQuality(1) == expectedQuality[1]); assert(samRecord.getSequence(2, SamRecord::NONE) == actualExpectedSequence[2]); assert(samRecord.getQuality(2) == expectedQuality[2]); assert(samRecord.getSequence(3, SamRecord::NONE) == actualExpectedSequence[3]); assert(samRecord.getQuality(3) == expectedQuality[3]); samRecord.setSequenceTranslation(SamRecord::NONE); assert(strcmp(samRecord.getSequence(), actualExpectedSequence) == 0); assert(samRecord.getSequence(0) == actualExpectedSequence[0]); assert(samRecord.getQuality(0) == expectedQuality[0]); assert(samRecord.getSequence(1) == actualExpectedSequence[1]); assert(samRecord.getQuality(1) == expectedQuality[1]); assert(samRecord.getSequence(2) == actualExpectedSequence[2]); assert(samRecord.getQuality(2) == expectedQuality[2]); assert(samRecord.getSequence(3) == actualExpectedSequence[3]); assert(samRecord.getQuality(3) == expectedQuality[3]); // No tags, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. samRecord.setSequenceTranslation(SamRecord::BASES); validateEqReadBuffer(samRecord, READ_SEQS_BASES[readIndex]); samRecord.setSequenceTranslation(SamRecord::NONE); validateEqReadBuffer(samRecord, actualExpectedSequence); samRecord.setSequenceTranslation(SamRecord::EQUAL); validateEqReadBuffer(samRecord, READ_SEQS_EQUALS[readIndex]); } void EqualsTest::validateEqReadBuffer(SamRecord& samRecord, const char* expectedSequence) { const bamRecordStruct* bufferPtr; unsigned char* varPtr; bufferPtr = (const bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecord.myBlockSize); assert(bufferPtr->myReferenceID == expectedRecord.myReferenceID); assert(bufferPtr->myPosition == expectedRecord.myPosition); assert(bufferPtr->myReadNameLength == expectedRecord.myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecord.myMapQuality); assert(bufferPtr->myBin == expectedRecord.myBin); assert(bufferPtr->myCigarLength == expectedRecord.myCigarLength); assert(bufferPtr->myFlag == expectedRecord.myFlag); assert(bufferPtr->myReadLength == expectedRecord.myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecord.myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecord.myMatePosition); assert(bufferPtr->myInsertSize == expectedRecord.myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecord.myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // Validate the cigar. for(int i = 0; i < expectedRecord.myCigarLength; i++) { assert(*(unsigned int*)varPtr == expectedCigarHex[i]); // Increment the varptr the size of an int. varPtr += 4; } // Validate the sequence. int expectedSeqHex = 0; for(int i = 0; i < expectedRecord.myReadLength; i++) { int hexChar = 0x0; switch(expectedSequence[i]) { case '=': hexChar = 0x0; break; case 'A': case 'a': hexChar = 0x1; break; case 'C': case 'c': hexChar = 0x2; break; case 'G': case 'g': hexChar = 0x4; break; case 'T': case 't': hexChar = 0x8; break; case 'N': case 'n': hexChar = 0xF; break; } if(i%2 == 0) { expectedSeqHex = hexChar << 4; } else { expectedSeqHex |= hexChar; assert(*varPtr == expectedSeqHex); varPtr++; } } if((expectedRecord.myReadLength%2) != 0) { // Odd number of sequences, so need to check the last one. assert(*varPtr == expectedSeqHex); varPtr++; } // Validate the Quality for(int i = 0; i < expectedRecord.myReadLength; i++) { assert(*varPtr == samRecord.getQuality()[i] - 33); varPtr++; } } libStatGen-1.0.14/bam/test/TestEquals.h000066400000000000000000000037461254730101300176200ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" void testSeqEquals(); class EqualsTest { public: enum FileType{SAM, BAM}; static void testEq(FileType inputType); private: static void reset(); static void validateEqRead(SamRecord& samRecord, int readIndex, const char* actualExpectedSequence); static void validateEqReadBuffer(SamRecord& samRecord, const char* expectedSequence); static const char* READ_NAMES[]; static const char* READ_SEQS_BASES[]; static const char* READ_SEQS_EQUALS[]; static const char* READ_SEQS_MIXED[]; static const char* expectedReferenceName; static const char* expectedMateReferenceName; static const char* expectedMateReferenceNameOrEqual; static const char* expectedCigar; static const char* expectedQuality; static std::vector expectedCigarHex; static int expected0BasedAlignmentEnd; static int expected1BasedAlignmentEnd; static int expectedAlignmentLength; static int expected0BasedUnclippedStart; static int expected1BasedUnclippedStart; static int expected0BasedUnclippedEnd; static int expected1BasedUnclippedEnd; static bamRecordStruct expectedRecord; }; libStatGen-1.0.14/bam/test/TestFilter.cpp000066400000000000000000000114001254730101300201300ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TestFilter.h" #include "TestValidate.h" #include "SamFilter.h" #include void testFilter() { // Call generic test which since the sam and bam are identical, should // contain the same results. FilterTest::testFilter(FilterTest::SAM); #ifdef __ZLIB_AVAILABLE__ FilterTest::testFilter(FilterTest::BAM); #endif } void FilterTest::testFilter(FileType inputType) { SamFile inSam; if(inputType == SAM) { assert(inSam.OpenForRead("testFiles/testSam.sam")); } else { assert(inSam.OpenForRead("testFiles/testBam.bam")); } // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); SamRecord samRecord; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); // Clip the read, 2 from the front and 2 from the back, which causes 2D to // be dropped. assert(SamFilter::softClip(samRecord, 2, 2) == SamFilter::CLIPPED); assert(samRecord.get0BasedPosition() == TestValidate::READ1_POS + 2); std::string expectedCigar = "2S1M2S"; assert(samRecord.getCigar() == expectedCigar); assert(samRecord.getSequence() == TestValidate::READ1_SEQ); assert(samRecord.getQuality() == TestValidate::READ1_QUAL); // Only 1 base, so the end is the same as start assert(samRecord.get0BasedAlignmentEnd() == TestValidate::READ1_POS + 2); assert(samRecord.getAlignmentLength() == 1); assert(samRecord.get0BasedUnclippedStart() == TestValidate::READ1_UNCLIP_START); // The new unclipped end is not the same as the original end because the // 2 deletions are lost. assert(samRecord.get0BasedUnclippedEnd() == TestValidate::READ1_UNCLIP_END - 2); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead3(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead4(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead6(samRecord); // Clip the read 2 more from the front and 2 from the back. assert(SamFilter::softClip(samRecord, 5, 2) == SamFilter::CLIPPED); assert(samRecord.get0BasedPosition() == TestValidate::READ6_POS + 2); expectedCigar = "2H5S1M2S"; assert(samRecord.getCigar() == expectedCigar); assert(samRecord.getSequence() == TestValidate::READ6_SEQ); assert(samRecord.getQuality() == TestValidate::READ6_QUAL); // Only 1 base, so the end is the same as start assert(samRecord.get0BasedAlignmentEnd() == TestValidate::READ6_POS + 2); assert(samRecord.getAlignmentLength() == 1); assert(samRecord.get0BasedUnclippedStart() == TestValidate::READ6_UNCLIP_START); assert(samRecord.get0BasedUnclippedEnd() == TestValidate::READ6_UNCLIP_END); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); // Clip the read 2 more from the front and 2 morefrom the back. assert(SamFilter::softClip(samRecord, 5, 3) == SamFilter::CLIPPED); assert(samRecord.get0BasedPosition() == TestValidate::READ7_POS + 2); expectedCigar = "5S1M3S3H"; assert(samRecord.getCigar() == expectedCigar); assert(samRecord.getSequence() == TestValidate::READ7_SEQ); assert(samRecord.getQuality() == TestValidate::READ7_QUAL); // Only 1 base, so the end is the same as start assert(samRecord.get0BasedAlignmentEnd() == TestValidate::READ7_POS + 2); assert(samRecord.getAlignmentLength() == 1); assert(samRecord.get0BasedUnclippedStart() == TestValidate::READ7_UNCLIP_START); assert(samRecord.get0BasedUnclippedEnd() == TestValidate::READ7_UNCLIP_END); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead9(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); } libStatGen-1.0.14/bam/test/TestFilter.h000066400000000000000000000016061254730101300176040ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" void testFilter(); class FilterTest { public: enum FileType{SAM, BAM}; static void testFilter(FileType inputType); private: }; libStatGen-1.0.14/bam/test/TestPileup.cpp000066400000000000000000000034301254730101300201450ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TestPileup.h" void testPileup() { TestPileup pileupTest; pileupTest.testPileupPosition(); } void TestPileupElement::analyze() { assert(strcmp(getChromosome(), "") == 0); assert(getRefPosition() == 14000); } void TestPileup::testPileupPosition() { assert(pileupPosition(14000) == 0); assert(pileupHead == 14000); assert(pileupStart == 14000); assert(pileupTail == 14000); bool caught = false; try { pileupPosition(13999); } catch(std::exception& e) { caught = true; assert(strcmp(e.what(), "Overflow on the pileup buffer: specifiedPosition = 13999, pileup buffer start position: 14000, pileup buffer end position: 15024") == 0); } assert(caught); caught = false; try { pileupPosition(15025); } catch(std::exception& e) { caught = true; assert(strcmp(e.what(), "Overflow on the pileup buffer: specifiedPosition = 15025, pileup buffer start position: 14000, pileup buffer end position: 15024") == 0); } assert(caught); } libStatGen-1.0.14/bam/test/TestPileup.h000066400000000000000000000020401254730101300176060ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Pileup.h" #include "PileupElementBaseQual.h" void testPileup(); class TestPileupElement : public PileupElementBaseQual { public: // Overwrite to validate result. virtual void analyze(); }; class TestPileup : public Pileup { public: void testPileupPosition(); private: }; libStatGen-1.0.14/bam/test/TestPosList.cpp000066400000000000000000000063711254730101300203130ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TestPosList.h" #include #include void testPosList() { TestPosList posListTest; posListTest.testPosList(); } TestPosList::TestPosList() { } TestPosList::~TestPosList() { } void TestPosList::testPosList() { assert(myPosList.size() == 24); for(int i = 0; i < 24; i++) { assert(myPosList.at(i).size() == 100); } bool caught = false; try { myPosList.at(24); } catch(std::out_of_range& oor) { caught = true; } assert(caught == true); ////////////////////////////// // Test accessing for(int i = 0; i < 24; i++) { for(int j = 0; j < 100; j++) { assert(!hasPosition(i, j)); } } ////////////////////////////// // Test setting all for(int i = 0; i < 24; i++) { for(int j = 0; j < 100; j++) { addPosition(i, j); } } for(int i = 0; i < 24; i++) { for(int j = 0; j < 100; j++) { assert(hasPosition(i, j)); } } ////////////////////////////// // Test accessing out of range assert(!hasPosition(-1, 0)); assert(!hasPosition(0, -1)); assert(!hasPosition(100, 0)); assert(!hasPosition(0, 1000)); ////////////////////////////// // Test adding more to ref 4, // but skipping positions. for(int j = 300; j < 350; j++) { addPosition(4, j); } for(int j = 0; j < 100; j++) { assert(hasPosition(4, j)); } for(int j = 100; j < 300; j++) { assert(!hasPosition(4, j)); } for(int j = 300; j < 350; j++) { assert(hasPosition(4, j)); } // Test adding a new reference, 30, // position 16. addPosition(30, 16); // Check the size now. assert(myPosList.size() == 31); for(int i = 0; i < 24; i++) { if(i != 4) { assert(myPosList.at(i).size() == 100); } else { assert(myPosList.at(i).size() == 350); } } for(int i = 24; i < 31; i++) { assert(myPosList.at(i).size() == 350); } ////////////////////////////// // Test accessing for(int i = 24; i < 30; i++) { for(int j = 0; j < 350; j++) { assert(!hasPosition(i, j)); } } for(int j = 0; j < 350; j++) { if(j != 16) { assert(!hasPosition(30, j)); } else { assert(hasPosition(30, 16)); } } } libStatGen-1.0.14/bam/test/TestPosList.h000066400000000000000000000016041254730101300177520ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "PosList.h" void testPosList(); class TestPosList : public PosList { public: TestPosList(); ~TestPosList(); void testPosList(); }; libStatGen-1.0.14/bam/test/TestSamCoordOutput.cpp000066400000000000000000000111161254730101300216370ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TestSamCoordOutput.h" #include "TestValidate.h" #include "SamCoordOutput.h" #include "SamRecordPool.h" #include void testSamCoordOutput() { // Call generic test. SamCoordOutputTest::testSamCoordOutput(); } void SamCoordOutputTest::testSamCoordOutput() { SamRecordPool pool(3); SamCoordOutput outputBuffer(pool); SamFile inSam; SamFile outSam; SamFileHeader samHeader; SamRecord* rec1 = NULL; SamRecord* rec2 = NULL; SamRecord* rec3 = NULL; // Open input file and read the header. #ifdef __ZLIB_AVAILABLE__ assert(inSam.OpenForRead("testFiles/testBam.bam")); #else assert(inSam.OpenForRead("testFiles/testSam.sam")); #endif assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); // Check failed to add empty record. assert(!outputBuffer.add(rec1)); // Read the first 3 records from the input file. rec1 = pool.getRecord(); assert(inSam.ReadRecord(samHeader, *rec1) == true); validateRead1(*rec1); rec2 = pool.getRecord(); assert(inSam.ReadRecord(samHeader, *rec2) == true); validateRead2(*rec2); rec3 = pool.getRecord(); assert(inSam.ReadRecord(samHeader, *rec3) == true); validateRead3(*rec3); assert(pool.getRecord() == NULL); // Add the first 3 records to the output buffer. // Sorted order is rec 3, 1, 2 assert(outputBuffer.add(rec1)); assert(outputBuffer.add(rec2)); assert(outputBuffer.add(rec3)); // Test writing to the output buffer without having set it. // Should flush just rec3 out. assert(!outputBuffer.flush(0, 100)); // Set the output buffer. outputBuffer.setOutputFile(&outSam, &samHeader); // Open output file and write the header. assert(outSam.OpenForWrite("results/TestSamCoordOutput.sam")); assert(outSam.WriteHeader(samHeader)); // Read another 1 record (reuse record pointers). rec1 = pool.getRecord(); assert(inSam.ReadRecord(samHeader, *rec1) == true); validateRead4(*rec1); assert(outputBuffer.add(rec1)); rec1 = pool.getRecord(); assert(rec1 == NULL); // Flush out just the reads before this position. assert(outputBuffer.flush(0, 1011)); // Read 2 more records. rec1 = pool.getRecord(); assert(inSam.ReadRecord(samHeader, *rec1) == true); validateRead5(*rec1); assert(outputBuffer.add(rec1)); rec1 = pool.getRecord(); assert(inSam.ReadRecord(samHeader, *rec1) == true); validateRead6(*rec1); assert(outputBuffer.add(rec1)); // Can get another record (tests to make sure flushes up to and // including the specified position). If it did not // flush the specified position, there would not be // another record available. rec1 = pool.getRecord(); assert(rec1 != NULL); // Flush out just the reads before this position. assert(outputBuffer.flush(0, 1012)); // Read another record. assert(inSam.ReadRecord(samHeader, *rec1) == true); validateRead7(*rec1); assert(outputBuffer.add(rec1)); assert(pool.getRecord() == NULL); // Flush out just the reads on chrom 1 (chrom id 0). assert(outputBuffer.flush(0, -1)); // Read another record. rec1 = pool.getRecord(); assert(inSam.ReadRecord(samHeader, *rec1) == true); validateRead8(*rec1); assert(outputBuffer.add(rec1)); assert(pool.getRecord() == NULL); // Flush out the chrom 2 (chrom id 1) reads. assert(outputBuffer.flush(2, 0)); // Read the rest of the records. rec1 = pool.getRecord(); assert(inSam.ReadRecord(samHeader, *rec1) == true); validateRead9(*rec1); assert(outputBuffer.add(rec1)); rec1 = pool.getRecord(); assert(inSam.ReadRecord(samHeader, *rec1) == true); validateRead10(*rec1); assert(outputBuffer.add(rec1)); assert(pool.getRecord() == NULL); // Flush the rest by passing in -1, -1 assert(outputBuffer.flush(-1, -1)); } libStatGen-1.0.14/bam/test/TestSamCoordOutput.h000066400000000000000000000016461254730101300213130ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __TEST_SAM_COORD_OUTPUT_H__ #define __TEST_SAM_COORD_OUTPUT_H__ void testSamCoordOutput(); class SamCoordOutputTest { public: static void testSamCoordOutput(); private: }; #endif libStatGen-1.0.14/bam/test/TestSamRecordHelper.cpp000066400000000000000000000043311254730101300217270ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TestSamRecordHelper.h" #include "TestValidate.h" #include "SamRecordHelper.h" #include void testSamRecordHelper() { // Call generic test. SamRecordHelperTest::testSamRecordHelper("testFiles/testSam.sam"); // SamRecordHelperTest::testSamRecordHelper("testFiles/testBam.bam"); } void SamRecordHelperTest::testSamRecordHelper(const char* fileName) { SamFile inSam; assert(inSam.OpenForRead(fileName)); SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); SamRecord samRecord; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); // Validate the entire sequence matches. assert(SamRecordHelper::checkSequence(samRecord, TestValidate::READ1_POS, TestValidate::READ1_SEQ.c_str()) == 0); // The read start position is 1010. // The sequence is CCGAA. assert(SamRecordHelper::checkSequence(samRecord, 1010, "CCGAA") == 0); // Test not matching. assert(SamRecordHelper::checkSequence(samRecord, 1010, "NNNNN") == -1); // Test match, but not at the start. assert(SamRecordHelper::checkSequence(samRecord, 1011, "CGA") == 1); // Test match not at the start, but to the end. assert(SamRecordHelper::checkSequence(samRecord, 1011, "CGAA") == 1); // Test run over the end. assert(SamRecordHelper::checkSequence(samRecord, 1011, "CGAAC") == -1); } libStatGen-1.0.14/bam/test/TestSamRecordHelper.h000066400000000000000000000016001254730101300213700ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" void testSamRecordHelper(); class SamRecordHelperTest { public: static void testSamRecordHelper(const char* fileName); private: }; libStatGen-1.0.14/bam/test/TestSamRecordPool.cpp000066400000000000000000000056661254730101300214350ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TestSamRecordPool.h" #include "SamRecordPool.h" #include void testSamRecordPool() { // Call generic test. SamRecordPoolTest::testSamRecordPool(); } void SamRecordPoolTest::testSamRecordPool() { // Attempt to allocate with max size 0, // fails to get a record. SamRecordPool pool(0); assert(pool.getRecord() == NULL); // Up the max size to 3. pool.setMaxAllocatedRecs(3); // Successfully get 1st record. SamRecord* rec1 = pool.getRecord(); assert(rec1 != NULL); // Successfully get 2nd record. SamRecord* rec2 = pool.getRecord(); assert(rec2 != NULL); assert(rec2 != rec1); // Successfully get 3rd record. SamRecord* rec3 = pool.getRecord(); assert(rec3 != NULL); assert((rec3 != rec1) && (rec3 != rec2)); // Fail to get a 4th record. assert(pool.getRecord() == NULL); // Release a record and confirm its reuse. pool.releaseRecord(rec2); SamRecord* rec = pool.getRecord(); assert(rec == rec2); // Release multiple records and check reuse. pool.releaseRecord(rec3); pool.releaseRecord(rec1); pool.releaseRecord(rec); SamRecord* release1 = pool.getRecord(); SamRecord* release2 = pool.getRecord(); SamRecord* release3 = pool.getRecord(); assert(release1 == rec3); assert(release2 == rec1); assert(release3 == rec); assert(pool.getRecord() == NULL); // Up the max allocated size but don't allocate any, then // reduce the max allocated size and release all the records // but the already allocated records will still be used. pool.setMaxAllocatedRecs(4); pool.setMaxAllocatedRecs(0); pool.releaseRecord(release3); pool.releaseRecord(release1); pool.releaseRecord(release2); rec1 = pool.getRecord(); rec2 = pool.getRecord(); rec3 = pool.getRecord(); assert(rec1 == release3); assert(rec2 == release1); assert(rec3 == release2); assert(pool.getRecord() == NULL); // Up the max allocated size and allocate another record. pool.setMaxAllocatedRecs(4); rec = pool.getRecord(); assert(rec != NULL); assert(rec != rec1); assert(rec != rec2); assert(rec != rec3); assert(pool.getRecord() == NULL); } libStatGen-1.0.14/bam/test/TestSamRecordPool.h000066400000000000000000000016411254730101300210670ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __TEST_SAM_RECORD_POOL_H__ #define __TEST_SAM_RECORD_POOL_H__ void testSamRecordPool(); class SamRecordPoolTest { public: static void testSamRecordPool(); private: }; #endif libStatGen-1.0.14/bam/test/TestValidate.cpp000066400000000000000000002234301254730101300204440ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TestValidate.h" #include "BaseUtilities.h" const std::string TestValidate::READ1_CIGAR = "5M2D"; const std::string TestValidate::READ1_SEQ = "CCGAA"; const std::string TestValidate::READ1_QUAL = "6>6+4"; const std::string TestValidate::READ6_CIGAR = "3S2H5M"; const std::string TestValidate::READ6_SEQ = "TGCACGTN"; const std::string TestValidate::READ6_QUAL = "453;>>>>"; const std::string TestValidate::READ7_CIGAR = "3S5M1S3H"; const std::string TestValidate::READ7_SEQ = "TGCACGTNG"; const std::string TestValidate::READ7_QUAL = "453;>>>>5"; void validateRead1(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 1 // Create record structure for validating. int expectedBlockSize = 89; const char* expectedReferenceName = "1"; const char* expectedMateReferenceName = "1"; const char* expectedMateReferenceNameOrEqual = "="; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; const bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = 0; expectedRecordPtr->myPosition = TestValidate::READ1_POS; expectedRecordPtr->myReadNameLength = 23; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4681; expectedRecordPtr->myCigarLength = 2; expectedRecordPtr->myFlag = 73; expectedRecordPtr->myReadLength = 5; expectedRecordPtr->myMateReferenceID = 0; expectedRecordPtr->myMatePosition = 1010; expectedRecordPtr->myInsertSize = 0; assert(samRecord.getString("MD") == "37"); assert(samRecord.getString("YZ") == ""); assert(samRecord.getInteger("YZ") == -1); float tmpFloat = -1; assert(samRecord.getFloatTag("YZ", tmpFloat) == false); // Check the alignment end assert(samRecord.get0BasedAlignmentEnd() == TestValidate::READ1_ALIGN_END); assert(samRecord.get1BasedAlignmentEnd() == (TestValidate::READ1_ALIGN_END + 1)); assert(samRecord.getAlignmentLength() == TestValidate::READ1_ALIGN_LEN); assert(samRecord.get1BasedUnclippedStart() == (TestValidate::READ1_UNCLIP_START + 1)); assert(samRecord.get0BasedUnclippedStart() == TestValidate::READ1_UNCLIP_START); assert(samRecord.get1BasedUnclippedEnd() == (TestValidate::READ1_UNCLIP_END + 1)); assert(samRecord.get0BasedUnclippedEnd() == TestValidate::READ1_UNCLIP_END); // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "1:1011:F:255+17M15D20M") == 0); assert(samRecord.getCigar() == TestValidate::READ1_CIGAR); assert(samRecord.getSequence() == TestValidate::READ1_SEQ); assert(samRecord.getQuality() == TestValidate::READ1_QUAL); assert(samRecord.getSequence(0) == 'C'); assert(samRecord.getQuality(0) == '6'); assert(samRecord.getSequence(1) == 'C'); assert(samRecord.getQuality(1) == '>'); assert(samRecord.getSequence(2) == 'G'); assert(samRecord.getQuality(2) == '6'); assert(samRecord.getSequence(3) == 'A'); assert(samRecord.getQuality(3) == '+'); assert(samRecord.getSequence(4) == 'A'); assert(samRecord.getQuality(4) == '4'); bool caught = false; try { samRecord.getSequence(-1); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getSequence(-1) is out of range. Index must be between 0 and 4") == 0); } assert(caught == true); caught = false; try { samRecord.getQuality(-1); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getQuality(-1) is out of range. Index must be between 0 and 4") == 0); } assert(caught == true); caught = false; try { samRecord.getSequence(5); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getSequence(5) is out of range. Index must be between 0 and 4") == 0); } assert(caught == true); caught = false; try { samRecord.getQuality(5); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getQuality(5) is out of range. Index must be between 0 and 4") == 0); } assert(caught == true); assert(samRecord.getNumOverlaps(1010, 1017) == 5); assert(samRecord.getNumOverlaps(1010, 1016) == 5); assert(samRecord.getNumOverlaps(1012, 1017) == 3); assert(samRecord.getNumOverlaps(1015, 1017) == 0); assert(samRecord.getNumOverlaps(1017, 1010) == 0); assert(samRecord.getNumOverlaps(1013, 1011) == 0); assert(samRecord.getNumOverlaps(-1, 1017) == 5); assert(samRecord.getNumOverlaps(1010, -1) == 5); // Check the tags. assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'A'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 0); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'M'); assert(tag[1] == 'D'); assert(type == 'Z'); assert(*(String*)value == "37"); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'N'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 0); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'X'); assert(tag[1] == 'T'); assert(type == 'A'); assert(*(char*)value == 'R'); // No more tags, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (const bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // Validate the cigar. // The First cigar is 5M which is 5 << 4 | 0 = 80 assert(*(unsigned int*)varPtr == 80); // Increment the varptr the size of an int. varPtr += 4; // The 2nd cigar is 2D which is 2 << 4 | 2 = 34 assert(*(unsigned int*)varPtr == 34); // Increment the varptr the size of an int. varPtr += 4; // Validate the sequence. // CC = 0x22 assert(*varPtr == 0x22); varPtr++; // GA = 0x41 assert(*varPtr == 0x41); varPtr++; // A = 0x10 assert(*varPtr == 0x10); varPtr++; // Validate the Quality for(int i = 0; i < expectedRecordPtr->myReadLength; i++) { assert(*varPtr == samRecord.getQuality()[i] - 33); varPtr++; } // Validate the tags. assert(*varPtr == 'A'); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'D'); varPtr++; assert(*varPtr == 'Z'); varPtr++; assert(*varPtr == '3'); varPtr++; assert(*varPtr == '7'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'N'); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'X'); varPtr++; assert(*varPtr == 'T'); varPtr++; assert(*varPtr == 'A'); varPtr++; assert(*varPtr == 'R'); varPtr++; } void validateRead2(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 2 // Create record structure for validating. int expectedBlockSize = 61; const char* expectedReferenceName = "1"; const char* expectedMateReferenceName = "1"; const char* expectedMateReferenceNameOrEqual = "="; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = 0; expectedRecordPtr->myPosition = TestValidate::READ2_POS; expectedRecordPtr->myReadNameLength = 23; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4681; expectedRecordPtr->myCigarLength = 0; expectedRecordPtr->myFlag = 133; expectedRecordPtr->myReadLength = 4; expectedRecordPtr->myMateReferenceID = 0; expectedRecordPtr->myMatePosition = 1010; expectedRecordPtr->myInsertSize = 0; // Check the fields. bamRecordStruct retrieveRecord; String retrieveReadName; String retrieveCigar; String retrieveSequence; String retrieveQuality; assert(samRecord.getFields(retrieveRecord, retrieveReadName, retrieveCigar, retrieveSequence, retrieveQuality) == true); assert(retrieveRecord.myBlockSize == expectedRecordPtr->myBlockSize); assert(retrieveRecord.myReferenceID == expectedRecordPtr->myReferenceID); assert(retrieveRecord.myPosition == expectedRecordPtr->myPosition); assert(retrieveRecord.myReadNameLength == expectedRecordPtr->myReadNameLength); assert(retrieveRecord.myMapQuality == expectedRecordPtr->myMapQuality); assert(retrieveRecord.myBin == expectedRecordPtr->myBin); assert(retrieveRecord.myCigarLength == expectedRecordPtr->myCigarLength); assert(retrieveRecord.myFlag == expectedRecordPtr->myFlag); assert(retrieveRecord.myReadLength == expectedRecordPtr->myReadLength); assert(retrieveRecord.myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(retrieveRecord.myMatePosition == expectedRecordPtr->myMatePosition); assert(retrieveRecord.myInsertSize == expectedRecordPtr->myInsertSize); // Check the alignment end assert(samRecord.getAlignmentLength() == 0); assert(samRecord.get0BasedAlignmentEnd() == 1011); assert(samRecord.get1BasedAlignmentEnd() == 1012); assert(samRecord.get0BasedUnclippedStart() == 1011); assert(samRecord.get1BasedUnclippedStart() == 1012); assert(samRecord.get0BasedUnclippedEnd() == 1011); assert(samRecord.get1BasedUnclippedEnd() == 1012); // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "1:1011:F:255+17M15D20M") == 0); assert(strcmp(samRecord.getCigar(), "*") == 0); assert(strcmp(samRecord.getSequence(), "CTGT") == 0); assert(strcmp(samRecord.getQuality(), ">>9>") == 0); assert(samRecord.getSequence(0) == 'C'); assert(samRecord.getQuality(0) == '>'); assert(samRecord.getSequence(1) == 'T'); assert(samRecord.getQuality(1) == '>'); assert(samRecord.getSequence(2) == 'G'); assert(samRecord.getQuality(2) == '9'); assert(samRecord.getSequence(3) == 'T'); assert(samRecord.getQuality(3) == '>'); bool caught = false; try { samRecord.getSequence(-1); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getSequence(-1) is out of range. Index must be between 0 and 3") == 0); } assert(caught == true); caught = false; try { samRecord.getQuality(-1); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getQuality(-1) is out of range. Index must be between 0 and 3") == 0); } assert(caught == true); caught = false; try { samRecord.getSequence(4); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getSequence(4) is out of range. Index must be between 0 and 3") == 0); } assert(caught == true); caught = false; try { samRecord.getQuality(4); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getQuality(4) is out of range. Index must be between 0 and 3") == 0); } assert(caught == true); assert(samRecord.getNumOverlaps(1011, 1017) == 0); assert(samRecord.getNumOverlaps(0, 1116) == 0); // No Tags to check, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // No cigar to validate. // Validate the sequence. // CT = 0x28 assert(*varPtr == 0x28); varPtr++; // GT = 0x48 assert(*varPtr == 0x48); varPtr++; // Validate the Quality for(int i = 0; i < expectedRecordPtr->myReadLength; i++) { assert(*varPtr == samRecord.getQuality()[i] - 33); varPtr++; } // No tags. } void validateRead3(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 3 // Create record structure for validating. int expectedBlockSize = 87; const char* expectedReferenceName = "1"; const char* expectedMateReferenceName = "18"; const char* expectedMateReferenceNameOrEqual = "18"; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = 0; expectedRecordPtr->myPosition = 74; expectedRecordPtr->myReadNameLength = 21; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4681; expectedRecordPtr->myCigarLength = 1; expectedRecordPtr->myFlag = 97; expectedRecordPtr->myReadLength = 5; expectedRecordPtr->myMateReferenceID = 17; expectedRecordPtr->myMatePosition = 756; expectedRecordPtr->myInsertSize = 0; // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "18:462+29M5I3M:F:295") == 0); assert(strcmp(samRecord.getCigar(), "5M") == 0); assert(strcmp(samRecord.getSequence(), "ACGTN") == 0); assert(strcmp(samRecord.getQuality(), ";>>>>") == 0); assert(samRecord.getNumOverlaps(74, 79) == 5); assert(samRecord.getNumOverlaps(74, 78) == 4); assert(samRecord.getNumOverlaps(73, 79) == 5); assert(samRecord.getNumOverlaps(75, 79) == 4); assert(samRecord.getNumOverlaps(0, 179) == 5); assert(samRecord.getNumOverlaps(0, 19) == 0); // Check the alignment end assert(samRecord.get0BasedAlignmentEnd() == 78); assert(samRecord.get1BasedAlignmentEnd() == 79); assert(samRecord.getAlignmentLength() == 5); assert(samRecord.get0BasedUnclippedStart() == 74); assert(samRecord.get1BasedUnclippedStart() == 75); assert(samRecord.get0BasedUnclippedEnd() == 78); assert(samRecord.get1BasedUnclippedEnd() == 79); // Check the tags. assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'A'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 0); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'M'); assert(tag[1] == 'D'); assert(type == 'Z'); assert(*(String*)value == "30A0C5"); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'N'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 2); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'X'); assert(tag[1] == 'T'); assert(type == 'A'); assert(*(char*)value == 'R'); // No more tags, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // Validate the cigar. // The cigar is 5M which is 5 << 4 | 0 = 80 assert(*(unsigned int*)varPtr == 80); // Increment the varptr the size of an int. varPtr += 4; // Validate the sequence. // AC = 0x12 assert(*varPtr == 0x12); varPtr++; // GT = 0x48 assert(*varPtr == 0x48); varPtr++; // N = 0xF0 assert(*varPtr == 0xF0); varPtr++; // Validate the Quality for(int i = 0; i < expectedRecordPtr->myReadLength; i++) { assert(*varPtr == samRecord.getQuality()[i] - 33); varPtr++; } // Validate the tags. assert(*varPtr == 'A'); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'D'); varPtr++; assert(*varPtr == 'Z'); varPtr++; assert(*varPtr == '3'); varPtr++; assert(*varPtr == '0'); varPtr++; assert(*varPtr == 'A'); varPtr++; assert(*varPtr == '0'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == '5'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'N'); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == 2); varPtr++; assert(*varPtr == 'X'); varPtr++; assert(*varPtr == 'T'); varPtr++; assert(*varPtr == 'A'); varPtr++; assert(*varPtr == 'R'); varPtr++; } void validateRead4(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 4 // Create record structure for validating. int expectedBlockSize = 57; const char* expectedReferenceName = "1"; const char* expectedMateReferenceName = "18"; const char* expectedMateReferenceNameOrEqual = "18"; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = 0; expectedRecordPtr->myPosition = 74; expectedRecordPtr->myReadNameLength = 21; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4681; expectedRecordPtr->myCigarLength = 0; expectedRecordPtr->myFlag = 97; expectedRecordPtr->myReadLength = 0; expectedRecordPtr->myMateReferenceID = 17; expectedRecordPtr->myMatePosition = 756; expectedRecordPtr->myInsertSize = 0; // Check the alignment end assert(samRecord.get1BasedUnclippedEnd() == 75); assert(samRecord.get0BasedUnclippedEnd() == 74); assert(samRecord.get0BasedUnclippedStart() == 74); assert(samRecord.get1BasedUnclippedStart() == 75); assert(samRecord.get1BasedAlignmentEnd() == 75); assert(samRecord.get0BasedAlignmentEnd() == 74); assert(samRecord.getAlignmentLength() == 0); // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "18:462+29M5I3M:F:295") == 0); assert(strcmp(samRecord.getCigar(), "*") == 0); assert(strcmp(samRecord.getSequence(), "*") == 0); assert(strcmp(samRecord.getQuality(), "*") == 0); bool caught = false; try { samRecord.getSequence(0); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getSequence(0) is not allowed since sequence = '*'") == 0); } assert(caught == true); caught = false; try { assert(samRecord.getQuality(0) == BaseUtilities::UNKNOWN_QUALITY_CHAR); } catch (std::exception& e) { caught = true; } assert(caught == false); try { samRecord.getSequence(-1); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getSequence(-1) is not allowed since sequence = '*'") == 0); } assert(caught == true); caught = false; try { assert(samRecord.getQuality(-1) == BaseUtilities::UNKNOWN_QUALITY_CHAR); } catch (std::exception& e) { caught = true; } assert(caught == false); caught = false; try { samRecord.getSequence(5); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getSequence(5) is not allowed since sequence = '*'") == 0); } assert(caught == true); caught = false; try { assert(samRecord.getQuality(5) == BaseUtilities::UNKNOWN_QUALITY_CHAR); } catch (std::exception& e) { caught = true; } assert(caught == false); assert(samRecord.getNumOverlaps(74, 79) == 0); assert(samRecord.getNumOverlaps(74, 78) == 0); assert(samRecord.getNumOverlaps(73, 79) == 0); assert(samRecord.getNumOverlaps(75, 79) == 0); assert(samRecord.getNumOverlaps(0, 179) == 0); assert(samRecord.getNumOverlaps(0, 19) == 0); // Check the tag. assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'A'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 0); // No more Tags to check, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // No cigar to validate. // Validate the sequence. // No sequence. // No Quality. // Validate the tags. assert(*varPtr == 'A'); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == 0); varPtr++; } void validateRead5(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 5 int expectedBlockSize = 87; const char* expectedReferenceName = "2"; const char* expectedMateReferenceName = "18"; const char* expectedMateReferenceNameOrEqual = "18"; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = 1; expectedRecordPtr->myPosition = 74; expectedRecordPtr->myReadNameLength = 21; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4681; expectedRecordPtr->myCigarLength = 1; expectedRecordPtr->myFlag = 97; expectedRecordPtr->myReadLength = 5; expectedRecordPtr->myMateReferenceID = 17; expectedRecordPtr->myMatePosition = 756; expectedRecordPtr->myInsertSize = 0; // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "18:462+29M5I3M:F:295") == 0); assert(strcmp(samRecord.getCigar(), "5M") == 0); assert(strcmp(samRecord.getSequence(), "ACGTN") == 0); assert(strcmp(samRecord.getQuality(), "*") == 0); assert(samRecord.getNumOverlaps(74, 79) == 5); assert(samRecord.getNumOverlaps(74, 78) == 4); assert(samRecord.getNumOverlaps(73, 79) == 5); assert(samRecord.getNumOverlaps(75, 79) == 4); assert(samRecord.getNumOverlaps(0, 179) == 5); assert(samRecord.getNumOverlaps(0, 19) == 0); assert(samRecord.getSequence(0) == 'A'); char expChar = BaseUtilities::UNKNOWN_QUALITY_CHAR; assert(samRecord.getQuality(0) == expChar); assert(samRecord.getSequence(1) == 'C'); assert(samRecord.getQuality(1) == expChar); assert(samRecord.getSequence(2) == 'G'); assert(samRecord.getQuality(2) == expChar); assert(samRecord.getSequence(3) == 'T'); assert(samRecord.getQuality(3) == expChar); assert(samRecord.getSequence(4) == 'N'); assert(samRecord.getQuality(4) == expChar); bool caught = false; try { samRecord.getSequence(-1); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getSequence(-1) is out of range. Index must be between 0 and 4") == 0); } assert(caught == true); caught = false; try { samRecord.getQuality(-1); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getQuality(-1) is out of range. Index must be between 0 and 4") == 0); } assert(caught == true); caught = false; try { samRecord.getSequence(5); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getSequence(5) is out of range. Index must be between 0 and 4") == 0); } assert(caught == true); caught = false; try { samRecord.getQuality(5); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "SamRecord::getQuality(5) is out of range. Index must be between 0 and 4") == 0); } assert(caught == true); // Check the tags. assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'A'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 0); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'M'); assert(tag[1] == 'D'); assert(type == 'Z'); assert(*(String*)value == "30A0C5"); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'N'); assert(tag[1] == 'M'); assert(type == 'i'); assert(*(char*)value == 2); assert(samRecord.getNextSamTag(tag, type, &value) == true); assert(tag[0] == 'X'); assert(tag[1] == 'T'); assert(type == 'A'); assert(*(char*)value == 'R'); // No more tags, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // Validate the cigar. // The cigar is 5M which is 5 << 4 | 0 = 80 assert(*(unsigned int*)varPtr == 80); // Increment the varptr the size of an int. varPtr += 4; // Validate the sequence. // AC = 0x12 assert(*varPtr == 0x12); varPtr++; // GT = 0x48 assert(*varPtr == 0x48); varPtr++; // N = 0xF0 assert(*varPtr == 0xF0); varPtr++; // Validate the Quality for(int i = 0; i < expectedRecordPtr->myReadLength; i++) { assert(*varPtr == 0xFF); varPtr++; } // Validate the tags. assert(*varPtr == 'A'); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'D'); varPtr++; assert(*varPtr == 'Z'); varPtr++; assert(*varPtr == '3'); varPtr++; assert(*varPtr == '0'); varPtr++; assert(*varPtr == 'A'); varPtr++; assert(*varPtr == '0'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == '5'); varPtr++; assert(*varPtr == 0); varPtr++; assert(*varPtr == 'N'); varPtr++; assert(*varPtr == 'M'); varPtr++; assert(*varPtr == 'C'); varPtr++; assert(*varPtr == 2); varPtr++; assert(*varPtr == 'X'); varPtr++; assert(*varPtr == 'T'); varPtr++; assert(*varPtr == 'A'); varPtr++; assert(*varPtr == 'R'); varPtr++; } void validateRead6(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 6 // Create record structure for validating. int expectedBlockSize = 77; const char* expectedReferenceName = "1"; const char* expectedMateReferenceName = "18"; const char* expectedMateReferenceNameOrEqual = "18"; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = 0; expectedRecordPtr->myPosition = TestValidate::READ6_POS; expectedRecordPtr->myReadNameLength = 21; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4681; expectedRecordPtr->myCigarLength = 3; expectedRecordPtr->myFlag = 97; expectedRecordPtr->myReadLength = 8; expectedRecordPtr->myMateReferenceID = 17; expectedRecordPtr->myMatePosition = 756; expectedRecordPtr->myInsertSize = 0; // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "18:462+29M5I3M:F:296") == 0); assert(samRecord.getCigar() == TestValidate::READ6_CIGAR); assert(samRecord.getSequence() == TestValidate::READ6_SEQ); assert(samRecord.getQuality() == TestValidate::READ6_QUAL); assert(samRecord.getNumOverlaps(1750, 1755) == 5); assert(samRecord.getNumOverlaps(1750, 1754) == 4); assert(samRecord.getNumOverlaps(0, 2000) == 5); assert(samRecord.getNumOverlaps(1749, 1755) == 5); assert(samRecord.getNumOverlaps(1751, 1755) == 4); assert(samRecord.getNumOverlaps(0, 1752) == 2); assert(samRecord.getNumOverlaps(0, 19) == 0); // Check the alignment end assert(samRecord.get0BasedAlignmentEnd() == TestValidate::READ6_ALIGN_END); assert(samRecord.get1BasedAlignmentEnd() == (TestValidate::READ6_ALIGN_END + 1)); assert(samRecord.getAlignmentLength() == TestValidate::READ6_ALIGN_LEN); assert(samRecord.get0BasedUnclippedStart() == TestValidate::READ6_UNCLIP_START); assert(samRecord.get1BasedUnclippedStart() == (TestValidate::READ6_UNCLIP_START + 1)); assert(samRecord.get0BasedUnclippedEnd() == TestValidate::READ6_UNCLIP_END); assert(samRecord.get1BasedUnclippedEnd() == (TestValidate::READ6_UNCLIP_END + 1)); // No tags. assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // Validate the cigar. // The cigar is 3S2H5M which is: // 3S: 3 << 4 | 4 = 0x34 assert(*(unsigned int*)varPtr == 0x34); // Increment the varptr the size of an int. varPtr += 4; // 2H: 2 << 4 | 5 = 0x25 assert(*(unsigned int*)varPtr == 0x25); // Increment the varptr the size of an int. varPtr += 4; // 5M: 5 << 4 | 0 = 0x50 assert(*(unsigned int*)varPtr == 0x50); // Increment the varptr the size of an int. varPtr += 4; // Validate the sequence. // TG = 0x84 assert(*varPtr == 0x84); varPtr++; // CA = 0x21 assert(*varPtr == 0x21); varPtr++; // CG = 0x24 assert(*varPtr == 0x24); varPtr++; // TN = 0x8F assert(*varPtr == 0x8F); varPtr++; // Validate the Quality for(int i = 0; i < expectedRecordPtr->myReadLength; i++) { assert(*varPtr == samRecord.getQuality()[i] - 33); varPtr++; } } void validateRead7(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 7 // Create record structure for validating. int expectedBlockSize = 83; const char* expectedReferenceName = "2"; const char* expectedMateReferenceName = "18"; const char* expectedMateReferenceNameOrEqual = "18"; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = 1; expectedRecordPtr->myPosition = TestValidate::READ7_POS; expectedRecordPtr->myReadNameLength = 21; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4681; expectedRecordPtr->myCigarLength = 4; expectedRecordPtr->myFlag = 97; expectedRecordPtr->myReadLength = 9; expectedRecordPtr->myMateReferenceID = 17; expectedRecordPtr->myMatePosition = 756; expectedRecordPtr->myInsertSize = 0; // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "18:462+29M5I3M:F:297") == 0); assert(samRecord.getCigar() == TestValidate::READ7_CIGAR); assert(samRecord.getSequence() == TestValidate::READ7_SEQ); assert(samRecord.getQuality() == TestValidate::READ7_QUAL); assert(samRecord.getNumOverlaps(1750, 1755) == 5); assert(samRecord.getNumOverlaps(1750, 1754) == 4); assert(samRecord.getNumOverlaps(0, 2000) == 5); assert(samRecord.getNumOverlaps(1749, 1755) == 5); assert(samRecord.getNumOverlaps(1751, 1755) == 4); assert(samRecord.getNumOverlaps(0, 1752) == 2); assert(samRecord.getNumOverlaps(0, 19) == 0); // Check the alignment end assert(samRecord.get0BasedAlignmentEnd() == TestValidate::READ7_ALIGN_END); assert(samRecord.get1BasedAlignmentEnd() == (TestValidate::READ7_ALIGN_END + 1)); assert(samRecord.getAlignmentLength() == TestValidate::READ7_ALIGN_LEN); assert(samRecord.get0BasedUnclippedStart() == TestValidate::READ7_UNCLIP_START); assert(samRecord.get1BasedUnclippedStart() == (TestValidate::READ7_UNCLIP_START + 1)); assert(samRecord.get0BasedUnclippedEnd() == TestValidate::READ7_UNCLIP_END); assert(samRecord.get1BasedUnclippedEnd() == (TestValidate::READ7_UNCLIP_END + 1)); // No tags. assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // Validate the cigar. // The cigar is 3S5M1S3H which is: // 3S: 3 << 4 | 4 = 0x34 assert(*(unsigned int*)varPtr == 0x34); // Increment the varptr the size of an int. varPtr += 4; // 5M: 5 << 4 | 0 = 0x50 assert(*(unsigned int*)varPtr == 0x50); // Increment the varptr the size of an int. varPtr += 4; // 1S: 1 << 4 | 4 = 0x14 assert(*(unsigned int*)varPtr == 0x14); // Increment the varptr the size of an int. varPtr += 4; // 3H: 3 << 4 | 5 = 0x35 assert(*(unsigned int*)varPtr == 0x35); // Increment the varptr the size of an int. varPtr += 4; // Validate the sequence. // TG = 0x84 assert(*varPtr == 0x84); varPtr++; // CA = 0x21 assert(*varPtr == 0x21); varPtr++; // CG = 0x24 assert(*varPtr == 0x24); varPtr++; // TN = 0x8F assert(*varPtr == 0x8F); varPtr++; // G = 0x40 assert(*varPtr == 0x40); varPtr++; // Validate the Quality for(int i = 0; i < expectedRecordPtr->myReadLength; i++) { assert(*varPtr == samRecord.getQuality()[i] - 33); varPtr++; } } void validateRead8(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 8 // Create record structure for validating. int expectedBlockSize = 65; const char* expectedReferenceName = "*"; const char* expectedMateReferenceName = "*"; const char* expectedMateReferenceNameOrEqual = "*"; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = -1; expectedRecordPtr->myPosition = -1; expectedRecordPtr->myReadNameLength = 27; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4680; expectedRecordPtr->myCigarLength = 0; expectedRecordPtr->myFlag = 141; expectedRecordPtr->myReadLength = 4; expectedRecordPtr->myMateReferenceID = -1; expectedRecordPtr->myMatePosition = -1; expectedRecordPtr->myInsertSize = 0; // Check the alignment end assert(samRecord.get0BasedAlignmentEnd() == -1); assert(samRecord.get1BasedAlignmentEnd() == 0); assert(samRecord.getAlignmentLength() == 0); assert(samRecord.get0BasedUnclippedStart() == -1); assert(samRecord.get1BasedUnclippedStart() == 0); assert(samRecord.get0BasedUnclippedEnd() == -1); assert(samRecord.get1BasedUnclippedEnd() == 0); // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "Y:16597235+13M13I11M:F:181") == 0); assert(strcmp(samRecord.getCigar(), "*") == 0); assert(strcmp(samRecord.getSequence(), "AACT") == 0); assert(strcmp(samRecord.getQuality(), "==;;") == 0); assert(samRecord.getNumOverlaps(1750, 1755) == 0); assert(samRecord.getNumOverlaps(1750, 1754) == 0); assert(samRecord.getNumOverlaps(0, 2000) == 0); assert(samRecord.getNumOverlaps(1749, 1755) == 0); assert(samRecord.getNumOverlaps(1751, 1755) == 0); assert(samRecord.getNumOverlaps(0, 1752) == 0); assert(samRecord.getNumOverlaps(0, 19) == 0); assert(samRecord.getNumOverlaps(-1, 4) == 0); // No Tags to check, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // No cigar to validate. // Validate the sequence. // AA = 0x11 assert(*varPtr == 0x11); varPtr++; // CT = 0x28 assert(*varPtr == 0x28); varPtr++; // Validate the Quality for(int i = 0; i < expectedRecordPtr->myReadLength; i++) { assert(*varPtr == samRecord.getQuality()[i] - 33); varPtr++; } // No tags. } void validateRead9(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 9 // Create record structure for validating. int expectedBlockSize = 77; const char* expectedReferenceName = "3"; const char* expectedMateReferenceName = "18"; const char* expectedMateReferenceNameOrEqual = "18"; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = 2; expectedRecordPtr->myPosition = 74; expectedRecordPtr->myReadNameLength = 21; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4681; expectedRecordPtr->myCigarLength = 3; expectedRecordPtr->myFlag = 97; expectedRecordPtr->myReadLength = 8; expectedRecordPtr->myMateReferenceID = 17; expectedRecordPtr->myMatePosition = 756; expectedRecordPtr->myInsertSize = 0; // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getStatus() == SamStatus::SUCCESS); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "18:462+29M5I3M:F:298") == 0); assert(strcmp(samRecord.getCigar(), "3S5M4H") == 0); assert((strcmp(samRecord.getSequence(), "TGCACGTN") == 0) || (strcmp(samRecord.getSequence(), "tgcacgtn") == 0)); assert(strcmp(samRecord.getQuality(), "453;>>>>") == 0); assert(samRecord.getNumOverlaps(74, 79) == 5); assert(samRecord.getNumOverlaps(73, 79) == 5); assert(samRecord.getNumOverlaps(75, 78) == 3); assert(samRecord.getNumOverlaps(0, 1017) == 5); assert(samRecord.getNumOverlaps(79, 85) == 0); assert(samRecord.getNumOverlaps(78, 85) == 1); assert(samRecord.getNumOverlaps(-1, 1017) == 5); // Check the alignment end assert(samRecord.get0BasedAlignmentEnd() == 78); assert(samRecord.get1BasedAlignmentEnd() == 79); assert(samRecord.getAlignmentLength() == 5); assert(samRecord.get0BasedUnclippedStart() == 71); assert(samRecord.get1BasedUnclippedStart() == 72); assert(samRecord.get0BasedUnclippedEnd() == 82); assert(samRecord.get1BasedUnclippedEnd() == 83); // No tags. assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); assert(bufferPtr != NULL); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // Validate the cigar. // The cigar is 3S5M1S3H which is: // 3S: 3 << 4 | 4 = 0x34 assert(*(unsigned int*)varPtr == 0x34); // Increment the varptr the size of an int. varPtr += 4; // 5M: 5 << 4 | 0 = 0x50 assert(*(unsigned int*)varPtr == 0x50); // Increment the varptr the size of an int. varPtr += 4; // 4H: 4 << 4 | 5 = 0x45 assert(*(unsigned int*)varPtr == 0x45); // Increment the varptr the size of an int. varPtr += 4; // Validate the sequence. // TG = 0x84 assert(*varPtr == 0x84); varPtr++; // CA = 0x21 assert(*varPtr == 0x21); varPtr++; // CG = 0x24 assert(*varPtr == 0x24); varPtr++; // TN = 0x8F assert(*varPtr == 0x8F); varPtr++; // Validate the Quality for(int i = 0; i < expectedRecordPtr->myReadLength; i++) { assert(*varPtr == samRecord.getQuality()[i] - 33); varPtr++; } } void validateRead10(SamRecord& samRecord) { ////////////////////////////////////////// // Validate Record 10 // Create record structure for validating. int expectedBlockSize = 59; const char* expectedReferenceName = "*"; const char* expectedMateReferenceName = "*"; const char* expectedMateReferenceNameOrEqual = "*"; bamRecordStruct* expectedRecordPtr = (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int)); char tag[3]; char type; void* value; bamRecordStruct* bufferPtr; unsigned char* varPtr; expectedRecordPtr->myBlockSize = expectedBlockSize; expectedRecordPtr->myReferenceID = -1; expectedRecordPtr->myPosition = -1; expectedRecordPtr->myReadNameLength = 27; expectedRecordPtr->myMapQuality = 0; expectedRecordPtr->myBin = 4680; expectedRecordPtr->myCigarLength = 0; expectedRecordPtr->myFlag = 141; expectedRecordPtr->myReadLength = 0; expectedRecordPtr->myMateReferenceID = -1; expectedRecordPtr->myMatePosition = -1; expectedRecordPtr->myInsertSize = 0; // Check the alignment end assert(samRecord.get0BasedUnclippedStart() == -1); assert(samRecord.get1BasedUnclippedStart() == 0); assert(samRecord.get0BasedUnclippedEnd() == -1); assert(samRecord.get1BasedUnclippedEnd() == 0); assert(samRecord.get1BasedAlignmentEnd() == 0); assert(samRecord.get0BasedAlignmentEnd() == -1); assert(samRecord.getAlignmentLength() == 0); // Check the accessors. assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize); assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID); assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0); assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1); assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition); assert(samRecord.getReadNameLength() == expectedRecordPtr->myReadNameLength); assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality); assert(samRecord.getBin() == expectedRecordPtr->myBin); assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength); assert(samRecord.getFlag() == expectedRecordPtr->myFlag); assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength); assert(samRecord.getMateReferenceID() == expectedRecordPtr->myMateReferenceID); assert(strcmp(samRecord.getMateReferenceName(), expectedMateReferenceName) == 0); assert(strcmp(samRecord.getMateReferenceNameOrEqual(), expectedMateReferenceNameOrEqual) == 0); assert(samRecord.get1BasedMatePosition() == expectedRecordPtr->myMatePosition + 1); assert(samRecord.get0BasedMatePosition() == expectedRecordPtr->myMatePosition); assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize); assert(strcmp(samRecord.getReadName(), "Y:16597235+13M13I11M:F:181") == 0); assert(strcmp(samRecord.getCigar(), "*") == 0); assert(strcmp(samRecord.getSequence(), "*") == 0); assert(strcmp(samRecord.getQuality(), "*") == 0); assert(samRecord.getNumOverlaps(74, 79) == 0); assert(samRecord.getNumOverlaps(73, 79) == 0); assert(samRecord.getNumOverlaps(75, 78) == 0); assert(samRecord.getNumOverlaps(0, 1017) == 0); assert(samRecord.getNumOverlaps(79, 85) == 0); assert(samRecord.getNumOverlaps(78, 85) == 0); assert(samRecord.getNumOverlaps(-1, 1017) == 0); // No Tags to check, should return false. assert(samRecord.getNextSamTag(tag, type, &value) == false); assert(samRecord.getNextSamTag(tag, type, &value) == false); // Get the record ptr. bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer(); // Validate the buffers match. assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize); assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID); assert(bufferPtr->myPosition == expectedRecordPtr->myPosition); assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength); assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality); assert(bufferPtr->myBin == expectedRecordPtr->myBin); assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength); assert(bufferPtr->myFlag == expectedRecordPtr->myFlag); assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength); assert(bufferPtr->myMateReferenceID == expectedRecordPtr->myMateReferenceID); assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition); assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize); // Validate the variable length fields in the buffer. // Set the pointer to the start of the variable fields. varPtr = (unsigned char*)(&(bufferPtr->myData[0])); // Validate the readname. for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++) { assert(*varPtr == samRecord.getReadName()[i]); varPtr++; } // No cigar to validate. // No sequence. // No Quality. // No Tags. } void validateHeader(SamFileHeader& samHeader) { validateHeaderFields(samHeader); validateHeaderString(samHeader); } void validateHeaderFields(SamFileHeader& samHeader) { const char* value; //////////////////////////////////////////////////////// // Test getting a specific HD Tag value from the header // that does not exist. value = samHeader.getHDTagValue("GO"); assert(strcmp(value, "") == 0); //////////////////////////////////////////////////////// // Test getting a specific PG Tag value from the header // that does not exist. value = samHeader.getPGTagValue("CL", "1"); assert(strcmp(value, "") == 0); //////////////////////////////////////////////////////// // Test getting a specific SQ Tag value from the header value = samHeader.getSQTagValue("LN", "1"); assert(value != NULL); assert(strcmp(value, "247249719") == 0); value = samHeader.getSQTagValue("LN", "22"); assert(value != NULL); assert(strcmp(value, "49691432") == 0); //////////////////////////////////////////////////////// // Test getting a specific SQ Tag value from the header // that does not exist. value = samHeader.getSQTagValue("LN", "1000"); assert(strcmp(value, "") == 0); //////////////////////////////////////////////////////// // Test getting a specific SQ Tag value from the header // that does not exist - sq exists, but not with that tag. value = samHeader.getSQTagValue("AS", "1"); assert(strcmp(value, "") == 0); //////////////////////////////////////////////////////// // Test getting a specific RG Tag value from the header value = samHeader.getRGTagValue("LB", "myID2"); assert(value != NULL); assert(strcmp(value, "library2") == 0); value = samHeader.getRGTagValue("LB", "myID"); assert(value != NULL); assert(strcmp(value, "library") == 0); //////////////////////////////////////////////////////// // Test getting a specific SQ from the header // Then pulling the tags out of it. SamHeaderSQ* sq = samHeader.getSQ("10"); assert(strcmp(sq->getTagValue("SN"), "10") == 0); assert(strcmp(sq->getTagValue("LN"), "135374737") == 0); // Test pulling a tag that does not exist. assert(strcmp(sq->getTagValue("DD"), "") == 0); //////////////////////////////////////////////////////// // Test getting a specific RG from the header // Then pulling the tags out of it. const SamHeaderRG* rg = samHeader.getRG("myID"); assert(strcmp(rg->getTagValue("ID"), "myID") == 0); assert(strcmp(rg->getTagValue("SM"), "sample") == 0); assert(strcmp(rg->getTagValue("LB"), "library") == 0); // Test pulling a tag that does not exist. assert(strcmp(rg->getTagValue("DD"), "") == 0); //////////////////////////////////////////////////////// // Test getting a specific RG from the header that does not exist. rg = samHeader.getRG("noExist"); assert(rg == NULL); //////////////////////////////////////////////////////// // Test getting a specific SQ from the header that does not exist. sq = samHeader.getSQ("noExist"); assert(sq == NULL); //////////////////////////////////////////////////////// // Test getting the reference ID. assert(samHeader.getReferenceID("2") == 1); std::string refIDStdString = "X"; assert(samHeader.getReferenceID(refIDStdString.c_str()) == 22); String refIDString = "22"; assert(samHeader.getReferenceID(refIDString) == 21); assert(samHeader.getReferenceID(refIDString.c_str()) == 21); assert(samHeader.getReferenceID("Z") == SamReferenceInfo::NO_REF_ID); assert(samHeader.getReferenceID("Z", true) == 23); assert(samHeader.getReferenceID("*") == -1); refIDString = "*"; assert(samHeader.getReferenceID(refIDString) == -1); assert(samHeader.getReferenceID(refIDString.c_str()) == -1); } void validateHeaderString(SamFileHeader& samHeader) { // Check the header line. std::string headerString = ""; assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:11\tLN:134452384\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\tLB:library2\n@CO\tComment 1\n@CO\tComment 2\n"); } libStatGen-1.0.14/bam/test/TestValidate.h000066400000000000000000000046121254730101300201100ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" void validateRead1(SamRecord& samRecord); void validateRead2(SamRecord& samRecord); void validateRead3(SamRecord& samRecord); void validateRead4(SamRecord& samRecord); void validateRead5(SamRecord& samRecord); void validateRead6(SamRecord& samRecord); void validateRead7(SamRecord& samRecord); void validateRead8(SamRecord& samRecord); void validateRead9(SamRecord& samRecord); void validateRead10(SamRecord& samRecord); void validateHeader(SamFileHeader& samHeader); void validateHeaderFields(SamFileHeader& samHeader); void validateHeaderString(SamFileHeader& samHeader); class TestValidate { public: static const int READ1_POS = 1010; static const int READ1_ALIGN_END = 1016; static const int READ1_UNCLIP_START = 1010; static const int READ1_UNCLIP_END = 1016; static const int READ1_ALIGN_LEN = 7; static const std::string READ1_CIGAR; static const std::string READ1_SEQ; static const std::string READ1_QUAL; static const int READ2_POS = 1011; static const int READ6_POS = 1750; static const int READ6_ALIGN_END = 1754; static const int READ6_UNCLIP_START = 1745; static const int READ6_UNCLIP_END = 1754; static const int READ6_ALIGN_LEN = 5; static const std::string READ6_CIGAR; static const std::string READ6_SEQ; static const std::string READ6_QUAL; static const int READ7_POS = 1750; static const int READ7_ALIGN_END = 1754; static const int READ7_UNCLIP_START = 1747; static const int READ7_UNCLIP_END = 1758; static const int READ7_ALIGN_LEN = 5; static const std::string READ7_CIGAR; static const std::string READ7_SEQ; static const std::string READ7_QUAL; }; libStatGen-1.0.14/bam/test/ValidationTest.cpp000066400000000000000000000177511254730101300210140ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamRecord.h" #include "SamValidation.h" #include "ValidationTest.h" #include void testSamQNAME() { // This method tests: // QNAME.Length() > 0 and <= 254 // QNAME does not contain [ \t\n\r] char qname[256]; SamFileHeader samHeader; SamRecord testRecord(ErrorHandler::RETURN); // Error list SamValidationErrors errorList; // Test Length == 0 by setting qname[0] to 0 (end of char*) qname[0] = 0; // It fails, because it is a required field. assert(testRecord.setReadName(qname) == false); assert(strcmp(testRecord.getReadName(), "UNKNOWN") == 0); // It was reset to the default which is valid. assert(SamValidator::isValid(samHeader, testRecord, errorList) == true); assert(errorList.numErrors() == 0); assert(errorList.getNextError() == NULL); // Test too long of a read name. memset(qname, '.', 255); qname[255] = 0; assert(testRecord.setReadName(qname) == true); assert(strcmp(testRecord.getReadName(), qname) == 0); assert(SamValidator::isValid(samHeader, testRecord, errorList) == false); // 2 errors - 1 since the qname is longer than 254 (it is 255). // and the qname length including the null is 256, but the // read name length is only 8 bits, so that is a 1. assert(errorList.numErrors() == 2); assert(errorList.getNextError()->getType() == SamValidationError::INVALID_QNAME); assert(errorList.getNextError()->getType() == SamValidationError::INVALID_QNAME); assert(errorList.getNextError() == NULL); // Clear the error list errorList.clear(); // Setup a buffer to set the record to. int bufferBlockSize = 32; bamRecordStruct* bufferRecordPtr = (bamRecordStruct *) malloc(bufferBlockSize + sizeof(int)); bufferRecordPtr->myBlockSize = bufferBlockSize; bufferRecordPtr->myReferenceID = -1; bufferRecordPtr->myPosition = 1010; // Set the read name length to 0. bufferRecordPtr->myReadNameLength = 0; bufferRecordPtr->myMapQuality = 0; bufferRecordPtr->myBin = 4681; bufferRecordPtr->myCigarLength = 0; bufferRecordPtr->myFlag = 73; bufferRecordPtr->myReadLength = 0; bufferRecordPtr->myMateReferenceID = -1; bufferRecordPtr->myMatePosition = 1010; bufferRecordPtr->myInsertSize = 0; assert(testRecord.setBuffer((const char*)bufferRecordPtr, bufferBlockSize + sizeof(int), samHeader) == SamStatus::SUCCESS); // 1 error - the read name length is 0. assert(SamValidator::isValid(samHeader, testRecord, errorList) == false); assert(errorList.numErrors() == 1); assert(errorList.getNextError()->getType() == SamValidationError::INVALID_QNAME); assert(errorList.getNextError() == NULL); // Clear the error list errorList.clear(); // Test a buffer that has a read name, but the length specified is // longer than the first null. bufferBlockSize = 40; bufferRecordPtr->myBlockSize = bufferBlockSize; // Set the read name length to 8 - longer than 3 - "HI\0". bufferRecordPtr->myReadNameLength = 8; bufferRecordPtr->myData[0] = 'H'; bufferRecordPtr->myData[1] = 'I'; bufferRecordPtr->myData[2] = 0; assert(testRecord.setBuffer((const char*)bufferRecordPtr, bufferBlockSize + sizeof(int), samHeader) == SamStatus::SUCCESS); // 1 error - the read name length in the buffer does not match the // length of the read name to the first null. assert(SamValidator::isValid(samHeader, testRecord, errorList) == false); assert(errorList.numErrors() == 1); assert(errorList.getNextError()->getType() == SamValidationError::INVALID_QNAME); assert(errorList.getNextError() == NULL); // Clear the error list errorList.clear(); // Test a buffer that has a read name, but the length specified is // shorter than the first null. bufferBlockSize = 34; bufferRecordPtr->myBlockSize = bufferBlockSize; // Set the read name length to 2 - longer than 3 - "HI\0".. bufferRecordPtr->myReadNameLength = 2; bufferRecordPtr->myData[0] = 'H'; bufferRecordPtr->myData[1] = 'I'; bufferRecordPtr->myData[2] = 0; assert(testRecord.setBuffer((const char*)bufferRecordPtr, bufferBlockSize + sizeof(int), samHeader) == SamStatus::SUCCESS); // 1 error - the read name length in the buffer does not match // the length of the read name to the first null. assert(SamValidator::isValid(samHeader, testRecord, errorList) == false); assert(errorList.numErrors() == 1); assert(errorList.getNextError()->getType() == SamValidationError::INVALID_QNAME); assert(errorList.getNextError() == NULL); // Clear the error list errorList.clear(); } void testBamRID() { // BAM SamRecord testRecord(ErrorHandler::RETURN); // Error list SamValidationErrors errorList; SamFileHeader samHeader; // Clear the error list errorList.clear(); // Setup a buffer to set the record to. int bufferBlockSize = 35; bamRecordStruct* bufferRecordPtr = (bamRecordStruct *) malloc(bufferBlockSize + sizeof(int)); bufferRecordPtr->myBlockSize = bufferBlockSize; bufferRecordPtr->myPosition = 1010; bufferRecordPtr->myReferenceID = -1; // Set the read name length to 0. bufferRecordPtr->myReadNameLength = 3; bufferRecordPtr->myMapQuality = 0; bufferRecordPtr->myBin = 4681; bufferRecordPtr->myCigarLength = 0; bufferRecordPtr->myFlag = 73; bufferRecordPtr->myReadLength = 0; bufferRecordPtr->myMateReferenceID = -1; bufferRecordPtr->myMatePosition = 1010; bufferRecordPtr->myInsertSize = 0; bufferRecordPtr->myData[0] = 'H'; bufferRecordPtr->myData[1] = 'I'; bufferRecordPtr->myData[2] = 0; //////////////////////////////////////////// // Test out of range reference sequence id. bufferRecordPtr->myReferenceID = 100; assert(testRecord.setBuffer((const char*)bufferRecordPtr, bufferBlockSize + sizeof(int), samHeader) == SamStatus::SUCCESS); // 1 error - the read name length is 0. assert(SamValidator::isValid(samHeader, testRecord, errorList) == false); assert(errorList.numErrors() == 1); assert(errorList.getNextError()->getType() == SamValidationError::INVALID_REF_ID); assert(errorList.getNextError() == NULL); // Clear the error list errorList.clear(); //////////////////////////////////////////// // Test out of range reference sequence id. bufferRecordPtr->myReferenceID = -100; assert(testRecord.setBuffer((const char*)bufferRecordPtr, bufferBlockSize + sizeof(int), samHeader) == SamStatus::SUCCESS); // 1 error - the read name length is 0. assert(SamValidator::isValid(samHeader, testRecord, errorList) == false); assert(errorList.numErrors() == 1); assert(errorList.getNextError()->getType() == SamValidationError::INVALID_REF_ID); assert(errorList.getNextError() == NULL); // Clear the error list errorList.clear(); } void testEmptyQual() { } libStatGen-1.0.14/bam/test/ValidationTest.h000066400000000000000000000014421254730101300204470ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ void testSamQNAME(); void testBamRID(); void testEmptyQual(); libStatGen-1.0.14/bam/test/WriteFiles.cpp000066400000000000000000001327301254730101300201320ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "SamFile.h" #include "WriteFiles.h" #include "TestValidate.h" #include #include void testWrite() { testHeaderWrite(); testWriteCopiedHeader("testFiles/testSam.sam"); #ifdef __ZLIB_AVAILABLE__ testWriteCopiedHeader("testFiles/testBam.bam"); #endif } void testHeaderWrite() { SamFile samOut; samOut.OpenForWrite("results/MyTestOut.sam"); // Create a sam header. SamFileHeader samHeader; std::string headerString = ""; // Test getting HD & PG and the HD-SO tag when they do not exist. assert(samHeader.getHD() == NULL); assert(samHeader.getPG("1") == NULL); assert(strcmp(samHeader.getTagSO(), "") == 0); // Test removing the HD tag that does not exist. assert(samHeader.removeHD() == true); assert(samHeader.getHD() == NULL); assert(strcmp(samHeader.getHDTagValue("VN"), "") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == ""); char type[3] = "HD"; char tag[3] = "VN"; // Verify it has not yet been added to the parsed header. assert(strcmp(samHeader.getHDTagValue("VN"), "") == 0); assert(samHeader.addHeaderLine(type, tag, "1.0") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.0\n"); // Verify it was added to the parsed header. assert(strcmp(samHeader.getHDTagValue("VN"), "1.0") == 0); type[0] = 'S'; type[1] = 'Q'; tag[0] = 'L'; tag[1] = 'N'; // Cannot add SQ LN tag without adding the SN tag also. assert(samHeader.addHeaderLine(type, tag, "123") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.0\n"); // Has not yet been added, so returns blank. assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "") == 0); // Can't add the SQ type without a LN. std::string line = "@SQ\tSN:123"; assert(samHeader.addHeaderLine(line.c_str()) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.0\n"); // Successfully add a SQ line. line = "@SQ\tLN:123\tSN:chr20"; assert(samHeader.addHeaderLine(line.c_str()) == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.0\n@SQ\tLN:123\tSN:chr20\n"); // Verify it was added to the parsed header. assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "123") == 0); // Test to make sure nothing changes if try to copy into self. samHeader = samHeader; assert(samHeader.addHeaderLine(line.c_str()) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.0\n@SQ\tLN:123\tSN:chr20\n"); // Verify it was added to the parsed header. assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "123") == 0); samHeader.copy(samHeader); assert(samHeader.addHeaderLine(line.c_str()) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.0\n@SQ\tLN:123\tSN:chr20\n"); // Verify it was added to the parsed header. assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "123") == 0); // Test adding an HD that is already there. assert(samHeader.addHeaderLine("@HD\tVN:1.1") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.0\n@SQ\tLN:123\tSN:chr20\n"); // Verify it was added to the parsed header. assert(strcmp(samHeader.getHDTagValue("VN"), "1.0") == 0); // Test copying the header. SamFileHeader newHeader = samHeader; assert(newHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.0\n@SQ\tLN:123\tSN:chr20\n"); // Verify it was added to the parsed header. assert(strcmp(newHeader.getSQTagValue("LN", "chr20"), "123") == 0); // Modify one of the tags. assert(samHeader.setHDTag("VN", "1.1") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.1\n@SQ\tLN:123\tSN:chr20\n"); // Verify it was modified. assert(strcmp(samHeader.getHDTagValue("VN"), "1.1") == 0); // Remove the version. assert(samHeader.setHDTag("VN", "") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:123\tSN:chr20\n"); // Verify it was removed. assert(strcmp(samHeader.getHDTagValue("VN"), "") == 0); // Remove the SN from the SQ type - fails because SN is the key. assert(samHeader.setSQTag("SN", "", "chr20") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:123\tSN:chr20\n"); // Verify it was not removed. assert(strcmp(samHeader.getSQTagValue("SN", "chr20"), "chr20") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "123") == 0); // Can't remove the LN from the SQ type assert(samHeader.setSQTag("LN", "", "chr20") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:123\tSN:chr20\n"); // Verify it was not removed. assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "123") == 0); assert(strcmp(samHeader.getSQTagValue("SN", "chr20"), "chr20") == 0); // Delete the SQ line. assert(samHeader.removeSQ("chr20") == true); // There is no header string. assert(samHeader.getHeaderString(headerString) == true); assert(headerString == ""); // Verify it was removed. assert(strcmp(samHeader.getSQTagValue("SN", "chr20"), "") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "") == 0); // Create an SQ record and add it back in. SamHeaderSQ* sq = new SamHeaderSQ(); assert(sq->setTag("LN", "123") == true); assert(sq->setTag("SN", "chr20") == true); assert(samHeader.addSQ(sq) == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:123\tSN:chr20\n"); // Verify it was added. assert(strcmp(samHeader.getSQTagValue("SN", "chr20"), "chr20") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "123") == 0); // Modify a tag. assert(sq->setTag("LN", "222") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:222\tSN:chr20\n"); // Verify it was modified. assert(strcmp(samHeader.getSQTagValue("SN", "chr20"), "chr20") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "222") == 0); // Test adding another SQ with the same key. SamHeaderSQ* sq2 = new SamHeaderSQ(); assert(sq2->setTag("LN", "333") == true); assert(sq2->setTag("SN", "chr20") == true); assert(samHeader.addSQ(sq2) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:222\tSN:chr20\n"); // Verify it was not added. assert(strcmp(samHeader.getSQTagValue("SN", "chr20"), "chr20") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "chr20"), "222") == 0); delete sq2; // Add a new tag to the SQ tag. assert(samHeader.setSQTag("AS", "HG18", "chr20") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:222\tSN:chr20\tAS:HG18\n"); // Verify it was added. assert(strcmp(samHeader.getSQTagValue("AS", "chr20"), "HG18") == 0); // Modify the AS tag. assert(samHeader.setSQTag("AS", "HG19", "chr20") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:222\tSN:chr20\tAS:HG19\n"); // Verify it was added. assert(strcmp(samHeader.getSQTagValue("AS", "chr20"), "HG19") == 0); // Add a new tag . sq2 = new SamHeaderSQ(); assert(sq2->setTag("LN", "333") == true); assert(sq2->setTag("SN", "chr1") == true); assert(samHeader.addSQ(sq2) == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:222\tSN:chr20\tAS:HG19\n@SQ\tLN:333\tSN:chr1\n"); // Verify it was added. assert(strcmp(samHeader.getSQTagValue("SN", "chr1"), "chr1") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "chr1"), "333") == 0); // Test removing an SQ tag that does not exist. assert(samHeader.removeSQ("chr100") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:222\tSN:chr20\tAS:HG19\n@SQ\tLN:333\tSN:chr1\n"); // Remove the newly added sq2 by resetting it. sq2->reset(); // Verify it was removed. assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@SQ\tLN:222\tSN:chr20\tAS:HG19\n"); assert(strcmp(samHeader.getSQTagValue("SN", "chr1"), "") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "chr1"), "") == 0); // Test getting HD which does exist since it was set before. Even // though the VN was removed so it doesn't appear in the header string, // it was never actually removed. SamHeaderHD* hd = samHeader.getHD(); assert(hd != NULL); // Blank since the sort order was never set. assert(strcmp(samHeader.getTagSO(), "") == 0); // Set the version number. assert(hd->setTag("VN", "2.1") == true); // Verify it was added. assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:2.1\n@SQ\tLN:222\tSN:chr20\tAS:HG19\n"); assert(strcmp(samHeader.getHDTagValue("VN"), "2.1") == 0); // Set the SO assert(hd->setTag("SO", "coordinate") == true); // Verify it was added. assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:2.1\tSO:coordinate\n@SQ\tLN:222\tSN:chr20\tAS:HG19\n"); assert(strcmp(samHeader.getHDTagValue("SO"), "coordinate") == 0); // Reset the header. samHeader.resetHeader(); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == ""); // Add a new HD tag. assert(samHeader.setHDTag("SO", "queryname") == true); assert(strcmp(samHeader.getHDTagValue("SO"), "queryname") == 0); assert(samHeader.getHeaderString(headerString) == true); // Blank since missing VN. assert(headerString == ""); // Set the VN. assert(samHeader.setHDTag("VN", "3.1") == true); assert(strcmp(samHeader.getHDTagValue("SO"), "queryname") == 0); assert(strcmp(samHeader.getHDTagValue("VN"), "3.1") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n"); ////////////////////////////////////////////////////////////// // Test removing a non-existent PG. assert(samHeader.removePG("1") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n"); // Test adding a null PG. SamHeaderPG* pg = NULL; assert(samHeader.addPG(pg) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n"); // Add a PG tag. assert(strcmp(samHeader.getPGTagValue("ID", "pid"), "") == 0); assert(samHeader.setPGTag("ID", "pid", "pid") == true); assert(strcmp(samHeader.getPGTagValue("ID", "pid"), "pid") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:pid\n"); // Verify can't modify the key. assert(samHeader.setPGTag("ID", "pid1", "pid") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:pid\n"); assert(strcmp(samHeader.getPGTagValue("ID", "pid"), "pid") == 0); // Test adding a null PG. pg = NULL; assert(samHeader.addPG(pg) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:pid\n"); // Test adding a PG header when it already exists. pg = new SamHeaderPG(); assert(pg->setTag("ID", "pid") == true); assert(samHeader.addPG(pg) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:pid\n"); delete pg; // Get a PG that does not exist. pg = samHeader.getPG("pid1"); assert(pg == NULL); // Get a PG tag that does not exist. assert(strcmp(samHeader.getPGTagValue("CL", "pid"), "") == 0); // Get the PG tag. pg = samHeader.getPG("pid"); assert(pg != NULL); assert(strcmp(pg->getTagValue("ID"), "pid") == 0); // Add a tag to the PG. assert(pg->setTag("VN", "pg1") == true); assert(strcmp(samHeader.getPGTagValue("VN", "pid"), "pg1") == 0); assert(strcmp(samHeader.getPGTagValue("ID", "pid"), "pid") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:pid\tVN:pg1\n"); // Test modifying the key tag - fails. assert(pg->setTag("ID", "pid1") == false); assert(strcmp(samHeader.getPGTagValue("VN", "pid"), "pg1") == 0); assert(strcmp(samHeader.getPGTagValue("ID", "pid"), "pid") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:pid\tVN:pg1\n"); // Test modifying the VN tag. assert(samHeader.setPGTag("VN", "pg", "pid") == true); assert(strcmp(samHeader.getPGTagValue("VN", "pid"), "pg") == 0); assert(strcmp(samHeader.getPGTagValue("ID", "pid"), "pid") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:pid\tVN:pg\n"); // Test removing the VN tag. assert(pg->setTag("VN", "") == true); assert(strcmp(samHeader.getPGTagValue("VN", "pid"), "") == 0); assert(strcmp(samHeader.getPGTagValue("ID", "pid"), "pid") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:pid\n"); // Test removing a PG that does not exist. assert(samHeader.removePG("pid1") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:pid\n"); assert(strcmp(samHeader.getPGTagValue("VN", "pid"), "") == 0); assert(strcmp(samHeader.getPGTagValue("ID", "pid"), "pid") == 0); // Test removing the PG. assert(samHeader.removePG("pid") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n"); assert(strcmp(samHeader.getPGTagValue("VN", "pid"), "") == 0); assert(strcmp(samHeader.getPGTagValue("ID", "pid"), "") == 0); // Test adding a PG header. pg = new SamHeaderPG(); assert(pg->setTag("ID", "newID") == true); assert(samHeader.addPG(pg) == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n"); // Test adding a PG that is already there. assert(samHeader.addHeaderLine("@PG\tID:newID") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n"); // Verify it was added to the parsed header. assert(strcmp(samHeader.getPGTagValue("ID", "newID"), "newID") == 0); // Test adding another PG header. pg = new SamHeaderPG(); assert(pg->setTag("ID", "newID1") == true); assert(samHeader.addPG(pg) == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@PG\tID:newID1\n"); // Test adding another PG header. pg = new SamHeaderPG(); assert(pg->setTag("ID", "pid") == true); assert(samHeader.addPG(pg) == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@PG\tID:newID1\n@PG\tID:pid\n"); // Test removing the new pg. assert(samHeader.removePG("newID1") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@PG\tID:pid\n"); // Test removing the other new pg. assert(samHeader.removePG("pid") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n"); // Test adding a tag assert(samHeader.setPGTag("VN", "1.0", "newID") == true); assert(strcmp(samHeader.getPGTagValue("VN", "newID"), "1.0") == 0); assert(strcmp(samHeader.getPGTagValue("ID", "newID"), "newID") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\tVN:1.0\n"); // Test removing a tag assert(samHeader.setPGTag("VN", "", "newID") == true); assert(strcmp(samHeader.getPGTagValue("VN", "newID"), "") == 0); assert(strcmp(samHeader.getPGTagValue("ID", "newID"), "newID") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n"); //////////////////////////////////////////////////////////////////// // Add an SQ, but fail since LN is not specified. assert(samHeader.setSQTag("AS", "HG18", "newName") == false); assert(samHeader.getHeaderString(headerString) == true); // SQ does not show up since it is missing the LN field. assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n"); // Add the SQ's SN, but fail since LN is not specified. assert(samHeader.setSQTag("SN", "newName", "newName") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n"); sq = samHeader.getSQ("newName"); assert(sq == NULL); // Add the SQ with the LN tag. assert(samHeader.setSQTag("LN", "111", "newName") == true); assert(strcmp(samHeader.getSQTagValue("SN", "newName"), "newName") == 0); assert(strcmp(samHeader.getSQTagValue("AS", "newName"), "") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "newName"), "111") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\n"); // Add the AS. assert(samHeader.setSQTag("AS", "HG18", "newName") == true); assert(strcmp(samHeader.getSQTagValue("AS", "newName"), "HG18") == 0); assert(strcmp(samHeader.getSQTagValue("SN", "newName"), "newName") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "newName"), "111") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\n"); // Get the SQ. sq = samHeader.getSQ("newName"); assert(sq != NULL); // Modify the SQ assert(sq->setTag("SP", "species") == true); assert(strcmp(samHeader.getSQTagValue("SN", "newName"), "newName") == 0); assert(strcmp(samHeader.getSQTagValue("AS", "newName"), "HG18") == 0); assert(strcmp(samHeader.getSQTagValue("LN", "newName"), "111") == 0); assert(strcmp(samHeader.getSQTagValue("SP", "newName"), "species") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n"); ////////////////////////////////////////////////////////////////////// // Add a new RG Tag assert(samHeader.setRGTag("ID", "rgID", "rgID") == true); assert(samHeader.getHeaderString(headerString) == true); // New RG does not show up since it is still missing a required field. assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n"); assert(strcmp(samHeader.getRGTagValue("ID", "rgID"), "rgID") == 0); // Add the missing SM field. assert(samHeader.setRGTag("SM", "sm1", "rgID") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n"); assert(strcmp(samHeader.getRGTagValue("ID", "rgID"), "rgID") == 0); // Verify can't modify the key. assert(samHeader.setRGTag("ID", "rgID1", "rgID") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n"); assert(strcmp(samHeader.getRGTagValue("ID", "rgID"), "rgID") == 0); // Verify that the copied header did not change. assert(newHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.0\n@SQ\tLN:123\tSN:chr20\n"); // Verify it was added to the parsed header. assert(strcmp(newHeader.getSQTagValue("LN", "chr20"), "123") == 0); // Add a new RG Tag assert(samHeader.setRGTag("SM", "sample1", "rgID1") == true); assert(samHeader.getHeaderString(headerString) == true); // String does not show the tag until all required fields are there. assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n"); assert(strcmp(samHeader.getRGTagValue("ID", "rgID1"), "rgID1") == 0); assert(strcmp(samHeader.getRGTagValue("SM", "rgID1"), "sample1") == 0); // Modify an RG tag. assert(samHeader.setRGTag("SM", "sample", "rgID1") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n"); assert(strcmp(samHeader.getRGTagValue("ID", "rgID1"), "rgID1") == 0); assert(strcmp(samHeader.getRGTagValue("SM", "rgID1"), "sample") == 0); // Test removing an rg that does not exist. assert(samHeader.removeRG("rgID2") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n"); // Create a new RG, set some values and add it. SamHeaderRG* rg = new SamHeaderRG(); // Try adding it without a key. assert(samHeader.addRG(rg) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n"); // Set some values in rg. assert(rg->setTag("ID", "rgID2") == true); assert(rg->setTag("SM", "sm2") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n"); // Add the new RG. assert(samHeader.addRG(rg) == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n@RG\tID:rgID2\tSM:sm2\n"); assert(strcmp(samHeader.getRGTagValue("ID", "rgID2"), "rgID2") == 0); // Test trying to add another one with the same key. rg = new SamHeaderRG(); assert(rg->setTag("ID", "rgID2") == true); assert(samHeader.addRG(rg) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n@RG\tID:rgID2\tSM:sm2\n"); // Test removing the rg again. assert(samHeader.removeRG("rgID2") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n"); // Test getting an rg tag that doesn't exist assert(strcmp(samHeader.getRGTagValue("DS", "rgID"), "") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n"); // Test getting an rg tag from a removed key assert(strcmp(samHeader.getRGTagValue("ID", "rgID2"), "") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n"); // Test getting an rg tag from an key that doesn't exist assert(strcmp(samHeader.getRGTagValue("ID", "rgID22"), "") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n"); // Test adding a null header. rg = NULL; assert(samHeader.addRG(rg) == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n"); // Test adding the deleted header back in. rg = new SamHeaderRG(); assert(rg->setTag("ID", "rgID2") == true); assert(rg->setTag("SM", "sm2") == true); assert(samHeader.addRG(rg) == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n@RG\tID:rgID2\tSM:sm2\n"); // Test adding an RG that is already there. assert(samHeader.addHeaderLine("@RG\tID:rgID\tSM:sm5") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample\n@RG\tID:rgID2\tSM:sm2\n"); // Verify it was added to the parsed header. assert(strcmp(samHeader.getRGTagValue("SM", "rgID"), "sm1") == 0); // Get an RG record then modify it. rg = samHeader.getRG("rgID1"); assert(rg != NULL); assert(rg->setTag("SM", "sample1") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n@RG\tID:rgID2\tSM:sm2\n"); // Try to modify the key. assert(rg->setTag("ID", "rgID111") == false); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n@RG\tID:rgID2\tSM:sm2\n"); //////////////////////////////////////////////////////////////////////////// // Test getting a comment when there aren't any. assert(strcmp(samHeader.getNextComment(), "") == 0); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n@RG\tID:rgID2\tSM:sm2\n"); // Test getting each headerline when there are no comments. const char* hdrlinechar; std::string hdrline; assert(samHeader.getNextHeaderLine(hdrline)); hdrlinechar = hdrline.c_str(); // Test to make sure there is not memory corruption. std::string tmpString = "@SQ\tSN:queryname\tVN:3.1\n"; assert(hdrline == "@HD\tSO:queryname\tVN:3.1\n"); assert(strcmp(hdrlinechar, "@HD\tSO:queryname\tVN:3.1\n") == 0); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@PG\tID:newID\n"); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n"); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@RG\tID:rgID\tSM:sm1\n"); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@RG\tID:rgID1\tSM:sample1\n"); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@RG\tID:rgID2\tSM:sm2\n"); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n@RG\tID:rgID2\tSM:sm2\n"); // Verify that getHeaderRecord returns nothing. assert(samHeader.getNextHeaderRecord() == NULL); // Reset the header record iter. samHeader.resetHeaderRecordIter(); // Test getting each headerrecord when there are no comments. SamHeaderRecord* hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "HD") == 0); assert(hdrRec->getType() == SamHeaderRecord::HD); assert(strcmp(hdrRec->getTagValue("SO"), "queryname") == 0); assert(strcmp(hdrRec->getTagValue("VN"), "3.1") == 0); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "PG") == 0); assert(hdrRec->getType() == SamHeaderRecord::PG); assert(strcmp(hdrRec->getTagValue("ID"), "newID") == 0); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "SQ") == 0); assert(hdrRec->getType() == SamHeaderRecord::SQ); assert(strcmp(hdrRec->getTagValue("SN"), "newName") == 0); assert(strcmp(hdrRec->getTagValue("AS"), "HG18") == 0); assert(strcmp(hdrRec->getTagValue("LN"), "111") == 0); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID") == 0); assert(strcmp(hdrRec->getTagValue("SM"), "sm1") == 0); // Get the SQ Header Record (should have no affect on the general // getNextHeaderRecord calls). hdrRec = samHeader.getNextSQRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "SQ") == 0); assert(hdrRec->getType() == SamHeaderRecord::SQ); assert(strcmp(hdrRec->getTagValue("SN"), "newName") == 0); assert(strcmp(hdrRec->getTagValue("AS"), "HG18") == 0); assert(strcmp(hdrRec->getTagValue("LN"), "111") == 0); // Only one SQ Header Record. hdrRec = samHeader.getNextSQRecord(); assert(hdrRec == NULL); // Get the RG/PG Header Records (should have no affect on the general // getNextHeaderRecord calls). hdrRec = samHeader.getNextRGRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID") == 0); assert(strcmp(hdrRec->getTagValue("SM"), "sm1") == 0); // Get the next RG record. hdrRec = samHeader.getNextRGRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID1") == 0); assert(strcmp(hdrRec->getTagValue("SM"), "sample1") == 0); // Get the PG record. hdrRec = samHeader.getNextPGRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "PG") == 0); assert(hdrRec->getType() == SamHeaderRecord::PG); assert(strcmp(hdrRec->getTagValue("ID"), "newID") == 0); // Get the last RG record. hdrRec = samHeader.getNextRGRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID2") == 0); assert(strcmp(hdrRec->getTagValue("SM"), "sm2") == 0); // Already got all RG Records. hdrRec = samHeader.getNextRGRecord(); assert(hdrRec == NULL); // Reset the RG record. samHeader.resetRGRecordIter(); // Get the RG record. hdrRec = samHeader.getNextRGRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID") == 0); assert(strcmp(hdrRec->getTagValue("SM"), "sm1") == 0); // No more PG records. hdrRec = samHeader.getNextPGRecord(); assert(hdrRec == NULL); // No more SQ records. hdrRec = samHeader.getNextSQRecord(); assert(hdrRec == NULL); // Reset the SQ record iterator. samHeader.resetSQRecordIter(); // No more PG records. hdrRec = samHeader.getNextPGRecord(); assert(hdrRec == NULL); // Get the now reset SQ record. hdrRec = samHeader.getNextSQRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "SQ") == 0); assert(hdrRec->getType() == SamHeaderRecord::SQ); assert(strcmp(hdrRec->getTagValue("SN"), "newName") == 0); assert(strcmp(hdrRec->getTagValue("AS"), "HG18") == 0); assert(strcmp(hdrRec->getTagValue("LN"), "111") == 0); // Only one SQ Header Record. hdrRec = samHeader.getNextSQRecord(); assert(hdrRec == NULL); // Reset the PG record iterator. samHeader.resetPGRecordIter(); // No more SQ records. hdrRec = samHeader.getNextSQRecord(); assert(hdrRec == NULL); // Get the next RG record. hdrRec = samHeader.getNextRGRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID1") == 0); assert(strcmp(hdrRec->getTagValue("SM"), "sample1") == 0); // Get the PG record. hdrRec = samHeader.getNextPGRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "PG") == 0); assert(hdrRec->getType() == SamHeaderRecord::PG); assert(strcmp(hdrRec->getTagValue("ID"), "newID") == 0); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID1") == 0); assert(strcmp(hdrRec->getTagValue("SM"), "sample1") == 0); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID2") == 0); assert(strcmp(hdrRec->getTagValue("SM"), "sm2") == 0); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec == NULL); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec == NULL); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n@RG\tID:rgID2\tSM:sm2\n"); // Add some comments. assert(samHeader.addComment("My Comment") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n@RG\tID:rgID2\tSM:sm2\n@CO\tMy Comment\n"); // Call getNextHeaderRecord - still nothing. hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec == NULL); // Call getNextHeaderLine - should return the comment. assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@CO\tMy Comment\n"); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); // Call getNextCommentLine - should return the comment. assert(strcmp(samHeader.getNextComment(), "My Comment") == 0); assert(strcmp(samHeader.getNextComment(), "") == 0); assert(strcmp(samHeader.getNextComment(), "") == 0); // Add another comment. assert(samHeader.addComment("My Comment2") == true); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n@RG\tID:rgID2\tSM:sm2\n@CO\tMy Comment\n@CO\tMy Comment2\n"); newHeader = samHeader; assert(newHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n@RG\tID:rgID2\tSM:sm2\n@CO\tMy Comment\n@CO\tMy Comment2\n"); // Call getNextHeaderLine - should return the comment. assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@CO\tMy Comment2\n"); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); // Call getNextCommentLine - should return the comment. assert(strcmp(samHeader.getNextComment(), "My Comment2") == 0); assert(strcmp(samHeader.getNextComment(), "") == 0); assert(strcmp(samHeader.getNextComment(), "") == 0); // Reset the header record iter. samHeader.resetHeaderRecordIter(); // Recall getNextCommentLine - should not return anything. assert(strcmp(samHeader.getNextComment(), "") == 0); assert(strcmp(samHeader.getNextComment(), "") == 0); // Reset the next comment iter. samHeader.resetCommentIter(); // Call the get next headerLine, record, comment interspersed with // each other. hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "HD") == 0); assert(hdrRec->getType() == SamHeaderRecord::HD); assert(strcmp(hdrRec->getTagValue("SO"), "queryname") == 0); assert(strcmp(hdrRec->getTagValue("VN"), "3.1") == 0); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@PG\tID:newID\n"); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n"); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec != NULL); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID") == 0); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec != NULL); assert(strcmp(samHeader.getNextComment(), "My Comment") == 0); assert(strcmp(hdrRec->getTypeString(), "RG") == 0); assert(hdrRec->getType() == SamHeaderRecord::RG); assert(strcmp(hdrRec->getTagValue("ID"), "rgID1") == 0); assert(strcmp(hdrRec->getTagValue("SM"), "sample1") == 0); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@RG\tID:rgID2\tSM:sm2\n"); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec == NULL); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@CO\tMy Comment\n"); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec == NULL); assert(samHeader.getNextHeaderLine(hdrline)); assert(hdrline == "@CO\tMy Comment2\n"); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); assert(strcmp(samHeader.getNextComment(), "My Comment2") == 0); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); hdrRec = samHeader.getNextHeaderRecord(); assert(hdrRec == NULL); assert(strcmp(samHeader.getNextComment(), "") == 0); assert(strcmp(samHeader.getNextComment(), "") == 0); samOut.WriteHeader(samHeader); // Reset the header. samHeader.resetHeader(); assert(samHeader.getHeaderString(headerString) == true); assert(headerString == ""); assert(!samHeader.getNextHeaderLine(hdrline)); assert(hdrline == ""); assert(strcmp(samHeader.getHDTagValue("SO"), "") == 0); assert(strcmp(samHeader.getHDTagValue("VN"), "") == 0); // Try adding a key to the HD tag. hd = new SamHeaderHD(); assert(hd->addKey("3.1") == false); assert(strcmp(hd->getTagValue("VN"), "") == 0); assert(hd->isActiveHeaderRecord() == false); assert(hd->setTag("VN", "3.1") == true); assert(hd->isActiveHeaderRecord() == true); assert(strcmp(hd->getTagValue("VN"), "3.1") == 0); // Verify the copied header did not change. assert(newHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tSO:queryname\tVN:3.1\n@PG\tID:newID\n@SQ\tSN:newName\tLN:111\tAS:HG18\tSP:species\n@RG\tID:rgID\tSM:sm1\n@RG\tID:rgID1\tSM:sample1\n@RG\tID:rgID2\tSM:sm2\n@CO\tMy Comment\n@CO\tMy Comment2\n"); // Verify it was added to the parsed header. assert(strcmp(newHeader.getSQTagValue("LN", "chr20"), "") == 0); } void testWriteCopiedHeader(const char* fileName) { SamFile samIn; assert(samIn.OpenForRead(fileName)); SamFile samOut; assert(samOut.OpenForWrite("results/MyTestOut2.bam")); SamFile samOut2; assert(samOut2.OpenForWrite("results/MyTestOut2.sam")); // Read the sam header. SamFileHeader samHeader; assert(samIn.ReadHeader(samHeader)); validateHeader(samHeader); SamFileHeader newHeader; std::string hdrLine; assert(samHeader.getNextHeaderLine(hdrLine)); newHeader.addHeaderLine("@HD\tVN:1.02"); bool hdrStatus = true; while(hdrStatus) { newHeader.addHeaderLine(hdrLine.c_str()); hdrStatus = samHeader.getNextHeaderLine(hdrLine); } // Write the sam header. assert(samOut.WriteHeader(newHeader)); assert(samOut2.WriteHeader(newHeader)); SamRecord samRecord; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Successfully read a record from the file, so write it. assert(samOut.WriteRecord(newHeader, samRecord)); assert(samOut2.WriteRecord(newHeader, samRecord)); } assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS); // Close the output files. samOut.Close(); samOut2.Close(); SamFileReader bamRead("results/MyTestOut2.bam"); SamFileReader samRead("results/MyTestOut2.sam"); // Read and check the header. assert(samRead.ReadHeader(samHeader)); validateHeaderFields(samHeader); std::string headerString = ""; assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.02\n@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:11\tLN:134452384\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\tLB:library2\n@CO\tComment 1\n@CO\tComment 2\n"); assert(bamRead.ReadHeader(samHeader)); validateHeaderFields(samHeader); headerString = ""; assert(samHeader.getHeaderString(headerString) == true); assert(headerString == "@HD\tVN:1.02\n@SQ\tSN:1\tLN:247249719\n@SQ\tSN:2\tLN:242951149\n@SQ\tSN:3\tLN:199501827\n@SQ\tSN:4\tLN:191273063\n@SQ\tSN:5\tLN:180857866\n@SQ\tSN:6\tLN:170899992\n@SQ\tSN:7\tLN:158821424\n@SQ\tSN:8\tLN:146274826\n@SQ\tSN:9\tLN:140273252\n@SQ\tSN:10\tLN:135374737\n@SQ\tSN:11\tLN:134452384\n@SQ\tSN:12\tLN:132349534\n@SQ\tSN:13\tLN:114142980\n@SQ\tSN:14\tLN:106368585\n@SQ\tSN:15\tLN:100338915\n@SQ\tSN:16\tLN:88827254\n@SQ\tSN:17\tLN:78774742\n@SQ\tSN:18\tLN:76117153\n@SQ\tSN:19\tLN:63811651\n@SQ\tSN:20\tLN:62435964\n@SQ\tSN:21\tLN:46944323\n@SQ\tSN:22\tLN:49691432\n@SQ\tSN:X\tLN:154913754\n@RG\tID:myID\tLB:library\tSM:sample\n@RG\tID:myID2\tSM:sample2\tLB:library2\n@CO\tComment 1\n@CO\tComment 2\n"); assert(samHeader.getNextHeaderLine(hdrLine)); std::string expectedString = "@HD\tVN:1.02\n"; assert(expectedString == hdrLine); // TODO - validate reading these written records back in. } libStatGen-1.0.14/bam/test/WriteFiles.h000066400000000000000000000014771254730101300176020ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ void testWrite(); void testHeaderWrite(); void testWriteCopiedHeader(const char* fileName); libStatGen-1.0.14/bam/test/expected/000077500000000000000000000000001254730101300171445ustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/TestSamCoordOutput.sam000066400000000000000000000022601254730101300234360ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 @RG ID:myID LB:library SM:sample @RG ID:myID2 SM:sample2 LB:library2 @CO Comment 1 @CO Comment 2 18:462+29M5I3M:F:295 97 1 75 0 * 18 757 0 * * AM:i:0 1:1011:F:255+17M15D20M 73 1 1011 0 5M2D = 1011 0 CCGAA 6>6+4 AM:i:0 MD:Z:37 NM:i:0 XT:A:R 1:1011:F:255+17M15D20M 133 1 1012 0 * = 1011 0 CTGT >>9> 18:462+29M5I3M:F:296 97 1 1751 0 3S2H5M 18 757 0 TGCACGTN 453;>>>> 18:462+29M5I3M:F:295 97 2 75 0 5M 18 757 0 ACGTN * AM:i:0 MD:Z:30A0C5 NM:i:2 XT:A:R 18:462+29M5I3M:F:297 97 2 1751 0 3S5M1S3H 18 757 0 TGCACGTNG 453;>>>>5 18:462+29M5I3M:F:298 97 3 75 0 3S5M4H 18 757 0 TGCACGTN 453;>>>> Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 AACT ==;; Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 * * libStatGen-1.0.14/bam/test/expected/addedTagToBam.bam000066400000000000000000000012221254730101300222420ustar00rootroot00000000000000BCuU=hSQOVRlTR=~K5k""4*T(88"(]uWЮuDAG/\߻sJV*̵.$FF&a5ڴk)ZQ{Jtϊh4g 0#QgUVC(H V(P^F\7ydAsɵ< 1j!$qWFu̇Ma u )#uLb_e$CDmOB(/!QIHHHpiU^B@.MnC>aE{ȵy:ʝz-Y/߸4ppҹzYy {t''Pv'$/>y{2~7d zēWFJ2G""<}\Hp^ (>z9*Z9Q×P@6FP}[H )P]\ H0=kOӽx06+4 AM:i:0 MD:Z:37 NM:i:0 RR:Z:myID2 XT:A:R XA:i:456 18:462+29M5I3M:F:295 97 1 75 0 5M 18 757 0 ACGTN ;>>>> AM:i:0 MD:Z:2T1N0 NM:i:2 XT:A:R libStatGen-1.0.14/bam/test/expected/addedTagToSam.bam000066400000000000000000000012221254730101300222630ustar00rootroot00000000000000BCuU=hSQOVRlTR=~K5k""4*T(88"(]uWЮuDAG/\߻sJV*̵.$FF&a5ڴk)ZQ{Jtϊh4g 0#QgUVC(H V(P^F\7ydAsɵ< 1j!$qWFu̇Ma u )#uLb_e$CDmOB(/!QIHHHpiU^B@.MnC>aE{ȵy:ʝz-Y/߸4ppҹzYy {t''Pv'$/>y{2~7d zēWFJ2G""<}\Hp^ (>z9*Z9Q×P@6FP}[H )P]\ H0=kOӽx06+4 AM:i:0 MD:Z:37 NM:i:0 RR:Z:myID2 XT:A:R XA:i:456 18:462+29M5I3M:F:295 97 1 75 0 5M 18 757 0 ACGTN ;>>>> AM:i:0 MD:Z:2T1N0 NM:i:2 XT:A:R libStatGen-1.0.14/bam/test/expected/noZlib/000077500000000000000000000000001254730101300204015ustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/noZlib/TestSamCoordOutput.sam000066400000000000000000000022601254730101300246730ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 @RG ID:myID LB:library SM:sample @RG ID:myID2 SM:sample2 LB:library2 @CO Comment 1 @CO Comment 2 18:462+29M5I3M:F:295 97 1 75 0 * 18 757 0 * * AM:i:0 1:1011:F:255+17M15D20M 73 1 1011 0 5M2D = 1011 0 CCGAA 6>6+4 AM:i:0 MD:Z:37 NM:i:0 XT:A:R 1:1011:F:255+17M15D20M 133 1 1012 0 * = 1011 0 CTGT >>9> 18:462+29M5I3M:F:296 97 1 1751 0 3S2H5M 18 757 0 TGCACGTN 453;>>>> 18:462+29M5I3M:F:295 97 2 75 0 5M 18 757 0 ACGTN * AM:i:0 MD:Z:30A0C5 NM:i:2 XT:A:R 18:462+29M5I3M:F:297 97 2 1751 0 3S5M1S3H 18 757 0 TGCACGTNG 453;>>>>5 18:462+29M5I3M:F:298 97 3 75 0 3S5M4H 18 757 0 tgcacgtn 453;>>>> Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 AACT ==;; Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 * * libStatGen-1.0.14/bam/test/expected/noZlib/addedTagToBam.bam000066400000000000000000000020721254730101300235030ustar00rootroot00000000000000BAMv@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 @RG ID:myID LB:library SM:sample @RG ID:myID2 SM:sample2 LB:library2 @RG ID:myID3 SM:mySM @CO Comment 1 @CO Comment 2 172#{3( 4gf 5 6/ 70lw 8 9df\101112^~1314IW15 16veK1718t19C20|21CP22(;X; gII1:1011:F:255+17M15D20MP""A AMCMDZ37NMCRRZmyID2XTARXASVJIa18:462+29M5I3M:F:295PHAMCMDZ2T1N0NMCXTARlibStatGen-1.0.14/bam/test/expected/noZlib/addedTagToBam.sam000077700000000000000000000000001254730101300270532../addedTagToBam.samustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/noZlib/addedTagToSam.bam000066400000000000000000000020721254730101300235240ustar00rootroot00000000000000BAMv@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 @RG ID:myID LB:library SM:sample @RG ID:myID2 SM:sample2 LB:library2 @RG ID:myID3 SM:mySM @CO Comment 1 @CO Comment 2 172#{3( 4gf 5 6/ 70lw 8 9df\101112^~1314IW15 16veK1718t19C20|21CP22(;X; gII1:1011:F:255+17M15D20MP""A AMCMDZ37NMCRRZmyID2XTARXASVJIa18:462+29M5I3M:F:295PHAMCMDZ2T1N0NMCXTARlibStatGen-1.0.14/bam/test/expected/noZlib/addedTagToSam.sam000077700000000000000000000000001254730101300271152../addedTagToSam.samustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/noZlib/samTest.log000066400000000000000000000031451254730101300225270ustar00rootroot000000000000001 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping RG line that has a repeated ID field. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping RG line that has a repeated ID field. ERROR: Missing required tag: SN. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failed to store SQ record. ERROR: Missing required tag: LN. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failed to store SQ record. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping SQ line that has a repeated SN field. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping SQ line that has a repeated SN field. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: multiple HD records. Can't modify the key tag, SN from chr20 to ERROR: Missing required tag: LN. ERROR: Missing required tag: VN. Can't modify the key tag, ID from pid to pid1 Can't modify the key tag, ID from pid to pid1 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping PG line that has a repeated ID field. ERROR: Missing required tag: LN. ERROR: Missing required tag: LN. ERROR: Missing required tag: SM. Can't modify the key tag, ID from rgID to rgID1 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping RG line that has a repeated ID field. Can't modify the key tag, ID from rgID1 to rgID111 SamCoordOutput::flush, no output file/header is set, so records removed without being written libStatGen-1.0.14/bam/test/expected/noZlib/testEqWithBases.bam000066400000000000000000000073301254730101300241440ustar00rootroot00000000000000BAM@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 172#{3( 4gf 5 6/ 70lw 8 9df\101112^~1314IW15 16veK1718t19C20|21CP22(;X; 2'II'01:====@"(2'II'02:===X@"(2'II'03:==X=@"(2'II'04:==XX@"(2'II'05:=X==@((2'II'06:=X=X@((2'II'07:=XX=@((2'II'08:=XXX@((2'II'09:X===@(2'II'10:X==X@(2'II'11:X=X=@(2'II'12:X=XX@(2'II'13:XX==@(2'II'14:XX=X@(2'II'15:XXX=@(2'II'16:XXXX@(2'II'01:====@"(2'II'02:===X@"(2'II'03:==X=@"(2'II'04:==XX@"(2'II'05:=X==@((2'II'06:=X=X@((2'II'07:=XX=@((2'II'08:=XXX@((2'II'09:X===@(2'II'10:X==X@(2'II'11:X=X=@(2'II'12:X=XX@(2'II'13:XX==@(2'II'14:XX=X@(2'II'15:XXX=@(2'II'16:XXXX@(2'II'01:====@"(2'II'02:===X@"(2'II'03:==X=@"(2'II'04:==XX@"(2'II'05:=X==@((2'II'06:=X=X@((2'II'07:=XX=@((2'II'08:=XXX@((2'II'09:X===@(2'II'10:X==X@(2'II'11:X=X=@(2'II'12:X=XX@(2'II'13:XX==@(2'II'14:XX=X@(2'II'15:XXX=@(2'II'16:XXXX@(F'II'Read:GGCCTA;Ref:CCTA$@D"(A'II'Read:CCTA;Ref:CCTA@E"(_'II'Read:CCGTxxxC;Ref:CCxTAACC2"H (J'II'Read:CCxxAC;Ref:CCTAACC # "(8'II'chromNotInRef@"(9'II'chromNotInRef1@"(libStatGen-1.0.14/bam/test/expected/noZlib/testEqWithBases.sam000077700000000000000000000000001254730101300301512../testEqWithBases.samustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/noZlib/testEqWithEq.bam000066400000000000000000000073301254730101300234540ustar00rootroot00000000000000BAM@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 172#{3( 4gf 5 6/ 70lw 8 9df\101112^~1314IW15 16veK1718t19C20|21CP22(;X; 2'II'01:====@(2'II'02:===X@(2'II'03:==X=@(2'II'04:==XX@(2'II'05:=X==@(2'II'06:=X=X@(2'II'07:=XX=@(2'II'08:=XXX@(2'II'09:X===@(2'II'10:X==X@(2'II'11:X=X=@(2'II'12:X=XX@(2'II'13:XX==@(2'II'14:XX=X@(2'II'15:XXX=@(2'II'16:XXXX@(2'II'01:====@(2'II'02:===X@(2'II'03:==X=@(2'II'04:==XX@(2'II'05:=X==@(2'II'06:=X=X@(2'II'07:=XX=@(2'II'08:=XXX@(2'II'09:X===@(2'II'10:X==X@(2'II'11:X=X=@(2'II'12:X=XX@(2'II'13:XX==@(2'II'14:XX=X@(2'II'15:XXX=@(2'II'16:XXXX@(2'II'01:====@(2'II'02:===X@(2'II'03:==X=@(2'II'04:==XX@(2'II'05:=X==@(2'II'06:=X=X@(2'II'07:=XX=@(2'II'08:=XXX@(2'II'09:X===@(2'II'10:X==X@(2'II'11:X=X=@(2'II'12:X=XX@(2'II'13:XX==@(2'II'14:XX=X@(2'II'15:XXX=@(2'II'16:XXXX@(F'II'Read:GGCCTA;Ref:CCTA$@D(A'II'Read:CCTA;Ref:CCTA@E(_'II'Read:CCGTxxxC;Ref:CCxTAACC2@(J'II'Read:CCxxAC;Ref:CCTAACC # (8'II'chromNotInRef@"(9'II'chromNotInRef1@"(libStatGen-1.0.14/bam/test/expected/noZlib/testEqWithEq.sam000077700000000000000000000000001254730101300267712../testEqWithEq.samustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/noZlib/testEqWithOrig.bam000066400000000000000000000073301254730101300240070ustar00rootroot00000000000000BAM@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 172#{3( 4gf 5 6/ 70lw 8 9df\101112^~1314IW15 16veK1718t19C20|21CP22(;X; 2'II'01:====@"(2'II'02:===X@"(2'II'03:==X=@"(2'II'04:==XX@"(2'II'05:=X==@((2'II'06:=X=X@((2'II'07:=XX=@((2'II'08:=XXX@((2'II'09:X===@(2'II'10:X==X@(2'II'11:X=X=@(2'II'12:X=XX@(2'II'13:XX==@(2'II'14:XX=X@(2'II'15:XXX=@(2'II'16:XXXX@(2'II'01:====@(2'II'02:===X@(2'II'03:==X=@(2'II'04:==XX@(2'II'05:=X==@(2'II'06:=X=X@(2'II'07:=XX=@(2'II'08:=XXX@(2'II'09:X===@(2'II'10:X==X@(2'II'11:X=X=@(2'II'12:X=XX@(2'II'13:XX==@(2'II'14:XX=X@(2'II'15:XXX=@(2'II'16:XXXX@(2'II'01:====@ (2'II'02:===X@(2'II'03:==X=@(2'II'04:==XX@(2'II'05:=X==@(2'II'06:=X=X@((2'II'07:=XX=@(2'II'08:=XXX@(2'II'09:X===@(2'II'10:X==X@(2'II'11:X=X=@(2'II'12:X=XX@(2'II'13:XX==@(2'II'14:XX=X@(2'II'15:XXX=@(2'II'16:XXXX@(F'II'Read:GGCCTA;Ref:CCTA$@D (A'II'Read:CCTA;Ref:CCTA@E (_'II'Read:CCGTxxxC;Ref:CCxTAACC2 H(J'II'Read:CCxxAC;Ref:CCTAACC # (8'II'chromNotInRef@"(9'II'chromNotInRef1@"(libStatGen-1.0.14/bam/test/expected/noZlib/testEqWithOrig.sam000077700000000000000000000000001254730101300276572../testEqWithOrig.samustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/noZlib/testShift.bam000066400000000000000000000024041254730101300230370ustar00rootroot00000000000000BAM@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 172#{3( 4gf 5 6/ 70lw 8 9df\101112^~1314IW15 16veK1718t19C20|21CP22(;X; G'II'Read1`A@B"BH'II 'Read2APBBFZBabc;'II'Read3@B BBG'II'Read4 AD""""G'II'Read5 AD""""G'II'Read6$AD""""J'II'Read701H((HB&33"3""3"3&3&" libStatGen-1.0.14/bam/test/expected/noZlib/testShift.sam000077700000000000000000000000001254730101300257452../testShift.samustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/noZlib/updateTag.bam000066400000000000000000000031101254730101300227730ustar00rootroot00000000000000BAMa@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 @RG ID:myID LB:library SM:sample @RG ID:myID2 SM:sample2 LB:library2 @CO Comment 1 @CO Comment 2 172#{3( 4gf 5 6/ 70lw 8 9df\101112^~1314IW15 16veK1718t19C20|21CP22(;X; SII1:1011:F:255+17M15D20MP""A AMCNMCXTAR=I1:1011:F:255+17M15D20M(HEJIa18:462+29M5I3M:F:295PHAMC9JIa18:462+29M5I3M:F:295AMCIJIa18:462+29M5I3M:F:295PHAMCXTARMIa18:462+29M5I3M:F:2964%P!$SIa 18:462+29M5I3M:F:2974P5!$@AHY:16597235+13M13I11M:F:181(MJIa18:462+29M5I3M:F:2984PE!$;HY:16597235+13M13I11M:F:181libStatGen-1.0.14/bam/test/expected/noZlib/updateTagFromBam.sam000077700000000000000000000000001254730101300303572../updateTagFromBam.samustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/noZlib/updateTagFromSam.sam000077700000000000000000000000001254730101300304212../updateTagFromSam.samustar00rootroot00000000000000libStatGen-1.0.14/bam/test/expected/samTest.log000066400000000000000000000041561254730101300212750ustar00rootroot000000000000001 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping RG line that has a repeated ID field. WARNING: Duplicate Tags, overwritting XA:C:123 with XA:S:456 WARNING: Duplicate Tags, overwritting RR:Z:myID1 with RR:Z:myID2 WARNING: Duplicate Tags, overwritting MD:Z:30A0C5 with MD:Z:2T1N0 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping RG line that has a repeated ID field. WARNING: Duplicate Tags, overwritting XA:C:123 with XA:S:456 WARNING: Duplicate Tags, overwritting RR:Z:myID1 with RR:Z:myID2 Suppressing rest of Duplicate Tag warnings. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping RG line that has a repeated ID field. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping RG line that has a repeated ID field. ERROR: Missing required tag: SN. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failed to store SQ record. ERROR: Missing required tag: LN. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failed to store SQ record. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping SQ line that has a repeated SN field. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping SQ line that has a repeated SN field. 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: multiple HD records. Can't modify the key tag, SN from chr20 to ERROR: Missing required tag: VN. Can't modify the key tag, ID from pid to pid1 Can't modify the key tag, ID from pid to pid1 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping PG line that has a repeated ID field. ERROR: Missing required tag: SM. Can't modify the key tag, ID from rgID to rgID1 1 invalid SAM/BAM Header lines were skipped due to: SAM/BAM Header line failure: Skipping RG line that has a repeated ID field. Can't modify the key tag, ID from rgID1 to rgID111 SamCoordOutput::flush, no output file/header is set, so records removed without being written libStatGen-1.0.14/bam/test/expected/testEqWithBases.bam000066400000000000000000000014571254730101300227130ustar00rootroot00000000000000BCT=hSQ/QZ[QB(l1j?EۂDh@)N.EQwZWW]?<qq{r~.y ŅiijDsDԲ5P̳H$MXX). ++#EV:+EʱLIg#Fxꀑ)Ii8*uFiBK~ZE) nBqS7"J.**E޵vf<֝UQ"O*D1" r-R_ 23mRcZh4t¶[f4?@? [D}(vY8^~,Y$`$D7-RPf" K+ 9/F=Dx!jkArAwx(KnAyz`![?$P"XsSv\0cpe:2RRjyU'XEgUY*^3Q>KXϪFRVgҵV6{adU>k#QŽ6{!8+Z+^^̪|V'7W'8+{ zۯUo_ P:Q3Q P/_]6啚p<6+ugN;ӟ>${%crȘ .Fl^ c7=GgԄM(D¿ltR?aM۞1k nÙ˸]qgZKv$D^d@&BClibStatGen-1.0.14/bam/test/expected/testEqWithBases.sam000066400000000000000000000056521254730101300227350ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 01:==== 73 1 10011 0 4M = 10009 0 CCTA I00? 02:===X 73 1 10011 0 4M = 10009 0 CCTT I00? 03:==X= 73 1 10011 0 4M = 10009 0 CCAA I00? 04:==XX 73 1 10011 0 4M = 10009 0 CCAT I00? 05:=X== 73 1 10011 0 4M = 10009 0 CTTA I00? 06:=X=X 73 1 10011 0 4M = 10009 0 CTTT I00? 07:=XX= 73 1 10011 0 4M = 10009 0 CTAA I00? 08:=XXX 73 1 10011 0 4M = 10009 0 CTAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 TCTA I00? 10:X==X 73 1 10011 0 4M = 10009 0 TCTT I00? 11:X=X= 73 1 10011 0 4M = 10009 0 TCAA I00? 12:X=XX 73 1 10011 0 4M = 10009 0 TCAT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TTTA I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TTTT I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTAA I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? 01:==== 73 1 10011 0 4M = 10009 0 CCTA I00? 02:===X 73 1 10011 0 4M = 10009 0 CCTT I00? 03:==X= 73 1 10011 0 4M = 10009 0 CCAA I00? 04:==XX 73 1 10011 0 4M = 10009 0 CCAT I00? 05:=X== 73 1 10011 0 4M = 10009 0 CTTA I00? 06:=X=X 73 1 10011 0 4M = 10009 0 CTTT I00? 07:=XX= 73 1 10011 0 4M = 10009 0 CTAA I00? 08:=XXX 73 1 10011 0 4M = 10009 0 CTAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 TCTA I00? 10:X==X 73 1 10011 0 4M = 10009 0 TCTT I00? 11:X=X= 73 1 10011 0 4M = 10009 0 TCAA I00? 12:X=XX 73 1 10011 0 4M = 10009 0 TCAT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TTTA I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TTTT I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTAA I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? 01:==== 73 1 10011 0 4M = 10009 0 CCTA I00? 02:===X 73 1 10011 0 4M = 10009 0 CCTT I00? 03:==X= 73 1 10011 0 4M = 10009 0 CCAA I00? 04:==XX 73 1 10011 0 4M = 10009 0 CCAT I00? 05:=X== 73 1 10011 0 4M = 10009 0 CTTA I00? 06:=X=X 73 1 10011 0 4M = 10009 0 CTTT I00? 07:=XX= 73 1 10011 0 4M = 10009 0 CTAA I00? 08:=XXX 73 1 10011 0 4M = 10009 0 CTAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 TCTA I00? 10:X==X 73 1 10011 0 4M = 10009 0 TCTT I00? 11:X=X= 73 1 10011 0 4M = 10009 0 TCAA I00? 12:X=XX 73 1 10011 0 4M = 10009 0 TCAT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TTTA I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TTTT I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTAA I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? Read:GGCCTA;Ref:CCTA 73 1 10011 0 2S4M = 10009 0 GGCCTA ??I00? Read:CCTA;Ref:CCTA 73 1 10011 0 4M4H = 10009 0 CCTA I00? Read:CCGTxxxC;Ref:CCxTAACC 73 1 10011 0 1M1P1M1I1M3D1M = 10009 0 CCGTC I00?? Read:CCxxAC;Ref:CCTAACC 73 1 10011 0 2M2N2M = 10009 0 CCAC I00? chromNotInRef 73 2 10011 0 4M = 10009 0 CCTA I00? chromNotInRef1 73 2 10011 0 4M = 10009 0 CC=A I00? libStatGen-1.0.14/bam/test/expected/testEqWithEq.bam000066400000000000000000000014541254730101300222200ustar00rootroot00000000000000BCS;hQ*(ꢰ#$sMO$IPtARFP(joi-lckk33 ܁=s=o+@`zBe~6٘0I)KPCfd!;_Y (CsOЗr F3L_ WUԨZiHYUU VDrx/aU>KUګKYUKǭւ{adU>k-Qւ{!4+Ҹzx/fU>k=7zx/aUiVyW~z zwT~R3Q P-]]gfd15[Ğ9P[Figsc=NMkr14&7YhIj^h4F~u4]I$P?Q3n7|R?f;ZH݆zڍ;+fWV+|.p`TaBClibStatGen-1.0.14/bam/test/expected/testEqWithEq.sam000066400000000000000000000056521254730101300222450ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 01:==== 73 1 10011 0 4M = 10009 0 ==== I00? 02:===X 73 1 10011 0 4M = 10009 0 ===T I00? 03:==X= 73 1 10011 0 4M = 10009 0 ==A= I00? 04:==XX 73 1 10011 0 4M = 10009 0 ==AT I00? 05:=X== 73 1 10011 0 4M = 10009 0 =T== I00? 06:=X=X 73 1 10011 0 4M = 10009 0 =T=T I00? 07:=XX= 73 1 10011 0 4M = 10009 0 =TA= I00? 08:=XXX 73 1 10011 0 4M = 10009 0 =TAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 T=== I00? 10:X==X 73 1 10011 0 4M = 10009 0 T==T I00? 11:X=X= 73 1 10011 0 4M = 10009 0 T=A= I00? 12:X=XX 73 1 10011 0 4M = 10009 0 T=AT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TT== I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TT=T I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTA= I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? 01:==== 73 1 10011 0 4M = 10009 0 ==== I00? 02:===X 73 1 10011 0 4M = 10009 0 ===T I00? 03:==X= 73 1 10011 0 4M = 10009 0 ==A= I00? 04:==XX 73 1 10011 0 4M = 10009 0 ==AT I00? 05:=X== 73 1 10011 0 4M = 10009 0 =T== I00? 06:=X=X 73 1 10011 0 4M = 10009 0 =T=T I00? 07:=XX= 73 1 10011 0 4M = 10009 0 =TA= I00? 08:=XXX 73 1 10011 0 4M = 10009 0 =TAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 T=== I00? 10:X==X 73 1 10011 0 4M = 10009 0 T==T I00? 11:X=X= 73 1 10011 0 4M = 10009 0 T=A= I00? 12:X=XX 73 1 10011 0 4M = 10009 0 T=AT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TT== I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TT=T I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTA= I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? 01:==== 73 1 10011 0 4M = 10009 0 ==== I00? 02:===X 73 1 10011 0 4M = 10009 0 ===T I00? 03:==X= 73 1 10011 0 4M = 10009 0 ==A= I00? 04:==XX 73 1 10011 0 4M = 10009 0 ==AT I00? 05:=X== 73 1 10011 0 4M = 10009 0 =T== I00? 06:=X=X 73 1 10011 0 4M = 10009 0 =T=T I00? 07:=XX= 73 1 10011 0 4M = 10009 0 =TA= I00? 08:=XXX 73 1 10011 0 4M = 10009 0 =TAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 T=== I00? 10:X==X 73 1 10011 0 4M = 10009 0 T==T I00? 11:X=X= 73 1 10011 0 4M = 10009 0 T=A= I00? 12:X=XX 73 1 10011 0 4M = 10009 0 T=AT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TT== I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TT=T I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTA= I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? Read:GGCCTA;Ref:CCTA 73 1 10011 0 2S4M = 10009 0 GG==== ??I00? Read:CCTA;Ref:CCTA 73 1 10011 0 4M4H = 10009 0 ==== I00? Read:CCGTxxxC;Ref:CCxTAACC 73 1 10011 0 1M1P1M1I1M3D1M = 10009 0 ==G== I00?? Read:CCxxAC;Ref:CCTAACC 73 1 10011 0 2M2N2M = 10009 0 ==== I00? chromNotInRef 73 2 10011 0 4M = 10009 0 CCTA I00? chromNotInRef1 73 2 10011 0 4M = 10009 0 CC=A I00? libStatGen-1.0.14/bam/test/expected/testEqWithOrig.bam000066400000000000000000000016131254730101300225500ustar00rootroot00000000000000BCnV=hA.K(jP ^~?4 x(=PLKR+AQBZjkk0V.,x{߼{h(A`btO7iz0gg戬`eQk"2"4rǪLq±2S\Jr2zr,D2E L#pєS$-i8pFiBtUhh /h0(UWiQ,UkQ*iVW"J(_]DAXK(\—LhLu c+3Tm?@o YDQԿnG q|~"o9$!rk"?^U-pqLb3xtC+8{Kwi9|snP§> {0SЗs{3~sՇǢTĪ^FgDUFUaFY\ªz|.iTyz|.eU\Rt.̬Z΅hTy\H* iۯ\̪\ܨ\U+~A5_)_ԯj|_\A~UՍ+W7:WدnZs+>?ԯrZί~_iJ:v~%k)_W@ըF~jfd5_hX{ ksNiKvsz2wkM&;I1YgɹNʍ:sfmϑIg0v41jFTN*VjkneoՕAm(ׯN__fuѢ/#MX/BClibStatGen-1.0.14/bam/test/expected/testEqWithOrig.sam000066400000000000000000000056521254730101300226000ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 01:==== 73 1 10011 0 4M = 10009 0 CCTA I00? 02:===X 73 1 10011 0 4M = 10009 0 CCTT I00? 03:==X= 73 1 10011 0 4M = 10009 0 CCAA I00? 04:==XX 73 1 10011 0 4M = 10009 0 CCAT I00? 05:=X== 73 1 10011 0 4M = 10009 0 CTTA I00? 06:=X=X 73 1 10011 0 4M = 10009 0 CTTT I00? 07:=XX= 73 1 10011 0 4M = 10009 0 CTAA I00? 08:=XXX 73 1 10011 0 4M = 10009 0 CTAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 TCTA I00? 10:X==X 73 1 10011 0 4M = 10009 0 TCTT I00? 11:X=X= 73 1 10011 0 4M = 10009 0 TCAA I00? 12:X=XX 73 1 10011 0 4M = 10009 0 TCAT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TTTA I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TTTT I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTAA I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? 01:==== 73 1 10011 0 4M = 10009 0 ==== I00? 02:===X 73 1 10011 0 4M = 10009 0 ===T I00? 03:==X= 73 1 10011 0 4M = 10009 0 ==A= I00? 04:==XX 73 1 10011 0 4M = 10009 0 ==AT I00? 05:=X== 73 1 10011 0 4M = 10009 0 =T== I00? 06:=X=X 73 1 10011 0 4M = 10009 0 =T=T I00? 07:=XX= 73 1 10011 0 4M = 10009 0 =TA= I00? 08:=XXX 73 1 10011 0 4M = 10009 0 =TAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 T=== I00? 10:X==X 73 1 10011 0 4M = 10009 0 T==T I00? 11:X=X= 73 1 10011 0 4M = 10009 0 T=A= I00? 12:X=XX 73 1 10011 0 4M = 10009 0 T=AT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TT== I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TT=T I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTA= I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? 01:==== 73 1 10011 0 4M = 10009 0 C=== I00? 02:===X 73 1 10011 0 4M = 10009 0 =C=T I00? 03:==X= 73 1 10011 0 4M = 10009 0 ==AA I00? 04:==XX 73 1 10011 0 4M = 10009 0 ==AT I00? 05:=X== 73 1 10011 0 4M = 10009 0 =TTA I00? 06:=X=X 73 1 10011 0 4M = 10009 0 CT=T I00? 07:=XX= 73 1 10011 0 4M = 10009 0 =TAA I00? 08:=XXX 73 1 10011 0 4M = 10009 0 =TAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 T=TA I00? 10:X==X 73 1 10011 0 4M = 10009 0 TC=T I00? 11:X=X= 73 1 10011 0 4M = 10009 0 TCA= I00? 12:X=XX 73 1 10011 0 4M = 10009 0 TCAT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TT=A I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TT=T I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTA= I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? Read:GGCCTA;Ref:CCTA 73 1 10011 0 2S4M = 10009 0 GGC=T= ??I00? Read:CCTA;Ref:CCTA 73 1 10011 0 4M4H = 10009 0 C=T= I00? Read:CCGTxxxC;Ref:CCxTAACC 73 1 10011 0 1M1P1M1I1M3D1M = 10009 0 C=GT= I00?? Read:CCxxAC;Ref:CCTAACC 73 1 10011 0 2M2N2M = 10009 0 C=A= I00? chromNotInRef 73 2 10011 0 4M = 10009 0 CCTA I00? chromNotInRef1 73 2 10011 0 4M = 10009 0 CC=A I00? libStatGen-1.0.14/bam/test/expected/testShift.bam000066400000000000000000000011511254730101300216000ustar00rootroot00000000000000BCL=hQO(bq)i6ܿfiSğR`)"Vۺ8H: ur.;hW׺:wc;xCț~KcP]\]ZŅNhkCˣV"R[V24\疢ES"2XÌFܪ`53[4#qʭ dxJ2k\dA )4iVCHraR2 +H~Hn kXұ릌42ؐ1!ő뢹EhB(ӓPQ $$4T*P%rm *Nv9>e(?|{@ʝG<~HBէ'SOܯz2Pv“keFDg?;PJ@AjD {G 4=rwED_R ȡY-÷94@2 >tS GsnfVcwbD 8L}T+[-ȪdV&U95+QE)8QQFnNh ̽E[VZBClibStatGen-1.0.14/bam/test/expected/testShift.sam000066400000000000000000000017041254730101300216250ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 Read1 73 1 10011 0 6M4I4M = 10009 0 GCAACCTTAATTGC ?????????????? Read2 73 1 10011 0 1M4I5M = 10009 0 GCACACACGC ?????????? FZ:Z:abc Read3 73 1 10011 0 4M4D2M = 10009 0 GCACGC ?????? Read4 73 1 10011 0 2M4I8M = 10009 0 GGCCAACCAACCCC ?????????????? Read5 73 1 10011 0 2M4I8M = 10009 0 GGCCAACCAACCCC ?????????????? Read6 73 1 10011 0 2S4I8M = 10009 0 GGCCAACCAACCCC ?????????????? Read7 73 1 10011 0 3M3I10M = 10009 0 GTTCTCCTCTGTGCAA GTTCTCCTCTGTGCAA libStatGen-1.0.14/bam/test/expected/updateTag.bam000066400000000000000000000014261254730101300215460ustar00rootroot00000000000000BC;hAfO EB́Opؙ rf5cN0'&"TbPb`aQBӦ *Zy H9ԲՓ^ud0A KKGdKJq%˜DP?˭+}Ʌ +| +˥$K\i]$I ʹ5S֦ &h ٌqBeJFlBa~t %vu@rS5}J†m+JE@ۊB 5Rۊ"@ӝC[11mF R.YmRc*PĝqRaMR})&FJ/jTV_id]9U볗_H=]QiF1x* FUQ ^֜!YO$pv+)_lAvIk;XN X,答t */WrFn-v21k%Yl>}7aGBh ("ď`J/3],GsTypcB9x[>gdfs#Kd42 m}E=L" ̗̀Cf=$,a[}iwxaUt 0@aeﱃ'Ԯvݷgx{Fsp%'G#!Dh)ٹ&4ئCHͧ<HBClibStatGen-1.0.14/bam/test/expected/updateTagFromBam.sam000066400000000000000000000023231254730101300230300ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 @RG ID:myID LB:library SM:sample @RG ID:myID2 SM:sample2 LB:library2 @CO Comment 1 @CO Comment 2 1:1011:F:255+17M15D20M 73 1 1011 0 5M2D = 1011 0 CCGAA 6>6+4 AM:i:0 NM:i:0 XT:A:R 1:1011:F:255+17M15D20M 133 1 1012 0 * = 1011 0 CTGT >>9> 18:462+29M5I3M:F:295 97 1 75 0 5M 18 757 0 ACGTN ;>>>> AM:i:0 18:462+29M5I3M:F:295 97 1 75 0 * 18 757 0 * * AM:i:0 18:462+29M5I3M:F:295 97 2 75 0 5M 18 757 0 ACGTN * AM:i:0 XT:A:R 18:462+29M5I3M:F:296 97 1 1751 0 3S2H5M 18 757 0 TGCACGTN 453;>>>> 18:462+29M5I3M:F:297 97 2 1751 0 3S5M1S3H 18 757 0 TGCACGTNG 453;>>>>5 Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 AACT ==;; 18:462+29M5I3M:F:298 97 3 75 0 3S5M4H 18 757 0 TGCACGTN 453;>>>> Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 * * libStatGen-1.0.14/bam/test/expected/updateTagFromSam.sam000066400000000000000000000023231254730101300230510ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 @RG ID:myID LB:library SM:sample @RG ID:myID2 SM:sample2 LB:library2 @CO Comment 1 @CO Comment 2 1:1011:F:255+17M15D20M 73 1 1011 0 5M2D = 1011 0 CCGAA 6>6+4 AM:i:0 NM:i:0 XT:A:R 1:1011:F:255+17M15D20M 133 1 1012 0 * = 1011 0 CTGT >>9> 18:462+29M5I3M:F:295 97 1 75 0 5M 18 757 0 ACGTN ;>>>> AM:i:0 18:462+29M5I3M:F:295 97 1 75 0 * 18 757 0 * * AM:i:0 18:462+29M5I3M:F:295 97 2 75 0 5M 18 757 0 ACGTN * AM:i:0 XT:A:R 18:462+29M5I3M:F:296 97 1 1751 0 3S2H5M 18 757 0 TGCACGTN 453;>>>> 18:462+29M5I3M:F:297 97 2 1751 0 3S5M1S3H 18 757 0 TGCACGTNG 453;>>>>5 Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 AACT ==;; 18:462+29M5I3M:F:298 97 3 75 0 3S5M4H 18 757 0 tgcacgtn 453;>>>> Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 * * libStatGen-1.0.14/bam/test/results/000077500000000000000000000000001254730101300170445ustar00rootroot00000000000000libStatGen-1.0.14/bam/test/results/.gitignore000066400000000000000000000000211254730101300210250ustar00rootroot00000000000000*.sam *.bam *.loglibStatGen-1.0.14/bam/test/test.sh000077500000000000000000000052251254730101300166650ustar00rootroot00000000000000expected="expected/" myDiffs="" if [ $# -eq 1 ] then expected="$expected$1" ./samTest 2> results/samTest.log && \ diff $expected/testEqWithBases.sam results/outSamEqBases.sam && \ diff $expected/testEqWithEq.sam results/outSamEqEquals.sam && \ diff $expected/testEqWithOrig.sam results/outSamEqOrig.sam && \ diff $expected/testEqWithBases.bam results/outSamEqBases.bam && \ diff $expected/testEqWithEq.bam results/outSamEqEquals.bam && \ diff $expected/testEqWithOrig.bam results/outSamEqOrig.bam && \ diff $expected/updateTagFromSam.sam results/updateTagFromSam.sam && \ diff $expected/updateTag.bam results/updateTagFromSam.bam && \ diff $expected/samTest.log results/samTest.log && \ diff $expected/addedTagToSam.bam results/addedTagToSam.bam && \ diff $expected/addedTagToSam.sam results/addedTagToSam.sam && \ diff $expected/testShift.sam results/testShift.sam && \ diff $expected/testShift.bam results/testShiftFromSam.bam && \ diff $expected/TestSamCoordOutput.sam results/TestSamCoordOutput.sam else ./samTest 2> results/samTest.log && \ diff $expected/testEqWithBases.sam results/outSamEqBases.sam && \ diff $expected/testEqWithEq.sam results/outSamEqEquals.sam && \ diff $expected/testEqWithOrig.sam results/outSamEqOrig.sam && \ diff $expected/testEqWithBases.bam results/outSamEqBases.bam && \ diff $expected/testEqWithEq.bam results/outSamEqEquals.bam && \ diff $expected/testEqWithOrig.bam results/outSamEqOrig.bam && \ diff $expected/testEqWithBases.sam results/outBamEqBases.sam && \ diff $expected/testEqWithEq.sam results/outBamEqEquals.sam && \ diff $expected/testEqWithOrig.sam results/outBamEqOrig.sam && \ diff $expected/testEqWithBases.bam results/outBamEqBases.bam && \ diff $expected/testEqWithEq.bam results/outBamEqEquals.bam && \ diff $expected/testEqWithOrig.bam results/outBamEqOrig.bam && \ diff $expected/updateTagFromBam.sam results/updateTagFromBam.sam && \ diff $expected/updateTagFromSam.sam results/updateTagFromSam.sam && \ diff $expected/updateTag.bam results/updateTagFromBam.bam && \ diff $expected/updateTag.bam results/updateTagFromSam.bam && \ diff $expected/samTest.log results/samTest.log && \ diff $expected/addedTagToSam.bam results/addedTagToSam.bam && \ diff $expected/addedTagToSam.sam results/addedTagToSam.sam && \ diff $expected/addedTagToBam.sam results/addedTagToBam.sam && \ diff $expected/addedTagToBam.bam results/addedTagToBam.bam && \ diff $expected/testShift.sam results/testShift.sam && \ diff $expected/testShift.bam results/testShift.bam && \ diff $expected/testShift.sam results/testShiftFromBam.sam && \ diff $expected/testShift.bam results/testShiftFromSam.bam && \ diff $expected/TestSamCoordOutput.sam results/TestSamCoordOutput.sam filibStatGen-1.0.14/bam/test/testFiles/000077500000000000000000000000001254730101300173055ustar00rootroot00000000000000libStatGen-1.0.14/bam/test/testFiles/chr1_partial-bs.umfa000066400000000000000000000144741254730101300231440ustar00rootroot000000000000003y12ɐMmktrostmktrost-laptopKARMA--2b57eeb883c6c7078241876e733elibStatGen-1.0.14/bam/test/testFiles/chr1_partial.fa000066400000000000000000000267641254730101300222050ustar00rootroot00000000000000>1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNACTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTAACCCTAACCCTAACCCTA ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAA CCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCC TAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCC TAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC CCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGG TCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCC GTTGCAAAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCG CAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCA GAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGA CACATGCTAGCGCGTCGGGGTGGAGGCGTGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGG CGCAGGCGCAGAGACACATGCTACCGCGTCCAGGGGTGGAGGCGTGGCGCAGGCGCAGAG AGGCGCACCGCGCCGGCGCAGGCGCAGAGACACATGCTAGCGCGTCCAGGGGTGGAGGCG TGGCGCAGGCGCAGAGACGCAAGCCTACGGGCGGGGGTTGGGGGGGCGTGTGTTGCAGGA GCAAAGTCGCACGGCGCCGGGCTGGGGCGGGGGGAGGGTGGCGCCGTGCACGCGCAGAAA CTCACGTCACGGTGGCGCGGCGCAGAGACGGGTAGAACCTCAGTAATCCGAAAAGCCGGG ATCGACCGCCCCTTGCTTGCAGCCGGGCACTACAGGACCCGCTTGCTCACGGTGCTGTGC CAGGGCGCCCCCTGCTGGCGACTAGGGCAACTGCAGGGCTCTCTTGCTTAGAGTGGTGGC CAGCGCCCCCTGCTGGCGCCGGGGCACTGCAGGGCCCTCTTGCTTACTGTATAGTGGTGG CACGCCGCCTGCTGGCAGCTAGGGACATTGCAGGGTCCTCTTGCTCAAGGTGTAGTGGCA GCACGCCCACCTGCTGGCAGCTGGGGACACTGCCGGGCCCTCTTGCTCCAACAGTACTGG CGGATTATAGGGAAACACCCGGAGCATATGCTGTTTGGTCTCAGTAGACTCCTAAATATG libStatGen-1.0.14/bam/test/testFiles/sortedBam.bam000066400000000000000000000014511254730101300217070ustar00rootroot00000000000000BC ;hAH̞̋pcg悐˝l9!pb"BN%%(VZXZJE-4m bE^ 80ow)j`rګ&SUNDqDSj(*}DD9DP?J_r! —JQa) #,R$5c- &h a82vF0Lq2BI?ƮH.y]CߧT*l`[Q.(X#(D<9*؆a % cMLm?Gt2SHI6y+r_P]X*azvm~vɆuWZ^Y܋\L/ ĻFzUR/;bpɜU2/>d a7=$d~n3JŹsm)ãN"<}r;k;~j%rvi2'XN[)}'_$>^~$BqSJ_Il>z@3UhѺCϟIw>32HT4 &oY= aY"񰘜>STA[C3nNݫ'lsE.3Y yMlO+J'L20Ɏԍ~A9h6ۗ j)]mj~i2waWwAs 8u3>J:ʚf5mwya{72=i z>>P"f gWP}$4DZF4(:rC[@&nBClibStatGen-1.0.14/bam/test/testFiles/sortedBam.bam.bai000066400000000000000000000006401254730101300224400ustar00rootroot00000000000000BAII`J``IJIJlibStatGen-1.0.14/bam/test/testFiles/sortedBam2.bai000077700000000000000000000000001254730101300251122sortedBam.bam.baiustar00rootroot00000000000000libStatGen-1.0.14/bam/test/testFiles/sortedBam2.bam000077700000000000000000000000001254730101300243642sortedBam.bamustar00rootroot00000000000000libStatGen-1.0.14/bam/test/testFiles/testBam.bam000066400000000000000000000014421254730101300213660ustar00rootroot00000000000000BCKHTQǿyX)$_(@OpsFǙ)FTREEZ 7EoQn]m-jaqA{`)LMjgH"a0%P"J%#RCQ)$"Q$Ř[,}Ʌ  _*HL\JJKY@$1(G}};SN40cP8L(S6n`7.\ }RR"oB ]5ZE(ӄ*+QK,Pijc*PqL'Sf 0NxJXxÃŹS^-,^_Yi`b.W+ O$x$_޲BeI .=CŇIt=bg&)8{9c> {0SЗs{3~sՇǢTĪ^FgDUFUaFY\ªz|.iTyz|.eU\Rt.̬Z΅hTy\H* iۯ\̪\ܨ\U+~A5_)_ԯj|_\A~UՍ+W7:WدnZs+>?ԯrZί~_iJ:v~%k)_W@ըF~jfd5_hX{ ksNiKvsz2wkM&;I1YgɹNʍ:sfmϑIg0v41jFTN*VjkneoՕAm(ׯN__fuѢ/#MX/BClibStatGen-1.0.14/bam/test/testFiles/testEq.sam000066400000000000000000000056521254730101300212640ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 01:==== 73 1 10011 0 4M = 10009 0 CCTA I00? 02:===X 73 1 10011 0 4M = 10009 0 CCTT I00? 03:==X= 73 1 10011 0 4M = 10009 0 CCAA I00? 04:==XX 73 1 10011 0 4M = 10009 0 CCAT I00? 05:=X== 73 1 10011 0 4M = 10009 0 CTTA I00? 06:=X=X 73 1 10011 0 4M = 10009 0 CTTT I00? 07:=XX= 73 1 10011 0 4M = 10009 0 CTAA I00? 08:=XXX 73 1 10011 0 4M = 10009 0 CTAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 TCTA I00? 10:X==X 73 1 10011 0 4M = 10009 0 TCTT I00? 11:X=X= 73 1 10011 0 4M = 10009 0 TCAA I00? 12:X=XX 73 1 10011 0 4M = 10009 0 TCAT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TTTA I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TTTT I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTAA I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? 01:==== 73 1 10011 0 4M = 10009 0 ==== I00? 02:===X 73 1 10011 0 4M = 10009 0 ===T I00? 03:==X= 73 1 10011 0 4M = 10009 0 ==A= I00? 04:==XX 73 1 10011 0 4M = 10009 0 ==AT I00? 05:=X== 73 1 10011 0 4M = 10009 0 =T== I00? 06:=X=X 73 1 10011 0 4M = 10009 0 =T=T I00? 07:=XX= 73 1 10011 0 4M = 10009 0 =TA= I00? 08:=XXX 73 1 10011 0 4M = 10009 0 =TAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 T=== I00? 10:X==X 73 1 10011 0 4M = 10009 0 T==T I00? 11:X=X= 73 1 10011 0 4M = 10009 0 T=A= I00? 12:X=XX 73 1 10011 0 4M = 10009 0 T=AT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TT== I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TT=T I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTA= I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? 01:==== 73 1 10011 0 4M = 10009 0 C=== I00? 02:===X 73 1 10011 0 4M = 10009 0 =C=T I00? 03:==X= 73 1 10011 0 4M = 10009 0 ==AA I00? 04:==XX 73 1 10011 0 4M = 10009 0 ==AT I00? 05:=X== 73 1 10011 0 4M = 10009 0 =TTA I00? 06:=X=X 73 1 10011 0 4M = 10009 0 CT=T I00? 07:=XX= 73 1 10011 0 4M = 10009 0 =TAA I00? 08:=XXX 73 1 10011 0 4M = 10009 0 =TAT I00? 09:X=== 73 1 10011 0 4M = 10009 0 T=TA I00? 10:X==X 73 1 10011 0 4M = 10009 0 TC=T I00? 11:X=X= 73 1 10011 0 4M = 10009 0 TCA= I00? 12:X=XX 73 1 10011 0 4M = 10009 0 TCAT I00? 13:XX== 73 1 10011 0 4M = 10009 0 TT=A I00? 14:XX=X 73 1 10011 0 4M = 10009 0 TT=T I00? 15:XXX= 73 1 10011 0 4M = 10009 0 TTA= I00? 16:XXXX 73 1 10011 0 4M = 10009 0 TTAT I00? Read:GGCCTA;Ref:CCTA 73 1 10011 0 2S4M = 10009 0 GGC=T= ??I00? Read:CCTA;Ref:CCTA 73 1 10011 0 4M4H = 10009 0 C=T= I00? Read:CCGTxxxC;Ref:CCxTAACC 73 1 10011 0 1M1P1M1I1M3D1M = 10009 0 C=GT= I00?? Read:CCxxAC;Ref:CCTAACC 73 1 10011 0 2M2N2M = 10009 0 C=A= I00? chromNotInRef 73 2 10011 0 4M = 10009 0 CCTA I00? chromNotInRef1 73 2 10011 0 4M = 10009 0 CC=A I00? libStatGen-1.0.14/bam/test/testFiles/testSam.sam000066400000000000000000000024111254730101300214250ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 @RG ID:myID LB:library SM:sample @RG ID:myID2 SM:sample2 LB:library2 @CO Comment 1 @CO Comment 2 1:1011:F:255+17M15D20M 73 1 1011 0 5M2D = 1011 0 CCGAA 6>6+4 XT:A:R AM:i:0 NM:i:0 MD:Z:37 1:1011:F:255+17M15D20M 133 1 1012 0 * = 1011 0 CTGT >>9> 18:462+29M5I3M:F:295 97 1 75 0 5M 18 757 0 ACGTN ;>>>> XT:A:R AM:i:0 NM:i:2 MD:Z:30A0C5 18:462+29M5I3M:F:295 97 1 75 0 * 18 757 0 * * AM:i:0 18:462+29M5I3M:F:295 97 2 75 0 5M 18 757 0 ACGTN * XT:A:R AM:i:0 NM:i:2 MD:Z:30A0C5 18:462+29M5I3M:F:296 97 1 1751 0 3S2H5M 18 757 0 TGCACGTN 453;>>>> 18:462+29M5I3M:F:297 97 2 1751 0 3S5M1S3H 18 757 0 TGCACGTNG 453;>>>>5 Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 AACT ==;; 18:462+29M5I3M:F:298 97 3 75 0 3S5M4H 18 757 0 tgcacgtn 453;>>>> Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 * * libStatGen-1.0.14/bam/test/testFiles/testSamSOcoord.sam000066400000000000000000000023001254730101300227130ustar00rootroot00000000000000@HD SO:coordinate VN:1.0 @SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 1:1011:F:255+17M15D20M 73 1 1011 0 5M2D = 1011 0 CCGAA 6>6+4 XT:A:R AM:i:0 NM:i:0 MD:Z:37 1:1011:F:255+17M15D20M 133 1 1012 0 * = 1011 0 CTGT >>9> 18:462+29M5I3M:F:295 97 1 75 0 5M 18 757 0 ACGTN ;>>>> XT:A:R AM:i:0 NM:i:2 MD:Z:30A0C5 18:462+29M5I3M:F:295 97 1 75 0 * 18 757 0 * * AM:i:0 18:462+29M5I3M:F:295 97 2 75 0 5M 18 757 0 ACGTN * XT:A:R AM:i:0 NM:i:2 MD:Z:30A0C5 18:462+29M5I3M:F:296 97 1 1751 0 3S2H5M 18 757 0 TGCACGTN 453;>>>> 18:462+29M5I3M:F:297 97 2 1751 0 3S5M1S3H 18 757 0 TGCACGTNG 453;>>>>5 Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 AACT ==;; 18:462+29M5I3M:F:298 97 3 75 0 3S5M4H 18 757 0 TGCACGTN 453;>>>> Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 * * libStatGen-1.0.14/bam/test/testFiles/testSamSOinvalid.sam000066400000000000000000000022721254730101300232430ustar00rootroot00000000000000@HD SO:junk VN:1.0 @SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 1:1011:F:255+17M15D20M 73 1 1011 0 5M2D = 1011 0 CCGAA 6>6+4 XT:A:R AM:i:0 NM:i:0 MD:Z:37 1:1011:F:255+17M15D20M 133 1 1012 0 * = 1011 0 CTGT >>9> 18:462+29M5I3M:F:295 97 1 75 0 5M 18 757 0 ACGTN ;>>>> XT:A:R AM:i:0 NM:i:2 MD:Z:30A0C5 18:462+29M5I3M:F:295 97 1 75 0 * 18 757 0 * * AM:i:0 18:462+29M5I3M:F:295 97 2 75 0 5M 18 757 0 ACGTN * XT:A:R AM:i:0 NM:i:2 MD:Z:30A0C5 18:462+29M5I3M:F:296 97 1 1751 0 3S2H5M 18 757 0 TGCACGTN 453;>>>> 18:462+29M5I3M:F:297 97 2 1751 0 3S5M1S3H 18 757 0 TGCACGTNG 453;>>>>5 Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 AACT ==;; 18:462+29M5I3M:F:298 97 3 75 0 3S5M4H 18 757 0 TGCACGTN 453;>>>> Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 * * libStatGen-1.0.14/bam/test/testFiles/testSamSOquery.sam000066400000000000000000000022771254730101300227670ustar00rootroot00000000000000@HD SO:queryname VN:1.0 @SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 1:1011:F:255+17M15D20M 73 1 1011 0 5M2D = 1011 0 CCGAA 6>6+4 XT:A:R AM:i:0 NM:i:0 MD:Z:37 1:1011:F:255+17M15D20M 133 1 1012 0 * = 1011 0 CTGT >>9> 18:462+29M5I3M:F:295 97 1 75 0 5M 18 757 0 ACGTN ;>>>> XT:A:R AM:i:0 NM:i:2 MD:Z:30A0C5 18:462+29M5I3M:F:295 97 1 75 0 * 18 757 0 * * AM:i:0 18:462+29M5I3M:F:295 97 2 75 0 5M 18 757 0 ACGTN * XT:A:R AM:i:0 NM:i:2 MD:Z:30A0C5 18:462+29M5I3M:F:296 97 1 1751 0 3S2H5M 18 757 0 TGCACGTN 453;>>>> 18:462+29M5I3M:F:297 97 2 1751 0 3S5M1S3H 18 757 0 TGCACGTNG 453;>>>>5 Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 AACT ==;; 18:462+29M5I3M:F:298 97 3 75 0 3S5M4H 18 757 0 TGCACGTN 453;>>>> Y:16597235+13M13I11M:F:181 141 * 0 0 * * 0 0 * * libStatGen-1.0.14/bam/test/testFiles/testShift.bam000066400000000000000000000012071254730101300217430ustar00rootroot00000000000000BCE!lTA@wzicgvfg5@V4((I@@H( PhMwgV<1;osnmO6֖qrmXb[j rZY`Y$![ &j) 6`3B&DF}Bn6H De}5ĦHMS,q[r"[dbu+ !F8ӞhEIf=QMz<5!*J9 1 /ⅉ8JN}.y \j>)cуjͮJor0^w;N?\?T2|[¸NՆ 9aEG ^pLɽ  >>0qE zo!#GQ]l ZJBCsg)u6O!fO>:JML1dhX:;PelAnQQI Xbb1,'YP܉ \P We 6dT.ьba @e^X\&8sQ@\MMNjJJ@dfg-z BClibStatGen-1.0.14/bam/test/testFiles/testShift.sam000066400000000000000000000017051254730101300217670ustar00rootroot00000000000000@SQ SN:1 LN:247249719 @SQ SN:2 LN:242951149 @SQ SN:3 LN:199501827 @SQ SN:4 LN:191273063 @SQ SN:5 LN:180857866 @SQ SN:6 LN:170899992 @SQ SN:7 LN:158821424 @SQ SN:8 LN:146274826 @SQ SN:9 LN:140273252 @SQ SN:10 LN:135374737 @SQ SN:11 LN:134452384 @SQ SN:12 LN:132349534 @SQ SN:13 LN:114142980 @SQ SN:14 LN:106368585 @SQ SN:15 LN:100338915 @SQ SN:16 LN:88827254 @SQ SN:17 LN:78774742 @SQ SN:18 LN:76117153 @SQ SN:19 LN:63811651 @SQ SN:20 LN:62435964 @SQ SN:21 LN:46944323 @SQ SN:22 LN:49691432 @SQ SN:X LN:154913754 Read1 73 1 10011 0 8M4I2M = 10009 0 GCAACCTTAATTGC ?????????????? Read2 73 1 10011 0 4M4I2M = 10009 0 GCACACACGC ?????????? FZ:Z:abc Read3 73 1 10011 0 4M4D2M = 10009 0 GCACGC ?????? Read4 73 1 10011 0 8M4I2M = 10009 0 GGCCAACCAACCCC ?????????????? Read5 73 1 10011 0 2M4I8M = 10009 0 GGCCAACCAACCCC ?????????????? Read6 73 1 10011 0 2S6M4I2M = 10009 0 GGCCAACCAACCCC ?????????????? Read7 73 1 10011 0 6M3I7M = 10009 0 GTTCTCCTCTGTGCAA GTTCTCCTCTGTGCAA libStatGen-1.0.14/bam/test/testFiles/testVar.bam000066400000000000000000000012541254730101300214200ustar00rootroot00000000000000BC];lPO&4)jiIiRs߷]A+Hii% BL.C!H $X@]@]X ;go8klW"zucuW(W64Xj"-s5U"UHY&Z(-efhcfVy+)ϬK,O"߀k&\PHF:CnSMr nR -r˜68Ҿ]վԾ E~r+Jr&R_Kùm[ߏi̦ᔹAl& A}<¯=1(N8l˓ڛ/5Gg<) Uiy?ͭ%?-p! U. */ #include #include "BaseComposition.h" // Constructor // Initialize the base to ascii map based on the specified maptype. BaseComposition::BaseComposition(): myBaseAsciiMap() { } // Update the composition for the specified index with the specified character. // Return false if the character is not a valid raw sequence character. // Return true if it is valid. bool BaseComposition::updateComposition(unsigned int rawSequenceCharIndex, char baseChar) { bool validIndex = true; // Each time we return to index 0, reset the primer count in the base/ascii // map. if(rawSequenceCharIndex == 0) { myBaseAsciiMap.resetPrimerCount(); } // Check to see if the vector size is already sized to include this // index. If it is not sized appropriately, add entries until it contains // the rawSequenceCharIndex. while(rawSequenceCharIndex >= myBaseCountVector.size()) { // Add an entry of the base count array object to the vector. BaseCount baseCountEntry; myBaseCountVector.push_back(baseCountEntry); } // Get the base info for the specified character. int baseIndex = myBaseAsciiMap.getBaseIndex(baseChar); // Increment the count for the given character. This method returns false // if the character's index falls outside the range of the base array. // This relies on the myBaseAsciiMap indexes and the BaseCOunt object array // to use the same indexing values for valid bases. validIndex = myBaseCountVector[rawSequenceCharIndex].incrementCount(baseIndex); // Return whether or not the specified character was valid. return(validIndex); } // Print the composition. void BaseComposition::print() { std::cout << std::endl << "Base Composition Statistics:" << std::endl; std::cout.precision(2); // This assumes the relationship between indexes that are printed // by a BaseCount object to be in a specific order based on ATGCN. std::cout << std::fixed << "Read Index" << "\t%A" << "\t%C" << "\t%G" << "\t%T" << "\t%N" << "\tTotal Reads At Index" << std::endl; for(unsigned int i = 0; i < myBaseCountVector.size(); i++) { std::cout << std::setw(10) << i << " "; myBaseCountVector[i].printPercent(); } std::cout << std::endl; } // Clear the vector. void BaseComposition::clear() { myBaseCountVector.clear(); } libStatGen-1.0.14/fastq/BaseComposition.h000066400000000000000000000042001254730101300202060ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BASE_COMPOSITION_H__ #define __BASE_COMPOSITION_H__ #include #include "BaseAsciiMap.h" #include "BaseCount.h" /// Class that tracks the composition of base by read location. class BaseComposition { public: /// Constructor. BaseComposition(); /// Update the composition for the specified index with the specified /// character. /// \return false if the character is not a valid raw sequence character, /// true if it is valid. bool updateComposition(unsigned int rawSequenceCharIndex, char baseChar); /// Get the space type for this composition. BaseAsciiMap::SPACE_TYPE getSpaceType() { return(myBaseAsciiMap.getSpaceType()); } /// Reset the base map type for this composition. void resetBaseMapType() { myBaseAsciiMap.resetBaseMapType(); }; /// Set the base map type for this composition. void setBaseMapType(BaseAsciiMap::SPACE_TYPE spaceType) { myBaseAsciiMap.setBaseMapType(spaceType); } /// Print the composition. void print(); /// Clear the composition stored in the base count vector. void clear(); private: // Map of bases used to determine if a character is valid and if so // maps it to a number. BaseAsciiMap myBaseAsciiMap; // Vector used to store the occurrence of each base type at a given // read location. vector myBaseCountVector; }; #endif libStatGen-1.0.14/fastq/BaseCount.cpp000066400000000000000000000051651254730101300173410ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include "BaseCount.h" // Constructor. Initializes the array to be all 0s. BaseCount::BaseCount() { // Init each element of the array to 0. for(int i = 0; i < myBaseSize; i++) { myBaseCount[i] = 0; } } // Update the count for the specified index as well as the overall count // (The last index). // Returns false if the specified index is < 0 or >= myBaseSize-1. The // reason returns false if it is equal to the size-1 is because the last // index is used to track an overall count. bool BaseCount::incrementCount(int baseIndex) { // Check to see if the index is within range (>=0 & < myBaseSize-1) // The last entry of the array is invalid since it is used to track // total occurrence of all other entries. if((baseIndex < myBaseSize-1) && (baseIndex >= 0)) { // Valid index, so increment that index as well as the overall // count (index myBaseSize-1) and return true. myBaseCount[baseIndex]++; myBaseCount[myBaseSize-1]++; return true; } else { // Invalid index, return false return false; } } // Prints the percentage for each index 0 to myBaseSize-2. Also prints // the total number of entries (index myBaseSize-1). void BaseCount::printPercent() { // Do not divide by 0, so check to see if there are any bases by checking // the last index of the array. if(myBaseCount[myBaseSize-1] == 0) { // No entries for any index. std::cout << "No Valid Bases found."; } else { // Print the percentage for each index. for(int i = 0; i < myBaseSize -1; i++) { double percentage = (myBaseCount[i]/(double)myBaseCount[myBaseSize-1]) * 100; std::cout << " " << std::setw(7) << percentage; } // Print the total number of bases. std::cout << "\t" << myBaseCount[myBaseSize-1]; } std::cout << std::endl; } libStatGen-1.0.14/fastq/BaseCount.h000066400000000000000000000041061254730101300170000ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BASE_COUNT_H__ #define __BASE_COUNT_H__ /// This class is a wrapper around an array that has one index per base and an /// extra index for a total count of all bases. This class is used to keep /// a count of the number of times each index has occurred. /// It can print a percentage of the occurrence of each base against the total /// number of bases. class BaseCount { public: /// Constructor, initializes the array to be all 0s. BaseCount(); /// Update the count for the specified index as well as the overall count /// (The last index). /// \return false if the specified index is < 0 or >= myBaseSize-1, otherwise /// returns true. The reason it returns false if it is equal to the size-1 /// is because the last index is used to track an overall count. bool incrementCount(int baseIndex); // Print the percentage for each index, 0 to myBaseSize-2, also print // the total number of entries (index myBaseSize-1). void printPercent(); private: // Constant to size the array and implement the logic for loops as well // as tracking the last index for keeping an overall count. static const int myBaseSize = 6; // Array used to track the occurences of each index. The last index // tracks the total number of occurrences of all the other indexes. int myBaseCount[myBaseSize]; }; #endif libStatGen-1.0.14/fastq/COPYING000066400000000000000000001045141254730101300160030ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . libStatGen-1.0.14/fastq/FastQFile.cpp000066400000000000000000000652621254730101300173000ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "InputFile.h" #include "FastQFile.h" #include "BaseUtilities.h" // Constructor. // minReadLength - The minimum length that a base sequence must be for // it to be valid. // numPrintableErrors - The maximum number of errors that should be reported // in detail before suppressing the errors. // FastQFile::FastQFile(int minReadLength, int numPrintableErrors) : myFile(NULL), myBaseComposition(), myQualPerCycle(), myCountPerCycle(), myCheckSeqID(true), myMinReadLength(minReadLength), myNumPrintableErrors(numPrintableErrors), myMaxErrors(-1), myDisableMessages(false), myFileProblem(false) { // Reset the member data. reset(); } void FastQFile::disableMessages() { myDisableMessages = true; } void FastQFile::enableMessages() { myDisableMessages = false; } // Disable Unique Sequence ID checking. // Unique Sequence ID checking is enabled by default. void FastQFile::disableSeqIDCheck() { myCheckSeqID = false; } // Enable Unique Sequence ID checking. // Unique Sequence ID checking is enabled by default. void FastQFile::enableSeqIDCheck() { myCheckSeqID = true; } // Set the number of errors after which to quit reading/validating a file. void FastQFile::setMaxErrors(int maxErrors) { myMaxErrors = maxErrors; } // Open a FastQFile. FastQStatus::Status FastQFile::openFile(const char* fileName, BaseAsciiMap::SPACE_TYPE spaceType) { // reset the member data. reset(); myBaseComposition.resetBaseMapType(); myBaseComposition.setBaseMapType(spaceType); myQualPerCycle.clear(); myCountPerCycle.clear(); FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS; // Close the file if there is already one open - checked by close. status = closeFile(); if(status == FastQStatus::FASTQ_SUCCESS) { // Successfully closed a previously opened file if there was one. // Open the file myFile = ifopen(fileName, "rt"); myFileName = fileName; if(myFile == NULL) { // Failed to open the file. status = FastQStatus::FASTQ_OPEN_ERROR; } } if(status != FastQStatus::FASTQ_SUCCESS) { // Failed to open the file. std::string errorMessage = "ERROR: Failed to open file: "; errorMessage += fileName; logMessage(errorMessage.c_str()); } return(status); } // Close a FastQFile. FastQStatus::Status FastQFile::closeFile() { int closeStatus = 0; // Success. // If a file has been opened, close it. if(myFile != NULL) { // Close the file. closeStatus = ifclose(myFile); myFile = NULL; } if(closeStatus == 0) { // Success - either there wasn't a file to close or it was closed // successfully. return(FastQStatus::FASTQ_SUCCESS); } else { std::string errorMessage = "Failed to close file: "; errorMessage += myFileName.c_str(); logMessage(errorMessage.c_str()); return(FastQStatus::FASTQ_CLOSE_ERROR); } } // Check to see if the file is open. bool FastQFile::isOpen() { // Check to see if the file is open. if((myFile != NULL) && (myFile->isOpen())) { // File pointer exists and the file is open. return true; } // File is not open. return false; } // Check to see if the file is at the end of the file. bool FastQFile::isEof() { // Check to see if the file is open. if((myFile != NULL) && (ifeof(myFile))) { // At EOF. return true; } // Not at EOF. return false; } // Returns whether or not to keep reading the file. // Stop reading (false) if eof or there is a problem reading the file. bool FastQFile::keepReadingFile() { if(isEof() || myFileProblem) { return(false); } return(true); } // Validate the specified fastq file FastQStatus::Status FastQFile::validateFastQFile(const String& filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg) { // Open the fastqfile. if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS) { // Failed to open the specified file. return(FastQStatus::FASTQ_OPEN_ERROR); } // Track the total number of sequences that were validated. int numSequences = 0; // Keep reading the file until there are no more fastq sequences to process // and not configured to quit after a certain number of errors or there // has not yet been that many errors. // Or exit if there is a problem reading the file. FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS; while (keepReadingFile() && ((myMaxErrors == -1) || (myMaxErrors > myNumErrors))) { // Validate one sequence. This call will read all the lines for // one sequence. status = readFastQSequence(); if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID)) { // Read a sequence and it is either valid or invalid, but // either way, a sequence was read, so increment the sequence count. ++numSequences; } else { // Other error, so break out of processing. break; } } // Report Base Composition Statistics. if(printBaseComp) { myBaseComposition.print(); } if(printQualAvg) { printAvgQual(); } std::string finishMessage = "Finished processing "; finishMessage += myFileName.c_str(); char buffer[100]; if(sprintf(buffer, " with %u lines containing %d sequences.", myLineNum, numSequences) > 0) { finishMessage += buffer; logMessage(finishMessage.c_str()); } if(sprintf(buffer, "There were a total of %d errors.", myNumErrors) > 0) { logMessage(buffer); } // Close the input file. FastQStatus::Status closeStatus = closeFile(); if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID)) { // Stopped validating due to some error other than invalid, so // return that error. return(status); } else if(myNumErrors == 0) { // No errors, check to see if there were any sequences. // Finished processing all of the sequences in the file. // If there are no sequences, report an error. if(numSequences == 0) { // Empty file, return error. logMessage("ERROR: No FastQSequences in the file."); return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR); } return(FastQStatus::FASTQ_SUCCESS); } else { // The file is invalid. But check the close status. If the close // failed, it means there is a problem with the file itself not just // with validation, so the close failure should be returned. if(closeStatus != FastQStatus::FASTQ_SUCCESS) { return(closeStatus); } return(FastQStatus::FASTQ_INVALID); } } // Reads and validates a single fastq sequence from myFile. FastQStatus::Status FastQFile::readFastQSequence() { // First verify that a file is open, if not, return failure. if(!isOpen()) { std::string message = "ERROR: Trying to read a fastq file but no file is open."; logMessage(message.c_str()); return(FastQStatus::FASTQ_ORDER_ERROR); } // Reset variables for each sequence. resetForEachSequence(); bool valid = true; // No sequence was read. if(isTimeToQuit()) { return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR); } // The first line is the sequence identifier, so validate that. valid = validateSequenceIdentifierLine(); if(myFileProblem) { return(FastQStatus::FASTQ_READ_ERROR); } // If we are at the end of the file, check to see if it is a partial // sequence or just an empty line at the end. if(ifeof(myFile)) { // If the sequence identifier line was empty and we are at the // end of the file, there is nothing more to validate. if(mySequenceIdLine.Length() != 0) { // There was a sequence identifier line, so this is an incomplete // sequence. myErrorString = "Incomplete Sequence.\n"; reportErrorOnLine(); valid = false; } if(valid) { // Return failure - no sequences were left to read. At the end // of the file. It wasn't invalid and it wasn't really an error. return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR); } else { return(FastQStatus::FASTQ_INVALID); } } // If enough errors, quit before reading any more. if(isTimeToQuit()) { // Means there was an error, so mark it as invalid. return(FastQStatus::FASTQ_INVALID); } // Validate the Raw Sequence Line(s) and the "+" line. valid &= validateRawSequenceAndPlusLines(); if(myFileProblem) { return(FastQStatus::FASTQ_READ_ERROR); } // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(FastQStatus::FASTQ_INVALID); } // If it is the end of a file, it is missing the quality string. if(ifeof(myFile)) { // There was a sequence identifier line, so this is an incomplete // sequence. myErrorString = "Incomplete Sequence, missing Quality String."; reportErrorOnLine(); valid = false; return(FastQStatus::FASTQ_INVALID); } // All that is left is to validate the quality string line(s). valid &= validateQualityStringLines(); if(myFileProblem) { return(FastQStatus::FASTQ_READ_ERROR); } if(valid) { return(FastQStatus::FASTQ_SUCCESS); } return(FastQStatus::FASTQ_INVALID); } // Reads and validates the sequence identifier line of a fastq sequence. bool FastQFile::validateSequenceIdentifierLine() { // Read the first line of the sequence. int readStatus = mySequenceIdLine.ReadLine(myFile); // Check to see if the read was successful. if(readStatus <= 0) { // If EOF, not an error. if(ifeof(myFile)) { return true; } myFileProblem = true; myErrorString = "Failure trying to read sequence identifier line"; reportErrorOnLine(); return false; } // If the line is 0 length and it is the end of the file, just // return since this is the eof - no error. if((mySequenceIdLine.Length() == 0) && (ifeof(myFile))) { // Not an error, just a new line at the end of the file. return true; } // Increment the line number. myLineNum++; // Verify that the line has at least 2 characters: '@' and at least // one character for the sequence identifier. if(mySequenceIdLine.Length() < 2) { // Error. Sequence Identifier line not long enough. myErrorString = "The sequence identifier line was too short."; reportErrorOnLine(); return false; } // The sequence identifier line must start wtih a '@' if(mySequenceIdLine[0] != '@') { // Error - sequence identifier line does not begin with an '@'. myErrorString = "First line of a sequence does not begin with @"; reportErrorOnLine(); return false; } // Valid Sequence Identifier Line. // The sequence identifier ends at the first space or at the end of the // line if there is no space. // Use fast find since this is a case insensitive search. // Start at 1 since we know that 0 is '@' int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1); // Check if a " " was found. if(endSequenceIdentifier == -1) { // Did not find a ' ', so the identifier is the rest of the line. // It starts at 1 since @ is at offset 0. mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str(); } else { // Found a ' ', so the identifier ends just before that. // The sequence identifier must be at least 1 character long, // therefore the endSequenceIdentifier must be greater than 1. if(endSequenceIdentifier <= 1) { myErrorString = "No Sequence Identifier specified before the comment."; reportErrorOnLine(); return false; } mySequenceIdentifier = (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str(); } // Check if sequence identifier should be validated for uniqueness. if(myCheckSeqID) { // Check to see if the sequenceIdentifier is a repeat by adding // it to the set and seeing if it already existed. std::pair::iterator,bool> insertResult; insertResult = myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(), myLineNum)); if(insertResult.second == false) { // Sequence Identifier is a repeat. myErrorString = "Repeated Sequence Identifier: "; myErrorString += mySequenceIdentifier.c_str(); myErrorString += " at Lines "; myErrorString += insertResult.first->second; myErrorString += " and "; myErrorString += myLineNum; reportErrorOnLine(); return(false); } } // Valid, return true. return(true); } // Reads and validates the raw sequence line(s) and the plus line. Both are // included in one method since it is unknown when the raw sequence line // ends until you find the plus line that divides it from the quality // string. Since this method will read the plus line to know when the // raw sequence ends, it also validates that line. bool FastQFile::validateRawSequenceAndPlusLines() { // Read the raw sequence. int readStatus = myRawSequence.ReadLine(myFile); myLineNum++; if(readStatus <= 0) { myFileProblem = true; myErrorString = "Failure trying to read sequence line"; reportErrorOnLine(); return false; } // Offset into the raw sequence to be validated. int offset = 0; // Validate the raw sequence. bool valid = validateRawSequence(offset); // Increment the offset for what was just read. offset = myRawSequence.Length(); // The next line is either a continuation of the raw sequence or it starts // with a '+' // Keep validating til the '+' line or the end of file is found. bool stillRawLine = true; while(stillRawLine && !ifeof(myFile)) { // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // Read the next line. // Read into the plus line, but if it isn't a plus line, then // it will be copied into the raw sequence line. readStatus = myPlusLine.ReadLine(myFile); myLineNum++; if(readStatus <= 0) { myFileProblem = true; myErrorString = "Failure trying to read sequence/plus line"; reportErrorOnLine(); return false; } // Check if the next line is blank if(myPlusLine.Length() == 0) { // The next line is blank. Assume it is part of the raw sequence and // report an error since there are no valid characters on the line. myErrorString = "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence."; reportErrorOnLine(); } // Check for the plus line. else if(myPlusLine[0] == '+') { // This is the + line. valid &= validateSequencePlus(); stillRawLine = false; } else { // Not a plus line, so assume this is a continuation of the Raw // Sequence. // Copy from the plus line to the raw sequence line. myRawSequence += myPlusLine; myPlusLine.SetLength(0); valid &= validateRawSequence(offset); // Increment the offset. offset = myRawSequence.Length(); } } // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // Now that the entire raw sequence has been obtained, check its length // against the minimum allowed length. if(myRawSequence.Length() < myMinReadLength) { // Raw sequence is not long enough - error. myErrorString = "Raw Sequence is shorter than the min read length: "; myErrorString += myRawSequence.Length(); myErrorString += " < "; myErrorString += myMinReadLength; reportErrorOnLine(); valid = false; } // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // if the flag still indicates it is processing the raw sequence that means // we reached the end of the file without a '+' line. So report that error. if(stillRawLine) { myErrorString = "Reached the end of the file without a '+' line."; reportErrorOnLine(); valid = false; } return(valid); } // Reads and validates the quality string line(s). bool FastQFile::validateQualityStringLines() { // Read the quality string. int readStatus = myQualityString.ReadLine(myFile); myLineNum++; if(readStatus <= 0) { myFileProblem = true; myErrorString = "Failure trying to read quality line"; reportErrorOnLine(); return false; } // track the offset into the quality string to validate. int offset = 0; // Validate this line of the quality string. bool valid = validateQualityString(offset); offset = myQualityString.Length(); // Keep reading quality string lines until the length of the // raw sequence has been hit or the end of the file is reached. while((myQualityString.Length() < myRawSequence.Length()) && (!ifeof(myFile))) { // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // Read another line of the quality string. readStatus = myTempPartialQuality.ReadLine(myFile); myLineNum++; if(readStatus <= 0) { myFileProblem = true; myErrorString = "Failure trying to read quality line"; reportErrorOnLine(); return false; } myQualityString += myTempPartialQuality; myTempPartialQuality.Clear(); // Validate this line of the quality string. valid &= validateQualityString(offset); offset = myQualityString.Length(); } // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // Validate that the quality string length is the same as the // raw sequence length. if(myQualityString.Length() != myRawSequence.Length()) { myErrorString = "Quality string length ("; myErrorString += myQualityString.Length(); myErrorString += ") does not equal raw sequence length ("; myErrorString += myRawSequence.Length(); myErrorString += ")"; reportErrorOnLine(); valid = false; } return(valid); } // Method to validate a line that contains part of the raw sequence. bool FastQFile::validateRawSequence(int offset) { bool validBase = false; bool valid = true; // Loop through validating each character is valid for the raw sequence. for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length(); sequenceIndex++) { // Update the composition for this position. Returns false if the // character was not a valid base. validBase = myBaseComposition.updateComposition(sequenceIndex, myRawSequence[sequenceIndex]); // Check the return if(!validBase) { // Error, found a value that is not a valid base character. myErrorString = "Invalid character ('"; myErrorString += myRawSequence[sequenceIndex]; myErrorString += "') in base sequence."; reportErrorOnLine(); valid = false; // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } } } return(valid); } // Method to validate the "+" line that seperates the raw sequence and the // quality string. bool FastQFile::validateSequencePlus() { // Validate that optional sequence identifier is the same // as the one on the @ line. // Check to see if there is more to the line than just the plus int lineLength = myPlusLine.Length(); // If the line is only 1 character or the second character is a space, // then there is no sequence identifier on this line and there is nothing // further to validate. if((lineLength == 1) || (myPlusLine[1] == ' ')) { // No sequence identifier, so just return valid. return true; } // There is a sequence identifier on this line, so validate that // it matches the one from the associated @ line. // The read in line must be at least 1 more character ('+') than the // sequence identifier read from the '@' line. // If it is not longer than the sequence identifier, then we know that it // cannot be the same. int sequenceIdentifierLength = mySequenceIdentifier.Length(); if(lineLength <= sequenceIdentifierLength) { myErrorString = "Sequence Identifier on '+' line does not equal the one on the '@' line."; reportErrorOnLine(); return false; } bool same = true; int seqIndex = 0; int lineIndex = 1; // Start at 1 since offset 0 has '+' // Loop through the sequence index and the line buffer verifying they // are the same until a difference is found or the end of the sequence // identifier is found. while((same == true) && (seqIndex < sequenceIdentifierLength)) { if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex]) { myErrorString = "Sequence Identifier on '+' line does not equal the one on the '@' line."; reportErrorOnLine(); same = false; } lineIndex++; seqIndex++; } return(same); } // Method to validate the quality string. bool FastQFile::validateQualityString(int offset) { bool valid = true; if(myQualityString.Length() > (int)(myQualPerCycle.size())) { myQualPerCycle.resize(myQualityString.Length()); myCountPerCycle.resize(myQualityString.Length()); } // For each character in the line, verify that it is ascii > 32. for(int i=offset; i < myQualityString.Length(); i++) { if(myQualityString[i] <= 32) { myErrorString = "Invalid character ('"; myErrorString += myQualityString[i]; myErrorString += "') in quality string."; reportErrorOnLine(); valid = false; // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } } else { myQualPerCycle[i] += BaseUtilities::getPhredBaseQuality(myQualityString[i]); myCountPerCycle[i] += 1; } } return(valid); } // Helper method for printing the contents of myErrorString. It will // only print the errors until the maximum number of reportable errors is // reached. void FastQFile::reportErrorOnLine() { // Increment the total number of errors. myNumErrors++; // Only display the first X number of errors. if(myNumErrors <= myNumPrintableErrors) { // Write the error with the line number. char buffer[100]; sprintf(buffer, "ERROR on Line %u: ", myLineNum); std::string message = buffer; message += myErrorString.c_str(); logMessage(message.c_str()); } } // Reset member data that is unique for each fastQFile. void FastQFile::reset() { // Each fastq file processing needs to also reset the member data for each // sequence. resetForEachSequence(); myNumErrors = 0; // per fastqfile myLineNum = 0; // per fastqfile myFileName.SetLength(0); // reset the filename string. myIdentifierMap.clear(); // per fastqfile myBaseComposition.clear(); // clear the base composition. myQualPerCycle.clear(); myCountPerCycle.clear(); myFileProblem = false; } // Reset the member data that is unique for each sequence. void FastQFile::resetForEachSequence() { myLineBuffer.SetLength(0); myErrorString.SetLength(0); myRawSequence.SetLength(0); mySequenceIdLine.SetLength(0); mySequenceIdentifier.SetLength(0); myPlusLine.SetLength(0); myQualityString.SetLength(0); myTempPartialQuality.SetLength(0); } void FastQFile::logMessage(const char* logMessage) { // Write the message if they are not disabled. if(!myDisableMessages) { std::cout << logMessage << std::endl; } } // Determine if it is time to quit by checking if we are to quit after a // certain number of errors and that many errors have been encountered. bool FastQFile::isTimeToQuit() { // It is time to quit if we are to quit after a certain number of errors // and that many errors have been encountered. if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors)) { return(true); } return(false); } void FastQFile::printAvgQual() { std::cout << std::endl << "Average Phred Quality by Read Index (starts at 0):" << std::endl; std::cout.precision(2); std::cout << std::fixed << "Read Index\tAverage Quality" << std::endl; if(myQualPerCycle.size() != myCountPerCycle.size()) { // This is a code error and should NEVER happen. std::cerr << "ERROR calculating the average Qualities per cycle\n"; } double sumQual = 0; double count = 0; double avgQual = 0; for(unsigned int i = 0; i < myQualPerCycle.size(); i++) { avgQual = 0; if(myCountPerCycle[i] != 0) { avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]); } std::cout << i << "\t" << avgQual << "\n"; sumQual += myQualPerCycle[i]; count += myCountPerCycle[i]; } std::cout << std::endl; avgQual = 0; if(count != 0) { avgQual = sumQual / count; } std::cout << "Overall Average Phred Quality = " << avgQual << std::endl; } libStatGen-1.0.14/fastq/FastQFile.h000066400000000000000000000207631254730101300167420ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __FASTQ_VALIDATOR_H__ #define __FASTQ_VALIDATOR_H__ #include #include #include "StringBasics.h" #include "InputFile.h" #include "BaseComposition.h" #include "FastQStatus.h" /// Class for reading/validating a fastq file. class FastQFile { public: /// Constructor. /// /param minReadLength The minimum length that a base sequence must be for /// it to be valid. /// \param numPrintableErrors The maximum number of errors that should be reported /// in detail before suppressing the errors. FastQFile(int minReadLength = 10, int numPrintableErrors = 20); /// Disable messages - do not write to cout. void disableMessages(); /// Enable messages - write to cout. void enableMessages(); /// Disable Unique Sequence ID checking /// (Unique Sequence ID checking is enabled by default). void disableSeqIDCheck(); /// Enable Unique Sequence ID checking. /// (Unique Sequence ID checking is enabled by default). void enableSeqIDCheck(); /// Set the number of errors after which to quit reading/validating a file, /// defaults to -1. /// \param maxErrors # of errors before quitting, /// -1 indicates to not quit until the entire file has been read/validated (default), /// 0 indicates to quit without reading/validating anything. void setMaxErrors(int maxErrors); /// Open a FastQFile. /// Use the specified SPACE_TYPE to determine BASE, COLOR, or UNKNOWN. FastQStatus::Status openFile(const char* fileName, BaseAsciiMap::SPACE_TYPE spaceType = BaseAsciiMap::UNKNOWN); /// Close a FastQFile. FastQStatus::Status closeFile(); /// Check to see if the file is open. bool isOpen(); /// Check to see if the file is at the end of the file. bool isEof(); /// Returns whether or not to keep reading the file, /// it stops reading (false) if eof or there is a problem reading the file. bool keepReadingFile(); /// Validate the specified fastq file /// \param filename fastq file to be validated. /// \param printBaseComp whether or not to print the base composition for the file. /// true means print it, false means do not. /// \param spaceType the spaceType to use for validation - BASE_SPACE, COLOR_SPACE, /// or UNKNOWN (UNKNOWN means to determine the spaceType to /// validate against from the first character of the first /// sequence). /// \param printQualAvg whether or not to print the quality averages for the file. /// true means to print it, false (default) means do not. /// \return the fastq validation status, SUCCESS on a successfully /// validated fastq file. FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg = false); /// Read 1 FastQSequence, validating it. FastQStatus::Status readFastQSequence(); /////////////////////// /// @name Public Sequence Line variables. /// Keep public variables for a sequence's line so they can be accessed /// without having to do string copies. //@{ String myRawSequence; String mySequenceIdLine; String mySequenceIdentifier; String myPlusLine; String myQualityString; //@} /// Get the space type used for this file. inline BaseAsciiMap::SPACE_TYPE getSpaceType() { return(myBaseComposition.getSpaceType()); } private: // Validates a single fastq sequence from myFile. bool validateFastQSequence(); // Reads and validates the sequence identifier line of a fastq sequence. bool validateSequenceIdentifierLine(); // Reads and validates the raw sequence line(s) and the plus line. Both are // included in one method since it is unknown when the raw sequence line // ends until you find the plus line that divides it from the quality // string. Since this method will read the plus line to know when the // raw sequence ends, it also validates that line. bool validateRawSequenceAndPlusLines(); // Reads and validates the quality string line(s). bool validateQualityStringLines(); // Method to validate a line that contains part of the raw sequence. // offset specifies where in the sequence to start validating. bool validateRawSequence(int offset); // Method to validate the "+" line that seperates the raw sequence and the // quality string. bool validateSequencePlus(); // Method to validate the quality string. // offset specifies where in the quality string to start validating. bool validateQualityString(int offset); // Helper method to read a line from the input file into a string. // It also tracks the line number. void readLine(); // Helper method for printing the contents of myErrorString. It will // only print the errors until the maximum number of reportable errors is // reached. void reportErrorOnLine(); // Reset the member data for each fastq file. void reset(); // Reset the member data for each sequence. void resetForEachSequence(); // Log the specified message if enabled. void logMessage(const char* message); // Determine if it is time to quit by checking if we are to quit after a // certain number of errors and that many errors have been encountered. bool isTimeToQuit(); void printAvgQual(); ////////////////////////////////////////////////////////////////////// // Following member data elements are reset for each validated sequence. // // Buffer for storing the contents of the line read. // Stored as member data so memory allocation is only done once. String myLineBuffer; // Buffer for storing the error string. This prevents the reallocation of // the string buffer for each error. String myErrorString; String myTempPartialQuality; ////////////////////////////////////////////////////////////////////// // Following member data elements are reset for each validated file. // IFILE myFile; // Input file to be read. String myFileName; // Name of file being processed. int myNumErrors; // Tracks the number of errors. unsigned int myLineNum; // Track the line number - used for reporting errors. BaseComposition myBaseComposition; // Tracks the base composition. std::vector myQualPerCycle; // Tracks the quality by cycle. std::vector myCountPerCycle; // Tracks the number of entries by cycle. // Whether or not to check the sequence identifier for uniqueness. // Checking may use up a lot of memory. bool myCheckSeqID; // Map to track which identifiers have appeared in the file. std::map myIdentifierMap; ////////////////////////////////////////////////////////////////////// // Following member data do not change for each call to the validator. // int myMinReadLength; // Min Length for a read. int myNumPrintableErrors; // Max number of errors to print the details of. // Number of errors after which to quit reading/validating a file. // Defaults to -1. // -1 indicates to not quit until the entire file has been read/validated. // 0 indicates to quit without reading/validating anything. int myMaxErrors; // Whether or not messages should be printed. // Defaulted to false (they should be printed). bool myDisableMessages; // Track if there is a problem reading the file. If there are read // problems, stop reading the file. bool myFileProblem; }; #endif libStatGen-1.0.14/fastq/FastQStatus.cpp000066400000000000000000000020231254730101300176660ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "FastQStatus.h" const char* FastQStatus::enumString[] = {"FASTQ_SUCCESS", "FASTQ_INVALID", "FASTQ_ORDER_ERROR", "FASTQ_OPEN_ERROR", "FASTQ_CLOSE_ERROR", "FASTQ_READ_ERROR", "FASTQ_NO_SEQUENCE_ERROR"}; const char* FastQStatus::getStatusString(Status status) { return(enumString[status]); } libStatGen-1.0.14/fastq/FastQStatus.h000066400000000000000000000033631254730101300173430ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __FASTQ_STATUS_H__ #define __FASTQ_STATUS_H__ #include /// Status for FastQ operations. class FastQStatus { public: /// Return value enum for the FastQFile class methods, indicating /// success or error codes. enum Status { FASTQ_SUCCESS = 0, ///< indicates method finished successfully. FASTQ_INVALID, ///< means that the sequence was invalid. FASTQ_ORDER_ERROR, ///< means the methods are called out of order, like trying to read a file before opening it. FASTQ_OPEN_ERROR, ///< means the file could not be opened. FASTQ_CLOSE_ERROR, ///< means the file could not be closed. FASTQ_READ_ERROR, ///< means that a problem occurred on a read. FASTQ_NO_SEQUENCE_ERROR ///< means there were no errors, but no sequences read. }; /// Get the enum string for the status. static const char* getStatusString(Status status); private: static const char* enumString[]; }; #endif libStatGen-1.0.14/fastq/Makefile000066400000000000000000000001571254730101300164060ustar00rootroot00000000000000# Source File Set TOOLBASE = FastQFile BaseCount BaseComposition FastQStatus include ../Makefiles/Makefile.liblibStatGen-1.0.14/fastq/Makefile.depends000066400000000000000000000036451254730101300200340ustar00rootroot00000000000000# DO NOT DELETE $(OBJDIR_OPT)/FastQFile.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/FastQFile.o: FastQFile.h ../include/StringBasics.h $(OBJDIR_OPT)/FastQFile.o: BaseComposition.h ../include/BaseAsciiMap.h $(OBJDIR_OPT)/FastQFile.o: BaseCount.h FastQStatus.h $(OBJDIR_OPT)/BaseCount.o: BaseCount.h $(OBJDIR_OPT)/BaseComposition.o: BaseComposition.h ../include/BaseAsciiMap.h $(OBJDIR_OPT)/BaseComposition.o: ../include/StringBasics.h $(OBJDIR_OPT)/BaseComposition.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_OPT)/BaseComposition.o: BaseCount.h $(OBJDIR_OPT)/FastQStatus.o: FastQStatus.h $(OBJDIR_DEBUG)/FastQFile.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_DEBUG)/FastQFile.o: FastQFile.h ../include/StringBasics.h $(OBJDIR_DEBUG)/FastQFile.o: BaseComposition.h ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/FastQFile.o: BaseCount.h FastQStatus.h $(OBJDIR_DEBUG)/BaseCount.o: BaseCount.h $(OBJDIR_DEBUG)/BaseComposition.o: BaseComposition.h $(OBJDIR_DEBUG)/BaseComposition.o: ../include/BaseAsciiMap.h $(OBJDIR_DEBUG)/BaseComposition.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/BaseComposition.o: ../include/InputFile.h $(OBJDIR_DEBUG)/BaseComposition.o: ../include/FileType.h BaseCount.h $(OBJDIR_DEBUG)/FastQStatus.o: FastQStatus.h $(OBJDIR_PROFILE)/FastQFile.o: ../include/InputFile.h ../include/FileType.h $(OBJDIR_PROFILE)/FastQFile.o: FastQFile.h ../include/StringBasics.h $(OBJDIR_PROFILE)/FastQFile.o: BaseComposition.h ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/FastQFile.o: BaseCount.h FastQStatus.h $(OBJDIR_PROFILE)/BaseCount.o: BaseCount.h $(OBJDIR_PROFILE)/BaseComposition.o: BaseComposition.h $(OBJDIR_PROFILE)/BaseComposition.o: ../include/BaseAsciiMap.h $(OBJDIR_PROFILE)/BaseComposition.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/BaseComposition.o: ../include/InputFile.h $(OBJDIR_PROFILE)/BaseComposition.o: ../include/FileType.h BaseCount.h $(OBJDIR_PROFILE)/FastQStatus.o: FastQStatus.h libStatGen-1.0.14/fastq/test/000077500000000000000000000000001254730101300157225ustar00rootroot00000000000000libStatGen-1.0.14/fastq/test/.gitignore000066400000000000000000000000231254730101300177050ustar00rootroot00000000000000fastqTest results/ libStatGen-1.0.14/fastq/test/FastQFileTest.cpp000066400000000000000000001346301254730101300211130ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "FastQFile.h" #include const String FIRST_SEQID_LINE = "@Valid with comment"; const String FIRST_SEQID = "Valid"; const String FIRST_RAW_SEQ = "ACTGNactng.0123"; const String FIRST_PLUS_LINE = "+"; const String FIRST_QUALITY = "!#\"$%&'()*+,-./"; const String SECOND_SEQID_LINE = "@Valid1 with comment"; const String SECOND_SEQID = "Valid1"; const String SECOND_RAW_SEQ = "ACTGACTNactngaac"; const String SECOND_PLUS_LINE = "+"; const String SECOND_QUALITY = "0123456789:;<=>@"; const String THIRD_SEQID_LINE = "@Valid2"; const String THIRD_SEQID = "Valid2"; const String THIRD_RAW_SEQ = "A123.0321.011"; const String THIRD_PLUS_LINE = "+"; const String THIRD_QUALITY = "?@ABCDEFGHIJK"; const String FOURTH_SEQID_LINE = "@Valid3"; const String FOURTH_SEQID = "Valid3"; const String FOURTH_RAW_SEQ = "ACTGACTNactngACTGACTNactng"; const String FOURTH_PLUS_LINE = "+"; const String FOURTH_QUALITY = "LMNOPQRSTUVWXYZ[\\]^_'abcde"; const String FIFTH_SEQID_LINE = "@Valid4"; const String FIFTH_SEQID = "Valid4"; const String FIFTH_RAW_SEQ = "ACTGACTNactngACTGACTNactng"; const String FIFTH_PLUS_LINE = "+"; const String FIFTH_QUALITY = "fghijklmnopqrstuvwxyz{|}~~"; const String SIXTH_SEQID_LINE = "@"; const String SIXTH_SEQID = ""; const String SIXTH_RAW_SEQ = "ACTGACTNactng"; const String SIXTH_PLUS_LINE = "+"; const String SIXTH_QUALITY = "?@ABCDEFGHIJK"; const String SEVENTH_SEQID_LINE = "Line no start with @"; const String SEVENTH_SEQID = ""; const String SEVENTH_RAW_SEQ = "ACTGACTNactng"; const String SEVENTH_PLUS_LINE = "+"; const String SEVENTH_QUALITY = "LMNOPQRSTUVWX"; const String EIGHTH_SEQID_LINE = "@ a"; const String EIGHTH_SEQID = ""; const String EIGHTH_RAW_SEQ = "ACTGACTNactng"; const String EIGHTH_PLUS_LINE = "+"; const String EIGHTH_QUALITY = "YZ[\\]^_'abcde"; const String NINTH_SEQID_LINE = "@ "; const String NINTH_SEQID = ""; const String NINTH_RAW_SEQ = "ACTGACTNactng"; const String NINTH_PLUS_LINE = "+"; const String NINTH_QUALITY = "fghijklmnopqr"; const String TENTH_SEQID_LINE = "@Valid"; const String TENTH_SEQID = "Valid"; const String TENTH_RAW_SEQ = "ACTGNactng"; const String TENTH_PLUS_LINE = "+"; const String TENTH_QUALITY = "!#\"$%&'()*"; const String ELEVENTH_SEQID_LINE = "@RawError1"; const String ELEVENTH_SEQID = "RawError1"; const String ELEVENTH_RAW_SEQ = "ACTNaHtng0aBZa"; const String ELEVENTH_PLUS_LINE = "+"; const String ELEVENTH_QUALITY = "ACTNactng0aBaZ"; const String TWELFTH_SEQID_LINE = "@RawError2"; const String TWELFTH_SEQID = "RawError2"; const String TWELFTH_RAW_SEQ = "aaa"; const String TWELFTH_PLUS_LINE = "+"; const String TWELFTH_QUALITY = "aaa"; const String THIRTEENTH_SEQID_LINE = "@RawError3"; const String THIRTEENTH_SEQID = "RawError3"; const String THIRTEENTH_RAW_SEQ = "ACTGACTNactng"; const String THIRTEENTH_PLUS_LINE = "+"; const String THIRTEENTH_QUALITY = "ACTGACTNactng"; const String FOURTEENTH_SEQID_LINE = "@QualityError1"; const String FOURTEENTH_SEQID = "QualityError1"; const String FOURTEENTH_RAW_SEQ = "ACTGCacgnc"; const String FOURTEENTH_PLUS_LINE = "+"; const String FOURTEENTH_QUALITY = "ac gcacg n"; const String FIFTEENTH_SEQID_LINE = "@QualityError2"; const String FIFTEENTH_SEQID = "QualityError2"; const String FIFTEENTH_RAW_SEQ = "ACTGCacgnc"; const String FIFTEENTH_PLUS_LINE = "+"; const String FIFTEENTH_QUALITY = "actgc@cgnc"; const String SIXTEENTH_SEQID_LINE = "@QualityError3"; const String SIXTEENTH_SEQID = "QualityError3"; const String SIXTEENTH_RAW_SEQ = "ACTGCacgnc"; const String SIXTEENTH_PLUS_LINE = "+"; const String SIXTEENTH_QUALITY = "actgc77acgnc"; const String SEVENTEENTH_SEQID_LINE = "@PlusValid1"; const String SEVENTEENTH_SEQID = "PlusValid1"; const String SEVENTEENTH_RAW_SEQ = "ACTGCacgnc"; const String SEVENTEENTH_PLUS_LINE = "+PlusValid1"; const String SEVENTEENTH_QUALITY = "actgcacgnc"; const String EIGHTEENTH_SEQID_LINE = "@PlusValid2"; const String EIGHTEENTH_SEQID = "PlusValid2"; const String EIGHTEENTH_RAW_SEQ = "ACTGCacgnc"; const String EIGHTEENTH_PLUS_LINE = "+PlusValid2 Added comment"; const String EIGHTEENTH_QUALITY = "actgcacgnc"; const String NINETEENTH_SEQID_LINE = "@PlusError1"; const String NINETEENTH_SEQID = "PlusError1"; const String NINETEENTH_RAW_SEQ = "ACTGCacgnc"; const String NINETEENTH_PLUS_LINE = "+PlusError2"; const String NINETEENTH_QUALITY = "actgcacgnc"; const String TWENTIETH_SEQID_LINE = "@InvalidColor"; const String TWENTIETH_SEQID = "InvalidColor"; const String TWENTIETH_RAW_SEQ = "0123.0321.011"; const String TWENTIETH_PLUS_LINE = "+"; const String TWENTIETH_QUALITY = "0123.0321.011"; const String TWENTY_FIRST_SEQID_LINE = "@PlusError2"; const String TWENTY_FIRST_SEQID = "PlusError2"; const String TWENTY_FIRST_RAW_SEQ = "ACTGCacgnc"; const String TWENTY_FIRST_PLUS_LINE = ""; const String TWENTY_FIRST_QUALITY = ""; void testReadUnOpenedFile() { FastQFile fastqfile; assert(fastqfile.isOpen() == false); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_ORDER_ERROR); assert(fastqfile.isOpen() == false); } void testOpenFile() { FastQFile fastqfile; // Test for non-existent file. assert(fastqfile.isOpen() == false); assert(fastqfile.openFile("noexist.txt", BaseAsciiMap::UNKNOWN) == FastQStatus::FASTQ_OPEN_ERROR); assert(fastqfile.isOpen() == false); } void testCloseFile() { FastQFile fastqfile; // Test closing a file even though there isn't one open - counts as success. assert(fastqfile.isOpen() == false); assert(fastqfile.closeFile() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == false); } void testReadSequence() { FastQFile fastqfile; assert(fastqfile.isOpen() == false); assert(fastqfile.openFile("testFile.txt") == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == true); assert(fastqfile.getSpaceType() == BaseAsciiMap::UNKNOWN); // Read Sequence from test file. assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIRST_SEQID); assert(fastqfile.myRawSequence == FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == FIRST_PLUS_LINE); assert(fastqfile.myQualityString == FIRST_QUALITY); assert(fastqfile.getSpaceType() == BaseAsciiMap::BASE_SPACE); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == SECOND_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SECOND_SEQID); assert(fastqfile.myRawSequence == SECOND_RAW_SEQ); assert(fastqfile.myPlusLine == SECOND_PLUS_LINE); assert(fastqfile.myQualityString == SECOND_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == THIRD_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRD_SEQID); assert(fastqfile.myRawSequence == THIRD_RAW_SEQ); assert(fastqfile.myPlusLine == THIRD_PLUS_LINE); assert(fastqfile.myQualityString == THIRD_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FOURTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTH_SEQID); assert(fastqfile.myRawSequence == FOURTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FIFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTH_SEQID); assert(fastqfile.myRawSequence == FIFTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTH_SEQID); assert(fastqfile.myRawSequence == SIXTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == EIGHTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTH_SEQID); assert(fastqfile.myRawSequence == EIGHTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINTH_SEQID); assert(fastqfile.myRawSequence == NINTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINTH_PLUS_LINE); assert(fastqfile.myQualityString == NINTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TENTH_SEQID); assert(fastqfile.myRawSequence == TENTH_RAW_SEQ); assert(fastqfile.myPlusLine == TENTH_PLUS_LINE); assert(fastqfile.myQualityString == TENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == ELEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == ELEVENTH_SEQID); assert(fastqfile.myRawSequence == ELEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == ELEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == ELEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWELFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWELFTH_SEQID); assert(fastqfile.myRawSequence == TWELFTH_RAW_SEQ); assert(fastqfile.myPlusLine == TWELFTH_PLUS_LINE); assert(fastqfile.myQualityString == TWELFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == THIRTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRTEENTH_SEQID); assert(fastqfile.myRawSequence == THIRTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == THIRTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == THIRTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FOURTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTEENTH_SEQID); assert(fastqfile.myRawSequence == FOURTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FIFTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTEENTH_SEQID); assert(fastqfile.myRawSequence == FIFTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTEENTH_SEQID); assert(fastqfile.myRawSequence == SIXTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == SEVENTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTEENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == EIGHTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTEENTH_SEQID); assert(fastqfile.myRawSequence == EIGHTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINETEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINETEENTH_SEQID); assert(fastqfile.myRawSequence == NINETEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINETEENTH_PLUS_LINE); assert(fastqfile.myQualityString == NINETEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTIETH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTIETH_SEQID); assert(fastqfile.myRawSequence == TWENTIETH_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTIETH_PLUS_LINE); assert(fastqfile.myQualityString == TWENTIETH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTY_FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTY_FIRST_SEQID); assert(fastqfile.myRawSequence == TWENTY_FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTY_FIRST_PLUS_LINE); assert(fastqfile.myQualityString == TWENTY_FIRST_QUALITY); // Close the file, and verify isOpen = false; assert(fastqfile.closeFile() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == false); ////////////////////////////////// // Repeat test specifying base space assert(fastqfile.isOpen() == false); assert(fastqfile.openFile("testFile.txt", BaseAsciiMap::BASE_SPACE) == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == true); assert(fastqfile.getSpaceType() == BaseAsciiMap::BASE_SPACE); // Read Sequence from test file. assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIRST_SEQID); assert(fastqfile.myRawSequence == FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == FIRST_PLUS_LINE); assert(fastqfile.myQualityString == FIRST_QUALITY); assert(fastqfile.getSpaceType() == BaseAsciiMap::BASE_SPACE); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == SECOND_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SECOND_SEQID); assert(fastqfile.myRawSequence == SECOND_RAW_SEQ); assert(fastqfile.myPlusLine == SECOND_PLUS_LINE); assert(fastqfile.myQualityString == SECOND_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == THIRD_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRD_SEQID); assert(fastqfile.myRawSequence == THIRD_RAW_SEQ); assert(fastqfile.myPlusLine == THIRD_PLUS_LINE); assert(fastqfile.myQualityString == THIRD_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FOURTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTH_SEQID); assert(fastqfile.myRawSequence == FOURTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FIFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTH_SEQID); assert(fastqfile.myRawSequence == FIFTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTH_SEQID); assert(fastqfile.myRawSequence == SIXTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == EIGHTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTH_SEQID); assert(fastqfile.myRawSequence == EIGHTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINTH_SEQID); assert(fastqfile.myRawSequence == NINTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINTH_PLUS_LINE); assert(fastqfile.myQualityString == NINTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TENTH_SEQID); assert(fastqfile.myRawSequence == TENTH_RAW_SEQ); assert(fastqfile.myPlusLine == TENTH_PLUS_LINE); assert(fastqfile.myQualityString == TENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == ELEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == ELEVENTH_SEQID); assert(fastqfile.myRawSequence == ELEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == ELEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == ELEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWELFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWELFTH_SEQID); assert(fastqfile.myRawSequence == TWELFTH_RAW_SEQ); assert(fastqfile.myPlusLine == TWELFTH_PLUS_LINE); assert(fastqfile.myQualityString == TWELFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == THIRTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRTEENTH_SEQID); assert(fastqfile.myRawSequence == THIRTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == THIRTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == THIRTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FOURTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTEENTH_SEQID); assert(fastqfile.myRawSequence == FOURTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FIFTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTEENTH_SEQID); assert(fastqfile.myRawSequence == FIFTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTEENTH_SEQID); assert(fastqfile.myRawSequence == SIXTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == SEVENTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTEENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == EIGHTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTEENTH_SEQID); assert(fastqfile.myRawSequence == EIGHTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINETEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINETEENTH_SEQID); assert(fastqfile.myRawSequence == NINETEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINETEENTH_PLUS_LINE); assert(fastqfile.myQualityString == NINETEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTIETH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTIETH_SEQID); assert(fastqfile.myRawSequence == TWENTIETH_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTIETH_PLUS_LINE); assert(fastqfile.myQualityString == TWENTIETH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTY_FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTY_FIRST_SEQID); assert(fastqfile.myRawSequence == TWENTY_FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTY_FIRST_PLUS_LINE); assert(fastqfile.myQualityString == TWENTY_FIRST_QUALITY); // Close the file, and verify isOpen = false; assert(fastqfile.closeFile() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == false); //////////////////////////////// // Repeat test specifying color space assert(fastqfile.isOpen() == false); assert(fastqfile.openFile("testFile.txt", BaseAsciiMap::COLOR_SPACE) == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == true); assert(fastqfile.getSpaceType() == BaseAsciiMap::COLOR_SPACE); // Read Sequence from test file. assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIRST_SEQID); assert(fastqfile.myRawSequence == FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == FIRST_PLUS_LINE); assert(fastqfile.myQualityString == FIRST_QUALITY); assert(fastqfile.getSpaceType() == BaseAsciiMap::COLOR_SPACE); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SECOND_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SECOND_SEQID); assert(fastqfile.myRawSequence == SECOND_RAW_SEQ); assert(fastqfile.myPlusLine == SECOND_PLUS_LINE); assert(fastqfile.myQualityString == SECOND_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == THIRD_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRD_SEQID); assert(fastqfile.myRawSequence == THIRD_RAW_SEQ); assert(fastqfile.myPlusLine == THIRD_PLUS_LINE); assert(fastqfile.myQualityString == THIRD_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FOURTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTH_SEQID); assert(fastqfile.myRawSequence == FOURTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FIFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTH_SEQID); assert(fastqfile.myRawSequence == FIFTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTH_SEQID); assert(fastqfile.myRawSequence == SIXTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == EIGHTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTH_SEQID); assert(fastqfile.myRawSequence == EIGHTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINTH_SEQID); assert(fastqfile.myRawSequence == NINTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINTH_PLUS_LINE); assert(fastqfile.myQualityString == NINTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TENTH_SEQID); assert(fastqfile.myRawSequence == TENTH_RAW_SEQ); assert(fastqfile.myPlusLine == TENTH_PLUS_LINE); assert(fastqfile.myQualityString == TENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == ELEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == ELEVENTH_SEQID); assert(fastqfile.myRawSequence == ELEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == ELEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == ELEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWELFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWELFTH_SEQID); assert(fastqfile.myRawSequence == TWELFTH_RAW_SEQ); assert(fastqfile.myPlusLine == TWELFTH_PLUS_LINE); assert(fastqfile.myQualityString == TWELFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == THIRTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRTEENTH_SEQID); assert(fastqfile.myRawSequence == THIRTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == THIRTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == THIRTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FOURTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTEENTH_SEQID); assert(fastqfile.myRawSequence == FOURTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FIFTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTEENTH_SEQID); assert(fastqfile.myRawSequence == FIFTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTEENTH_SEQID); assert(fastqfile.myRawSequence == SIXTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SEVENTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTEENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == EIGHTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTEENTH_SEQID); assert(fastqfile.myRawSequence == EIGHTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINETEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINETEENTH_SEQID); assert(fastqfile.myRawSequence == NINETEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINETEENTH_PLUS_LINE); assert(fastqfile.myQualityString == NINETEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTIETH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTIETH_SEQID); assert(fastqfile.myRawSequence == TWENTIETH_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTIETH_PLUS_LINE); assert(fastqfile.myQualityString == TWENTIETH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTY_FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTY_FIRST_SEQID); assert(fastqfile.myRawSequence == TWENTY_FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTY_FIRST_PLUS_LINE); assert(fastqfile.myQualityString == TWENTY_FIRST_QUALITY); // Close the file, and verify isOpen = false; assert(fastqfile.closeFile() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == false); //////////////////////////////// // Repeat test specifying Unknown space assert(fastqfile.isOpen() == false); assert(fastqfile.openFile("testFile.txt", BaseAsciiMap::UNKNOWN) == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == true); assert(fastqfile.getSpaceType() == BaseAsciiMap::UNKNOWN); // Read Sequence from test file. assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIRST_SEQID); assert(fastqfile.myRawSequence == FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == FIRST_PLUS_LINE); assert(fastqfile.myQualityString == FIRST_QUALITY); assert(fastqfile.getSpaceType() == BaseAsciiMap::BASE_SPACE); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == SECOND_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SECOND_SEQID); assert(fastqfile.myRawSequence == SECOND_RAW_SEQ); assert(fastqfile.myPlusLine == SECOND_PLUS_LINE); assert(fastqfile.myQualityString == SECOND_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == THIRD_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRD_SEQID); assert(fastqfile.myRawSequence == THIRD_RAW_SEQ); assert(fastqfile.myPlusLine == THIRD_PLUS_LINE); assert(fastqfile.myQualityString == THIRD_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FOURTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTH_SEQID); assert(fastqfile.myRawSequence == FOURTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FIFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTH_SEQID); assert(fastqfile.myRawSequence == FIFTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTH_SEQID); assert(fastqfile.myRawSequence == SIXTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == EIGHTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTH_SEQID); assert(fastqfile.myRawSequence == EIGHTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINTH_SEQID); assert(fastqfile.myRawSequence == NINTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINTH_PLUS_LINE); assert(fastqfile.myQualityString == NINTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TENTH_SEQID); assert(fastqfile.myRawSequence == TENTH_RAW_SEQ); assert(fastqfile.myPlusLine == TENTH_PLUS_LINE); assert(fastqfile.myQualityString == TENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == ELEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == ELEVENTH_SEQID); assert(fastqfile.myRawSequence == ELEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == ELEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == ELEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWELFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWELFTH_SEQID); assert(fastqfile.myRawSequence == TWELFTH_RAW_SEQ); assert(fastqfile.myPlusLine == TWELFTH_PLUS_LINE); assert(fastqfile.myQualityString == TWELFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == THIRTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRTEENTH_SEQID); assert(fastqfile.myRawSequence == THIRTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == THIRTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == THIRTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FOURTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTEENTH_SEQID); assert(fastqfile.myRawSequence == FOURTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FIFTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTEENTH_SEQID); assert(fastqfile.myRawSequence == FIFTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTEENTH_SEQID); assert(fastqfile.myRawSequence == SIXTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == SEVENTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTEENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == EIGHTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTEENTH_SEQID); assert(fastqfile.myRawSequence == EIGHTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINETEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINETEENTH_SEQID); assert(fastqfile.myRawSequence == NINETEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINETEENTH_PLUS_LINE); assert(fastqfile.myQualityString == NINETEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTIETH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTIETH_SEQID); assert(fastqfile.myRawSequence == TWENTIETH_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTIETH_PLUS_LINE); assert(fastqfile.myQualityString == TWENTIETH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTY_FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTY_FIRST_SEQID); assert(fastqfile.myRawSequence == TWENTY_FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTY_FIRST_PLUS_LINE); assert(fastqfile.myQualityString == TWENTY_FIRST_QUALITY); // Close the file, and verify isOpen = false; assert(fastqfile.closeFile() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == false); //////////////////////////////// // Repeat test specifying to not check for unique sequence id. fastqfile.disableSeqIDCheck(); assert(fastqfile.isOpen() == false); assert(fastqfile.openFile("testFile.txt", BaseAsciiMap::UNKNOWN) == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == true); assert(fastqfile.getSpaceType() == BaseAsciiMap::UNKNOWN); // Read Sequence from test file. assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIRST_SEQID); assert(fastqfile.myRawSequence == FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == FIRST_PLUS_LINE); assert(fastqfile.myQualityString == FIRST_QUALITY); assert(fastqfile.getSpaceType() == BaseAsciiMap::BASE_SPACE); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == SECOND_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SECOND_SEQID); assert(fastqfile.myRawSequence == SECOND_RAW_SEQ); assert(fastqfile.myPlusLine == SECOND_PLUS_LINE); assert(fastqfile.myQualityString == SECOND_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == THIRD_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRD_SEQID); assert(fastqfile.myRawSequence == THIRD_RAW_SEQ); assert(fastqfile.myPlusLine == THIRD_PLUS_LINE); assert(fastqfile.myQualityString == THIRD_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FOURTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTH_SEQID); assert(fastqfile.myRawSequence == FOURTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FIFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTH_SEQID); assert(fastqfile.myRawSequence == FIFTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTH_SEQID); assert(fastqfile.myRawSequence == SIXTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == EIGHTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTH_SEQID); assert(fastqfile.myRawSequence == EIGHTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINTH_SEQID); assert(fastqfile.myRawSequence == NINTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINTH_PLUS_LINE); assert(fastqfile.myQualityString == NINTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == TENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TENTH_SEQID); assert(fastqfile.myRawSequence == TENTH_RAW_SEQ); assert(fastqfile.myPlusLine == TENTH_PLUS_LINE); assert(fastqfile.myQualityString == TENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == ELEVENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == ELEVENTH_SEQID); assert(fastqfile.myRawSequence == ELEVENTH_RAW_SEQ); assert(fastqfile.myPlusLine == ELEVENTH_PLUS_LINE); assert(fastqfile.myQualityString == ELEVENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWELFTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWELFTH_SEQID); assert(fastqfile.myRawSequence == TWELFTH_RAW_SEQ); assert(fastqfile.myPlusLine == TWELFTH_PLUS_LINE); assert(fastqfile.myQualityString == TWELFTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == THIRTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == THIRTEENTH_SEQID); assert(fastqfile.myRawSequence == THIRTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == THIRTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == THIRTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == FOURTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FOURTEENTH_SEQID); assert(fastqfile.myRawSequence == FOURTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FOURTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FOURTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == FIFTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == FIFTEENTH_SEQID); assert(fastqfile.myRawSequence == FIFTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == FIFTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == FIFTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == SIXTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SIXTEENTH_SEQID); assert(fastqfile.myRawSequence == SIXTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SIXTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SIXTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == SEVENTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == SEVENTEENTH_SEQID); assert(fastqfile.myRawSequence == SEVENTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == SEVENTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == SEVENTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.mySequenceIdLine == EIGHTEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == EIGHTEENTH_SEQID); assert(fastqfile.myRawSequence == EIGHTEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == EIGHTEENTH_PLUS_LINE); assert(fastqfile.myQualityString == EIGHTEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == NINETEENTH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == NINETEENTH_SEQID); assert(fastqfile.myRawSequence == NINETEENTH_RAW_SEQ); assert(fastqfile.myPlusLine == NINETEENTH_PLUS_LINE); assert(fastqfile.myQualityString == NINETEENTH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTIETH_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTIETH_SEQID); assert(fastqfile.myRawSequence == TWENTIETH_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTIETH_PLUS_LINE); assert(fastqfile.myQualityString == TWENTIETH_QUALITY); assert(fastqfile.readFastQSequence() == FastQStatus::FASTQ_INVALID); assert(fastqfile.mySequenceIdLine == TWENTY_FIRST_SEQID_LINE); assert(fastqfile.mySequenceIdentifier == TWENTY_FIRST_SEQID); assert(fastqfile.myRawSequence == TWENTY_FIRST_RAW_SEQ); assert(fastqfile.myPlusLine == TWENTY_FIRST_PLUS_LINE); assert(fastqfile.myQualityString == TWENTY_FIRST_QUALITY); // Close the file, and verify isOpen = false; assert(fastqfile.closeFile() == FastQStatus::FASTQ_SUCCESS); assert(fastqfile.isOpen() == false); } int main(int argc, char ** argv) { testReadUnOpenedFile(); testOpenFile(); testCloseFile(); testReadSequence(); } libStatGen-1.0.14/fastq/test/FastQValidatorTest.sh000077500000000000000000000003161254730101300220050ustar00rootroot00000000000000ERROR=false ./fastqTest > results/fastqTest.txt diff results/fastqTest.txt expectedResults/ExpectedResultsFastqTestResults.txt if [ $? -ne 0 ] then ERROR=true fi if($ERROR == true) then exit 1 fi libStatGen-1.0.14/fastq/test/Makefile000066400000000000000000000002111254730101300173540ustar00rootroot00000000000000TEST_COMMAND = mkdir -p results; ./FastQValidatorTest.sh EXE=fastqTest SRCONLY = FastQFileTest.cpp include ../../Makefiles/Makefile.testlibStatGen-1.0.14/fastq/test/README.txt000066400000000000000000000024071254730101300174230ustar00rootroot00000000000000Lines 1 - 24 - test that all valid quality string characters are accepted & tests multiple line Raw Sequence and Quality Strings. Sequence Identifier Line Validates: * Line 25: line is at least 2 characters long ('@' and at least 1 for the sequence identifier) * Line 29: line starts with an '@' * Line 33 & 37: no space between the '@' & the sequence identifier (which must be at least 1 character) * Line 41: sequence identifier is unique within the file Raw Sequence Line Validates: * Line 46 & 47: every character is in ACTGNactgn0123. * Line 51: the raw sequence after it is completely read is at least a configurable minimum length * Line 56 & 57: assumes all lines are part of the raw sequence until a line begins with a '+' or the end of the file is reached Plus Line Validates: * Line 88: sequence identifier on + line does not match the one on the @ line. * Line 91: that this line exists for each sequence Quality Line Validates: * Line 63 & 64: each character is > ascii 32 * Line 70: assumes all lines are part of the quality string until the total length of quality characters is >= the raw sequence length or the end of the file is reached * Line 77: length of the quality string equals the length of the raw sequence libStatGen-1.0.14/fastq/test/expectedResults/000077500000000000000000000000001254730101300211055ustar00rootroot00000000000000libStatGen-1.0.14/fastq/test/expectedResults/ExpectedResultsFastqTestResults.txt000066400000000000000000000137601254730101300302210ustar00rootroot00000000000000ERROR: Trying to read a fastq file but no file is open. ERROR: Failed to open file: noexist.txt ERROR on Line 2: Invalid character ('.') in base sequence. ERROR on Line 2: Invalid character ('0') in base sequence. ERROR on Line 2: Invalid character ('1') in base sequence. ERROR on Line 2: Invalid character ('2') in base sequence. ERROR on Line 2: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('2') in base sequence. ERROR on Line 11: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('.') in base sequence. ERROR on Line 11: Invalid character ('0') in base sequence. ERROR on Line 11: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('2') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('.') in base sequence. ERROR on Line 11: Invalid character ('0') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 25: The sequence identifier line was too short. ERROR on Line 29: First line of a sequence does not begin with @ ERROR on Line 33: No Sequence Identifier specified before the comment. ERROR on Line 2: Invalid character ('.') in base sequence. ERROR on Line 2: Invalid character ('0') in base sequence. ERROR on Line 2: Invalid character ('1') in base sequence. ERROR on Line 2: Invalid character ('2') in base sequence. ERROR on Line 2: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('2') in base sequence. ERROR on Line 11: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('.') in base sequence. ERROR on Line 11: Invalid character ('0') in base sequence. ERROR on Line 11: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('2') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('.') in base sequence. ERROR on Line 11: Invalid character ('0') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 25: The sequence identifier line was too short. ERROR on Line 29: First line of a sequence does not begin with @ ERROR on Line 33: No Sequence Identifier specified before the comment. ERROR on Line 2: Invalid character ('C') in base sequence. ERROR on Line 2: Invalid character ('T') in base sequence. ERROR on Line 2: Invalid character ('G') in base sequence. ERROR on Line 2: Invalid character ('N') in base sequence. ERROR on Line 2: Invalid character ('a') in base sequence. ERROR on Line 2: Invalid character ('c') in base sequence. ERROR on Line 2: Invalid character ('t') in base sequence. ERROR on Line 2: Invalid character ('n') in base sequence. ERROR on Line 2: Invalid character ('g') in base sequence. ERROR on Line 6: Invalid character ('C') in base sequence. ERROR on Line 6: Invalid character ('T') in base sequence. ERROR on Line 6: Invalid character ('G') in base sequence. ERROR on Line 6: Invalid character ('A') in base sequence. ERROR on Line 6: Invalid character ('C') in base sequence. ERROR on Line 6: Invalid character ('T') in base sequence. ERROR on Line 6: Invalid character ('N') in base sequence. ERROR on Line 6: Invalid character ('a') in base sequence. ERROR on Line 6: Invalid character ('c') in base sequence. ERROR on Line 6: Invalid character ('t') in base sequence. ERROR on Line 6: Invalid character ('n') in base sequence. ERROR on Line 2: Invalid character ('.') in base sequence. ERROR on Line 2: Invalid character ('0') in base sequence. ERROR on Line 2: Invalid character ('1') in base sequence. ERROR on Line 2: Invalid character ('2') in base sequence. ERROR on Line 2: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('2') in base sequence. ERROR on Line 11: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('.') in base sequence. ERROR on Line 11: Invalid character ('0') in base sequence. ERROR on Line 11: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('2') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('.') in base sequence. ERROR on Line 11: Invalid character ('0') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 25: The sequence identifier line was too short. ERROR on Line 29: First line of a sequence does not begin with @ ERROR on Line 33: No Sequence Identifier specified before the comment. ERROR on Line 2: Invalid character ('.') in base sequence. ERROR on Line 2: Invalid character ('0') in base sequence. ERROR on Line 2: Invalid character ('1') in base sequence. ERROR on Line 2: Invalid character ('2') in base sequence. ERROR on Line 2: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('2') in base sequence. ERROR on Line 11: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('.') in base sequence. ERROR on Line 11: Invalid character ('0') in base sequence. ERROR on Line 11: Invalid character ('3') in base sequence. ERROR on Line 11: Invalid character ('2') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('.') in base sequence. ERROR on Line 11: Invalid character ('0') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 11: Invalid character ('1') in base sequence. ERROR on Line 25: The sequence identifier line was too short. ERROR on Line 29: First line of a sequence does not begin with @ ERROR on Line 33: No Sequence Identifier specified before the comment. libStatGen-1.0.14/fastq/test/testFile.txt000066400000000000000000000016211254730101300202420ustar00rootroot00000000000000@Valid with comment ACTGNactng.0123 + !#"$%&'()*+,-./ @Valid1 with comment ACTGACTNactngaac + 0123456789:;<=> @ @Valid2 A123.0321.011 + ?@ABCDEFGHIJK @Valid3 ACTGACTN actngACTGACTNactng + LMNOPQRSTUVWXYZ [\]^_'abcde @Valid4 ACTGACTNactng ACTGACTNactng + fghijklmnopqrstuvwxyz{|}~~ @ ACTGACTNactng + ?@ABCDEFGHIJK Line no start with @ ACTGACTNactng + LMNOPQRSTUVWX @ a ACTGACTNactng + YZ[\]^_'abcde @ ACTGACTNactng + fghijklmnopqr @Valid ACTGNactng + !#"$%&'()* @RawError1 ACTNaHtng0 aBZa + ACTNactng0aBaZ @RawError2 aaa + aaa @RawError3 ACTGACTNactng + ACTGACTNactng @QualityError1 ACTGCacgnc + ac gc acg n @QualityError2 ACTGC acgnc + actgc @cgnc @QualityError3 ACTGC acgnc + actgc77 acgnc @PlusValid1 ACTGCacgnc +PlusValid1 actgcacgnc @PlusValid2 ACTGCacgnc +PlusValid2 Added comment actgcacgnc @PlusError1 ACTGCacgnc +PlusError2 actgcacgnc @InvalidColor 0123.0321.011 + 0123.0321.011 @PlusError2 ACTGCacgnclibStatGen-1.0.14/general/000077500000000000000000000000001254730101300152425ustar00rootroot00000000000000libStatGen-1.0.14/general/BaseAsciiMap.cpp000066400000000000000000000163651254730101300202420ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "BaseAsciiMap.h" // // Map ASCII values to a 2 (or 3) bit encoding for the base pair value for // both base and color space. // class 0 -> 'A' (Adenine - 0x41 and 0x61) // class 1 -> 'C' (Cytosine - 0x43 and 0x63) // class 2 -> 'G' (Guanine - 0x47 and 0x67) // class 3 -> 'T' (Thymine - 0x54 and 0x74) // class 4 -> 'N' (Unknown - read error or incomplete data - 0x4E and 0x6E) // class 5 -> not a valid DNA base pair character // // Note: The +1 array size is for the terminating NUL character // // NB: This table also maps 0, 1, 2, and 3 to the corresponding integers, // and '.' to class 4. This allows ABI SOLiD reads to be converted // to integers via ReadIndexer::Word2Integer. // unsigned char BaseAsciiMap::baseColor2int[256+1] = "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x00-0x0F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x10-0x1F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\004\005" // 0x20-0x2F "\000\001\002\003\005\005\005\005\005\005\005\005\005\005\005\005" // 0x30-0x3F "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005" // 0x40-0x4F "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005" // 0x50-0x5F "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005" // 0x60-0x6F "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005" // 0x70-0x7F // not used, but included for completeness: "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x80-0x8F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x90-0x9F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xA0-0xAF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xB0-0xBF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xC0-0xCF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xD0-0xDF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xE0-0xEF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xF0-0xFF ; // Map ASCII values to a 2 (or 3) bit encoding for the base pair value for // just base space (ACTGNactgn). unsigned char BaseAsciiMap::base2int[256+1] = "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x00-0x0F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x10-0x1F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x20-0x2F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x30-0x3F "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005" // 0x40-0x4F "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005" // 0x50-0x5F "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005" // 0x60-0x6F "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005" // 0x70-0x7F // not used, but included for completeness: "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x80-0x8F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x90-0x9F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xA0-0xAF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xB0-0xBF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xC0-0xCF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xD0-0xDF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xE0-0xEF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xF0-0xFF ; // Map ASCII values to a 2 (or 3) bit encoding for the base pair value for // just color space (0123). unsigned char BaseAsciiMap::color2int[256+1] = "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x00-0x0F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x10-0x1F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\004\005" // 0x20-0x2F "\000\001\002\003\005\005\005\005\005\005\005\005\005\005\005\005" // 0x30-0x3F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x40-0x4F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x50-0x5F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x60-0x6F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x70-0x7F // not used, but included for completeness: "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x80-0x8F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0x90-0x9F "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xA0-0xAF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xB0-0xBF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xC0-0xCF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xD0-0xDF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xE0-0xEF "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" // 0xF0-0xFF ; // // This is obviously for base space use only: // const char BaseAsciiMap::int2base[] = "ACGTNMXXXXXXXXXX"; // // convert int to color space value // const char BaseAsciiMap::int2colorSpace[] = "0123NXXXXXXXXXXX"; /// This table maps 5' base space to the 3' complement base space /// values, as well as 5' color space values to the corresponding /// 3' complement color space values. /// /// In both cases, invalids are mapped to 'N', which isn't accurate /// for ABI SOLiD, but internally it shouldn't matter (on output it /// will). unsigned char BaseAsciiMap::base2complement[256+1 /* for NUL char */] = "NNNNNNNNNNNNNNNN" // 0x00-0x0F "NNNNNNNNNNNNNNNN" // 0x10-0x1F "NNNNNNNNNNNNNNNN" // 0x20-0x2F "0123NNNNNNNNNNNN" // 0x30-0x3F "NTNGNNNCNNNNNNNN" // 0x40-0x4F "NNNNANNNNNNNNNNN" // 0x50-0x5F "NTNGNNNCNNNNNNNN" // 0x60-0x6F "NNNNANNNNNNNNNNN" // 0x70-0x7F // not used, but included for completeness: "NNNNNNNNNNNNNNNN" // 0x80-0x8F "NNNNNNNNNNNNNNNN" // 0x90-0x9F "NNNNNNNNNNNNNNNN" // 0xA0-0xAF "NNNNNNNNNNNNNNNN" // 0xB0-0xBF "NNNNNNNNNNNNNNNN" // 0xC0-0xCF "NNNNNNNNNNNNNNNN" // 0xD0-0xDF "NNNNNNNNNNNNNNNN" // 0xE0-0xEF "NNNNNNNNNNNNNNNN" // 0xF0-0xFF ; BaseAsciiMap::BaseAsciiMap() : myNumPrimerBases(1) { myBase2IntMapPtr = NULL; } BaseAsciiMap::~BaseAsciiMap() { } libStatGen-1.0.14/general/BaseAsciiMap.h000066400000000000000000000152431254730101300177010ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _BASE_ASCII_MAP_H #define _BASE_ASCII_MAP_H #include "StringBasics.h" /// Map between characters and the associated base type. class BaseAsciiMap { public: /// Value associated with 'N' in the ascii to base map (bad read). static const int baseNIndex = 004; /// Value associated with any non-base character in the ascii to base /// map (unknown, bad data). static const int baseXIndex = 005; // Two arrays for converting back and forth between base pair character // value (ASCII) to a base integer in the range 0..3. Note there is actually // a value 4 and 5, for 'N' (indelible) and 'M' (unknown to me). // /// Convert from int representation to the base. static const char int2base[]; /// Convert from int representation to colorspace representation. static const char int2colorSpace[]; static unsigned char base2complement[]; /// The type of space (color or base) to use in the mapping. enum SPACE_TYPE { /// Base decision on the first raw seq character/type has yet /// to be determined. UNKNOWN, BASE_SPACE, ///< Bases only (A,C,G,T,N). COLOR_SPACE ///< Color space only (0,1,2,3,.). }; /// Map ASCII values to a 2 (or 3) bit encoding for the base pair value for /// both base and color space. /// 'A'/'a'/'0' -> 0; 'C'/'c'/'1' -> 1; 'G'/'g'/'2' -> 2; 'T'/'t'/'3' -> 3; /// 'N'/'n'/'4' -> 4; anything else -> 5. static unsigned char baseColor2int[256+1]; // base space read (ATCG) /// Map ASCII values to a 2 (or 3) bit encoding for the base pair value for /// just base space (ACTGNactgn). /// 'A'/'a' -> 0; 'C'/'c' -> 1; 'G'/'g' -> 2; 'T'/'t' -> 3; /// 'N'/'n' -> 4; anything else -> 5. static unsigned char base2int[256+1]; // base space read (ATCG) /// Map ASCII values to a 2 (or 3) bit encoding for the base pair value for /// just color space (0123). /// '0' -> 0; '1' -> 1; '2' -> 2; '3' -> 3; '4' -> 4; anything else -> 5. static unsigned char color2int[256+1]; // base space read (ATCG) public: BaseAsciiMap(); ~BaseAsciiMap(); /// Set the base type based on the passed in option. inline void setBaseMapType(SPACE_TYPE spaceType) { resetPrimerCount(); //First check to see if it is in base space. switch (spaceType) { case BASE_SPACE: // base space. myBase2IntMapPtr = base2int; break; case COLOR_SPACE: // color space. myBase2IntMapPtr = color2int; break; default: // Unknown map type, zero the pointer. myBase2IntMapPtr = NULL; break; } }; /// Returns the baseIndex value for the character passed in. inline int getBaseIndex(const char& letter) { if (myBase2IntMapPtr == NULL) { // Check to see if we have hit the number of primer bases. if (myPrimerCount < myNumPrimerBases) { // Still expecting primer bases, so lookup // the letter in the base map. ++myPrimerCount; return(base2int[(int)letter]); } // Have already processed all the primers, so determine // whether this is base or color space. // Need to determime the base type. setBaseMapType(letter); // If it is still null, return invalid. Will be set when the first // letter is either color or base. if (myBase2IntMapPtr == NULL) { return(baseXIndex); } } // Also check if configured as color space that the primers are correct. if ((myBase2IntMapPtr == color2int) && (myPrimerCount < myNumPrimerBases)) { // Still expecting primer bases, so lookup // the letter in the base map. ++myPrimerCount; return(base2int[(int)letter]); } return myBase2IntMapPtr[(int)letter]; } /// Return the space type that is currently set. inline SPACE_TYPE getSpaceType() { if (myBase2IntMapPtr == base2int) { return(BASE_SPACE); } else if (myBase2IntMapPtr == color2int) { return(COLOR_SPACE); } else { return(UNKNOWN); } } /// Set the number of primer bases expected before the actual /// base/color space type occurs for the rest of the entries. void setNumPrimerBases(int numPrimerBases) { myNumPrimerBases = numPrimerBases; } /// Reset the number of primers to 0. void resetPrimerCount() { myPrimerCount = 0; }; /// Reset the base mapping type to UNKNOWN. void resetBaseMapType() { myBase2IntMapPtr = NULL; resetPrimerCount(); }; private: // Set the base type based on the passed in letter. // If the letter is in neither the color space or the base space, both // will be allowed. inline void setBaseMapType(const char& letter) { //First check to see if it is in base space. if (base2int[(int)letter] != baseXIndex) { // This is a valid base space index, so it is base space. myBase2IntMapPtr = base2int; } else if (color2int[(int)letter] != baseXIndex) { // This is a valid color space index, so it is base space. myBase2IntMapPtr = color2int; } else { // Unknown map type, zero the pointer. myBase2IntMapPtr = NULL; } }; // The number of primer bases to expect for a color-space file. unsigned int myNumPrimerBases; // This is the number of primer bases that have been seen since // the map type was set/reset. unsigned int myPrimerCount; unsigned char* myBase2IntMapPtr; }; #endif libStatGen-1.0.14/general/BaseQualityHelper.cpp000066400000000000000000000023241254730101300213320ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "BaseQualityHelper.h" #include baseQualityConvertor bQualityConvertor; baseQualityConvertor::baseQualityConvertor() { // Create a quick lookup table to speed up conversion of // base quality values stored as log10 (error rates) into // fractional error rates for (int i = 0; i <= 255; i++) doubleLookup[i] = pow(0.1, i * 0.1); // doubleLookup[255] = 0.0; } double baseQualityConvertor::toDouble(unsigned char bq) { return doubleLookup[bq]; } libStatGen-1.0.14/general/BaseQualityHelper.h000066400000000000000000000017601254730101300210020ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BASEQUALITY_H__ #define __BASEQUALITY_H__ class baseQualityConvertor { public: baseQualityConvertor(); double toDouble(unsigned char baseQuality); private: double doubleLookup[256]; }; extern baseQualityConvertor bQualityConvertor; #endif libStatGen-1.0.14/general/BaseUtilities.cpp000066400000000000000000000050771254730101300205250ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "BaseUtilities.h" #include #include "BaseAsciiMap.h" bool BaseUtilities::isAmbiguous(char base) { switch(base) { case 'N': case 'n': case '.': return(true); default: break; }; // Not 'N', 'n', or '.', so return false. return(false); } bool BaseUtilities::areEqual(char base1, char base2) { // If they are the same, return true. if(base1 == base2) { return(true); } // If one of the bases is '=', return true. if((base1 == '=') || (base2 == '=')) { return(true); } // Check both in upercase. if(toupper(base1) == toupper(base2)) { // same in upper case. return(true); } // The bases are different. return(false); } // Get phred base quality from the specified ascii quality. uint8_t BaseUtilities::getPhredBaseQuality(char charQuality) { if(charQuality == UNKNOWN_QUALITY_CHAR) { return(UNKNOWN_QUALITY_INT); } return(charQuality - 33); } char BaseUtilities::getAsciiQuality(uint8_t phredQuality) { if(phredQuality == UNKNOWN_QUALITY_INT) { return(UNKNOWN_QUALITY_CHAR); } return(phredQuality + 33); } void BaseUtilities::reverseComplement(std::string& sequence) { int start = 0; int end = sequence.size() - 1; char tempChar; while(start < end) { tempChar = sequence[start]; sequence[start] = BaseAsciiMap::base2complement[(int)(sequence[end])]; sequence[end] = BaseAsciiMap::base2complement[(int)tempChar]; // Move both pointers. ++start; --end; } // there was an odd number of entries, complement the middle one. if(start == end) { tempChar = sequence[start]; sequence[start] = BaseAsciiMap::base2complement[(int)tempChar]; } } libStatGen-1.0.14/general/BaseUtilities.h000066400000000000000000000035041254730101300201630ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BASE_UTILITIES_H__ #define __BASE_UTILITIES_H__ #include #include /// This class contains static utilities for performing /// basic operations on bases. class BaseUtilities { public: /// Returns whether or not the specified bases is /// an indicator for ambiguity. /// \return true if base = 'n' or 'N' or '.' static bool isAmbiguous(char base); /// Returns whether or not two bases are equal (case insensitive), /// if one of the bases is '=', the bases are consided /// to be equal. static bool areEqual(char base1, char base2); /// Get phred base quality from the specified ascii quality. static uint8_t getPhredBaseQuality(char charQuality); /// Get ascii quality from the specified phred quality. static char getAsciiQuality(uint8_t phredQuality); static void reverseComplement(std::string& sequence); /// Character used when the quality is unknown. static const char UNKNOWN_QUALITY_CHAR = ' '; /// Int value used when the quality is unknown. static const uint8_t UNKNOWN_QUALITY_INT = 0xFF; }; #endif libStatGen-1.0.14/general/BasicHash.cpp000066400000000000000000000073571254730101300176070ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "BasicHash.h" #include "Error.h" #include BasicHash::BasicHash(int startsize) { count = 0; size = startsize; mask = startsize - 1; // In this implementation, the size of hash tables must be a power of two if (startsize & mask) error("BasicHash: Hash table size must be a power of two.\n"); objects = new void * [size]; keys = new unsigned int [size]; for (unsigned int i = 0; i < size; i++) { objects[i] = NULL; } }; BasicHash::~BasicHash() { delete [] objects; delete [] keys; } void BasicHash::Clear() { // printf("Clearing...\n"); count = 0; if (size > 16) SetSize(16); for (unsigned int i = 0; i < size; i++) objects[i] = NULL; } void BasicHash::SetSize(int newsize) { int newmask = newsize - 1; void ** newobjects = new void * [newsize]; unsigned int * newkeys = new unsigned int [newsize]; for (int i = 0; i < newsize; i++) { newobjects[i] = NULL; } if (count) for (unsigned int i = 0; i < size; i++) if (objects[i] != NULL) { unsigned int key = keys[i]; unsigned int h = key & newmask; while (newobjects[h] != NULL && newkeys[h] != h) h = (h + 1) & newmask; newkeys[h] = key; newobjects[h] = objects[i]; } delete [] objects; delete [] keys; objects = newobjects; keys = newkeys; size = newsize; mask = newmask; } int BasicHash::Add(int key, void * object) { if (count * 2 > size) Grow(); unsigned int h = Iterate(key); while ((objects[h] != NULL) && (objects[h] != object)) h = ReIterate(key, h); if (objects[h] == NULL) { // printf("At position %d, inserted %x\n", h, key); keys[h] = key; count++; } objects[h] = object; return h; } int BasicHash::Find(int key) { int h = Iterate(key); return objects[h] == NULL ? -1 : h; } int BasicHash::Rehash(int key, int h) { h = ReIterate(key, h); return objects[h] == NULL ? -1 : h; } void BasicHash::Delete(unsigned int index) { if (index >= size || objects[index] == NULL) return; objects[index] = NULL; count--; if (count * 8 < size && size > 32) Shrink(); else { // rehash the next entries until we find empty slot index = (index + 1) & mask; while (objects[index] != NULL) { if ((keys[index] & mask) != index) { unsigned int h = Iterate(keys[index]); while ((objects[h] != NULL) && (objects[h] != objects[index])) h = ReIterate(keys[index], h); if (h != (unsigned int) index) { keys[h] = keys[index]; objects[h] = objects[index]; objects[index] = NULL; } } index = (index + 1) & mask; } } } libStatGen-1.0.14/general/BasicHash.h000066400000000000000000000043131254730101300172410ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BASICHASH_H__ #define __BASICHASH_H__ #include class BasicHash { protected: void ** objects; unsigned int * keys; unsigned int count, size; unsigned int mask; public: BasicHash(int startsize = 32); virtual ~BasicHash(); void Grow() { SetSize(size * 2); } void Shrink() { SetSize(size / 2); } void SetSize(int newsize); void Clear(); int Capacity() const { return size; } int Entries() const { return count; } void * Object(int i) const { return objects[i]; } void SetObject(int i, void * object) { objects[i] = object; } int Add(int key, void * object = NULL); int Find(int key); int Rehash(int key, int h); BasicHash & operator = (const BasicHash & rhs); void * operator [](int i) const { return objects[i]; } void Delete(unsigned int index); bool SlotInUse(int index) { return objects[index] != NULL; } private: unsigned int Iterate(unsigned int key) const { unsigned int h = key & mask; while (objects[h] != NULL && keys[h] != key) h = (h + 1) & mask; return h; } unsigned int ReIterate(unsigned int key, unsigned int h) const { h = (h + 1) & mask; while (objects[h] != NULL && keys[h] != key) h = (h + 1) & mask; return h; } }; #endif libStatGen-1.0.14/general/BgzfFileType.cpp000066400000000000000000000047331254730101300203070ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifdef __ZLIB_AVAILABLE__ #include #include #include "BgzfFileType.h" // Default to require the EOF block at the end of the file. bool BgzfFileType::ourRequireEofBlock = true; BgzfFileType::BgzfFileType(const char * filename, const char * mode) { // If the file is for write and is '-', then write to stdout. if(((mode[0] == 'w') || (mode[0] == 'W')) && (strcmp(filename, "-") == 0)) { // Write to stdout. bgzfHandle = bgzf_dopen(fileno(stdout), mode); } else if(((mode[0] == 'r') || (mode[0] == 'R')) && (strcmp(filename, "-") == 0)) { // read from stdin bgzfHandle = bgzf_dopen(fileno(stdin), mode); } else { bgzfHandle = bgzf_open(filename, mode); } myStartPos = 0; if (bgzfHandle != NULL) { // Check to see if the file is being opened for read, if the eof block // is required, and if it is, if it is there. if ((mode[0] == 'r' || mode[0] == 'R') && (strcmp(filename, "-") != 0) && ourRequireEofBlock && (bgzf_check_EOF(bgzfHandle) != 1)) { std::cerr << "BGZF EOF marker is missing in " << filename << std::endl; // the block is supposed to be there, but isn't, so close the file. close(); } else { // Successfully opened a properly formatted file, so get the start // position. myStartPos = bgzf_tell(bgzfHandle); } } myEOF = false; } // Set whether or not to require the EOF block at the end of the // file. True - require the block. False - do not require the block. void BgzfFileType::setRequireEofBlock(bool requireEofBlock) { ourRequireEofBlock = requireEofBlock; } #endif libStatGen-1.0.14/general/BgzfFileType.h000066400000000000000000000116601254730101300177510ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BGZFFILETYPE_H__ #define __BGZFFILETYPE_H__ #ifdef __ZLIB_AVAILABLE__ #include // stdexcept header file #include "bgzf.h" #include "FileType.h" class BgzfFileType : public FileType { public: BgzfFileType() { bgzfHandle = NULL; myEOF = false; } virtual ~BgzfFileType() { bgzfHandle = NULL; } BgzfFileType(const char * filename, const char * mode); virtual bool operator == (void * rhs) { // No two file pointers are the same, so if rhs is not NULL, then // the two pointers are different (false). if (rhs != NULL) return false; return (bgzfHandle == rhs); } virtual bool operator != (void * rhs) { // No two file pointers are the same, so if rhs is not NULL, then // the two pointers are different (true). if (rhs != NULL) return true; return (bgzfHandle != rhs); } // Close the file. virtual inline int close() { int result = bgzf_close(bgzfHandle); bgzfHandle = NULL; return result; } // Reset to the beginning of the file. virtual inline void rewind() { // Just call rewind to move to the beginning of the file. seek(myStartPos, SEEK_SET); } // Check to see if we have reached the EOF. virtual inline int eof() { // check the file for eof. return myEOF; } // Check to see if the file is open. virtual inline bool isOpen() { if (bgzfHandle != NULL) { // bgzfHandle is not null, so the file is open. return(true); } return(false); } // Write to the file virtual inline unsigned int write(const void * buffer, unsigned int size) { return bgzf_write(bgzfHandle, buffer, size); } // Read into a buffer from the file. Since the buffer is passed in and // this would bypass the fileBuffer used by this class, this method must // be protected. virtual inline int read(void * buffer, unsigned int size) { int bytesRead = bgzf_read(bgzfHandle, buffer, size); if ((bytesRead == 0) && (size != 0)) { myEOF = true; } else if((bytesRead != (int)size) & (bytesRead >= 0)) { // Less then the requested size was read // and an error was not returned (bgzf_read returns -1 on error). myEOF = true; } else { myEOF = false; } return bytesRead; } // Get current position in the file. // -1 return value indicates an error. virtual inline int64_t tell() { if(myUsingBuffer) { throw std::runtime_error("IFILE: CANNOT use buffered reads and tell for BGZF files"); } return bgzf_tell(bgzfHandle); } // Seek to the specified offset from the origin. // origin can be any of the following: // Note: not all are valid for all filetypes. // SEEK_SET - Beginning of file // SEEK_CUR - Current position of the file pointer // SEEK_END - End of file // Returns true on successful seek and false on a failed seek. virtual inline bool seek(int64_t offset, int origin) { int64_t returnVal = bgzf_seek(bgzfHandle, offset, origin); // Check for failure. if (returnVal == -1) { return false; } // Successful. // Reset EOF, assuming no longer at eof - first read will indicate // eof if it is at the end. myEOF = false; return true; } // Set whether or not to require the EOF block at the end of the // file. True - require the block. False - do not require the block. static void setRequireEofBlock(bool requireEofBlock); protected: // A bgzfFile is used. BGZF* bgzfHandle; // Flag indicating EOF since there isn't one on the handle. bool myEOF; int64_t myStartPos; // Static variable to track whether or not to require the EOF Block // at the end of the file. If the block is required, but not on the file, // the constructor fails to open the file. static bool ourRequireEofBlock; }; #endif #endif libStatGen-1.0.14/general/BgzfFileTypeRecovery.cpp000066400000000000000000000412501254730101300220210ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifdef __ZLIB_AVAILABLE__ #include "BgzfFileTypeRecovery.h" #include #include #include #include #include #include #include #include #include #include #include #include #pragma pack(push,1) #define debug false class RecoveryGzipHeader { private: uint8_t m_ID1; uint8_t m_ID2; uint8_t m_CM; uint8_t m_FLG; uint32_t m_MTIME; uint8_t m_XFL; uint8_t m_OS; uint16_t m_XLEN; public: RecoveryGzipHeader() : m_ID1(0), m_ID2(0), m_CM(0), m_FLG(0), m_MTIME(0), m_XFL(0), m_OS(0), m_XLEN(0) {;} void defaults() { m_ID1 = 31; m_ID2 = 139; m_CM = 8; m_FLG = 4; m_MTIME = 0; m_XFL = 0; m_OS = 255; m_XLEN = 6; } uint8_t ID1() {return m_ID1;} uint8_t ID2() {return m_ID2;} uint8_t CM() {return m_CM;} uint8_t FLG() {return m_FLG;} uint32_t MTIME() {return m_MTIME;} uint8_t XFL() {return m_XFL;} uint8_t OS() {return m_OS;} uint16_t XLEN() {return m_XLEN;} bool sane() { return (m_ID1==31 && m_ID2==139 && m_CM==8 && m_FLG==4 && m_MTIME==0 && m_XFL == 0 && m_OS == 255 && m_XLEN==6); } }; class BGZFHeader : public RecoveryGzipHeader { private: uint8_t m_SI1; uint8_t m_SI2; uint16_t m_SLEN; // little endian uint16_t m_BSIZE; // little endian public: BGZFHeader( uint8_t m_SI1 = 'B', uint8_t m_SI2 = 'C', uint16_t m_SLEN = 2, uint16_t m_BSIZE = 0 ) : m_SI1(m_SI1), m_SI2(m_SI2), m_SLEN(m_SLEN), m_BSIZE(m_BSIZE) {;} uint8_t SI1() {return m_SI1;} uint8_t SI2() {return m_SI2;} uint16_t SLEN() {return m_SLEN;} uint16_t BSIZE() {return m_BSIZE;} bool sane() { return RecoveryGzipHeader::sane() && (m_SI1=='B' && m_SI2=='C' && m_SLEN==2 && m_BSIZE > sizeof(BGZFHeader)); } }; #pragma pack(pop) // // PeekaheadBuffer allows non-destructive peekahead and resyncing // after read errors when the underlying stream has signatures in the // data that allow it. // // In addition, it has a peek() capability to allow // the caller to look ahead in the stream to see // a certain number of bytes before actually consuming them. // // The intent is that this class behave as something of a poor // man's FIFO - with the cost of buffer movement when data is removed. // // This is far from ideal, but we basically are moving data around // when allowing arbitrary peekahead regardless. // // The basis for the design is the fact that most read calls to // various streams at best allow a single character to be peeked // at, and secondly, do not allow for recovery after an underfling // framing error occurs. // // That is, getchar()/putchar/ungetchar() support a single byte // peek. This may be fine for simply parsing applications, but here // we need to look at blocks up to 64K or more in size to search // for signatures while re-synchronizing on the underlying stream. // class PeekaheadBuffer : public std::vector { protected: ssize_t m_startPosition; // start of fresh data public: enum ReturnCode { endOfFile = -1, reSync = 0, ok = 1 }; ssize_t startPosition() {return m_startPosition;} private: // // when remaining data is 1/8 the size of the full // buffer, shift it back down to the start. // // for use by read(), which will consume data from the buffer. // void shiftData() { if(dataRemaining() < (ssize_t) (std::vector::size() / 8) ) { erase(begin(), begin() + m_startPosition); m_startPosition = 0; } } // called when read reports an error for some // reason - virtual ReturnCode sync(); public: PeekaheadBuffer(); virtual ~PeekaheadBuffer(); // return the amount of unused data: ssize_t dataRemaining(); // // overload size() to throw an exception - too confusing otherwise // size_t size() {abort();} // // just populate data in buffer from stream - not generic // // XXX note that it simply ensures that count bytes of data // are actually loaded into the buffer - if that amount of // data (or more) is present, this call is a NOP. // virtual ReturnCode readahead(ssize_t count) = 0; // read is generic. // remove data from our buffer - call non-generic readahead to populate data. ReturnCode read(uint8_t *buffer, ssize_t count) { ReturnCode rc; rc = readahead(count); if(rc == ok) { uint8_t *src = &(*begin()) + m_startPosition; uint8_t *dest = buffer; memcpy(dest, src, count); m_startPosition += count; // consume data // recover space if wasting too much: shiftData(); } else if(rc == reSync) { // peek puked - CRC error, other errors, see if we can sync forwards return reSync; } else { // failed to get needed data - premature EOF, I guess return endOfFile; } return ok; } }; PeekaheadBuffer::PeekaheadBuffer() : m_startPosition(0) { } PeekaheadBuffer::~PeekaheadBuffer() { } PeekaheadBuffer::ReturnCode PeekaheadBuffer::sync() { clear(); return ok; } ssize_t PeekaheadBuffer::dataRemaining() { return std::vector::size() - m_startPosition; } // peekahead buffered file reader class class FileReader : public PeekaheadBuffer { FILE *m_stream; public: FileReader(); ~FileReader(); FileReader(FILE *stream); PeekaheadBuffer::ReturnCode readahead(ssize_t count); FILE *stream() {return m_stream;} bool eof() {return m_stream ? feof(m_stream) : false;} }; FileReader::FileReader() { m_stream = NULL; } FileReader::FileReader(FILE *stream) : m_stream(stream) { } FileReader::~FileReader() { fclose(m_stream); m_stream = NULL; } // // fill buffer until we have count bytes of valid // data. // // need to detect error and eof and return appropriate values. // PeekaheadBuffer::ReturnCode FileReader::readahead(ssize_t count) { uint8_t buffer[4096]; while(dataRemaining() < count) { int bytesRead = fread(buffer, 1, sizeof(buffer), m_stream); if(bytesRead==0) { if(ferror(m_stream)) { return reSync; } // ain't getting no more data... return endOfFile; } #if 0 fprintf(stderr, "\n\n"); int possible = -1; for(int i=0;i0) { fprintf(stderr,"possible signature at %08x\n", possible); } #endif insert(end(), &buffer[0], &buffer[0] + bytesRead); } return ok; } class BGZFReader : public PeekaheadBuffer { FileReader m_fileReader; public: BGZFReader(FILE *stream) : m_fileReader(stream) {;} PeekaheadBuffer::ReturnCode readahead(ssize_t count); // // This will be reading data, and needs to return EOF, etc // ReturnCode sync() { // my internal data is now bad, so we'll scan ahead seeing // if we can find a good header clear(); PeekaheadBuffer::ReturnCode rc; while((rc = m_fileReader.readahead(sizeof(BGZFHeader)))==ok ) { BGZFHeader *header; if(rc==endOfFile) return rc; // a rc==reSync is ok provided readahead still ensures that header is present void *src = &(*(m_fileReader.begin())) + m_fileReader.startPosition(); header = (BGZFHeader *) src; if(header->sane()) { if(debug) std::cerr << "BGZFReader::sync returning reSync\n"; return reSync; // tell caller they need to sync up } // consume a byte, then see if we're at a valid block header uint8_t throwAwayBuffer; rc = m_fileReader.read(&throwAwayBuffer, 1); } return rc; } FILE *stream() {return m_fileReader.stream();} bool eof() {return dataRemaining()==0 && m_fileReader.eof();} }; PeekaheadBuffer::ReturnCode BGZFReader::readahead(ssize_t count) { BGZFHeader header; // size of inflateBuffer can be determined from ISIZE, I think uint8_t inflateBuffer[64*1024]; uint8_t gzipBuffer[64*1024+1]; while(dataRemaining() < count) { static int loopCount = 0; if(debug) std::cerr << "BGZFReader::readahead loopcount = " << loopCount++ << "\n"; // here we actually read data: // read what should be the header // verify the header // read the remainder of the block // check the CRC validity or perhaps just call unzip // // XXX the sizeof(header) is wrong: PeekaheadBuffer::ReturnCode rc = m_fileReader.read((uint8_t *) (&header), sizeof(header)); if(rc == endOfFile) { return endOfFile; } // if we have a bad header, start looking forward for a good one, if(!header.sane()) { // sync does not consume the next good header, it simply syncs() // the data stream to the next believed good BGZF header: if(debug) std::cerr << "BGZFReader::readahead found corrupt BGZF header - now calling sync()\n"; rc = sync(); // // even though we can now decompress, we need to tell the caller // what is up before they call for more data (caller needs to // sync its own record stream): return rc; } // Read the remainder of the block. // BSIZE is size of the entire block - 1, so compensate. rc = m_fileReader.read((uint8_t *) &gzipBuffer, header.BSIZE() + 1 - sizeof(header)); if(rc == reSync) { if(debug) std::cerr << "BGZFReader::readahead got incomplete BGZF read - now calling sync()\n"; sync(); return reSync; } // // we read a header, but our attempt to read more data ended early, // so best to just return EOF // if(rc == endOfFile) { return rc; } PeekaheadBuffer::ReturnCode bgzf_rc = ok; // zs.opaque is set when zalloc is NULL // // NB: zlib inflateInit2() has valgrind errors // in versions <1.2.4 - those can be ignored. // z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = gzipBuffer; zs.avail_in = header.BSIZE() - 16; // XXX need to check docs for inflate zs.next_out = inflateBuffer; zs.avail_out = sizeof(inflateBuffer); // -15 --> raw inflate - don't look for gzip or zlib header // This can be optimized - inflateInit2 does a malloc of // approximately 10K (sizeof(inflate_state)) if(inflateInit2(&zs, -15) != Z_OK) { bgzf_rc = reSync; if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n"; // XXX fatal? } if(bgzf_rc==ok && inflate(&zs, Z_FINISH) != Z_STREAM_END) { bgzf_rc = reSync; if(debug) std::cerr << "BGZFReader::readahead - inflate failed (bad data), calling sync()\n"; } if(bgzf_rc == ok) { if(inflateEnd(&zs) == Z_OK) { // do something with zs.total_out if(debug) std::cout << "hey, got data! zs.total_out == " << zs.total_out << "\n"; // append the newly decompressed data insert(end(), &inflateBuffer[0], &inflateBuffer[0] + zs.total_out); } else { // seems exceptionall unlikely, but check this error case too bgzf_rc = reSync; if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n"; // XXX fatal? } } if(bgzf_rc != ok) { inflateEnd(&zs); sync(); return bgzf_rc; } // may need to get more data - loop back till all is complete } return ok; } #if 0 void testBGZFBuffer() { BGZFReader b(stdin); std::vector::iterator position; BGZFReader::ReturnCode rc; std::cout << "size = " << b.dataRemaining() << "\n"; // // this should: // decompress a BGZF block, populating the buffer with // unzipped data, possibly returning a BGZFBuffer::ReturnCode of // resync if it turns out the BGZF data was interrupted by bad // CRC checks. // rc = b.readahead(64); std::cout << "rc = " << rc << " - expect ok (1)\n"; std::cout << "size (expect 64) = " << b.size() << "\n"; } int main(int argc, const char **argv) { testBGZFBuffer(); } #endif int BgzfFileTypeRecovery::close() { if(bgzfReader) delete bgzfReader; bgzfReader = NULL; return true; } BgzfFileTypeRecovery::BgzfFileTypeRecovery(const char * filename, const char * mode) { if(tolower(mode[0])=='r') { FILE *f = fopen(filename,"r"); bgzfReader = new BGZFReader(f); } else { // die for now if(debug) std::cerr << "Unable to open " << filename << " in mode " << mode << ".\n"; close(); } } // // Why is this ever called? // bool BgzfFileTypeRecovery::operator == (void * rhs) { throw std::logic_error("BgzfFileTypeRecovery::operator == is dangerous - do not use"); return false; } bool BgzfFileTypeRecovery::operator != (void * rhs) { throw std::logic_error("BgzfFileTypeRecovery::operator != is dangerous - do not use"); return false; } int BgzfFileTypeRecovery::eof() { return bgzfReader->eof(); } unsigned int BgzfFileTypeRecovery::write(const void * buffer, unsigned int size) { // currently unsupported return 0; } int BgzfFileTypeRecovery::read(void * buffer, unsigned int size) { if(bgzfReader == NULL) { return 0; } PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) buffer, size); // endOfFile = -1, // reSync = 0, // ok = 1 switch(rc) { case PeekaheadBuffer::endOfFile: // set a flag? return 0; case PeekaheadBuffer::reSync: // we could encode more info in the exception message here: if(debug) std::cerr << "throwing BGZF sync exception\n"; throw std::runtime_error("BGZF stream resync"); case PeekaheadBuffer::ok: // // in bgzfReader, we always are ensured we // get the full amount of the read, otherwise // an error is thrown. // return size; } // NOTREACHED return 0; } int64_t BgzfFileTypeRecovery::tell() { // currently unsupported return 0; } bool BgzfFileTypeRecovery::seek(int64_t offset, int origin) { // currently unsupported return 0; } bool BgzfFileTypeRecovery::attemptRecoverySync(bool (*checkSignature)(void *data) , int length) { // // creep along a byte at a time, checking for signature. // // possibly slow. should only need to scan ahead < 64K bytes // or so, however, so should recover in "reasonable" time. // while( bgzfReader->readahead(length) == PeekaheadBuffer::ok) { char ch; void *src = &(*(bgzfReader->begin())) + bgzfReader->startPosition(); // // readahead ensures we have 'length' bytes of // data to check that is valid in the buffer. // if((*checkSignature)(src)) return true; PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) &ch, 1); if(rc!=PeekaheadBuffer::ok) return false; // we consumed a byte, so go back to top of loop, // resume filling buffer (if need be) and re-check } return false; } #endif libStatGen-1.0.14/general/BgzfFileTypeRecovery.h000066400000000000000000000052751254730101300214750ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BGZFFILETYPERECOVERY_H__ #define __BGZFFILETYPERECOVERY_H__ #ifdef __ZLIB_AVAILABLE__ #include "FileType.h" #include // for NULL class BGZFReader; class BgzfFileTypeRecovery : public FileType { public: BgzfFileTypeRecovery() { bgzfReader = NULL; } ~BgzfFileTypeRecovery() { close(); } BgzfFileTypeRecovery(const char * filename, const char * mode); // these methods should not be used. They are // misleading because the rhs could be anything, // (specifically not a BgzfFileTypeRecover object). bool operator == (void * rhs); bool operator != (void * rhs); // Close the file. int close(); // Reset to the beginning of the file. inline void rewind() { // Just call rewind to move to the beginning of the file. seek(0LL, SEEK_SET); } // Check to see if we have reached the EOF. int eof(); // Check to see if the file is open. bool isOpen() { return (bgzfReader != NULL); } // Write to the file unsigned int write(const void * buffer, unsigned int size); // Read into a buffer from the file. Since the buffer is passed in and // this would bypass the fileBuffer used by this class, this method must // be protected. int read(void * buffer, unsigned int size); // Get current position in the file. // -1 return value indicates an error. int64_t tell(); // Seek to the specified offset from the origin. // origin can be any of the following: // Note: not all are valid for all filetypes. // SEEK_SET - Beginning of file // SEEK_CUR - Current position of the file pointer // SEEK_END - End of file // Returns true on successful seek and false on a failed seek. bool seek(int64_t offset, int origin); bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length); protected: // Read via BGZFReader BGZFReader* bgzfReader; }; #endif #endif libStatGen-1.0.14/general/CSG_MD5.h000066400000000000000000000305531254730101300165020ustar00rootroot00000000000000// // This is a C++ friendly, limited use MD5 checksum copied // from the reference implementation at http://www.ietf.org/rfc/rfc1321.txt // // The sole purpose of this file is to eliminate dependence on the // openssl MD5 code, which poses a maintenance problem for general // users trying to build libstatgen. // // It is intended to be included solely by GenomeSequence.cpp, which // needs it to compute the MD5 checksum of chromosomes it is building. // /* MDDRIVER.C - test driver for MD2, MD4 and MD5 */ /* Copyright (C) 1990-2, RSA Data Security, Inc. Created 1990. All rights reserved. RSA Data Security, Inc. makes no representations concerning either the merchantability of this software or the suitability of this software for any particular purpose. It is provided "as is" without express or implied warranty of any kind. These notices must be retained in any copies of any part of this documentation and/or software. */ #include #include #include /* MD5.H - header file for MD5C.C */ /* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. License to copy and use this software is granted provided that it is identified as the "RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing this software or this function. License is also granted to make and use derivative works provided that such works are identified as "derived from the RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing the derived work. RSA Data Security, Inc. makes no representations concerning either the merchantability of this software or the suitability of this software for any particular purpose. It is provided "as is" without express or implied warranty of any kind. These notices must be retained in any copies of any part of this documentation and/or software. */ #include /* MD5 context. */ typedef struct { uint32_t state[4]; /* state (ABCD) */ uint32_t count[2]; /* number of bits, modulo 2^64 (lsb first) */ unsigned char buffer[64]; /* input buffer */ } MD5_CTX; static void MD5Init(MD5_CTX *); static void MD5Update(MD5_CTX *, unsigned char *, unsigned int); static void MD5Final(unsigned char [16], MD5_CTX *); /* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm */ /* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. License to copy and use this software is granted provided that it is identified as the "RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing this software or this function. License is also granted to make and use derivative works provided that such works are identified as "derived from the RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing the derived work. RSA Data Security, Inc. makes no representations concerning either the merchantability of this software or the suitability of this software for any particular purpose. It is provided "as is" without express or implied warranty of any kind. These notices must be retained in any copies of any part of this documentation and/or software. */ /* Constants for MD5Transform routine. */ #define S11 7 #define S12 12 #define S13 17 #define S14 22 #define S21 5 #define S22 9 #define S23 14 #define S24 20 #define S31 4 #define S32 11 #define S33 16 #define S34 23 #define S41 6 #define S42 10 #define S43 15 #define S44 21 typedef unsigned char *POINTER; static void MD5Transform(uint32_t [4], unsigned char [64]); static void Encode(unsigned char *, uint32_t *, unsigned int); static void Decode(uint32_t *, unsigned char *, unsigned int); static void MD5_memcpy(POINTER, POINTER, unsigned int); static void MD5_memset(POINTER, int, unsigned int); static unsigned char PADDING[64] = { 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* F, G, H and I are basic MD5 functions. */ #define F(x, y, z) (((x) & (y)) | ((~x) & (z))) #define G(x, y, z) (((x) & (z)) | ((y) & (~z))) #define H(x, y, z) ((x) ^ (y) ^ (z)) #define I(x, y, z) ((y) ^ ((x) | (~z))) /* ROTATE_LEFT rotates x left n bits. */ #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) /* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. Rotation is separate from addition to prevent recomputation. */ #define FF(a, b, c, d, x, s, ac) { \ (a) += F ((b), (c), (d)) + (x) + (uint32_t)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define GG(a, b, c, d, x, s, ac) { \ (a) += G ((b), (c), (d)) + (x) + (uint32_t)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define HH(a, b, c, d, x, s, ac) { \ (a) += H ((b), (c), (d)) + (x) + (uint32_t)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define II(a, b, c, d, x, s, ac) { \ (a) += I ((b), (c), (d)) + (x) + (uint32_t)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } /* MD5 initialization. Begins an MD5 operation, writing a new context. */ static void MD5Init ( MD5_CTX *context) /* context */ { context->count[0] = context->count[1] = 0; /* Load magic initialization constants. */ context->state[0] = 0x67452301; context->state[1] = 0xefcdab89; context->state[2] = 0x98badcfe; context->state[3] = 0x10325476; } /* MD5 block update operation. Continues an MD5 message-digest operation, processing another message block, and updating the context. */ static void MD5Update ( MD5_CTX *context, /* context */ unsigned char *input, /* input block */ unsigned int inputLen) /* length of input block */ { unsigned int i, index, partLen; /* Compute number of bytes mod 64 */ index = (unsigned int)((context->count[0] >> 3) & 0x3F); /* Update number of bits */ if ((context->count[0] += ((uint32_t)inputLen << 3)) < ((uint32_t)inputLen << 3)) context->count[1]++; context->count[1] += ((uint32_t)inputLen >> 29); partLen = 64 - index; /* Transform as many times as possible. */ if (inputLen >= partLen) { MD5_memcpy ((POINTER)&context->buffer[index], (POINTER)input, partLen); MD5Transform (context->state, context->buffer); for (i = partLen; i + 63 < inputLen; i += 64) MD5Transform (context->state, &input[i]); index = 0; } else i = 0; /* Buffer remaining input */ MD5_memcpy ((POINTER)&context->buffer[index], (POINTER)&input[i], inputLen-i); } /* MD5 finalization. Ends an MD5 message-digest operation, writing the the message digest and zeroizing the context. */ static void MD5Final ( unsigned char digest[16], /* message digest */ MD5_CTX *context) /* context */ { unsigned char bits[8]; unsigned int index, padLen; /* Save number of bits */ Encode (bits, context->count, 8); /* Pad out to 56 mod 64. */ index = (unsigned int)((context->count[0] >> 3) & 0x3f); padLen = (index < 56) ? (56 - index) : (120 - index); MD5Update (context, PADDING, padLen); /* Append length (before padding) */ MD5Update (context, bits, 8); /* Store state in digest */ Encode (digest, context->state, 16); /* Zeroize sensitive information. */ MD5_memset ((POINTER)context, 0, sizeof (*context)); } /* MD5 basic transformation. Transforms state based on block. */ static void MD5Transform ( uint32_t state[4], unsigned char block[64]) { uint32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16]; Decode (x, block, 64); /* Round 1 */ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ /* Round 2 */ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ /* Round 3 */ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ /* Round 4 */ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ state[0] += a; state[1] += b; state[2] += c; state[3] += d; /* Zeroize sensitive information. */ MD5_memset ((POINTER)x, 0, sizeof (x)); } /* Encodes input (uint32_t) into output (unsigned char). Assumes len is a multiple of 4. */ static void Encode ( unsigned char *output, uint32_t *input, unsigned int len) { unsigned int i, j; for (i = 0, j = 0; j < len; i++, j += 4) { output[j] = (unsigned char)(input[i] & 0xff); output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); } } /* Decodes input (unsigned char) into output (uint32_t). Assumes len is a multiple of 4. */ static void Decode ( uint32_t *output, unsigned char *input, unsigned int len) { unsigned int i, j; for (i = 0, j = 0; j < len; i++, j += 4) output[i] = ((uint32_t)input[j]) | (((uint32_t)input[j+1]) << 8) | (((uint32_t)input[j+2]) << 16) | (((uint32_t)input[j+3]) << 24); } /* Note: Replace "for loop" with standard memcpy if possible. */ static void MD5_memcpy ( POINTER output, POINTER input, unsigned int len) { unsigned int i; for (i = 0; i < len; i++) output[i] = input[i]; } /* Note: Replace "for loop" with standard memset if possible. */ static void MD5_memset ( POINTER output, int value, unsigned int len) { unsigned int i; for (i = 0; i < len; i++) ((char *)output)[i] = (char)value; } libStatGen-1.0.14/general/CharBuffer.cpp000066400000000000000000000076371254730101300177720ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "CharBuffer.h" CharBuffer::CharBuffer() : myBuffer(NULL) { myBuffer = (char *) malloc(DEFAULT_BUFFER_SIZE); myBufferAllocatedLen = DEFAULT_BUFFER_SIZE; reset(); } CharBuffer::CharBuffer(int32_t initialSize) : myBuffer(NULL) { myBuffer = (char *) malloc(initialSize); myBufferAllocatedLen = DEFAULT_BUFFER_SIZE; reset(); } CharBuffer::~CharBuffer() { reset(); if(myBuffer != NULL) { free(myBuffer); myBuffer = NULL; } } // Copy Constructor CharBuffer::CharBuffer(const CharBuffer& buffer) : myBuffer(NULL) { myBuffer = (char *) malloc(DEFAULT_BUFFER_SIZE); myBufferAllocatedLen = DEFAULT_BUFFER_SIZE; reset(); copy(buffer); } // Overload operator = to copy the passed in buffer into this buffer. CharBuffer& CharBuffer::operator = (const CharBuffer& buffer) { copy(buffer); return(*this); } // Overload operator = to copy the passed in buffer into this buffer. CharBuffer& CharBuffer::operator = (const std::string& stringBuffer) { // First check lengh if(prepareNewLength(stringBuffer.length())) { memcpy(myBuffer, stringBuffer.c_str(), stringBuffer.length()); } // TODO: on failure of prepareNewLength, should it throw an exception? return(*this); } bool CharBuffer::copy(const CharBuffer& buffer) { // Check to see if the passed in value is the same as this. if(this == &buffer) { return(true); } // Copy the buffer. // First check lengh prepareNewLength(buffer.myBufferLen); memcpy(myBuffer, buffer.myBuffer, buffer.myBufferLen); myBufferLen = buffer.myBufferLen; return(true); } // Reset the buffer for a new entry, clearing out previous values. void CharBuffer::reset() { myBufferLen = 0; if(myBuffer != NULL) { myBuffer[0] = 0; } } // Read from a file into the buffer. length is the amount of data to read. // Returns the number of bytes read. int CharBuffer::readFromFile(IFILE filePtr, int32_t length) { if(filePtr == NULL) { return(0); } if(prepareNewLength(length)) { return(ifread(filePtr, myBuffer, length)); } // failed to setup the buffer, return false. return(false); } // newLen is the new length that this buffer needs to be. bool CharBuffer::prepareNewLength(int32_t newLen) { if(newLen < 0) { // Invalid length. return(false); } // myBufferAllocatedLen must be bigger than new length, because the // newLen position is set to 0. if(myBufferAllocatedLen <= newLen) { // Not enough space is allocated, so allocate more space. char* tmpBufferPtr = (char *)realloc(myBuffer, newLen); if(tmpBufferPtr == NULL) { // FAILED to allocate memory fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!"); // myStatus.addError(GlfStatus::FAIL_MEM, "Failed Memory Allocation."); return(false); } // Successfully allocated memory, so set myRecordPtr. myBuffer = tmpBufferPtr; myBufferAllocatedLen = newLen; } myBufferLen = newLen; myBuffer[newLen] = 0; return(true); } libStatGen-1.0.14/general/CharBuffer.h000066400000000000000000000036621254730101300174310ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BUFFER_H__ #define __BUFFER_H__ #include #include "InputFile.h" class CharBuffer { public: CharBuffer(); CharBuffer(int32_t initialSize); ~CharBuffer(); // Copy Constructor CharBuffer(const CharBuffer& buffer); // Overload operator = to copy the passed in buffer into this buffer. CharBuffer& operator = (const CharBuffer& buffer); // Overload operator = to copy the passed in buffer into this buffer. CharBuffer& operator = (const std::string& stringBuffer); // Overload operator = to copy the passed in buffer into this buffer. bool copy(const CharBuffer& buffer); void reset(); // Read from a file into the buffer. length is the amount of data to read. // Returns the number of bytes read. int readFromFile(IFILE filePtr, int32_t length); inline const char* c_str() const { return(myBuffer); } inline int32_t length() const { return(myBufferLen); } private: // newLen is the new length for the buffer. bool prepareNewLength(int32_t newLen); int32_t myBufferLen; char* myBuffer; int32_t myBufferAllocatedLen; static const int32_t DEFAULT_BUFFER_SIZE = 100; }; #endif libStatGen-1.0.14/general/Chromosome.cpp000066400000000000000000000032761254730101300200710ustar00rootroot00000000000000#include #include "Chromosome.h" Chromosome::Chromosome(GenomeSequence* gs, unsigned int chromosomeIndex) { assert(gs); assert(chromosomeIndex < (unsigned int)gs->getChromosomeCount()); this->gs = gs; this->chromosomeIndex = chromosomeIndex; this->offset = gs->getChromosomeStart((int)chromosomeIndex); this->chromosomeSize = gs->getChromosomeSize((int)chromosomeIndex); } Chromosome::Chromosome(GenomeSequence* gs, const char* chromosomeName) { assert(gs); this->gs = gs; this->chromosomeIndex = gs->getChromosome(chromosomeName); assert(chromosomeIndex != INVALID_CHROMOSOME_INDEX); this->offset = gs->getChromosomeStart((int)chromosomeIndex); this->chromosomeSize = gs->getChromosomeSize((int)chromosomeIndex); } Chromosome::Chromosome(const char* genomseSequenceFileName, unsigned int chromosomeIndex, bool isColorSpace) { std::string s(genomseSequenceFileName); if (this->gs) delete gs; gs = new GenomeSequence; assert(gs); gs->setReferenceName(s); assert(!gs->open(isColorSpace)); this->chromosomeIndex = chromosomeIndex; this->offset = gs->getChromosomeStart((int)chromosomeIndex); this->chromosomeSize = gs->getChromosomeSize((int)chromosomeIndex); } Chromosome::Chromosome(const std::string& genomseSequenceFileName, unsigned int chromosomeIndex, bool isColorSpace) { if (this->gs) delete gs; gs = new GenomeSequence; assert(gs); gs->setReferenceName(genomseSequenceFileName); assert(!gs->open(isColorSpace)); this->chromosomeIndex = chromosomeIndex; this->offset = gs->getChromosomeStart((int)chromosomeIndex); this->chromosomeSize = gs->getChromosomeSize((int)chromosomeIndex); } libStatGen-1.0.14/general/Chromosome.h000066400000000000000000000020221254730101300175220ustar00rootroot00000000000000#ifndef _CHROMOSOME_H_ #define _CHROMOSOME_H_ #include "GenomeSequence.h" class Chromosome{ public: explicit Chromosome(GenomeSequence* gs, unsigned int chrosomeIndex); explicit Chromosome(GenomeSequence* gs, const char* chromosomeName); explicit Chromosome(const char* genomseSequenceFileName, unsigned int chromosomeIndex, bool isColorSpace); explicit Chromosome(const std::string& genomseSequenceFileName, unsigned int chromosomeIndex, bool isColorSpace); genomeIndex_t Length() const { return chromosomeSize; } // 0-based index inline char operator[](genomeIndex_t index) const { index += offset; return (*gs)[index]; } const char* Name() const { return gs->getChromosomeName(this->chromosomeIndex); } private: GenomeSequence* gs; int chromosomeIndex; genomeIndex_t offset; // chromosome index 0 corresponds (*gs)[offset] genomeIndex_t chromosomeSize; // return the length of the chromosome }; #endif /* _CHROMOSOME_H_ */ libStatGen-1.0.14/general/Cigar.cpp000066400000000000000000000343041254730101300167770ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include "Cigar.h" #include "STLUtilities.h" // Initialize INDEX_NA. const int32_t Cigar::INDEX_NA = -1; //////////////////////////////////////////////////////////////////////// // // Cigar Class // // // Set the passed in string to the string reprentation of the Cigar operations // in this object. // void Cigar::getCigarString(std::string& cigarString) const { using namespace STLUtilities; std::vector::const_iterator i; cigarString.clear(); // clear result string // Progressively append the character representations of the operations to // the cigar string. for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) { cigarString << (*i).count << (*i).getChar(); } } void Cigar::getCigarString(String& cigarString) const { std::string cigar; getCigarString(cigar); cigarString = cigar.c_str(); return; } void Cigar::getExpandedString(std::string &s) const { s = ""; std::vector::const_iterator i; // Progressively append the character representations of the operations to // the string passed in for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) { for (uint32_t j = 0; j<(*i).count; j++) s += (*i).getChar(); } return; } bool Cigar::operator == (Cigar &rhs) const { if (this->size() != rhs.size()) return false; for (int i = 0; i < this->size(); i++) { if (cigarOperations[i]!=rhs.cigarOperations[i]) return false; } return true; } // return the length of the read that corresponds to // the current CIGAR string. int Cigar::getExpectedQueryBaseCount() const { int matchCount = 0; std::vector::const_iterator i; for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) { switch (i->operation) { case match: case mismatch: case softClip: case insert: matchCount += i->count; break; default: // we only care about operations that are in the query sequence. break; } } return matchCount; } // return the number of bases in the reference that // this read "spans" int Cigar::getExpectedReferenceBaseCount() const { int matchCount = 0; std::vector::const_iterator i; for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) { switch (i->operation) { case match: case mismatch: case del: case skip: matchCount += i->count; break; default: // we only care about operations that are in the reference sequence. break; } } return matchCount; } // Return the number of clips that are at the beginning of the cigar. int Cigar::getNumBeginClips() const { int numBeginClips = 0; for (unsigned int i = 0; i != cigarOperations.size(); i++) { if ((cigarOperations[i].operation == softClip) || (cigarOperations[i].operation == hardClip)) { // Clipping operator, increment the counter. numBeginClips += cigarOperations[i].count; } else { // Break out of the loop since a non-clipping operator was found. break; } } return(numBeginClips); } // Return the number of clips that are at the end of the cigar. int Cigar::getNumEndClips() const { int numEndClips = 0; for (int i = (cigarOperations.size() - 1); i >= 0; i--) { if ((cigarOperations[i].operation == softClip) || (cigarOperations[i].operation == hardClip)) { // Clipping operator, increment the counter. numEndClips += cigarOperations[i].count; } else { // Break out of the loop since a non-clipping operator was found. break; } } return(numEndClips); } int32_t Cigar::getRefOffset(int32_t queryIndex) { // If the vectors aren't set, set them. if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) { setQueryAndReferenceIndexes(); } if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size())) { return(INDEX_NA); } return(queryToRef[queryIndex]); } int32_t Cigar::getQueryIndex(int32_t refOffset) { // If the vectors aren't set, set them. if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) { setQueryAndReferenceIndexes(); } if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size())) { return(INDEX_NA); } return(refToQuery[refOffset]); } int32_t Cigar::getRefPosition(int32_t queryIndex, int32_t queryStartPos) { // If the vectors aren't set, set them. if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) { setQueryAndReferenceIndexes(); } if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size())) { return(INDEX_NA); } if (queryToRef[queryIndex] != INDEX_NA) { return(queryToRef[queryIndex] + queryStartPos); } return(INDEX_NA); } // Return the query index associated with the specified reference position // when the query starts at the specified reference position based on // this cigar. int32_t Cigar::getQueryIndex(int32_t refPosition, int32_t queryStartPos) { // If the vectors aren't set, set them. if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) { setQueryAndReferenceIndexes(); } int32_t refOffset = refPosition - queryStartPos; if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size())) { return(INDEX_NA); } return(refToQuery[refOffset]); } int32_t Cigar::getExpandedCigarIndexFromQueryIndex(int32_t queryIndex) { // If the vectors aren't set, set them. if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) { setQueryAndReferenceIndexes(); } if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToCigar.size())) { return(INDEX_NA); } return(queryToCigar[queryIndex]); } int32_t Cigar::getExpandedCigarIndexFromRefOffset(int32_t refOffset) { // If the vectors aren't set, set them. if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) { setQueryAndReferenceIndexes(); } if ((refOffset < 0) || ((uint32_t)refOffset >= refToCigar.size())) { return(INDEX_NA); } return(refToCigar[refOffset]); } int32_t Cigar::getExpandedCigarIndexFromRefPos(int32_t refPosition, int32_t queryStartPos) { return(getExpandedCigarIndexFromRefOffset(refPosition - queryStartPos)); } char Cigar::getCigarCharOp(int32_t expandedCigarIndex) { // Check if the expanded cigar has been set yet if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) { // Set the expanded cigar. setQueryAndReferenceIndexes(); } // Check to see if the index is in range. if((expandedCigarIndex < 0) || ((uint32_t)expandedCigarIndex >= myExpandedCigar.length())) { return('?'); } return(myExpandedCigar[expandedCigarIndex]); } char Cigar::getCigarCharOpFromQueryIndex(int32_t queryIndex) { return(getCigarCharOp(getExpandedCigarIndexFromQueryIndex(queryIndex))); } char Cigar::getCigarCharOpFromRefOffset(int32_t refOffset) { return(getCigarCharOp(getExpandedCigarIndexFromRefOffset(refOffset))); } char Cigar::getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos) { return(getCigarCharOp(getExpandedCigarIndexFromRefPos(refPosition, queryStartPos))); } // Return the number of bases that overlap the reference and the // read associated with this cigar that falls within the specified region. uint32_t Cigar::getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos) { // Get the overlap info. if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) { setQueryAndReferenceIndexes(); } // Get the start and end offsets. int32_t startRefOffset = 0; // If the specified start is more than the queryStartPos, set // the startRefOffset to the appropriate non-zero value. // (if start is <= queryStartPos, than startRefOffset is 0 - it should // not be set to a negative value.) if (start > queryStartPos) { startRefOffset = start - queryStartPos; } int32_t endRefOffset = end - queryStartPos; if (end == -1) { // -1 means that the region goes to the end of the refrerence. // So set endRefOffset to the max refOffset + 1 which is the // size of the refToQuery vector. endRefOffset = refToQuery.size(); } // if endRefOffset is less than 0, then this read does not fall within // the specified region, so return 0. if (endRefOffset < 0) { return(0); } // Get the overlaps for these offsets. // Loop through the read counting positions that match the reference // within this region. int32_t refOffset = 0; int32_t numOverlaps = 0; for (unsigned int queryIndex = 0; queryIndex < queryToRef.size(); queryIndex++) { refOffset = getRefOffset(queryIndex); if (refOffset > endRefOffset) { // Past the end of the specified region, so stop checking // for overlaps since there will be no more. break; } else if ((refOffset >= startRefOffset) && (refOffset < endRefOffset)) { // within the region, increment the counter. ++numOverlaps; } } return(numOverlaps); } // Return whether or not the cigar has an indel bool Cigar::hasIndel() { for(unsigned int i = 0; i < cigarOperations.size(); i++) { if((cigarOperations[i].operation == insert) || (cigarOperations[i].operation == del)) { // Found an indel, so return true. return(true); } } // Went through all the operations, and found no indel, so return false. return(false); } // Clear the query index/reference offset index vectors. void Cigar::clearQueryAndReferenceIndexes() { queryToRef.clear(); refToQuery.clear(); refToCigar.clear(); queryToCigar.clear(); myExpandedCigar.clear(); } /////////////////////////////////////////////////////// // Set the query index/reference offset index vectors. // // For Cigar: 3M2I2M1D1M // That total count of cigar elements is 9 (3+2+2+1+1) // // The entries that are valid in the query/reference contain the index/offset // where they are found in the query/reference. N/A are marked by 'x': // query indexes: 0123456x7 // --------- // reference offsets: 012xx3456 // // This shows what query index is associated with which reference offset and // vice versa. // For ones where an x appears, -1 would be returned. // void Cigar::setQueryAndReferenceIndexes() { // First ensure that the vectors are clear by clearing them. clearQueryAndReferenceIndexes(); int extPos = 0; // Process each cigar index. for (uint32_t cigarIndex = 0; cigarIndex < cigarOperations.size(); cigarIndex++) { // Process the cigar operation. switch (cigarOperations[cigarIndex].operation) { case match: case mismatch: // For match/mismatch, update the maps between query // and reference for the number of matches/mismatches. for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++) { // The associated indexes are the next location in // each array, which is equal to the current size. int32_t queryToRefLen = queryToRef.size(); int32_t refToQueryLen = refToQuery.size(); queryToRef.push_back(refToQueryLen); refToQuery.push_back(queryToRefLen); refToCigar.push_back(extPos); queryToCigar.push_back(extPos++); myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar()); } break; case insert: case softClip: // Add N/A reference offset for each query index that this // insert covers. for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++) { queryToRef.push_back(INDEX_NA); queryToCigar.push_back(extPos++); myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar()); } break; case del: case skip: // Add N/A query index for each reference offset that this // deletion/skip covers. for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++) { refToQuery.push_back(INDEX_NA); refToCigar.push_back(extPos++); myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar()); } break; case hardClip: case pad: case none: for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++) { myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar()); ++extPos; } break; }; } } libStatGen-1.0.14/general/Cigar.h000066400000000000000000000460771254730101300164560ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #if !defined(_CIGAR_H) #define _CIGAR_H #include // for inline use of strcat, etc #include // for INT_MAX #include // for uint32_t and friends #include #include #include #include #include #include #include #include #include "Generic.h" #include "StringBasics.h" /// This class represents the CIGAR without any methods to set the cigar /// (see CigarRoller for that). // // Docs from Sam1.pdf: // // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one. // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is // an example. Suppose the clipped alignment is: // REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG // READ: gggGTGTAACC-GACTAGgggg // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for // this alignment is: 3S8M1D6M4S. // // // If the mapping position of the query is not available, RNAME and // CIGAR are set as “*” // // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing: // // op Description // -- ----------- // M Match or mismatch // I Insertion to the reference // D Deletion from the reference // N Skipped region from the reference // S Soft clip on the read (clipped sequence present in ) // H Hard clip on the read (clipped sequence NOT present in ) // P Padding (silent deletion from the padded reference sequence) // //////////////////////////////////////////////////////////////////////// /// /// This class represents the CIGAR. It contains methods for converting /// to strings and extracting information from the cigar on how a read /// maps to the reference. /// /// It only contains read only methods. There are no ways to set /// values. To set a value, a child class must be used. /// class Cigar { public: /// Enum for the cigar operations. enum Operation { none=0, ///< no operation has been set. match, ///< match/mismatch operation. Associated with CIGAR Operation "M" mismatch, ///< mismatch operation. Associated with CIGAR Operation "M" insert, ///< insertion to the reference (the query sequence contains bases that have no corresponding base in the reference). Associated with CIGAR Operation "I" del, ///< deletion from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "D" skip, ///< skipped region from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "N" softClip, ///< Soft clip on the read (clipped sequence present in the query sequence, but not in reference). Associated with CIGAR Operation "S" hardClip, ///< Hard clip on the read (clipped sequence not present in the query sequence or reference). Associated with CIGAR Operation "H" pad ///< Padding (not in reference or query). Associated with CIGAR Operation "P" }; // The maximum value in the operation enum (used for setting up a bitset of // operations. static const int MAX_OP_VALUE = pad; //////////////////////////////////////////////////////////////////////// // // Nested Struct : CigarOperator // struct CigarOperator { CigarOperator() { operation = none; count = 0; } /// Set the cigar operator with the specified operation and /// count length. CigarOperator(Operation operation, uint32_t count) : operation(operation), count(count) {}; Operation operation; uint32_t count; /// Get the character code (M, I, D, N, S, H, or P) associated with /// this operation. char getChar() const { switch (operation) { case none: return '?'; // error case match: case mismatch: return'M'; case insert: return 'I'; case del: return'D'; case skip: return 'N'; case softClip: return 'S'; case hardClip: return 'H'; case pad: return 'P'; } return '?'; // actually it is an error to get here } /// Compare only on the operator, true if they are the same, false if not. Match and mismatch are considered the same for CIGAR strings. bool operator == (const CigarOperator &rhs) const { if (operation==rhs.operation) return true; if ((operation == mismatch || operation == match) && (rhs.operation == mismatch || rhs.operation == match)) return true; return false; } /// Compare only on the operator, false if they are the same, true if not. Match and mismatch are considered the same for CIGAR strings. bool operator != (const CigarOperator &rhs) const { return !((*this) == rhs) ; } }; //////////////////////////////////////////////////////////////////////// // // Cigar Class statics // /// Return true if the specified operation is found in the /// reference sequence, false if not. static bool foundInReference(Operation op) { switch(op) { case match: case mismatch: case del: case skip: return true; default: return false; } return false; } /// Return true if the specified operation is found in the /// reference sequence, false if not. static bool foundInReference(char op) { switch(op) { case 'M': case '=': case 'X': case 'D': case 'N': return true; default: return false; } return false; } /// Return true if the specified operation is found in the /// reference sequence, false if not. static bool foundInReference(const CigarOperator &op) { return(foundInReference(op.operation)); } /// Return true if the specified operation is found in the /// query sequence, false if not. static bool foundInQuery(Operation op) { switch(op) { case match: case mismatch: case insert: case softClip: return true; default: return false; } return false; } /// Return true if the specified operation is found in the /// query sequence, false if not. static bool foundInQuery(char op) { switch(op) { case 'M': case '=': case 'X': case 'I': case 'S': return true; default: return false; } return false; } /// Return true if the specified operation is found in the /// query sequence, false if not. static bool foundInQuery(const CigarOperator &op) { return(foundInQuery(op.operation)); } /// Return true if the specified operation is a clipping operation, /// false if not. static bool isClip(Operation op) { switch(op) { case softClip: case hardClip: return true; default: return false; } return false; } /// Return true if the specified operation is a clipping operation, /// false if not. static bool isClip(char op) { switch(op) { case 'S': case 'H': return true; default: return false; } return false; } /// Return true if the specified operation is a clipping operation, /// false if not. static bool isClip(const CigarOperator &op) { return(isClip(op.operation)); } /// Return true if the specified operation is a match/mismatch operation, /// false if not. static bool isMatchOrMismatch(Operation op) { switch(op) { case match: case mismatch: return true; default: return false; } return false; } /// Return true if the specified operation is a match/mismatch operation, /// false if not. static bool isMatchOrMismatch(const CigarOperator &op) { return(isMatchOrMismatch(op.operation)); } //////////////////////////////////////////////////////////////////////// // // Cigar Class non static // friend std::ostream &operator << (std::ostream &stream, const Cigar& cigar); /// Default constructor initializes as a CIGAR with no operations. Cigar() { clearQueryAndReferenceIndexes(); } /// Set the passed in String to the string reprentation of the Cigar /// operations in this object. void getCigarString(String& cigarString) const; /// Set the passed in std::string to the string reprentation of the Cigar /// operations in this object. void getCigarString(std::string& cigarString) const; /// Sets the specified string to a valid CIGAR string of characters that /// represent the cigar with no digits (a CIGAR of "3M" would return "MMM"). /// The returned string is actually also a valid CIGAR string. /// In theory this makes it easier to parse some reads. /// \return s the string to populate void getExpandedString(std::string &s) const; /// Return the Cigar Operation at the specified index (starting at 0). const CigarOperator & operator [](int i) const { return cigarOperations[i]; } /// Return the Cigar Operation at the specified index (starting at 0). const CigarOperator & getOperator(int i) const { return cigarOperations[i]; } /// Return true if the 2 Cigars are the same /// (the same operations of the same sizes). bool operator == (Cigar &rhs) const; /// Return the number of cigar operations int size() const { return cigarOperations.size(); } /// Write this object as a string to cout. void Dump() const { String cigarString; getCigarString(cigarString); std::cout << cigarString ; } /// Return the length of the read that corresponds to /// the current CIGAR string. /// /// For validation, we should expect that a sequence /// read in a SAM file will be the same length as the /// value returned by this method. /// /// Example: 3M2D3M describes a read with three bases /// matching the reference, then skips 2 bases, then has /// three more bases that match the reference (match/mismatch). /// In this case, the read length is expected to be 6. /// /// Example: 3M2I3M describes a read with 3 match/mismatch /// bases, two extra bases, and then 3 more match/mistmatch /// bases. The total in this example is 8 bases. /// /// \return returns the expected read length int getExpectedQueryBaseCount() const; /// Return the number of bases in the reference that /// this CIGAR "spans". /// /// When doing range checking, we occassionally need to know /// how many total bases the CIGAR string represents as compared /// to the reference. /// /// Examples: 3M2D3M describes a read that overlays 8 bases in /// the reference. 3M2I3M describes a read with 3 bases that /// match the reference, two additional bases that aren't in the /// reference, and 3 more bases that match the reference, so it /// spans 6 bases in the reference. /// /// \return how many bases in the reference are spanned /// by the given CIGAR string /// int getExpectedReferenceBaseCount() const; /// Return the number of clips that are at the beginning of the cigar. int getNumBeginClips() const; /// Return the number of clips that are at the end of the cigar. int getNumEndClips() const; /// Return the reference offset associated with the specified /// query index or INDEX_NA based on this cigar. int32_t getRefOffset(int32_t queryIndex); /// Return the query index associated with the specified /// reference offset or INDEX_NA based on this cigar. int32_t getQueryIndex(int32_t refOffset); /// Return the reference position associated with the specified query index /// or INDEX_NA based on this cigar and the specified queryStartPos which /// is the leftmost mapping position of the first matching base in the /// query. int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos); /// Return the query index or INDEX_NA associated with the specified /// reference offset when the query starts at the specified reference /// position. int32_t getQueryIndex(int32_t refPosition, int32_t queryStartPos); /// Returns the index into the expanded cigar for the cigar /// associated with the specified queryIndex. /// INDEX_NA returned if the index is out of range. int32_t getExpandedCigarIndexFromQueryIndex(int32_t queryIndex); /// Returns the index into the expanded cigar for the cigar /// associated with the specified reference offset. /// INDEX_NA returned if the offset is out of range. int32_t getExpandedCigarIndexFromRefOffset(int32_t refOffset); /// Returns the index into the expanded cigar for the cigar /// associated with the specified reference position and queryStartPos. /// INDEX_NA returned if the position is out of range. int32_t getExpandedCigarIndexFromRefPos(int32_t refPosition, int32_t queryStartPos); /// Return the character code of the cigar operator associated with the /// specified expanded CIGAR index. '?' is returned for an out of range /// index. char getCigarCharOp(int32_t expandedCigarIndex); /// Return the character code of the cigar operator associated with /// the specified queryIndex. '?' is returned for an out of range index. char getCigarCharOpFromQueryIndex(int32_t queryIndex); /// Return the character code of the cigar operator associated with /// the specified reference offset. '?' is returned for an out of range offset. char getCigarCharOpFromRefOffset(int32_t refOffset); /// Return the character code of the cigar operator associated with /// the specified reference position. '?' is returned for an out of /// range reference position. char getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos); /// Return the number of bases that overlap the reference and the /// read associated with this cigar that falls within the specified region. /// \param start : inclusive 0-based start position (reference position) of /// the region to check for overlaps in /// (-1 indicates to start at the beginning of the reference.) /// \param end : exclusive 0-based end position (reference position) of the /// region to check for overlaps in /// (-1 indicates to go to the end of the reference.) /// \param queryStartPos : 0-based leftmost mapping position of the first /// matcihng base in the query. uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos); /// Return whether or not the cigar has indels (insertions or delections) /// \return true if it has an insertion or deletion, false if not. bool hasIndel(); /// Value associated with an index that is not applicable/does not exist, /// used for converting between query and reference indexes/offsets when /// an associated index/offset does not exist. static const int32_t INDEX_NA; protected: // Clear the query index/reference offset index vectors. void clearQueryAndReferenceIndexes(); // Set the query index/reference offset index vectors. void setQueryAndReferenceIndexes(); // Container for the cigar operations in this cigar. std::vector cigarOperations; private: // The vector is indexed by query index and contains the reference // offset associated with that query index. // The vector is reset each time a new cigar operation is added, and // is calculated when accessed if it is not already set. std::vector queryToRef; // The vector is indexed by reference offset and contains the query // index associated with that reference offset. // The vector is reset each time a new cigar operation is added, and // is calculated when accessed if it is not already set. std::vector refToQuery; // The vector is indexed by reference offset and contains the offset into // the expanded cigar associated with that reference offset. // The vector is reset each time a new cigar operation is added, and // is calculated when accessed if it is not already set. std::vector refToCigar; // The vector is indexed by query index and contains the offset into // the expanded cigar associated with that query index. // The vector is reset each time a new cigar operation is added, and // is calculated when accessed if it is not already set. std::vector queryToCigar; std::string myExpandedCigar; }; /// Writes the specified cigar operation to the specified stream as (3M). inline std::ostream &operator << (std::ostream &stream, const Cigar::CigarOperator& o) { stream << o.count << o.getChar(); return stream; } /// Writes all of the cigar operations contained in the cigar to the passed in stream. inline std::ostream &operator << (std::ostream &stream, const Cigar& cigar) { stream << cigar.cigarOperations; return stream; } #endif libStatGen-1.0.14/general/CigarRoller.cpp000077500000000000000000000205311254730101300201570ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include "CigarRoller.h" //////////////////////////////////////////////////////////////////////// // // Cigar Roller Class // CigarRoller & CigarRoller::operator += (CigarRoller &rhs) { std::vector::iterator i; for (i = rhs.cigarOperations.begin(); i != rhs.cigarOperations.end(); i++) { (*this) += *i; } return *this; } // // Append a new operator at the end of the sequence. // CigarRoller & CigarRoller::operator += (const CigarOperator &rhs) { // Adding to the cigar, so the query & reference indexes would be // incomplete, so just clear them. clearQueryAndReferenceIndexes(); if (rhs.count==0) { // nothing to do } else if (cigarOperations.empty() || cigarOperations.back() != rhs) { cigarOperations.push_back(rhs); } else { // last stored operation is the same as the new one, so just add it in cigarOperations.back().count += rhs.count; } return *this; } CigarRoller & CigarRoller::operator = (CigarRoller &rhs) { clear(); (*this) += rhs; return *this; } // void CigarRoller::Add(Operation operation, int count) { CigarOperator rhs(operation, count); (*this) += rhs; } void CigarRoller::Add(char operation, int count) { switch (operation) { case 0: case 'M': Add(match, count); break; case 1: case 'I': Add(insert, count); break; case 2: case 'D': Add(del, count); break; case 3: case 'N': Add(skip, count); break; case 4: case 'S': Add(softClip, count); break; case 5: case 'H': Add(hardClip, count); break; case 6: case 'P': Add(pad, count); break; case 7: case '=': Add(match, count); break; case 8: case 'X': Add(match, count); break; default: // Hmmm... what to do? std::cerr << "ERROR " << "(" << __FILE__ << ":" << __LINE__ <<"): " << "Parsing CIGAR - invalid character found " << "with parameter " << operation << " and " << count << std::endl; break; } } void CigarRoller::Add(const char *cigarString) { int operationCount = 0; while (*cigarString) { if (isdigit(*cigarString)) { char *endPtr; operationCount = strtol((char *) cigarString, &endPtr, 10); cigarString = endPtr; } else { Add(*cigarString, operationCount); cigarString++; } } } bool CigarRoller::Remove(int index) { if((index < 0) || ((unsigned int)index >= cigarOperations.size())) { // can't remove, out of range, return false. return(false); } cigarOperations.erase(cigarOperations.begin() + index); // Modifying the cigar, so the query & reference indexes are out of date, // so clear them. clearQueryAndReferenceIndexes(); return(true); } bool CigarRoller::IncrementCount(int index, int increment) { if((index < 0) || ((unsigned int)index >= cigarOperations.size())) { // can't update, out of range, return false. return(false); } cigarOperations[index].count += increment; // Modifying the cigar, so the query & reference indexes are out of date, // so clear them. clearQueryAndReferenceIndexes(); return(true); } bool CigarRoller::Update(int index, Operation op, int count) { if((index < 0) || ((unsigned int)index >= cigarOperations.size())) { // can't update, out of range, return false. return(false); } cigarOperations[index].operation = op; cigarOperations[index].count = count; // Modifying the cigar, so the query & reference indexes are out of date, // so clear them. clearQueryAndReferenceIndexes(); return(true); } void CigarRoller::Set(const char *cigarString) { clear(); Add(cigarString); } void CigarRoller::Set(const uint32_t* cigarBuffer, uint16_t bufferLen) { clear(); // Parse the buffer. for (int i = 0; i < bufferLen; i++) { int opLen = cigarBuffer[i] >> 4; Add(cigarBuffer[i] & 0xF, opLen); } } // // when we examine CIGAR strings, we need to know how // many cumulative insert and delete positions there are // so that we can adjust the read location appropriately. // // Here, we iterate over the vector of CIGAR operations, // summaring the count for each insert or delete (insert // increases the offset, delete decreases it). // // The use case for this is when we have a genome match // position based on an index word other than the first one, // and there is also a insert or delete between the beginning // of the read and the index word. We can't simply report // the match position without taking into account the indels, // otherwise we'll be off by N where N is the sum of this // indel count. // // DEPRECATED - do not use. There are better ways to accomplish that by using // read lengths, reference lengths, span of the read, etc. int CigarRoller::getMatchPositionOffset() { int offset = 0; std::vector::iterator i; for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) { switch (i->operation) { case insert: offset += i->count; break; case del: offset -= i->count; break; // TODO anything for case skip:???? default: break; } } return offset; } // // Get the string reprentation of the Cigar operations in this object. // Caller must delete the returned value. // const char * CigarRoller::getString() { // NB: the exact size of the string is not important, it just needs to be guaranteed // larger than the largest number of characters we could put into it. // we do not explicitly manage memory usage, and we expect when program exits, the memory used here will be freed static char *ret = NULL; static unsigned int retSize = 0; if (ret == NULL) { retSize = cigarOperations.size() * 12 + 1; // 12 == a magic number -> > 1 + log base 10 of MAXINT ret = (char*) malloc(sizeof(char) * retSize); assert(ret != NULL); } else { // currently, ret pointer has enough memory to use if (retSize > cigarOperations.size() * 12 + 1) { } else { retSize = cigarOperations.size() * 12 + 1; free(ret); ret = (char*) malloc(sizeof(char) * retSize); } assert(ret != NULL); } char *ptr = ret; char buf[12]; // > 1 + log base 10 of MAXINT std::vector::iterator i; // Progressively append the character representations of the operations to // the cigar string we allocated above. *ptr = '\0'; // clear result string for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) { sprintf(buf, "%d%c", (*i).count, (*i).getChar()); strcat(ptr, buf); while (*ptr) { ptr++; // limit the cost of strcat above } } return ret; } void CigarRoller::clear() { // Clearing the cigar, so the query & reference indexes are out of // date, so clear them. clearQueryAndReferenceIndexes(); cigarOperations.clear(); } libStatGen-1.0.14/general/CigarRoller.h000066400000000000000000000146771254730101300176370ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #if !defined(_CIGAR_ROLLER_H) #define _CIGAR_ROLLER_H #include "Cigar.h" /// The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object. It is a child class of Cigar. /// /// Docs from Sam1.pdf: /// /// Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one. /// Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is /// an example. Suppose the clipped alignment is: /// REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG /// READ: gggGTGTAACC-GACTAGgggg /// where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for /// this alignment is: 3S8M1D6M4S. /// /// /// If the mapping position of the query is not available, RNAME and /// CIGAR are set as “*” /// /// A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows /// for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format /// further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing: /// /// op Description /// -- ----------- /// M Match or mismatch /// I Insertion to the reference /// D Deletion from the reference /// N Skipped region from the reference /// S Soft clip on the read (clipped sequence present in ) /// H Hard clip on the read (clipped sequence NOT present in ) /// P Padding (silent deletion from the padded reference sequence) /// //////////////////////////////////////////////////////////////////////// /// /// CigarRoller is an aid to correctly generating the CIGAR strings /// necessary to represent how a read maps to the reference. /// /// It is called once a particular match candidate is being written /// out, so it is far less performance sensitive than the Smith Waterman /// code below. /// class CigarRoller : public Cigar { public: //////////////////////////////////////////////////////////////////////// // // Cigar Roller Class // /// Writes all of the cigar operations contained in this roller to the /// passed in stream. friend std::ostream &operator << (std::ostream &stream, const CigarRoller& roller); /// Default constructor initializes as a CIGAR with no operations. CigarRoller() { clearQueryAndReferenceIndexes(); } /// Constructor that initializes the object with the specified cigarString. CigarRoller(const char *cigarString) { Set(cigarString); } /// Add the contents of the specified CigarRoller to this object. CigarRoller & operator += (CigarRoller &rhs); /// Append the specified operator to this object. CigarRoller & operator += (const CigarOperator &rhs); /// Set this object to be equal to the specified CigarRoller. CigarRoller & operator = (CigarRoller &rhs); /// Append the specified operation with the specified count to this object. void Add(Operation operation, int count); /// Append the specified operation with the specified count to this object. void Add(char operation, int count); /// Append the specified cigarString to this object. void Add(const char *cigarString); /// Append the specified Cigar object to this object. void Add(CigarRoller &rhs) { (*this) += rhs; } /// Remove the operation at the specified index. /// \return true if successfully removed, false if not. bool Remove(int index); /// Increments the count for the operation at the specified index /// by the specified value, specify a negative value to decrement. /// \return true if it is successfully incremented, false if not. bool IncrementCount(int index, int increment); /// Updates the operation at the specified index to be the specified /// operation and have the specified count. /// \return true if it is successfully updated, false if not. bool Update(int index, Operation op, int count); /// Sets this object to the specified cigarString. void Set(const char *cigarString); /// Sets this object to the BAM formatted cigar found at the beginning /// of the specified buffer which is bufferLen long. void Set(const uint32_t* cigarBuffer, uint16_t bufferLen); // // when we examine CIGAR strings, we need to know how // many cumulative insert and delete positions there are // so that we can adjust the read location appropriately. // // Here, we iterate over the vector of CIGAR operations, // summaring the count for each insert or delete (insert // increases the offset, delete decreases it). // // The use case for this is when we have a genome match // position based on an index word other than the first one, // and there is also a insert or delete between the beginning // of the read and the index word. We can't simply report // the match position without taking into account the indels, // otherwise we'll be off by N where N is the sum of this // indel count. // /// DEPRECATED - do not use, there are better ways to accomplish that by /// using read lengths, reference lengths, span of the read, etc. int getMatchPositionOffset(); /// Get the string reprentation of the Cigar operations in this object, /// caller must delete the returned value. const char *getString(); /// Clear this object so that it has no Cigar Operations. void clear(); private: }; inline std::ostream &operator << (std::ostream &stream, const CigarRoller& roller) { stream << roller.cigarOperations; return stream; } #endif libStatGen-1.0.14/general/Constant.h000066400000000000000000000037161254730101300172130ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _CONSTANT_H_ #define _CONSTANT_H_ #define COMPAREFUNC (int (*)(const void *, const void *)) #define BUFSIZE 1024 #define FILENAMELEN 100 #define IDLEN 20 #define SEPARATORS " \t\n\r\f/" #define WHITESPACE " \t\n\r\f" #define SWTABLESKIP 9 #define SWTABLEMAX 10000 #define _NAN_ ((double) (6.66666e-66)) #define QTDTDATA "qtdt.dat" #define QTDTPED "qtdt.ped" #define QTDTIBD "qtdt.ibd" #define QTDTRAW "regress.tbl" #define GENIHDATAIN "genih.dat" #ifndef _WIN32 #define stricmp strcasecmp #endif // Constants for older haplotype handling programs // Constants for HAPLOXT #define XT_MAX_ALLELES 50 // Maximum alleles for crosstabulation #define XT_VECTORSIZE 10000 // Total haplotypes in population #define XT_POOLTRESH 7 // Threshold for pooling rare alleles // Simwalk Haplotype Vectors #define HV_MAXSIZE 100 // Haplotypes in single SimWalk pedigree #define HV_INFOTRESH 75 // Percentage of loci typed #define HV_STATELENGTH 100 // Markers per haplotype #define HV_SKIPLINES 4 // lines to skip at bottom of family tree // Simwalk Summary Files #define HT_TABLE_SIZE 1000 #define HT_SKIP_LINES 9 #endif libStatGen-1.0.14/general/Error.cpp000066400000000000000000000030601254730101300170360ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Error.h" #include #include #include #include "PhoneHome.h" // Declare a dummy class to ensure that compilers recognize this as C++ code class String; void error(const char * msg, ...) { va_list ap; va_start(ap, msg); printf("\nFATAL ERROR - \n"); vprintf(msg, ap); printf("\n\n"); va_end(ap); PhoneHome::completionStatus("error: Exiting due to Fatal Error"); exit(EXIT_FAILURE); } void warning(const char * msg, ...) { va_list ap; va_start(ap, msg); printf("\n\aWARNING - \n"); vprintf(msg, ap); printf("\n"); va_end(ap); } void numerror(const char * msg , ...) { va_list ap; va_start(ap, msg); printf("\nFATAL NUMERIC ERROR - "); vprintf(msg, ap); printf("\n\n"); va_end(ap); exit(EXIT_FAILURE); } libStatGen-1.0.14/general/Error.h000066400000000000000000000017311254730101300165060ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _ERROR_H_ #define _ERROR_H_ // #ifdef __cplusplus // extern "C" { // #endif void error(const char * msg, ...); void warning(const char * msg, ...); void numerror(const char * msg, ...); // #ifdef __cplusplus // }; // #endif #endif libStatGen-1.0.14/general/ErrorHandler.cpp000066400000000000000000000033011254730101300203320ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "ErrorHandler.h" #include "PhoneHome.h" #include #include // Constructor ErrorHandler::ErrorHandler() { } // Destructor ErrorHandler::~ErrorHandler() { } void ErrorHandler::handleError(const char* message, HandlingType handlingType) { // Check the handling type. switch(handlingType) { case(EXCEPTION): throw(std::runtime_error(message)); break; case(ABORT): std::cerr << message << "\nExiting" << std::endl; PhoneHome::completionStatus("ErrorHandler: Exiting due to Error"); exit(-1); break; case(RETURN): return; break; default: std::cerr << message << "\nUnknown Handle Type: Exiting" << std::endl; PhoneHome::completionStatus("Exiting, ErrorHandler::unknown handle type."); exit(-1); break; } } libStatGen-1.0.14/general/ErrorHandler.h000066400000000000000000000026751254730101300200140ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __ERROR_HANDLER_H__ #define __ERROR_HANDLER_H__ #include /// Class that controls the handling of errors. class ErrorHandler { public: /// This specifies how this class should respond to errors. enum HandlingType {EXCEPTION, ///< throw an exception for the error ABORT, ///< exit the program on the error RETURN ///< just return failure on the error }; /// Constructor ErrorHandler(); /// Destructor ~ErrorHandler(); /// Handle an error based on the error handling type. static void handleError(const char* message, HandlingType handlingType = EXCEPTION); private: }; #endif libStatGen-1.0.14/general/FileType.cpp000066400000000000000000000026251254730101300174740ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "FileType.h" FileType::FileType() { }; FileType::~FileType() { }; // Set by the InputFile to inform this class if buffering // is used. Maybe used by child clases (bgzf) to disable // tell. NOTE: this class does no buffering, the // buffering is handled by the calling class. void FileType::setBuffered(bool buffered) { myUsingBuffer = buffered; } // // one class, BgzfFileTypeRecovery overloads this method because // it is able to sync on a new record using the checkSignature // callback function. // // For all other classes, this is a NOP (sync fails). // bool FileType::attemptRecoverySync(bool (*checkSignature)(void *data) , int length) { return false; } libStatGen-1.0.14/general/FileType.h000066400000000000000000000051201254730101300171320ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __FILETYPE_H__ #define __FILETYPE_H__ #include class FileType { public: FileType(); virtual ~FileType(); virtual bool operator == (void * rhs) = 0; virtual bool operator != (void * rhs) = 0; // Close the file. virtual int close() = 0; // Reset to the beginning of the file. virtual void rewind() = 0; // Check to see if we have reached the EOF. virtual int eof() = 0; // Check to see if the file is open. virtual bool isOpen() = 0; // Write to the file. virtual unsigned int write(const void * buffer, unsigned int size) = 0; // Read into a buffer from the file. virtual int read(void * buffer, unsigned int size) = 0; // Get current position in the file. // -1 return value indicates an error. virtual int64_t tell() = 0; // Seek to the specified offset from the origin. // origin can be any of the following: // Note: not all are valid for all filetypes. // SEEK_SET - Beginning of file // SEEK_CUR - Current position of the file pointer // SEEK_END - End of file // Returns true on successful seek and false on a failed seek. virtual bool seek(int64_t offset, int origin) = 0; // Set by the InputFile to inform this class if buffering // is used. Maybe used by child clases (bgzf) to disable // tell. NOTE: this class does no buffering, the // buffering is handled by the calling class. void setBuffered(bool buffered); // // When caller catches an exception, it may call this method. // It is implemented only in BgzfFileTypeRecovery. // virtual bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length); protected: // Set by the InputFile to inform this class if buffering // is used. Maybe used by child clases (bgzf) to disable // tell. bool myUsingBuffer; }; #endif libStatGen-1.0.14/general/FortranFormat.cpp000066400000000000000000000214371254730101300205410ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "FortranFormat.h" #include "Error.h" FortranFormat::FortranFormat() { inputPos = -1; endOfPattern = false; } void FortranFormat::SetInputFile(IFILE & file) { input = file; inputPos = -1; endOfPattern = false; } void FortranFormat::SetFormat(const String & formatString) { format = formatString; inputPos = -1; endOfPattern = false; repeatCount = 0; format.Clear(); // Remove blank spaces from format statement and extract // the first bracketed expression int level = 0; for (int i = 0; i < formatString.Length(); i++) { if (formatString[i] == ' ' || formatString[i] == '\t' || formatString[i] == '\n' || formatString[i] == '\r') continue; if (formatString[i] == '(') level++; if (formatString[i] == ')') level--; format += formatString[i]; if (level == 0) break; } if (format[0] != '(' || format[format.Length() - 1] != ')') error("Invalid FORTRAN format statement\n\n" "The statement \"%s\" is not bracketed correctly.\n", (const char *) formatString); lastBracket = 1; lastCount = 0; formatPos = 1; repeatCount = 0; bracketStack.Clear(); bracketCounter.Clear(); bracketCount.Clear(); } int FortranFormat::GetNextInteger() { GetNextField(buffer); return buffer.AsInteger(); } char FortranFormat::GetNextCharacter() { GetNextField(buffer); return buffer[0]; } void FortranFormat::GetNextField(String & field) { while (!ProcessToken(field)) ; } bool FortranFormat::ProcessToken(String & field) { // This flag only gets set if we encounter the final bracket or a ':' endOfPattern = false; // Read input from file, if appropriate if (inputPos == -1) { inputLine.ReadLine(input); inputPos = 0; } // First read repeat count specifier if (repeatCount == 0) repeatCount = GetIntegerFromFormat(); // By default, the repeat count should be 1 if (repeatCount == 0) repeatCount = 1; int repeatPos = formatPos; // Check if this is a new bracketed grouping if (format[formatPos] == '(') { formatPos++; bracketStack.Push(formatPos); bracketCounter.Push(repeatCount); bracketCount.Push(repeatCount); repeatCount = 0; return false; } // Check if this an 'X' field if (format[formatPos] == 'X') { formatPos++; // No width specifier allowed for these fields RejectWidth('X'); // Skip appropriate number of characters inputPos += repeatCount; // Reset repeat count repeatCount = 0; FinishField(); return false; } // Check if this is a '/' (vertical tab field) if (format[formatPos] == '/') { formatPos++; // No width specifier allowed for these fields RejectWidth('/'); // Skip the appropriate number of lines while (repeatCount--) inputLine.ReadLine(input); inputPos = 0; // Separators are optional, so we might already be at the next field if (format[formatPos] == ',' || format[formatPos] || ')') FinishField(); return false; } // Check that we haven't encountered a rare, but unsupported input type if (format[formatPos] == 'Q' || format[formatPos] == 'P' || format[formatPos] == 'B') { formatPos++; int problemStart = formatPos; while (format[formatPos] != ',' && format[formatPos] != ')' && format[formatPos] != '/') formatPos++; error("Unsupported pattern in FORMAT statement\n\n" "Statement \"%s\" includes unsupporterd pattern '%s'\n", (const char *) format, (const char *) format.SubStr(problemStart, formatPos - problemStart)); } if (format[formatPos] == ':') { formatPos++; if (format[formatPos] == ',' || format[formatPos] || ')') FinishField(); repeatCount = 0; endOfPattern = true; return false; } // All the other types we recognize include a width specifier // Identify the location of the type specifier int typeStart = formatPos; while (CharacterFollows()) formatPos++; int typeLen = formatPos - typeStart; // Retrieve the field width int width = GetIntegerFromFormat(); if (width == 0) error("Unrecognized FORMAT statement\n\n" "Statement \"%s\" is missing a width specifier for a field of type '%s'\n", (const char *) format, (const char *) format.SubStr(typeStart, typeLen)); // Check for horizontal tab character if (format[typeStart] == 'T') { // Move left by a specified number of characters if (format[typeStart + 1] == 'L') inputPos = width > inputPos ? 0 : inputPos - width; // Move right by a specified number of characters else if (format[typeStart + 1] == 'R') inputPos += width; // Or simply set the appropriate horizontal position else inputPos = width; repeatCount--; if (repeatCount) formatPos = repeatPos; else FinishField(); return false; } // Assume that if we got here, we are looking at a data field! field.Copy(inputLine, inputPos, width); field.Trim(); inputPos += width; repeatCount--; if (repeatCount) formatPos = repeatPos; else FinishField(); return true; } int FortranFormat::GetIntegerFromFormat() { int result = 0; while (DigitFollows()) result = result * 10 + (int)(format[formatPos++] - '0'); return result; } bool FortranFormat::DigitFollows() { return (format[formatPos] >= '0') && (format[formatPos] <= '9'); } bool FortranFormat::CharacterFollows() { return (format[formatPos] >= 'A') && (format[formatPos] <= 'Z'); } void FortranFormat::RejectWidth(char ch) { // No width allowed for field types 'X' and '\' if (DigitFollows()) error("Unrecognized FORTRAN format statement\n\n" "The statement \"%s\" includes width specifier for field of type '%c'.\n", (const char *) format, ch); } void FortranFormat::FinishField(bool) { // Find the next field separator while (format[formatPos] != ',' && format[formatPos] != ')') { if (format[formatPos] == '/') return; formatPos++; } // Skip commas if (format[formatPos] == ',') { formatPos++; return; } // If we found a bracket, then it is either the end of the statement // (if bracketStack is empty) or we finish an internal grouping if (bracketStack.Length()) { // Retrieve information about this grouping lastBracket = bracketStack.Pop(); lastCount = bracketCount.Pop(); int lastCounter = bracketCounter.Pop() - 1; // Loop if required if (lastCounter) { bracketStack.Push(lastBracket); bracketCount.Push(lastCount); bracketCounter.Push(lastCounter); formatPos = lastBracket; } else // Otherwise find the next separator { formatPos++; FinishField(); return; } } else { // If we finished the input line, then activate reset input counter inputPos = -1; endOfPattern = true; // And re-use input tokens starting at the last bracket formatPos = lastBracket; if (lastBracket == 1) return; // With appropriate repeat counts bracketStack.Push(lastBracket); bracketCounter.Push(lastCount); bracketCount.Push(lastCount); } } void FortranFormat::Flush() { while (!endOfPattern) ProcessToken(buffer); inputPos = -1; lastBracket = 1; lastCount = 0; formatPos = 1; repeatCount = 0; bracketStack.Clear(); bracketCounter.Clear(); bracketCount.Clear(); } libStatGen-1.0.14/general/FortranFormat.h000066400000000000000000000053631254730101300202060ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __FORTRAN_FORMAT__ #define __FORTRAN_FORMAT__ #include "StringBasics.h" #include "IntArray.h" class FortranFormat { public: // This class reads a user specified input file, one line at a time, // and returns individual fields according to a user specified format // statement FortranFormat(); // Set the fortran format statement void SetFormat(const String & formatString); // Set the input file void SetInputFile(IFILE & file); // Read one field from input file void GetNextField(String & field); int GetNextInteger(); char GetNextCharacter(); // Process a token in format statement and return true // if token corresponds to input field. Return false if // token led to processing of white-space or input line // positioning bool ProcessToken(String & field); // Flush the pattern -- this finishes processing the current // pattern and ensures that all trailing new-lines, etc. are // handled correctly void Flush(); private: // The input line and current position along it String inputLine; int inputPos; // The Fortran format statement and current position along it String format; int formatPos; // The position of the pattern we are repeating, if any int repeatCount; // Returns an integer from the current format statement, if any int GetIntegerFromFormat(); // These functions check the next character in format string bool DigitFollows(); bool CharacterFollows(); // This function finish the input field void FinishField(bool haveSlash = false); // Reject width were appropriate void RejectWidth(char type); // The input file IFILE input; // Stacks to keep track of nested parenthesis IntArray bracketStack; IntArray bracketCount; IntArray bracketCounter; int lastBracket; int lastCount; // Buffer for reading fields String buffer; // Flag that indicates whether we have reached end-of-pattern bool endOfPattern; }; #endif libStatGen-1.0.14/general/Generic.cpp000066400000000000000000000027111254730101300173230ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Generic.h" #if defined(TEST) #include #include #include #include // // g++ -g -o testGeneric -DTEST Generic.cpp // int main(int argc, const char **argv) { std::vector a; std::vector< std::pair > b; std::pair c; std::vector::iterator i; a.push_back(0); a.push_back(1); a.push_back(2); a.push_back(3); std::cout << a; c.first = 10; c.second = 20; b.push_back(c); b.push_back(c); b.push_back(c); std::cout << b; i = a.begin(); std::list > l; l.push_back(c); std::cout << l; // std::cout << "iterator i: " << i << std::endl; // std::cout << argv; } #endif libStatGen-1.0.14/general/Generic.h000077500000000000000000000132531254730101300167760ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #if !defined(_GENERIC_H) #define _GENERIC_H #include #include #include #include #include #include #include template inline T abs(T x) { return (x < 0) ? -x : x; } // // this is safe for signed/unsigned: // template inline T absDiff(T x, T y) { return (x < y) ? (y - x) : (x - y); } // // template inline T in(T x, T y, T z) { return (x >= y && x < z); } // // These overloaded operators and functions are largely // for diagnostic debug printing. The underlying problem // is that gdb is unable to decipher any STL use, let alone // complex STL use. printf debugging is a poor second choice, // but these functions at least make it practical to do rapidly. // // // Write a std::pair to a stream // template std::ostream &operator << (std::ostream &stream, std::pair p) { stream << "(" << p.first << ", " << p.second << ")"; return stream; } // // generic vector print -- in normal use, you should // be able to simply do foostream << somevector, and get // sane results, provided that the vector elements themselves // can be written to the stream. // // Example code is in Generic.cpp // template std::ostream &operator << (std::ostream &stream, std::vector const &v) { typename std::vector::const_iterator i; for (i = v.begin(); i != v.end(); i++) { stream << (i - v.begin()) << ": " << *i << std::endl; } return stream; } // // same overload as above, except for std::list // template std::ostream &operator << (std::ostream &stream, std::list const &l) { typename std::list::const_iterator i; int j = 0; for (i = l.begin(); i != l.end(); i++, j++) { stream << j << ": " << *i << std::endl; } return stream; } template void check(int &returnCode, TITLE title, ITEM item, EXPECT expect, GOT got) { if (expect!=got) { std::cout << "Test " << title << ": expect " << item << " = '" << expect << "', but got '" << got << "'." << std::endl; returnCode += 1; } } // // specialization of template below: // load a set of lines from a file into a vector of strings. // inline std::istream &operator >> (std::istream &stream, std::vector &vec) { std::string val; while (true) { if (!stream.good()) break; getline(stream, val); stream >> val; vec.push_back(val); } return stream; } // // read values from a stream, appending to the provided // vec. stops when the stream is consumed. // template std::istream &operator >> (std::istream &stream, std::vector &vec) { T val; while (true) { if (!stream.good()) break; stream >> val; vec.push_back(val); } return stream; } #if 0 // // generic vector of iterators print // template std::ostream &operator << ( std::ostream &stream, std::vector< std::pair< std::vector::iterator , std::vector< typename T>::iterator > > v ) { typename IteratorType i; typename std::vector::iterator i; for (i = v.begin(); i != v.end(); i++) { stream << *i << std::endl; } return stream; } #endif // // These are packed set/get functions for dealing with // packed 1, 2 and 4 bit unsigned values inside of arbitrary // arrays of data (char */std::vector whatever). // template inline uint32_t PackedAccess_1Bit(T byteSequence, uint32_t bitIndex) { return (((byteSequence)[bitIndex>>3] >> (bitIndex&0x7)) & 0x1); } template inline void PackedAssign_1Bit(T byteSequence, uint32_t bitIndex, uint32_t value) { (byteSequence)[bitIndex>>3] = ((byteSequence)[bitIndex>>3] & ~(1<<(bitIndex&0x07))) | ((value&0x01)<<(bitIndex&0x7)); } inline size_t Packed1BitElementCount2Bytes(uint32_t i) { return (size_t)(i+7)/8; } template inline uint32_t PackedAccess_2Bit(T byteSequence, uint32_t index) { return (((byteSequence)[index>>2] >> ((index&0x3)<<1)) & 0x3); } template inline void PackedAssign_2Bit(T byteSequence, uint32_t index, uint32_t value) { (byteSequence)[index>>2] = ((byteSequence)[index>>2] & ~(3<<((index&0x03)<<1))) | ((value&0x03)<<((index&0x3)<<1)); } inline size_t Packed2BitElementCount2Bytes(uint32_t i) { return (size_t)(i+3)/4; } template inline uint32_t PackedAccess_4Bit(T byteSequence, uint32_t index) { return (((byteSequence)[index>>1] >> ((index&0x1)<<2)) & 0xf); } template inline void PackedAssign_4Bit(T byteSequence, uint32_t index, uint32_t value) { (byteSequence)[index>>1] = ((byteSequence)[index>>1] & ~(7<<((index&0x01)<<2))) | ((value&0x0f)<<((index&0x1)<<2)); } inline size_t Packed4BitElementCount2Bytes(uint32_t i) { return (size_t)(i+1)/2; } #endif libStatGen-1.0.14/general/GenomeSequence.cpp000077500000000000000000001330521254730101300206600ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "assert.h" #include "ctype.h" #include "stdio.h" #include "Error.h" #include "Generic.h" #include "GenomeSequence.h" #include #include #include #include #include #if defined(_WIN32) #include #ifndef R_OK #define R_OK 4 #endif #endif // not general use: #include "CSG_MD5.h" // // given a read in a string, pack it into the vector of // bytes coded as two bases per byte. // // The goal is to allow us to more rapidly compare against // the genome, which is itself packed 2 bases per byte. // // Unfortunately, the match position may be odd or even, // so as a result, we also need to be able to prepad // with 'N' bases to get the byte alignment the same, so // padWithNCount may be a positive number indicating how // many N bases to prepend. // void PackedRead::set(const char *rhs, int padWithNCount) { clear(); // pad this packed read with 'N' bases two at a time while (padWithNCount>1) { packedBases.push_back( BaseAsciiMap::base2int[(int) 'N'] << 4 | BaseAsciiMap::base2int[(int) 'N'] ); padWithNCount -= 2; length+=2; } // when we have only one base, pack one 'N' base with // the first base in rhs if there is one. if (padWithNCount) { // NB: *rhs could be NUL, which is ok here - just keep // the length straight. packedBases.push_back( BaseAsciiMap::base2int[(int) *rhs] << 4 | BaseAsciiMap::base2int[(int) 'N'] ); // two cases - have characters in rhs or we don't: if (*rhs) { length+=2; // pad byte plus one byte from rhs rhs++; } else { length++; } padWithNCount--; // should now be zero, so superfluous. assert(padWithNCount==0); } // pad pairs of bases from rhs, two at a time: while (*rhs && *(rhs+1)) { packedBases.push_back( BaseAsciiMap::base2int[(int) *(rhs+1)] << 4 | BaseAsciiMap::base2int[(int) *(rhs+0)] ); rhs+=2; length+=2; } // if there is an odd base left at the end, put it // in a byte all its own (low 4 bits == 0): if (*rhs) { packedBases.push_back( BaseAsciiMap::base2int[(int) *(rhs+0)] ); length++; } return; } std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const { std::string sequence(""); for (unsigned int i = 0; i < wordsize; i ++) sequence += "N"; unsigned int clearHigherBits = ~(3U << (wordsize<<1)); // XXX refactor - this appears several places if (n > clearHigherBits) error("%d needs to be a non-negative integer < clearHigherBits\n", n); for (unsigned int i = 0; i < wordsize; i++) { sequence[wordsize-1-i] = BaseAsciiMap::int2base[n & 3]; n >>= 2; } return sequence; } GenomeSequence::GenomeSequence() { constructorClear(); } void GenomeSequence::constructorClear() { _debugFlag = 0; _progressStream = NULL; _colorSpace = false; _createOverwrite = false; } void GenomeSequence::setup(const char *referenceFilename) { setReferenceName(referenceFilename); if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush; if (open(false)) { std::cerr << "Failed to open reference genome " << referenceFilename << std::endl; std::cerr << errorStr << std::endl; exit(1); } prefetch(); if (_progressStream) *_progressStream << "done." << std::endl << std::flush; } GenomeSequence::~GenomeSequence() { // free up resources: _umfaFile.close(); } // // mapped open. // // if the file exists, map in into memory, and fill in a few useful // fields. // bool GenomeSequence::open(bool isColorSpace, int flags) { bool rc; if (isColorSpace) { _umfaFilename = _baseFilename + "-cs.umfa"; } else { _umfaFilename = _baseFilename + "-bs.umfa"; } if(access(_umfaFilename.c_str(), R_OK) != 0) { // umfa file doesn't exist, so try to create it. if(create(isColorSpace)) { // Couldon't access or create the umfa. std::cerr << "GenomeSequence::open: failed to open file " << _umfaFilename << " also failed creating it." << std::endl; return true; } } rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags); if (rc) { std::cerr << "GenomeSequence::open: failed to open file " << _umfaFilename << std::endl; return true; } _colorSpace = header->_colorSpace; return false; } void GenomeSequence::sanityCheck(MemoryMap &fasta) const { unsigned int i; unsigned int genomeIndex = 0; for (i=0; i': while (fasta[i]!='\n' && fasta[i]!='\r') i++; break; case '\n': case '\r': break; default: assert(BaseAsciiMap::base2int[(int)(*this)[genomeIndex]] == BaseAsciiMap::base2int[(int) fasta[i]]); genomeIndex++; break; } } } #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix)) // // referenceFilename is either a fasta or a UM fasta (.fa or .umfa) // filename. In both cases, the suffix gets removed and the // base name is kept for later use depending on context. // @return always return false // bool GenomeSequence::setReferenceName(std::string referenceFilename) { if (HAS_SUFFIX(referenceFilename, ".fa")) { _referenceFilename = referenceFilename; _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3); } else if (HAS_SUFFIX(referenceFilename, ".umfa")) { _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5); } else if (HAS_SUFFIX(referenceFilename, "-cs.umfa")) { _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8); } else if (HAS_SUFFIX(referenceFilename, "-bs.umfa")) { _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8); } else { _baseFilename = referenceFilename; } _fastaFilename = _baseFilename + ".fa"; if (HAS_SUFFIX(referenceFilename, ".fasta")) { _referenceFilename = referenceFilename; _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6); _fastaFilename = _baseFilename + ".fasta"; } return false; } // // this works in lockstep with ::create to populate // the per chromosome header fields size and md5 // checksum. // // It relies on header->elementCount being set to // the data length loaded so far ... not the ultimate // reference length. // bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome) { if (whichChromosome>=header->_chromosomeCount) return true; ChromosomeInfo *c = &header->_chromosomes[whichChromosome]; c->size = header->elementCount - c->start; MD5_CTX md5Context; uint8_t md5Signature[MD5_DIGEST_LENGTH]; // // it's easier to ensure we do this right if we just do it // in one big chunk: // char *md5Buffer = (char *) malloc(c->size); MD5Init(&md5Context); for (genomeIndex_t i = 0; i < c->size; i ++) { md5Buffer[i] = (*this)[c->start + i]; } MD5Update(&md5Context, (unsigned char *) md5Buffer, c->size); MD5Final((unsigned char *) &md5Signature, &md5Context); free(md5Buffer); for (int i=0; imd5+2*i, "%02x", md5Signature[i]); } // redundant, strictly speaking due to sprintf NUL terminating // it's output strings, but put it here anyway. c->md5[2*MD5_DIGEST_LENGTH] = '\0'; return false; } // // Given a buffer with a fasta format contents, count // the number of chromosomes in it and return that value. // static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount) { chromosomeCount = 0; baseCount = 0; bool atLineStart = true; // // loop over the fasta file, essentially matching for the // pattern '^>.*$' and counting them. // for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++) { switch (fastaData[fastaIndex]) { case '\n': case '\r': atLineStart = true; break; case '>': { if (!atLineStart) break; chromosomeCount++; // // eat the rest of the line // while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r') { fastaIndex++; } break; } default: baseCount++; atLineStart = false; break; } } return false; } class PackedSequenceData : public std::vector { std::vector m_packedBases; size_t m_baseCount; void set(size_t index, uint8_t value) { m_packedBases[index>>1] = (m_packedBases[index>>1] // original value & ~(7<<((index&0x01)<<2))) // logical AND off the original value | ((value&0x0f)<<((index&0x1)<<2)); // logical OR in the new value } public: void reserve(size_t baseCount) {m_packedBases.reserve(baseCount/2);} size_t size() {return m_baseCount;} void clear() {m_packedBases.clear(); m_baseCount = 0;} uint8_t operator [](size_t index) { return (m_packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf; } void push_back(uint8_t base); }; // // Load a fasta format file from filename into the buffer // provided by the caller. // While parsing the fasta file, record each chromosome name, // its start location, and its size. // // NB: the caller must implement the logic to determine how // large the sequence data is. There is no correct way to do // this, because we can't reliably estimate here how much sequence // data is contained in a compressed file. // // To safely pre-allocate space in sequenceData, use the reserve() method // before calling this function. // bool loadFastaFile(const char *filename, std::vector &sequenceData, std::vector &chromosomeNames) { InputFile inputStream(filename, "r", InputFile::DEFAULT); if(!inputStream.isOpen()) { std::cerr << "Failed to open file " << filename << "\n"; return true; } int whichChromosome = -1; chromosomeNames.clear(); char ch; while((ch = inputStream.ifgetc()) != EOF) { switch (ch) { case '\n': case '\r': break; case '>': { std::string chromosomeName = ""; // // pull out the chromosome new name // while (!isspace((ch = inputStream.ifgetc())) && ch != EOF) { chromosomeName += ch; // slow, but who cares } // // eat the rest of the line // do { ch = inputStream.ifgetc(); } while(ch != EOF && ch != '\n' && ch != '\r'); // // save the Chromosome name and index into our // header so we can use them later. // chromosomeNames.push_back(chromosomeName); whichChromosome++; break; } default: // we get here for sequence data. // // save the base value // Note: invalid characters come here as well, but we // let ::set deal with mapping them. break; } } return false; } // // recreate the umfa file from a reference fasta format file // // The general format of a FASTA file is best described // on wikipedia at http://en.wikipedia.org/wiki/FASTA_format // // The format parsed here is a simpler subset, and is // described here http://www.ncbi.nlm.nih.gov/blast/fasta.shtml // bool GenomeSequence::create(bool isColor) { setColorSpace(isColor); if (_baseFilename=="") { std::cerr << "Base reference filename is empty." << std::endl; return true; } if (isColorSpace()) { _umfaFilename = _baseFilename + "-cs.umfa"; } else { _umfaFilename = _baseFilename + "-bs.umfa"; } if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0) { std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl; return true; } MemoryMap fastaFile; if (fastaFile.open(_fastaFilename.c_str())) { std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl; return true; } std::cerr << "Creating FASTA " << (isColorSpace() ? "color space " : "") << "binary cache file '" << _umfaFilename << "'." << std::endl; std::cerr << std::flush; // // simple ptr to fasta data -- just treat the memory map // as an array of fastaDataSize characters... // const char *fasta = (const char *) fastaFile.data; size_t fastaDataSize = fastaFile.length(); uint32_t chromosomeCount = 0; uint64_t baseCount = 0; getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount); if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount)) { std::cerr << "failed to create '" << _umfaFilename << "'." << std::endl; perror(""); return true; } header->elementCount = 0; header->_colorSpace = isColorSpace(); header->setApplication(_application.c_str()); header->_chromosomeCount = chromosomeCount; // // clear out the variable length chromosome info array // for (uint32_t i=0; i_chromosomeCount; i++) header->_chromosomes[i].constructorClear(); std::string chromosomeName; // // for converting the reference to colorspace, the first base is always 5 (in base space it is 'N') signed char lastBase = BaseAsciiMap::base2int[(int) 'N']; bool terminateLoad = false; int percent = -1, newPercent; uint32_t whichChromosome = 0; for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++) { if (_progressStream) { newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100; if (newPercent>percent) { *_progressStream << "\r" << newPercent << "% "; *_progressStream << std::flush; percent = newPercent; } } switch (fasta[fastaIndex]) { case '\n': case '\r': break; case '>': { chromosomeName = ""; fastaIndex++; // skip the > char // // pull out the chromosome new name // while (!isspace(fasta[fastaIndex])) { chromosomeName += fasta[fastaIndex++]; // slow, but who cares } // // eat the rest of the line // while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r') { fastaIndex++; } // // save the Chromosome name and index into our // header so we can use them later. // ChromosomeInfo *c = &header->_chromosomes[whichChromosome]; c->setChromosomeName(chromosomeName.c_str()); c->start = header->elementCount; // c->size gets computed at the next '>' line or at the EOF if (whichChromosome>0) { // // compute md5 checksum for the chromosome that we just // loaded (if there was one) - note that on the last // chromosome, we have to duplicate this code after // the end of this loop // setChromosomeMD5andLength(whichChromosome - 1); } whichChromosome++; if (whichChromosome > header->_chromosomeCount) { std::cerr << "BUG: Exceeded computed chromosome count (" << header->_chromosomeCount << ") - genome is now truncated at chromosome " << header->_chromosomes[header->_chromosomeCount-1].name << " (index " << header->_chromosomeCount << ")." << std::endl; terminateLoad = true; } break; } default: // save the base pair value // Note: invalid characters come here as well, but we // let ::set deal with mapping them. if (isColorSpace()) { // // anything outside these values represents an invalid base // base codes: 0-> A, 1-> C, 2-> G, 3-> T // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red // const char fromBase2CS[] = { /* 0000 */ 0, // A->A /* 0001 */ 1, // A->C /* 0010 */ 2, // A->G /* 0011 */ 3, // A->T /* 0100 */ 1, // C->A /* 0101 */ 0, // C->C /* 0110 */ 3, // C->G /* 0111 */ 2, // C->T /* 1000 */ 2, // G->A /* 1001 */ 3, // G->C /* 1010 */ 0, // G->G /* 1011 */ 1, // G->T /* 1100 */ 3, // T->A /* 1101 */ 2, // T->C /* 1110 */ 1, // T->G /* 1111 */ 0, // T->T }; // // we are writing color space values on transitions, // so we don't write a colorspace value when we // get the first base value. // // On second and subsequent bases, write based on // the index table above // char thisBase = BaseAsciiMap::base2int[(int)(fasta[fastaIndex])]; if (lastBase>=0) { char color; if (lastBase>3 || thisBase>3) color=4; else color = fromBase2CS[(int)(lastBase<<2 | thisBase)]; // re-use the int to base, because ::set expects a base char (ATCG), not // a color code (0123). It should only matter on final output. set(header->elementCount++, BaseAsciiMap::int2base[(int) color]); } lastBase = thisBase; } else { set(header->elementCount++, toupper(fasta[fastaIndex])); } break; } // // slightly awkward exit handling when we exceed the fixed // number of chromosomes // if (terminateLoad) break; } // // also slightly awkward code to handle the last dangling chromosome... // all we should need to do is compute the md5 checksum // if (whichChromosome==0) { fastaFile.close(); throw std::runtime_error("No chromosomes found - aborting!"); } else { setChromosomeMD5andLength(whichChromosome-1); } fastaFile.close(); if (_progressStream) *_progressStream << "\r"; std::cerr << "FASTA binary cache file '" << _umfaFilename << "' created." << std::endl; // // leave the umfastaFile open in case caller wants to use it // return false; } int GenomeSequence::getChromosomeCount() const { return header->_chromosomeCount; } //return chromosome index: 0, 1, ... 24; int GenomeSequence::getChromosome(genomeIndex_t position) const { if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX; if (header->_chromosomeCount == 0) return INVALID_CHROMOSOME_INDEX; int start = 0; int stop = header->_chromosomeCount - 1; // eliminate case where position is in the last chromosome, since the loop // below falls off the end of the list if it in the last one. if (position > header->_chromosomes[stop].start) return (stop); while (start <= stop) { int middle = (start + stop) / 2; if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start) return middle; if (position == header->_chromosomes[middle + 1].start) return (middle + 1); if (position > header->_chromosomes[middle + 1].start) start = middle + 1; if (position < header->_chromosomes[middle].start) stop = middle - 1; } return -1; } // // Given a chromosome name and 1-based chromosome index, return the // genome index (0 based) into sequence for it. // // NB: the header->chromosomes array contains zero based genome positions // genomeIndex_t GenomeSequence::getGenomePosition( const char *chromosomeName, unsigned int chromosomeIndex) const { genomeIndex_t i = getGenomePosition(chromosomeName); if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX; return i + chromosomeIndex - 1; } genomeIndex_t GenomeSequence::getGenomePosition( int chromosome, unsigned int chromosomeIndex) const { if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX; genomeIndex_t i = header->_chromosomes[chromosome].start; if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX; return i + chromosomeIndex - 1; } // // return the genome index (0 based) of the start of the named // chromosome. If none is found, INVALID_GENOME_INDEX is returned. // // XXX may need to speed this up - and smarten it up with some // modest chromosome name parsing.... e.g. '%d/X/Y' or 'chr%d/chrX/chrY' or // other schemes. // genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const { int chromosome = getChromosome(chromosomeName); if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX; return header->_chromosomes[chromosome].start; } int GenomeSequence::getChromosome(const char *chromosomeName) const { unsigned int i; for (i=0; i_chromosomeCount; i++) { if (strcmp(header->_chromosomes[i].name, chromosomeName)==0) { return i; } } return INVALID_CHROMOSOME_INDEX; } // // Given a read, reverse the string and swap the base // pairs for the reverse strand equivalents. // void GenomeSequence::getReverseRead(std::string &read) { std::string newRead; if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--) { newRead.push_back(BasePair(read[i])); } read = newRead; } void GenomeSequence::getReverseRead(String& read) { int i = 0; int j = read.Length()-1; char temp; while (i < j) { temp = read[j]; read[j] = read[i]; read[i] = temp; } } #define ABS(x) ( (x) > 0 ? (x) : -(x) ) int GenomeSequence::debugPrintReadValidation( std::string &read, std::string &quality, char direction, genomeIndex_t readLocation, int sumQuality, int mismatchCount, bool recurse ) { int validateSumQ = 0; int validateMismatchCount = 0; int rc = 0; std::string genomeData; for (uint32_t i=0; i= (read.size() - 24))) validateMismatchCount++; genomeData.push_back(tolower((*this)[readLocation + i])); } else { genomeData.push_back(toupper((*this)[readLocation + i])); } } assert(validateSumQ>=0); if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount) { printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n", genomeData.c_str(), read.c_str(), validateSumQ, sumQuality ); rc++; } else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount) { printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n", genomeData.c_str(), read.c_str(), validateMismatchCount, mismatchCount ); rc++; } else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount) { printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n", genomeData.c_str(), read.c_str(), validateSumQ, sumQuality, validateMismatchCount, mismatchCount ); rc++; } if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2) { printf("large mismatch difference, trying reverse strand: "); std::string reverseRead = read; std::string reverseQuality = quality; getReverseRead(reverseRead); reverse(reverseQuality.begin(), reverseQuality.end()); rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false); } return rc; } #undef ABS bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const { for (uint32_t i = 0; i_chromosomeCount; i++) { file << "@SQ" << "\tSN:" << header->_chromosomes[i].name // name << "\tLN:" << header->_chromosomes[i].size // number of bases << "\tAS:" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3 << "\tM5:" << header->_chromosomes[i].md5 << "\tUR:" << header->_chromosomes[i].uri << "\tSP:" << header->_chromosomes[i].species // e.g. Homo_sapiens << std::endl; } } void GenomeSequence::dumpHeaderTSV(std::ostream &file) const { file << "# Reference: " << _baseFilename << std::endl; file << "# SN: sample name - must be unique" << std::endl; file << "# AS: assembly name" << std::endl; file << "# SP: species" << std::endl; file << "# LN: chromosome/contig length" << std::endl; file << "# M5: chromosome/contig MD5 checksum" << std::endl; file << "# LN and M5 are only printed for informational purposes." << std::endl; file << "# Karma will only set those values when creating the index." << std::endl; file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl; for (unsigned int i=0; i_chromosomeCount; i++) { file << header->_chromosomes[i].name // name << "\t" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3 << "\t" << header->_chromosomes[i].uri << "\t" << header->_chromosomes[i].species // e.g. Homo_sapiens << "\t" << header->_chromosomes[i].size // number of bases << "\t" << header->_chromosomes[i].md5 << std::endl; } } void GenomeSequence::getString(std::string &str, int chromosome, uint32_t index, int baseCount) const { // // calculate the genome index for the lazy caller... // genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1; getString(str, genomeIndex, baseCount); } void GenomeSequence::getString(String &str, int chromosome, uint32_t index, int baseCount) const { std::string string; this-> getString(string, chromosome, index, baseCount); str = string.c_str(); } void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const { str.clear(); if (baseCount > 0) { for (int i=0; i 0) { for (int i=0; i index) continue; if (index + i + read.size() >= getNumberBases()) continue; if (quality=="") { newScore = this->getMismatchCount(read, index + i); } else { newScore = this->getSumQ(read, quality, index + i); } if (newScore < bestScore) { bestScore = newScore; bestMatchLocation = index + i; } } return bestMatchLocation; } std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h) { stream << (MemoryMapArrayHeader &) h; stream << "chromosomeCount: " << h._chromosomeCount << std::endl; stream << "isColorSpace: " << h._colorSpace << std::endl; stream << "chromosomeCount: " << h._chromosomeCount << std::endl; uint64_t totalSize = 0; for (uint32_t i=0; i < h._chromosomeCount; i++) { totalSize += h._chromosomes[i].size; stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl; stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl; stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl; stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl; stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl; stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl; stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl; } stream << "Total Genome Size: " << totalSize << " bases."<< std::endl; if (totalSize != h.elementCount) { stream << "Total Genome Size: does not match elementCount!\n"; } stream << std::endl; return stream; } void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const { int whichChromosome = 0; whichChromosome = getChromosome(i); if (whichChromosome == INVALID_CHROMOSOME_INDEX) { s = "invalid genome index"; // TODO include the index in error } else { std::ostringstream buf; genomeIndex_t chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1; buf << header->_chromosomes[whichChromosome].name << ":" << chromosomeIndex; #if 0 buf << " (GenomeIndex " << i << ")"; #endif s = buf.str(); } return; } void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const { std::string ss; getChromosomeAndIndex(ss, i); s = ss.c_str(); return; } // // This is intended to be a helper routine to get dbSNP files // loaded. In some cases, we will load into an mmap() file (ie // when we are creating it), in others, we will simply be loading // an existing dbSNP file into RAM (when the binary file does not // exist or when we are running with useMemoryMapFlag == false. // // Assume that dbSNP exists, is writable, and is the right size. // // Using the dbSNPFilename given, mark each dbSNP position // with a bool true. // // Return value: // True: if populateDBSNP() succeed // False: if not succeed bool GenomeSequence::populateDBSNP( mmapArrayBool_t &dbSNP, IFILE inputFile) const { assert(dbSNP.getElementCount() == getNumberBases()); if(inputFile == NULL) { // FAIL, file not opened. return(false); } std::string chromosomeName; std::string position; genomeIndex_t chromosomePosition1; // 1-based uint64_t ignoredLineCount = 0; // Read til the end of the file. char* postPosPtr = NULL; while(!inputFile->ifeof()) { chromosomeName.clear(); position.clear(); // Read the chromosome if(inputFile->readTilTab(chromosomeName) <= 0) { // hit either eof or end of line, check if // it is a header. if(chromosomeName.size()>0 && chromosomeName[0]=='#') { // header, so just continue. continue; } // Not the header, so this line is poorly formatted. ++ignoredLineCount; // Continue to the next line. continue; } // Check if it is a header line. if(chromosomeName.size()>0 && chromosomeName[0]=='#') { // did not hit eof or end of line, // so discard the rest of the line. inputFile->discardLine(); continue; } // Not a header, so read the position. if(inputFile->readTilTab(position) > 0) { // Additional data on the line, so discard it. inputFile->discardLine(); } // Convert the position to a string. chromosomePosition1 = strtoul(position.c_str(), &postPosPtr, 0); if(postPosPtr == position.c_str()) { ++ignoredLineCount; continue; } // 1-based genome index. genomeIndex_t genomeIndex = getGenomePosition(chromosomeName.c_str(), chromosomePosition1); // if the genome index is invalid, ignore it if((genomeIndex == INVALID_GENOME_INDEX) || (genomeIndex > getNumberBases())) { ignoredLineCount++; continue; } dbSNP.set(genomeIndex, true); } if (ignoredLineCount > 0) { std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl; return false; } return true; } bool GenomeSequence::loadDBSNP( mmapArrayBool_t &dbSNP, const char *inputFileName) const { // // the goal in this section of code is to allow the user // to either specify a valid binary version of the SNP file, // or the original text file that it gets created from. // // To do this, we basically open, sniff the error message, // and if it claims it is not a binary version of the file, // we go ahead and treat it as the text file and use the // GenomeSequence::populateDBSNP method to load it. // // Further checking is really needed to ensure users don't // mix a dbSNP file for a different reference, since it is really // easy to do. // if (strlen(inputFileName)!=0) { std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush; if (dbSNP.open(inputFileName, O_RDONLY)) { // // failed to open, possibly due to bad magic. // // this is really awful ... need to have a return // code that is smart enough to avoid this ugliness: // if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos) { std::cerr << "Error: " << dbSNP.getErrorString() << std::endl; exit(1); } // // we have a file, assume we can load it as a text file // IFILE inputFile = ifopen(inputFileName, "r"); if(inputFile == NULL) { std::cerr << "Error: failed to open " << inputFileName << std::endl; exit(1); } std::cerr << "(as text file) "; // anonymously (RAM resident only) create: dbSNP.create(getNumberBases()); // now load it into RAM populateDBSNP(dbSNP, inputFile); ifclose(inputFile); } else { std::cerr << "(as binary mapped file) "; } std::cerr << "DONE!" << std::endl; return false; } else { return true; } } #if defined(TEST) void simplestExample(void) { GenomeSequence reference; genomeIndex_t index; // a particular reference is set by: // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa") // // In the above example, the suffix .fa is stripped and replaced with .umfa, // which contains the actual file being opened. // if (reference.open()) { perror("GenomeSequence::open"); exit(1); } index = 1000000000; // 10^9 // // Write the base at the given index. Here, index is 0 based, // and is across the whole genome, as all chromosomes are sequentially // concatenated, so the allowed range is // // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last)) // // (where int last = reference.getChromosomeCount() - 1;) // std::cout << "base[" << index << "] = " << reference[index] << std::endl; // // Example for finding chromosome and one based chromosome position given // and absolute position on the genome in 'index': // int chr = reference.getChromosome(index); genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1; // 1-based std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl; // // Example for finding an absolute genome index position when the // chromosome name and one based position are known: // const char *chromosomeName = "5"; chr = reference.getChromosome(chromosomeName); // 0-based chrIndex = 100000; // 1-based index = reference.getChromosomeStart(chr) + chrIndex - 1; std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl; reference.close(); } void testGenomeSequence(void) { GenomeSequence reference; #if 0 std::string referenceName = "someotherreference"; if (reference.setFastaName(referenceName)) { std::cerr << "failed to open reference file " << referenceName << std::endl; exit(1); } #endif std::cerr << "open and prefetch the reference genome: "; // open it if (reference.open()) { exit(1); } std::cerr << "done!" << std::endl; // // For the human genome, genomeIndex ranges from 0 to 3.2x10^9 // genomeIndex_t genomeIndex; // 0 based unsigned int chromosomeIndex; // 1 based unsigned int chromosome; // 0..23 or so std::string chromosomeName; // // Here we'll start with a chromosome name, then obtain the genome // index, and use it to find the base we want: // chromosomeName = "2"; chromosomeIndex = 1234567; // this call is slow (string search for chromsomeName): genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex); assert(genomeIndex!=INVALID_GENOME_INDEX); std::cout << "Chromosome " << chromosomeName << ", index "; std::cout << chromosomeIndex << " contains base " << reference[genomeIndex]; std::cout << " at genome index position " << genomeIndex << std::endl; // // now reverse it - given a genomeIndex from above, find the chromosome // name and index: // // slow (binary search on genomeIndex): chromosome = reference.getChromosome(genomeIndex); unsigned int newChromosomeIndex; // not slow: newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1; assert(chromosomeIndex == newChromosomeIndex); // more testing... at least test and use PackedRead: // PackedRead pr; pr.set("ATCGATCG", 0); assert(pr.size()==8); assert(pr[0]==BaseAsciiMap::base2int[(int) 'A']); assert(pr[1]==BaseAsciiMap::base2int[(int) 'T']); assert(pr[2]==BaseAsciiMap::base2int[(int) 'C']); assert(pr[3]==BaseAsciiMap::base2int[(int) 'G']); pr.set("ATCGATCG", 1); assert(pr.size()==9); pr.set("", 0); assert(pr.size()==0); pr.set("", 1); assert(pr.size()==1); pr.set("", 2); assert(pr.size()==2); pr.set("", 3); assert(pr.size()==3); assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']); assert(pr[1]==BaseAsciiMap::base2int[(int) 'N']); assert(pr[2]==BaseAsciiMap::base2int[(int) 'N']); pr.set("C", 1); assert(pr.size()==2); assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']); assert(pr[1]==BaseAsciiMap::base2int[(int) 'C']); } // // After I build libcsg, I compile and run this test code using: // // g++ -DTEST -o try GenomeSequence.cpp -L. -lcsg -lm -lz -lssl // you also may need -fno-rtti // ./try // int main(int argc, const char **argv) { #if 1 simplestExample(); #else testGenomeSequence(); #endif exit(0); } #endif libStatGen-1.0.14/general/GenomeSequence.h000066400000000000000000000475411254730101300203310ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _GENOME_SEQUENCE_H #define _GENOME_SEQUENCE_H #include #include #if !defined(MD5_DIGEST_LENGTH) #define MD5_DIGEST_LENGTH 16 #endif #include #include "MemoryMapArray.h" #include "BaseAsciiMap.h" // Goncalo's String class #include "StringArray.h" #include // stdint.h will define this, but only if __STDC_LIMIT_MACROS was // defined prior to the first include of stdint.h #ifndef UINT32_MAX #define UINT32_MAX 0xFFFFFFFF #endif typedef uint32_t genomeIndex_t; #define INVALID_GENOME_INDEX UINT32_MAX // chromosome index is just a signed int, so this is ok here: #define INVALID_CHROMOSOME_INDEX -1 #include "GenomeSequenceHelpers.h" #define UMFA_COOKIE 0x1b7933a1 // unique cookie id #define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout typedef MemoryMapArray< uint32_t, genomeIndex_t, UMFA_COOKIE, UMFA_VERSION, PackedAccess_4Bit, PackedAssign_4Bit, Packed4BitElementCount2Bytes, genomeSequenceMmapHeader > genomeSequenceArray; // std::string &operator = (std::string &lhs, const PackedRead &rhs); // //! Create/Access/Modify/Load Genome Sequences stored as binary mapped files. /*! GenomeSequence is designed to be a high performance shared access reference object. It is implemented as a MemoryMapArray template object with unsigned 8 bit ints, each of which stores two bases. Although 2 bits could be used, most references have more than four symbols (usually at least including 'N', indicating an unknown or masked out base). Normal use of this class follows these steps: -# create the reference -# instantiate the GenomeSequence class object -# create the actual file (memory mapped) that is to hold the data -# populate the data using GenomeSequence::set -# use the reference -# use the reference by instantiating a GenomeSequence object -# either use the constructor with the reference filename -# or use GenomeSequence::setReferenceName() followed by ::open -# access the bases via the overloaded array operator [] -# check sequence length by using GenomeSequence::getNumberBases() -# accessing chromosomes in the reference -# you typically will need to know about the chromosomes in the sequence -# see methods and docs with prefix 'getChromosome' Sharing is accomplished using the mmap() function via the MemoryMap base class. This allows a potentially large genome reference to be shared among a number of simultaneously executing instances of one or more programs sharing the same reference. */ class GenomeSequence : public genomeSequenceArray { private: int _debugFlag; std::ostream *_progressStream; bool _colorSpace; // Whether or not to overwrite an existing file when creating a umfa file (via create). bool _createOverwrite; std::string _baseFilename; // for later use by WordIndex create and open std::string _referenceFilename; std::string _fastaFilename; std::string _umfaFilename; std::string _application; // only used in ::create() MemoryMap _umfaFile; void setup(const char *referenceFilename); public: /// Simple constructor - no implicit file open GenomeSequence(); void constructorClear(); /// \brief attempt to open an existing sequence /// /// \param referenceFilename the name of the reference fasta file to open /// \param debug if true, additional debug information is printed GenomeSequence(std::string &referenceFilename) { constructorClear(); setup(referenceFilename.c_str()); } /// Smarter constructor - attempt to open an existing sequence /// /// \param referenceFilename the name of the reference fasta file to open /// \param debug if true, additional debug information is printed GenomeSequence(const char *referenceFilename) { constructorClear(); setup(referenceFilename); } /// Close the file if open and destroy the object ~GenomeSequence(); /// open the reference specified using GenomeSequence::setReferenceName /// /// \param isColorSpace open the color space reference /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents) /// \return false for success, true otherwise bool open(bool isColorSpace = false, int flags = O_RDONLY); /// open the given file as the genome (no filename munging occurs). /// /// \param filename the name of the file to open /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents) /// \return false for success, true otherwise bool open(const char *filename, int flags = O_RDONLY) { _umfaFilename = filename; // TODO - should this method be doing something??? return false; } private: bool _searchCommonFileSuffix; public: bool create(bool isColor = false); // NEW API? // load time modifiers: /// if set, then show progress when creating and pre-fetching void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;} void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; } void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;} // Set whether or not to overwrite a umfa file when calling create. void setCreateOverwrite(bool createOverwrite) {_createOverwrite = createOverwrite;} bool loadFastaData(const char *filename); /// set the reference name that will be used in open() /// \param referenceFilename the name of the reference fasta file to open /// \return false for success, true otherwise /// /// \sa open() bool setReferenceName(std::string referenceFilename); /// set the application name in the binary file header /// /// \param application name of the application void setApplication(std::string application) { _application = application; // used in ::create() to set application name } const std::string &getFastaName() const { return _fastaFilename; } const std::string &getReferenceName() const { return _referenceFilename; } /// tell us if we are a color space reference or not /// \return true if colorspace, false otherwise bool isColorSpace() const { return _colorSpace; } /// return the number of bases represented in this reference /// \return count of bases genomeIndex_t getNumberBases() const { return getElementCount(); } /// given a whole genome index, get the chromosome it is located in /// /// This is done via a binary search of the chromosome table in the /// header of the mapped file, so it is O(log(N)) /// /// \param 0-based position the base in the genome /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error int getChromosome(genomeIndex_t position) const; /// given a chromosome name, return the chromosome index /// /// This is done via a linear search of the chromosome table in the /// header of the mapped file, so it is O(N) /// /// \param chromosomeName the name of the chromosome - exact match only /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error int getChromosome(const char *chromosomeName) const; /// Return the number of chromosomes in the genome /// \return number of chromosomes in the genome int getChromosomeCount() const; /// given a chromosome, return the genome base it starts in /// /// \param 0-based chromosome index /// \return 0-based genome index of the base that starts the chromosome genomeIndex_t getChromosomeStart(int chromosomeIndex) const { if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX; return header->_chromosomes[chromosomeIndex].start; } /// given a chromosome, return its size in bases /// /// \param 0-based chromosome index /// \return size of the chromosome in bases genomeIndex_t getChromosomeSize(int chromosomeIndex) const { if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0; return header->_chromosomes[chromosomeIndex].size; } /// given a chromosome name and position, return the genome position /// /// \param chromosomeName name of the chromosome - exact match only /// \param chromosomeIndex 1-based chromosome position /// \return genome index of the above chromosome position genomeIndex_t getGenomePosition( const char *chromosomeName, unsigned int chromosomeIndex) const; /// given a chromosome index and position, return the genome position /// /// \param chromosome index of the chromosome /// \param chromosomeIndex 1-based chromosome position /// \return genome index of the above chromosome position genomeIndex_t getGenomePosition( int chromosome, unsigned int chromosomeIndex) const; /// given the chromosome name, get the corresponding 0 based genome index /// for the start of that chromosome genomeIndex_t getGenomePosition(const char *chromosomeName) const; genomeIndex_t getGenomePosition(int chromosomeIndex) const; const std::string &getBaseFilename() const { return _baseFilename; } const char *getChromosomeName(int chromosomeIndex) const { return header->_chromosomes[chromosomeIndex].name; } void setDebugFlag(bool d) { _debugFlag = d; } genomeIndex_t sequenceLength() const { return (genomeIndex_t) header->elementCount; } const char *chromosomeName(int chr) const { return header->_chromosomes[chr].name; } void sanityCheck(MemoryMap &fasta) const; // TODO - this will be moved somewhere else and be made a static method. std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const; bool wordMatch(unsigned int index, std::string &word) const; bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const; // TODO - this will be moved somewhere else and be made a static method. char BasePair(char c) const { return BaseAsciiMap::base2complement[(int) c]; } void dumpSequenceSAMDictionary(std::ostream&) const; void dumpHeaderTSV(std::ostream&) const; /// /// Return the bases in base space or color space for within range index, ot /// @param index the array-like index (0 based). /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid. /// For color space, index i represents the transition of base at position (i-1) to base at position i /// /// NB: bounds checking here needs to be deprecated - do not assume it /// will exist - the call must clip reads so that this routine is never /// called with a index value larger than the genome. /// /// The reason for this is simply that this routine gets called hundreds /// of billions of time in one run of karma, which will absolutely kill /// performance. Every single instruction here matters a great, great deal. /// // // 3.5% improvement for color space matching: // I guess the compiler isn't inlining 3 functions deep. // #if 0 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // The following code does not work even in the base space, // since the memory layout has changed. USE IT WITH CAUTIOUS! // // This block of code is a functional duplicate of the following // code - leave this here for reference and possibly later // performance testing as well as compiler evaluation. inline char operator[](genomeIndex_t index) const { return BaseAsciiMap::int2base[(*((genomeSequenceArray*) this))[index]]; } #endif inline char operator[](genomeIndex_t index) const { uint8_t val; if (index < getNumberBases()) { if ((index&1)==0) { val = ((uint8_t *) data)[index>>1] & 0xf; } else { val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4; } } else { val = BaseAsciiMap::baseNIndex; } val = isColorSpace() ? BaseAsciiMap::int2colorSpace[val] : BaseAsciiMap::int2base[val]; return val; } /// given a chromosome name and 1-based position, return the reference base. /// \param chromosomeName name of the chromosome - exact match only /// \param chromosomeIndex 1-based chromosome position /// \return reference base at the above chromosome position inline char getBase(const char *chromosomeName, unsigned int chromosomeIndex) const { genomeIndex_t index = getGenomePosition(chromosomeName, chromosomeIndex); if(index == INVALID_GENOME_INDEX) { // Invalid position, so return 'N' return('N'); } return((*this)[index]); } inline uint8_t getInteger(genomeIndex_t index) const { return (*((genomeSequenceArray*) this))[index]; } inline void set(genomeIndex_t index, char value) { genomeSequenceArray::set(index, BaseAsciiMap::base2int[(uint8_t) value]); } /// obtain the pointer to the raw data for other access methods /// /// this is a fairly ugly hack to reach into the /// raw genome vector, get the byte that encodes /// two bases, and return it. This is used by /// karma ReadIndexer::getSumQ to compare genome /// matchines by byte (two bases at a time) to speed /// it up. /// uint8_t *getDataPtr(genomeIndex_t index) { return ((uint8_t *) data + index/2); } private: /// /// when creating the genome mapped file, we call this to set /// the MD5 checksum of the chromosome sequence and length so that /// we can write the SAM SQ headers properly. /// NB: operates on the last fully loaded chromosome. bool setChromosomeMD5andLength(uint32_t whichChromosome); public: // TODO - this will be moved somewhere else and be made a static method. // replace read with the reversed one void getReverseRead(std::string &read); // TODO - this will be moved somewhere else and be made a static method. void getReverseRead(String& read); // debug the given read - print nice results int debugPrintReadValidation( std::string &read, std::string &quality, char direction, genomeIndex_t readLocation, int sumQuality, int mismatchCount, bool recurse = true ); // // get the sequence from this GenomeSequence using the specified chromosome and 0-based position. // if baseCount < 0, get the reverse complement // that starts at index (but do not reverse the string?) void getString(std::string &str, int chromosome, uint32_t index, int baseCount) const; void getString(String &str, int chromosome, uint32_t index, int baseCount) const; // // get the sequence from this GenomeSequence. // if baseCount < 0, get the reverse complement // that starts at index (but do not reverse the string?) // void getString(std::string &str, genomeIndex_t index, int baseCount) const; void getString(String &str, genomeIndex_t index, int baseCount) const; void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const; void print30(genomeIndex_t) const; // for debugging, not for speed: genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const; // TODO - these methods do not handle a CIGAR string and do not handle '=' when a read matches the reference. // They are here for alignment and should be moved to the aligner (karma). // OR they should optionally take a CIGAR and use that if specified.... // maybe they should be helper methods that are found somewhere else /// Return the mismatch count, disregarding CIGAR strings /// /// \param read is the sequence we're counting mismatches in /// \param location is where in the genmoe we start comparing /// \param exclude is a wildcard character (e.g. '.' or 'N') /// /// \return number of bases that don't match the reference, except those that match exclude int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const { int mismatchCount = 0; for (uint32_t i=0; i0 on entry, is checked against the computed sumQ /// \param insertions count of insertions found in /// /// bool checkRead( std::string &read, std::string &qualities, std::string &cigar, int &sumQ, // input and output int &gapOpenCount, // output only int &gapExtendCount, // output only int &gapDeleteCount, // output only std::string &result ) const; bool populateDBSNP(mmapArrayBool_t &dbSNP, IFILE inputFile) const; /// user friendly dbSNP loader. /// /// \param inputFileName may be empty, point to a text file or a dbSNP vector file /// /// In all cases, dbSNP is returned the same length as this genome. /// /// When no SNPs are loaded, all values are false. /// /// When a text file is given, the file is parsed with two space /// separated columns - the first column is the chromosome name, and /// the second is the 1-based chromosome position of the SNP. /// /// \return false if a dbSNP file was correctly loaded, true otherwise /// bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const; }; #endif libStatGen-1.0.14/general/GenomeSequenceHelpers.h000066400000000000000000000116631254730101300216500ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _GENOME_SEQUENCE_HELPERS_H #define _GENOME_SEQUENCE_HELPERS_H #if !defined(MD5_DIGEST_LENGTH) #define MD5_DIGEST_LENGTH 16 #endif #include "MemoryMapArray.h" #include // // ChromosomeInfo represents the per chromosome information // necessary to write out SAM/BAM records. In addition, it // contains a single internal index used to point to the vector // offset where the chromosome bases start. // // This is mildly non-optimal for larger collections of chromosomes // or contigs - one use case described having millions of contigs, // in which case, this structure alone would take a gigabyte or more. // struct ChromosomeInfo { static const int MAX_GENOME_INFO_STRING=128; void constructorClear() { memset(this,0, sizeof(*this)); } void setChromosomeName(const char *n) { strncpy(name, n, sizeof(name)-1); name[sizeof(name)-1] = '\0'; } genomeIndex_t start; // internal offset to combined genome vector genomeIndex_t size; // SAM SQ:LN value char md5[2*MD5_DIGEST_LENGTH + 1]; // 32 chars plus NUL, SAM SQ:M5 value char name[MAX_GENOME_INFO_STRING]; // SAM SQ:SN value char assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value char uri[MAX_GENOME_INFO_STRING]; // SAM SQ:UR value char species[MAX_GENOME_INFO_STRING]; // SAM SQ:SP value // handy setting methods: void setAssemblyID(const char *newID) { strncpy(assemblyID, newID, sizeof(assemblyID)-1); name[sizeof(name)-1] = '\0'; } void setSpecies(const char *newSpecies) { strncpy(species, newSpecies, sizeof(species)-1); species[sizeof(species)-1] = '\0'; } void setURI(const char *newURI) { strncpy(uri, newURI, sizeof(uri)-1); uri[sizeof(uri)-1] = '\0'; } }; class genomeSequenceMmapHeader : public MemoryMapArrayHeader { friend class GenomeSequence; friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &); private: uint32_t _chromosomeCount; bool _colorSpace; ChromosomeInfo _chromosomes[0]; public: // // getHeaderSize is special in that it must not access any // member variables, since it is called when the header has // not been created yet. // static size_t getHeaderSize(int chromosomeCount) { return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount; } // // below methods return TRUE if it failed, false otherwise (primarily // a length check). // }; std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h); // // define the genomeSequence array type: // // NB the access/set routines use the encoded base values in the range // 0-15, not the corresponding base pair character. // inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index) { if ((index&1)==0) { return ((uint8_t *) base)[index>>1] & 0xf; } else { return (((uint8_t *) base)[index>>1] & 0xf0) >> 4; } }; inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v) { if ((index&1)==0) { ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v; } else { ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4; } } inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i) { return sizeof(uint8_t) * i / 2; } class PackedRead { void set(int index, int val) { packedBases[index>>1] = (packedBases[index>>1] // original value & ~(7<<((index&0x01)<<2))) // logical AND off the original value | ((val&0x0f)<<((index&0x1)<<2)); // logical OR in the new value } public: std::vector packedBases; uint32_t length; int size() { return length; } void clear() { packedBases.clear(); length=0; } void set(const char *rhs, int padWithNCount = 0); uint8_t operator [](int index) { return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf; } }; #endif libStatGen-1.0.14/general/GenotypeLists.cpp000066400000000000000000000371561254730101300205730ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "GenotypeLists.h" // When the next line is uncommented, the genotype elimination routines // produce a lot of output useful for debugging // #define DEBUG_ELIMINATOR GenotypeList::GenotypeList() { ignore = false; } bool GenotypeList::EliminateGenotypes(Pedigree & ped, Family * family, int marker) { // First, allocate a genotype list for the family GenotypeList * list = new GenotypeList [family->count]; // Next, update the possible allele lists for each individual InitializeList(list, ped, family, marker); // Then, do multiple rounds of elimination until a problem is found // or no changes are made #ifdef DEBUG_ELIMINATOR Print(list, ped, family, marker); #endif while (PairwiseCheck(list, ped, family) || FamilyCheck(list, ped, family)) #ifdef DEBUG_ELIMINATOR Print(list, ped, family, marker) #endif ; for (int i = 0; i < family->count; i++) if (!list[i].ignore && list[i].allele1.Length() == 0) { printf("%s - Family %s has a subtle genotype inconsistency\n", (const char *) ped.markerNames[marker], (const char *) family->famid); delete [] list; return false; } delete [] list; return true; } void GenotypeList::InitializeList(GenotypeList * list, Pedigree & ped, Family * family, int marker) { for (int i = family->count - 1; i >= 0; i--) { Person & person = ped[family->path[i]]; int id = person.traverse; bool maleX = person.sex == SEX_MALE && ped.chromosomeX; #ifdef DEBUG_ELIMINATOR printf("Initializing genotype list for %s ...\n", (const char *) person.pid); #endif // If an individual is genotyped ... if (person.markers[marker].isKnown()) { // Their genotype list starts with just one entry! list[id].Dimension(1); list[id].SetGenotype(0, person.markers[marker][0], person.markers[marker][1]); list[id].alleles.Clear(); list[id].alleles.Push(person.markers[marker][0]); list[id].alleles.PushIfNew(person.markers[marker][1]); list[id].ignore = false; // "Heterozygous" males have no possible genotypes if (maleX && person.markers[marker].isHeterozygous()) list[id].Dimension(0); } else if (list[id].alleles.Length()) if (person.sex == SEX_MALE && ped.chromosomeX) { // Males only carry one X chromosome list[id].Dimension(list[id].alleles.Length() + 1); for (int i = 0, out = 0; i < list[id].alleles.Length(); i++) list[id].SetGenotype(out++, list[id].alleles[i], list[id].alleles[i]); list[id].SetGenotype(list[id].alleles.Length(), -1, -1); list[id].ignore = false; } else { // Build the genotype list based on the available allele lists int count = list[id].alleles.Length() * (list[id].alleles.Length() + 3) / 2 + 1; list[id].Dimension(count); for (int i = 0, out = 0; i < list[id].alleles.Length(); i++) { // Allow for all pairs of "transmitted" alleles for (int j = 0; j <= i; j++) list[id].SetGenotype(out++, list[id].alleles[i], list[id].alleles[j]); // Allow for an unstransmitted allele list[id].SetGenotype(out++, list[id].alleles[i], -1); } // Allow for a pair of untransmitted alleles list[id].SetGenotype(count - 1, -1, -1); list[id].ignore = false; } else list[id].ignore = true; // If the individual is a founder this is all there is to it if (i < family->founders) continue; // If the individual is not a founder, update the parental genotype lists... int fatid = person.father->traverse; int motid = person.mother->traverse; for (int i = 0; i < list[id].alleles.Length(); i++) { list[motid].alleles.PushIfNew(list[id].alleles[i]); if (!maleX) list[fatid].alleles.PushIfNew(list[id].alleles[i]); } } } bool GenotypeList::PairwiseCheck(GenotypeList * list, Pedigree & ped, Family * family) { #ifdef DEBUG_ELIMINATOR printf("Checking Relative Pairs ...\n"); #endif bool changed = false; for (int i = family->count - 1; i >= family->founders; i--) { Person & person = ped[family->path[i]]; int id = person.traverse; int fatid = person.father->traverse; int motid = person.mother->traverse; bool maleX = person.sex == SEX_MALE && ped.chromosomeX; if (list[id].ignore) continue; // Check if genotypes are consistent with paternal genotypes for (int i = 0; i < list[id].allele1.Length(); i++) { int al1 = list[id].allele1[i]; int al2 = list[id].allele2[i]; // Remove offspring genotypes incompatible with parental genotypes if ((maleX && !list[motid].Matches(al1) && al1 != -1) || (!maleX && !(al1 == -1 && al2 == -1) && !(list[fatid].Matches(al1) && (al2 == -1 || list[motid].Matches(al2))) && !((al2 == -1 || list[fatid].Matches(al2)) && list[motid].Matches(al1)))) { list[id].Delete(i--); changed = true; } } // The offspring genotype list allows for a wild-card untransmitted allele // so any single parental genotype is possible if (list[id].Matches(-1)) continue; // Check if genotypes are consistent with offspring genotypes for (int i = 0; i < list[motid].allele1.Length(); i++) { int al1 = list[motid].allele1[i]; int al2 = list[motid].allele2[i]; // Remove genotypes incompatible with offspring genotypes if (!list[id].Matches(al1) && !list[id].Matches(al2)) { list[motid].Delete(i--); changed = true; } } // Males don't affect genotype lists for their fathers if (maleX) continue; // Check if genotypes are consistent with offspring genotypes for (int i = 0; i < list[fatid].allele1.Length(); i++) { int al1 = list[fatid].allele1[i]; int al2 = list[fatid].allele2[i]; // Remove genotypes incompatible with offspring genotypes if (!list[id].Matches(al1) && !list[id].Matches(al2)) { list[fatid].Delete(i--); changed = true; } } #ifdef DEBUG_ELIMINATOR printf("Done checking individual %s\n", (const char *) person.pid); Print(list, ped, family, 0); #endif } return changed; } bool GenotypeList::FamilyCheck(GenotypeList * list, Pedigree & ped, Family * family) { #ifdef DEBUG_ELIMINATOR printf("Checking Nuclear Families ...\n"); #endif bool changed = false; for (int i = family->count - 1; i >= family->founders; i--) { Person & person = ped[family->path[i]]; int fatid = person.father->traverse; int motid = person.mother->traverse; // Only go through the loop once per sibship if (person.sibs[0] != &person || list[fatid].ignore || list[motid].ignore) continue; #ifdef DEBUG_ELIMINATOR printf("Checking Sibship with %s ...\n", (const char *) person.pid); #endif // Reset checked genotypes for the mother, father and child list[fatid].checked = 0; list[motid].checked = 0; for (int i = 0; i < person.sibCount; i++) list[person.sibs[i]->traverse].checked = 0; // Go through each of the paternal genotypes changed |= TrimParent(list, person, fatid, motid); // Go through each of maternal genotypes changed |= TrimParent(list, person, motid, fatid); // Sort out the unchecked offspring genotypes ... for (int i = 0; i < person.sibCount; i++) { int sibid = person.sibs[i]->traverse; bool maleX = person.sibs[i]->sex == SEX_MALE && ped.chromosomeX; // For dealing with male X chromosomes, the pairwise check is all we need if (maleX) continue; for (int j = list[sibid].checked; j < list[sibid].allele1.Length(); j++) changed |= Cleanup(list, person, motid, fatid, sibid, j); } #ifdef DEBUG_ELIMINATOR // Print(list, ped, family, 0); #endif } return changed; } bool GenotypeList::Matches(int genotype, int allele) { return allele1[genotype] == allele || allele2[genotype] == allele; } bool GenotypeList::Matches(int allele) { return allele1.Find(allele) != -1 || allele2.Find(allele) != -1; } int GenotypeList::SaveGenotype(int genotype) { if (checked > genotype) return genotype; if (checked != genotype) { allele1.Swap(genotype, checked); allele2.Swap(genotype, checked); } return checked++; } bool GenotypeList::CheckTrio(GenotypeList * list, int fatid, int motid, int child, int i, int j, int k) { // TODO: add tests for this code... return (list[fatid].Matches(i, list[child].allele1[k]) && (list[motid].Matches(j, list[child].allele2[k]) || list[child].allele2[k] == -1)) || ((list[fatid].Matches(i, list[child].allele2[k]) || list[child].allele2[k] == -1) && list[motid].Matches(j, list[child].allele1[k])) || (list[child].allele1[k] == -1 && list[child].allele2[k] == -1); } void GenotypeList::Dimension(int genotypes) { allele1.Dimension(genotypes); allele2.Dimension(genotypes); } void GenotypeList::SetGenotype(int genotype, int al1, int al2) { allele1[genotype] = al1; allele2[genotype] = al2; } void GenotypeList::Delete(int genotype) { allele1.Delete(genotype); allele2.Delete(genotype); } bool GenotypeList::TrimParent(GenotypeList * list, Person & person, int motid, int fatid) { bool trimmed = false; while (list[motid].checked < list[motid].allele1.Length()) { int current = list[motid].allele1.Length() - 1; bool saved = false; // Pair it with each possible paternal genotype for (int i = list[fatid].allele1.Length() - 1; i >= 0; i--) { int matches = 0; // Find out if the pairing is compatible with at least one genotype for each child for (int j = 0; j < person.sibCount; j++) { int sibid = person.sibs[j]->traverse; int maleX = person.sibs[j]->sex == SEX_MALE && person.chromosomeX; // Since we have done the pairwise check, there is nothing more // to do for males ... if (list[sibid].ignore || maleX) { matches++; continue; } for (int k = list[sibid].allele1.Length() - 1; k >= 0; k--) if (CheckTrio(list, motid, fatid, sibid, current, i, k)) { matches++; break; } if (matches != j + 1) break; } // Save maternal and paternal genotypes, mark all compatible sibling genotypes if (matches == person.sibCount) { for (int j = 0; j < person.sibCount; j++) { int sibid = person.sibs[j]->traverse; for (int k = list[sibid].checked; k < list[sibid].allele1.Length(); k++) if (CheckTrio(list, motid, fatid, sibid, current, i, k)) list[sibid].SaveGenotype(k); } list[motid].SaveGenotype(current); list[fatid].SaveGenotype(i); saved = true; break; } } if (!saved) { list[motid].Delete(current); trimmed = true; } } return trimmed; } bool GenotypeList::Cleanup(GenotypeList * list, Person & person, int motid, int fatid, int child, int geno) { for (int current = 0; current < list[motid].allele1.Length(); current++) for (int i = list[fatid].allele1.Length() - 1; i >= 0; i--) if (CheckTrio(list, motid, fatid, child, current, i, geno)) { int matches = 0; // Find out if the pairing is compatible with at least one genotype for each child for (int j = 0; j < person.sibCount; j++) { int sibid = person.sibs[j]->traverse; int maleX = person.sibs[j]->sex == SEX_MALE && person.chromosomeX; // After completing the pairwise check, all males are guaranteed // to be compatible with their mothers if (list[sibid].ignore || maleX) { matches++; continue; } for (int k = list[sibid].allele1.Length() - 1; k >= 0; k--) if (CheckTrio(list, motid, fatid, sibid, current, i, k)) { matches++; break; } if (matches != j + 1) break; } // Update list of compatible sibling genotypes if (matches == person.sibCount) for (int j = 0; j < person.sibCount; j++) { int sibid = person.sibs[j]->traverse; for (int k = list[sibid].checked; k < list[sibid].allele1.Length(); k++) if (CheckTrio(list, motid, fatid, sibid, current, i, k)) list[sibid].SaveGenotype(k); return false; } } list[child].Delete(geno); return true; } void GenotypeList::Print(GenotypeList * list, Pedigree & ped, Family * family, int marker) { MarkerInfo * info = ped.GetMarkerInfo(marker); for (int i = 0; i < family->count; i++) { printf("%s - ", (const char *) ped[family->path[i]].pid); for (int j = 0; j < list[i].allele1.Length(); j++) { if (list[i].allele1[j] == -1) printf("*/"); else printf("%s/", (const char *) info->GetAlleleLabel(list[i].allele1[j])); if (list[i].allele2[j] == -1) printf("* "); else printf("%s ", (const char *) info->GetAlleleLabel(list[i].allele2[j])); } printf("\n"); } printf("\n"); } libStatGen-1.0.14/general/GenotypeLists.h000066400000000000000000000036311254730101300202270ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GENOTYPE_ELIMINATION__ #define __GENOTYPE_ELIMINATION__ #include "Pedigree.h" class GenotypeList { public: IntArray allele1, allele2; IntArray alleles; bool ignore; int checked; GenotypeList(); static bool EliminateGenotypes(Pedigree & ped, Family * family, int marker); void Dimension(int genotypes); void Delete(int genotype); bool Matches(int genotype, int allele); bool Matches(int allele); int SaveGenotype(int genotype); void SetGenotype(int genotype, int al1, int al2); private: static void InitializeList(GenotypeList * list, Pedigree & p, Family * f, int marker); static bool PairwiseCheck(GenotypeList * list, Pedigree & p, Family * f); static bool FamilyCheck(GenotypeList * list, Pedigree & p, Family * f); static bool CheckTrio(GenotypeList * list, int fatid, int motid, int child, int i, int j, int k); static bool TrimParent(GenotypeList * list, Person & person, int fatid, int motid); static bool Cleanup(GenotypeList * list, Person & person, int fatid, int motid, int child, int geno); static void Print(GenotypeList * List, Pedigree & p, Family * f, int marker); }; #endif libStatGen-1.0.14/general/GreedyTupleAligner.h000066400000000000000000000230321254730101300211460ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _GREEDY_TUPLE_H #define _GREEDY_TUPLE_H #include #include #include #include #include #include "Generic.h" #include "CigarRoller.h" /* * TODO: 1. how to efficiently find insertion? */ /** * Weight includes various penalties(e.g. gap open) used in local alignment */ struct Weight { public: Weight() { gapOpen = gapExtend = -1; // here we do not use affine gap penalty for simlicity. mismatch = -1; match= 2; }; int gapOpen; int gapExtend; int mismatch; int match; }; // // tuple number is 3, arbitrary number from my guess! // another reason // template class GreedyTupleAligner { public: GreedyTupleAligner(Weight& wt): weight(wt) {/* */} /** * Match 'query' to the 'reference' from 'searchStartIndex' up * to 'searchSize', store matched length to 'matchedLength' * and number of mismatch to 'mismatch' * @param query input query * @param queryLength length of query * @param reference reference sequence * @param searchStartIndex the positino where search starts * @param searchSize the total length in reference sequence that will be examine * @param matchedLength store how many bases are matched * @param mismatch store how many bases are mismatched * @return -1 for unsuccess return */ int MatchTuple( const QueryType query, const int queryLength, const ReferenceType reference, const ReferenceIndex searchStartIndex, const int searchSize, int& matchedLength, int& mismatch) { // now use naive search, // TODO: will incorportate KMP serach later // TODO: adjust tolerance of mismatches const int MAX_MISMATCH=2; int bestPos = 0, bestMismatch = queryLength, bestMatchedLength = 0, bestScore=-1; #if defined(DEBUG_GREEDY_ALIGNER) cout << "searchStartIndex == " << searchStartIndex << ", searchSize == " << searchSize << std::endl; #endif // here i is the matching position (inclusive) // j is the matched length for (int i = 0; i <= searchSize - tupleSize; i++) { int j = 0; mismatch = 0; while (j < queryLength) { if (searchStartIndex + i + j >= reference.getNumberBases()) break; if (query[j] != reference[searchStartIndex + i + j]) { mismatch++; if (mismatch >= MAX_MISMATCH) break; } j++; } if (j>0 && (j==queryLength)) j--; while (searchStartIndex +i +j < reference.getNumberBases() && ((j+1) > mismatch) && mismatch>0 && query[j] != reference[searchStartIndex + i+j]) { // if pattern matching goes beyong the preset mismatch cutoff, // we will have to go backwards j--; mismatch--; } int score = j - mismatch; if (score > bestScore) { bestPos = i; bestScore = score; bestMismatch = mismatch; bestMatchedLength = j+1; } } if (bestScore > 0) { mismatch = bestMismatch; matchedLength = bestMatchedLength; return bestPos; } return -1; } /** * Core local alignment algorithm * @param query input query * @param queryLength length of query * @param reference reference genome * @param searchStartIndex matching starts here * @param searchSize how far we will search * @param cigarRoller store alignment results here * @param matchPosition store match position */ void Align( QueryType query, int queryLength, ReferenceType reference, ReferenceIndex searchStartIndex, int searchSize, CigarRoller& cigarRoller, ReferenceIndex& matchPosition) { // Algorithm: // finished align? (should we try different align position?) // if not, try next tuple // is the tuple aligned? // yes, extend to previous, mark unmatched part mismatch or gap // extend to next matched part int r1 = 0; // a start index: reference starting from r1 (inclusive) will be used int queryMatchCount = 0; // query matched # of bases int q1 = 0; // to align int pos = -1; int lastR1 = -1; // index: record last cigarRoller.clear(); matchPosition = -1; while (queryMatchCount < queryLength) { if (r1 == searchSize - 1) // touched ref right boundary { cigarRoller.Add(CigarRoller::softClip, queryLength-queryMatchCount); break; } if (queryLength - q1 < tupleSize) { // XXX this needs to do something more sane // printf("some bases left!\n"); // a simple fix: treat all left-over bases as mismatches/matches cigarRoller.Add(CigarRoller::mismatch, queryLength - queryMatchCount); break; } int mismatch = 0; int matchedLen = 0; if ((pos = MatchTuple(query+q1, queryLength-q1, reference, searchStartIndex + r1, searchSize - r1, matchedLen, mismatch)) // found match position for tuple >= 0) { // found match position for tuple if (lastR1<0) matchPosition = pos; // // deal with left // if (lastR1>=0) // have previously aligned part of the query to the reference genome yet { if (pos > 0) { cigarRoller.Add(CigarRoller::del, pos); } } else { lastR1 = pos; } r1 += pos; r1 += matchedLen; q1 += matchedLen; // // deal with right // cigarRoller.Add(CigarRoller::match, matchedLen); queryMatchCount = q1; lastR1 = r1; continue; } // end if // // try insertion // maximum insert ? say 2 // for (int i = 1; i < queryLength - q1 - tupleSize; i++) { int mismatch = 0; int matchedLen = 0; // check reference genome broundary if (searchStartIndex + r1 >= reference.getNumberBases()) return; if ((pos = MatchTuple(query+q1 + i , queryLength - q1 -i , reference, searchStartIndex + r1, searchSize - r1, matchedLen, mismatch)) // found match position for tuple >= 0) { if (matchPosition < 0) matchPosition = pos + q1 + i ; } queryMatchCount += i; q1 += i; cigarRoller.Add(CigarRoller::insert, i); lastR1 = r1 + pos; r1 += pos + tupleSize; q1 += tupleSize; // deal with right while (searchStartIndex + r1 < reference.getNumberBases() && query[q1]==reference[searchStartIndex + r1] && q1 < queryLength) { r1++; q1++; } if (q1 < queryLength) { cigarRoller.Add(CigarRoller::match, q1 - queryMatchCount); queryMatchCount = q1; } else { cigarRoller.Add(CigarRoller::match, queryLength - queryMatchCount); queryMatchCount = queryLength ; break ; } } r1++; q1++; // try next } // end while (queryMatchCount < queryLength) } private: static const int tupleSize = 3; Weight weight; }; #endif libStatGen-1.0.14/general/GzipFileType.cpp000066400000000000000000000027431254730101300203270ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "GzipFileType.h" #include #include #ifdef __ZLIB_AVAILABLE__ GzipFileType::GzipFileType(const char * filename, const char * mode) { // If the file is for write and is '-', then write to stdout. if(((mode[0] == 'w') || (mode[0] == 'W')) && ((strcmp(filename, "-") == 0) || (strcmp(filename, "-.gz") == 0))) { // Write to stdout. gzHandle = gzdopen(fileno(stdout), mode); } else if(((mode[0] == 'r') || (mode[0] == 'R')) && ((strcmp(filename, "-") == 0) || (strcmp(filename, "-.gz") == 0))) { // read from stdin gzHandle = gzdopen(fileno(stdin), mode); } else { // Open the file. gzHandle = gzopen(filename, mode); } }; #endif libStatGen-1.0.14/general/GzipFileType.h000066400000000000000000000100001254730101300177550ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GZFILETYPE_H__ #define __GZFILETYPE_H__ #ifdef __ZLIB_AVAILABLE__ #if defined(_WIN32) #include // for NULL! #endif #include #include #include "FileType.h" //#include class GzipFileType : public FileType { public: GzipFileType() { gzHandle = NULL; } virtual ~GzipFileType() { } GzipFileType(const char * filename, const char * mode); bool operator == (void * rhs) { // No two file pointers are the same, so if rhs is not NULL, then // the two pointers are different (false). if (rhs != NULL) return false; return (gzHandle == rhs); } bool operator != (void * rhs) { // No two file pointers are the same, so if rhs is not NULL, then // the two pointers are different (true). if (rhs != NULL) return true; return (gzHandle != rhs); } // Close the file. inline int close() { int result = gzclose(gzHandle); gzHandle = NULL; return result; } // Reset to the beginning of the file. inline void rewind() { // Just call rewind to move to the beginning of the file. gzrewind(gzHandle); } // Check to see if we have reached the EOF. inline int eof() { // check the file for eof. return gzeof(gzHandle); } // Check to see if the file is open. virtual inline bool isOpen() { if (gzHandle != NULL) { // gzHandle is not null, so the file is open. return(true); } return(false); } // Write to the file inline unsigned int write(const void * buffer, unsigned int size) { return gzwrite(gzHandle, buffer, size); } // Read into a buffer from the file. Since the buffer is passed in and // this would bypass the fileBuffer used by this class, this method must // be protected. inline int read(void * buffer, unsigned int size) { unsigned int numBytes = gzread(gzHandle, buffer, size); // if(numBytes != size) // { // std::cerr << "Error reading. Read "<< numBytes << " instead of "<< size<. */ #include "GzipHeader.h" #include #include // Constructor to initialize member data to 0. GzipHeader::GzipHeader() { // clear the union via memset: memset(headerBuffer, 0, sizeof(headerBuffer)); } // Desctructor - nothing to do. GzipHeader::~GzipHeader() { } // Method to read the gzip header from a file. // Returns true if the file is a gzip file, false, otherwise. bool GzipHeader::readHeader(FILE* filePtr) { bool isGzip = false; // If the file is not already open, return false. if (filePtr == NULL) { // File is not open, so return false - not a gzip file. return(false); } // Try to read a header from the file. // if(144 == fread(buffer, 1, 144, filePtr)) if (GZIP_HEADER_SIZE == fread(buffer, 1, GZIP_HEADER_SIZE, filePtr)) { memcpy(headerBuffer, buffer, GZIP_HEADER_SIZE); // Successfully read enough bytes, so check to see if it is a GzipFile. if (isGzipFile()) { // It is a gzip file. isGzip = true; } } return isGzip; } // Method to read the gzip header from a file. // Returns true if the file is a gzip file, false, otherwise. bool GzipHeader::readHeader(UncompressedFileType& file) { bool isGzip = false; // If the file is not already open, return false. if (!file.isOpen()) { // File is not open, so return false - not a gzip file. return(false); } // Try to read a header from the file. // if(144 == file.read(buffer, 1, 144, filePtr)) if ((int)GZIP_HEADER_SIZE == file.read(buffer, GZIP_HEADER_SIZE)) { memcpy(headerBuffer, buffer, GZIP_HEADER_SIZE); // Successfully read enough bytes, so check to see if it is a GzipFile. if (isGzipFile()) { // It is a gzip file. isGzip = true; } } return isGzip; } // Determine if the file is a gzip file. bool GzipHeader::isGzipFile() { if ((id1 == 31) && (id2 == 139)) { return true; } return false; } // Determine if the file is a BGZF compressed file. bool GzipHeader::isBgzfFile() { if (isGzipFile() && (si1 == 66) && (si2 == 67)) { return true; } return false; } libStatGen-1.0.14/general/GzipHeader.h000077500000000000000000000036541254730101300174500ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GZIPHEADER_H__ #define __GZIPHEADER_H__ #include #include #include "UncompressedFileType.h" class GzipHeader { public: GzipHeader(); ~GzipHeader(); // Method to read the gzip header from a file. // Returns true if the file is a gzip file, false, otherwise. bool readHeader(FILE* filePtr); // Method to read the gzip header from a file of UncompresedFileType. // Returns true if the file is a gzip file, false, otherwise. bool readHeader(UncompressedFileType& file); // Determine if the file is a gzip file. bool isGzipFile(); // Determine if the file is a BGZF compressed file. bool isBgzfFile(); private: static const unsigned int GZIP_HEADER_SIZE = 18; union { struct { uint8_t id1; uint8_t id2; uint8_t cm; uint8_t flg; uint32_t mtime; uint8_t xfl; uint8_t os; uint16_t xlen; uint8_t si1; uint8_t si2; uint16_t slen; uint16_t bsize; }; char headerBuffer[GzipHeader::GZIP_HEADER_SIZE]; }; char buffer[GZIP_HEADER_SIZE]; }; #endif libStatGen-1.0.14/general/Hash.cpp000066400000000000000000000111661254730101300166360ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Hash.h" #include // ******************************************************** // // This code is based on the original by Robert Jenkins. // // http://burtleburtle.net/bob/hash/doobs.html // // ******************************************************** #define MIX_INTEGERS(a,b,c) \ { \ a -= b; a -= c; a ^= (c>>13); \ b -= c; b -= a; b ^= (a<<8); \ c -= a; c -= b; c ^= (b>>13); \ a -= b; a -= c; a ^= (c>>12); \ b -= c; b -= a; b ^= (a<<16); \ c -= a; c -= b; c ^= (b>>5); \ a -= b; a -= c; a ^= (c>>3); \ b -= c; b -= a; b ^= (a<<10); \ c -= a; c -= b; c ^= (b>>15); \ } #define ui (unsigned int) unsigned int hash(const unsigned char * key, unsigned int length, unsigned int initval) { unsigned int a = 0x9e3779b9; unsigned int b = 0x9e3779b9; unsigned int c = initval; unsigned int len = length; /*---------------------------------------- handle most of the key */ while (len >= 12) { a += (key[0] +(ui(key[1])<<8) +(ui(key[2])<<16) +(ui(key[3])<<24)); b += (key[4] +(ui(key[5])<<8) +(ui(key[6])<<16) +(ui(key[7])<<24)); c += (key[8] +(ui(key[9])<<8) +(ui(key[10])<<16)+(ui(key[11])<<24)); MIX_INTEGERS(a,b,c); key += 12; len -= 12; } /*------------------------------------- handle the last 11 bytes */ c += length; switch (len) /* all the case statements fall through */ { case 11: c+=(ui(key[10])<<24); case 10: c+=(ui(key[9])<<16); case 9 : c+=(ui(key[8])<<8); /* the first byte of c is reserved for the length */ case 8 : b+=(ui(key[7])<<24); case 7 : b+=(ui(key[6])<<16); case 6 : b+=(ui(key[5])<<8); case 5 : b+=key[4]; case 4 : a+=(ui(key[3])<<24); case 3 : a+=(ui(key[2])<<16); case 2 : a+=(ui(key[1])<<8); case 1 : a+=key[0]; /* case 0: nothing left to add */ } MIX_INTEGERS(a,b,c); /*-------------------------------------------- report the result */ return c; } unsigned int hash_no_case(const unsigned char * key, unsigned int length, unsigned int initval) { unsigned int a = 0x9e3779b9; unsigned int b = 0x9e3779b9; unsigned int c = initval; unsigned int len = length; /*---------------------------------------- handle most of the key */ while (len >= 12) { a += (toupper(key[0]) +(ui(toupper(key[1]))<<8) +(ui(toupper(key[2]))<<16) +(ui(toupper(key[3]))<<24)); b += (toupper(key[4]) +(ui(toupper(key[5]))<<8) +(ui(toupper(key[6]))<<16) +(ui(toupper(key[7]))<<24)); c += (toupper(key[8]) +(ui(toupper(key[9]))<<8) +(ui(toupper(key[10]))<<16)+(ui(toupper(key[11]))<<24)); MIX_INTEGERS(a,b,c); key += 12; len -= 12; } /*------------------------------------- handle the last 11 bytes */ c += length; switch (len) /* all the case statements fall through */ { case 11: c+=(ui(toupper(key[10]))<<24); case 10: c+=(ui(toupper(key[9]))<<16); case 9 : c+=(ui(toupper(key[8]))<<8); /* the first byte of c is reserved for the length */ case 8 : b+=(ui(toupper(key[7]))<<24); case 7 : b+=(ui(toupper(key[6]))<<16); case 6 : b+=(ui(toupper(key[5]))<<8); case 5 : b+=toupper(key[4]); case 4 : a+=(ui(toupper(key[3]))<<24); case 3 : a+=(ui(toupper(key[2]))<<16); case 2 : a+=(ui(toupper(key[1]))<<8); case 1 : a+=toupper(key[0]); /* case 0: nothing left to add */ } MIX_INTEGERS(a,b,c); /*-------------------------------------------- report the result */ return c; } libStatGen-1.0.14/general/Hash.h000066400000000000000000000017161254730101300163030ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __HASH_H__ #define __HASH_H__ unsigned int hash(const unsigned char * key, unsigned int length, unsigned int initval); unsigned int hash_no_case(const unsigned char * key, unsigned int length, unsigned int initval); #endif libStatGen-1.0.14/general/IndexBase.cpp000066400000000000000000000163571254730101300176240ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "IndexBase.h" #include Chunk SortedChunkList::pop() { Chunk newChunk = chunkList.begin()->second; chunkList.erase(chunkList.begin()); return(newChunk); } bool SortedChunkList::insert(const Chunk& chunkToInsert) { std::pair::iterator, bool> insertRes; // Insert the passed in chunk. insertRes = chunkList.insert(std::pair(chunkToInsert.chunk_beg, chunkToInsert)); if(!insertRes.second) { // Failed to insert the chunk. std::cerr << "Failed to insert into the SortedChunkList.\n"; std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex << insertRes.first->second.chunk_beg << "\tend = " << insertRes.first->second.chunk_end << "\nnew chunk:\tbeg = " << std::hex << chunkToInsert.chunk_beg << "\tend = " << chunkToInsert.chunk_end << std::endl; } // return the result that comes from insertRes. return(insertRes.second); } void SortedChunkList::clear() { chunkList.clear(); } bool SortedChunkList::empty() { return(chunkList.empty()); } // Merge overlapping chunks found in this list. bool SortedChunkList::mergeOverlapping() { // Start at the beginning of the list and iterate through. std::map::iterator currentPos = chunkList.begin(); std::map::iterator nextPos = chunkList.begin(); if(nextPos != chunkList.end()) { ++nextPos; } // Loop until the end is reached. while(nextPos != chunkList.end()) { // If the next chunk is completely contained within the current // chunk (its end is less than the current chunk's end), // delete it since its position is already covered. if(nextPos->second.chunk_end < currentPos->second.chunk_end) { chunkList.erase(nextPos); nextPos = currentPos; ++nextPos; continue; } // If the next chunk's start position's BGZF block is less than or // equal to the BGZF block of the current chunk's end position, // combine the two chunks into the current chunk. if((nextPos->second.chunk_beg >> 16) <= (currentPos->second.chunk_end >> 16)) { currentPos->second.chunk_end = nextPos->second.chunk_end; // nextPos has now been included in the current pos, so // remove it. chunkList.erase(nextPos); nextPos = currentPos; ++nextPos; continue; } else { // Nothing to combine. So try combining at the next currentPos = nextPos; ++nextPos; } } return(true); } IndexBase::IndexBase() : n_ref(0) { myRefs.clear(); } IndexBase::~IndexBase() { } // Reset the member data for a new index file. void IndexBase::resetIndex() { n_ref = 0; // Clear the references. myRefs.clear(); } // Get the number of references in this index. int32_t IndexBase::getNumRefs() const { // Return the number of references. return(myRefs.size()); } // The basic logic is from samtools reg2bins and the samtools format specification pdf. // Set bins in the region to 1 and all other bins to 0. void IndexBase::getBinsForRegion(uint32_t start, uint32_t end, bool binMap[MAX_NUM_BINS+1]) { for(uint32_t index = 0; index < MAX_NUM_BINS+1; index++) { binMap[index] = false; } uint32_t binNum = 0; --end; // Check if beg/end go too high, set to max position. if(start > MAX_POSITION) { start = MAX_POSITION; } if(end > MAX_POSITION) { end = MAX_POSITION; } // Turn on bins. binMap[binNum] = true; for (binNum = 1 + (start>>26); binNum <= 1 + (end>>26); ++binNum) binMap[binNum] = true; for (binNum = 9 + (start>>23); binNum <= 9 + (end>>23); ++binNum) binMap[binNum] = true; for (binNum = 73 + (start>>20); binNum <= 73 + (end>>20); ++binNum) binMap[binNum] = true; for (binNum = 585 + (start>>17); binNum <= 585 + (end>>17); ++binNum) binMap[binNum] = true; for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum) binMap[binNum] = true; } // Returns the minimum offset of records that cross the 16K block that // contains the specified position for the given reference id. bool IndexBase::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position, uint64_t& minOffset) const { int32_t linearIndex = position >> LINEAR_INDEX_SHIFT; minOffset = 0; if(refID > n_ref) { // out of range of the references, return false. return(false); } // Check to see if the position is out of range of the linear index. int32_t linearOffsetSize = myRefs[refID].n_intv; // If there are no entries in the linear index, return false. // Or if the linear index is not large enough to include // the start block, then there can be no records that cross // our region, so return false. if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize)) { return(false); } // The linear index is specified for this block, so return that value. minOffset = myRefs[refID].ioffsets[linearIndex]; // If the offset is 0, go to the previous block that has an offset. // This is due to a couple of bugs in older sam tools indexes. // 1) they add one to the index location (so when reading those, you // may be starting earlier than necessary) // 2) (the bigger issue) They did not include bins 4681-37449 in // the linear index. while((minOffset == 0) && (--linearIndex >= 0)) { minOffset = myRefs[refID].ioffsets[linearIndex]; } // If the minOffset is still 0 when moving forward, // check later indices to find a non-zero since we don't want to return // an offset of 0 since the record can't start at 0 we want to at least // return the first record position for this reference. linearIndex = 0; while((minOffset == 0) && (linearIndex < linearOffsetSize)) { minOffset = myRefs[refID].ioffsets[linearIndex]; linearIndex++; } if(minOffset == 0) { // Could not find a valid start position for this reference. return(false); } return(true); } libStatGen-1.0.14/general/IndexBase.h000066400000000000000000000115471254730101300172650ustar00rootroot00000000000000/* * Copyright (C) 2011-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __INDEX_BASE_H__ #define __INDEX_BASE_H__ #include #include #include #include #include "InputFile.h" #include "StatGenStatus.h" class Chunk { public: uint64_t chunk_beg; // offset of the start of the chunk uint64_t chunk_end; // offset of the end of the chunk static const uint64_t MAX_CHUNK_VALUE = 0xFFFFFFFFFFFFFFFFULL; bool operator< (const Chunk& otherChunk) const { return(this->chunk_beg < otherChunk.chunk_beg); } }; // This class contains chunks that are sorted by the beginning position. // This class hides how the chunks are actually stored (map, list ,etc), // so they can be interchanged. class SortedChunkList { public: // Returns the first chunk in the list and removes it. Chunk pop(); bool insert(const Chunk& chunkToInsert); void clear(); bool empty(); bool mergeOverlapping(); private: std::map chunkList; }; class IndexBase { public: IndexBase(); virtual ~IndexBase(); /// Reset the member data for a new index file. virtual void resetIndex(); // Read & parse the specified index file. /// \param filename the bam index file to be read. /// \return the status of the read. virtual StatGenStatus::Status readIndex(const char* filename) = 0; /// Get the number of references in this index. /// \return number of references int32_t getNumRefs() const; // Returns the minimum offset of records that cross the 16K block that // contains the specified position for the given reference id. bool getMinOffsetFromLinearIndex(int32_t refID, uint32_t position, uint64_t& minOffset) const; protected: const static uint32_t MAX_NUM_BINS = 37450; // per specs, at most 37450 bins // Maximum allowed position (inclusive 512MB - 1) // NOTE: CSI index may not have this same max position. const static uint32_t MAX_POSITION = 536870911; // Number of bits in 1 linear index - how much to shift a position by // to determine which offset into the linear index to look for it. const static uint32_t LINEAR_INDEX_SHIFT = 14; class Bin { public: Bin(){chunks = NULL; reset();} ~Bin() {reset();} void reset() { if(chunks != NULL) { free(chunks); chunks = NULL; } n_chunk = 0; bin = NOT_USED_BIN; } uint32_t bin; // The bin id. int32_t n_chunk; // The number of chunks. Chunk* chunks; // The chunks for this bin. static const uint32_t NOT_USED_BIN = 0xFFFFFFFF; }; class Reference { // Add one to the max since there may now be an extra bin containing // the mapped/unmapped counts. public: static const int32_t UNKNOWN_MAP_INFO = -1; Reference(){ioffsets = NULL; reset();} ~Reference(){reset();} void reset() { bins.clear(); if(ioffsets != NULL) { free(ioffsets); ioffsets = NULL; } n_bin = 0; n_intv = 0; minChunkOffset = UNSET_MIN_CHUNK_OFFSET; maxChunkOffset = 0; n_mapped = UNKNOWN_MAP_INFO; n_unmapped = UNKNOWN_MAP_INFO; } int32_t n_bin; // The number of bins. int32_t n_intv; // Number of intervals. std::vector bins; // The bins for this reference. uint64_t* ioffsets; // Offsets of intervals first alignments uint64_t minChunkOffset; uint64_t maxChunkOffset; int32_t n_mapped; // Number of mapped reads. int32_t n_unmapped; // Number of unmapped reads. static const uint64_t UNSET_MIN_CHUNK_OFFSET = 0xFFFFFFFFFFFFFFFFULL; }; // Set bins in the region to 1 and all other bins to 0. // start is incluive, end is exclusive. static void getBinsForRegion(uint32_t start, uint32_t end, bool binMap[MAX_NUM_BINS+1]); // Number of reference sequences. int32_t n_ref; // The references. std::vector myRefs; }; #endif libStatGen-1.0.14/general/InplaceMerge.cpp000066400000000000000000000025231254730101300203030ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "InplaceMerge.h" #if defined(TEST) #include "iostream" #include "Generic.h" int main(int argc, const char **argv) { int a[] = {1,2,3,4,5}; int b[] = {2,4,6,7,10}; int c[] = {3,5,8,10,11}; std::vector z(15); std::copy(a, a+5, z.begin()); std::copy(b, b+5, z.begin() + 5); std::copy(c, c+5, z.begin() + 10); std::vector indeces, counts; indeces.push_back(0); indeces.push_back(5); indeces.push_back(10); counts.push_back(5); counts.push_back(5); counts.push_back(5); inplace_merge(indeces, counts, 0, 3, z); std::cout << z; } #endif libStatGen-1.0.14/general/InplaceMerge.h000066400000000000000000000052331254730101300177510ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _INPLACE_MERGE_H #define _INPLACE_MERGE_H #include #if defined(DEBUG_INPLACE_MERGE) #include "Generic.h" #include #endif #include #include // // given a partially ordered vector of values, use // inplace_merge to merge the ordered subsets together in some // reasonable fashion. // // On output, values is sorted in ascending order. // // the counts vector is also modified, the result being // undefined, except that counts[0] == values.size() at final exit. // template void inplace_merge( std::vector &indeces, std::vector &counts, int first, int last, std::vector &values) { if (first == (last)) return; // empty set -> no merge if (first == (last-1)) return; // only one set present -> no merge // here we see if we have non-adjacent sets to merge, // if so, do them independently, then we can do a final // merge next if (first != (last - 2)) { int middle = (first + last) / 2; inplace_merge(indeces, counts, middle, last, values); #if defined(DEBUG_INPLACE_MERGE) std::cout << values; #endif inplace_merge(indeces, counts, first, middle, values); #if defined(DEBUG_INPLACE_MERGE) std::cout << values; #endif // get ready to drop through to below code which will // merge our two merged subsets last = middle + 1; } // inplace_merge just two adjacent sets typename std::vector::iterator startIterator = values.begin()+indeces[first]; typename std::vector::iterator middleIterator = values.begin() + indeces[last-1]; typename std::vector::iterator endIterator = values.begin() + indeces[last-1] + counts[last - 1]; std::inplace_merge(startIterator, middleIterator, endIterator); counts[first] += counts[last - 1]; #if defined(DEBUG_INPLACE_MERGE) std::cout << values; #endif return; } #endif libStatGen-1.0.14/general/Input.cpp000066400000000000000000000074471254730101300170610ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Input.h" #include "Error.h" #include "Constant.h" #include #include int InputPromptWidth = 25; static bool safe_gets(char * buffer, int n) { buffer[0] = 0; bool success = (fgets(buffer, n, stdin) != NULL); for (char * ptr = buffer; *ptr != 0; ptr++) if (*ptr == '\n') *ptr = 0; return success; } void Input(const char * prompt, int & n, int _default) { char buffer[BUFSIZE]; int success; do { printf("%*s [%8d]: ", InputPromptWidth, prompt, _default); safe_gets(buffer, BUFSIZE); success = sscanf(buffer, "%d", &n); if (success == EOF) n = _default; } while (success == 0); } void Input(const char * prompt, char & ch, char _default) { char buffer[BUFSIZE]; int success; do { printf("%*s [%8c]: ", InputPromptWidth, prompt, _default); safe_gets(buffer, BUFSIZE); success = sscanf(buffer, "%c", &ch); if (success == EOF) ch = _default; } while (success == 0); } void Input(const char * prompt, double & d, double _default) { char buffer[BUFSIZE]; int success; do { printf("%*s [%8.2f]: ", InputPromptWidth, prompt, _default); safe_gets(buffer, BUFSIZE); success = sscanf(buffer, "%lf", &d); if (success == EOF) d = _default; } while (success == 0); } void Input(const char * prompt, bool & b, bool _default) { char buffer[BUFSIZE]; int success; char c; do { printf("%*s [%8s]: ", InputPromptWidth, prompt, _default ? "Y/n" : "y/N"); safe_gets(buffer, BUFSIZE); success = sscanf(buffer, "%c", &c); if (success == EOF) b = _default; else switch (c) { case 'y' : case 'Y' : b = true; break; case 'n' : case 'N' : b = false; break; default : success = 0; } } while (success == 0); } void Input(const char * prompt, char * s, const char * _default) { char buffer[BUFSIZE]; int success; do { printf("%*s [%8s]: ", InputPromptWidth, prompt, _default); safe_gets(buffer, BUFSIZE); success = sscanf(buffer, " %[^\n]", s); if (success == EOF) strcpy(s, _default); } while (success == 0); } void InputBounds(const char * prompt, int & n, int min, int max, int _default) { Input(prompt, n, _default); while ((n < min) || (n > max)) { printf("\n*** Input value must be between %d and %d ***\n", min, max); Input(prompt, n, _default); } } void InputBounds(const char * prompt, double & d, double min, double max, double _default) { Input(prompt, d, _default); while ((d < min) || (d > max)) { printf("\n*** Input value must be between %.2f and %.2f ***\n", min, max); Input(prompt, d, _default); } } libStatGen-1.0.14/general/Input.h000066400000000000000000000025131254730101300165130ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __INPUT_H__ #define __INPUT_H__ void Input(const char * prompt, int & n, int _default = 0); void Input(const char * prompt, double & d, double _default = 0.0); void Input(const char * prompt, char & c, char _default = 'A'); void Input(const char * prompt, char * s, const char * _default = ""); void Input(const char * prompt, bool & b, bool _default); void InputBounds(const char * prompt, int & n, int min, int max, int _default = 0); void InputBounds(const char * prompt, double & d, double min, double max, double _default = 0); extern int InputPromptWidth; #endif libStatGen-1.0.14/general/InputFile.cpp000066400000000000000000000273211254730101300176520ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "InputFile.h" #include "StringBasics.h" #include "GzipHeader.h" #include "BgzfFileType.h" #include "BgzfFileTypeRecovery.h" #include "GzipFileType.h" #include "UncompressedFileType.h" #include InputFile::InputFile(const char * filename, const char * mode, InputFile::ifileCompression compressionMode) { // XXX duplicate code myAttemptRecovery = false; myFileTypePtr = NULL; myBufferIndex = 0; myCurrentBufferSize = 0; myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; myFileBuffer = new char[myAllocatedBufferSize]; myFileName.clear(); openFile(filename, mode, compressionMode); } int InputFile::readTilChar(const std::string& stopChars, std::string& stringRef) { int charRead = 0; size_t pos = std::string::npos; // Loop until the character was not found in the stop characters. while(pos == std::string::npos) { charRead = ifgetc(); // First Check for EOF. If EOF is found, just return -1 if(charRead == EOF) { return(-1); } // Try to find the character in the stopChars. pos = stopChars.find(charRead); if(pos == std::string::npos) { // Didn't find a stop character and it is not an EOF, // so add it to the string. stringRef += charRead; } } return(pos); } int InputFile::readTilChar(const std::string& stopChars) { int charRead = 0; size_t pos = std::string::npos; // Loop until the character was not found in the stop characters. while(pos == std::string::npos) { charRead = ifgetc(); // First Check for EOF. If EOF is found, just return -1 if(charRead == EOF) { return(-1); } // Try to find the character in the stopChars. pos = stopChars.find(charRead); } return(pos); } int InputFile::discardLine() { int charRead = 0; // Loop until the character was not found in the stop characters. while((charRead != EOF) && (charRead != '\n')) { charRead = ifgetc(); } // First Check for EOF. If EOF is found, just return -1 if(charRead == EOF) { return(-1); } return(0); } int InputFile::readLine(std::string& line) { int charRead = 0; while(!ifeof()) { charRead = ifgetc(); if(charRead == EOF) { return(-1); } if(charRead == '\n') { return(0); } line += charRead; } // Should never get here. return(-1); } int InputFile::readTilTab(std::string& field) { int charRead = 0; while(!ifeof()) { charRead = ifgetc(); if(charRead == EOF) { return(-1); } if(charRead == '\n') { return(0); } if(charRead == '\t') { return(1); } field += charRead; } return(-1); } #ifdef __ZLIB_AVAILABLE__ // Open a file. Called by the constructor. // Returns true if the file was successfully opened, false otherwise. bool InputFile::openFile(const char * filename, const char * mode, InputFile::ifileCompression compressionMode) { // // if recovering, we don't want to issue big readaheads, since // that interferes with the decompression - we only want to // decompress one at a time, and handle the exceptions immediately // rather than at some indeterminate point in time. // if(myAttemptRecovery) { bufferReads(1); } // If a file is for write, just open a new file. if (mode[0] == 'w' || mode[0] == 'W') { openFileUsingMode(filename, mode, compressionMode); } else { // Check if reading from stdin. if((strcmp(filename, "-") == 0) || (strcmp(filename, "-.gz") == 0)) { // Reading from stdin, open it based on the // compression mode. openFileUsingMode(filename, mode, compressionMode); } else { // Not from stdin, so determine the file type. // Open the file read only to determine file type. UncompressedFileType file(filename, "r"); // If the file could not be opened, either create a new one or // return failure. if (!file.isOpen()) { // If the mode is for read, then the file must exist, otherwise, // create a new file. if (mode[0] == 'r' || mode[0] == 'R') { // File must exist. if (myFileTypePtr != NULL) { delete myFileTypePtr; myFileTypePtr = NULL; } // Return false, was not opened. return false; } else { openFileUsingMode(filename, mode, compressionMode); } } else { // File was successfully opened, so try to determine the // filetype from the file. // Read the file to see if it a gzip file. GzipHeader gzipHeader; bool isGzip = gzipHeader.readHeader(file); // The file header has been read, so close the file, so it can // be re-opened as the correct type. file.close(); if (isGzip) { // This file is a gzip file. // Check to see if it is BGZF Compression. if (gzipHeader.isBgzfFile()) { // This file has BGZF Compression, so set the file // pointer. if(myAttemptRecovery) { // NB: this reader will throw std::runtime_error when it recovers myFileTypePtr = new BgzfFileTypeRecovery(filename, mode); } else { // use the standard bgzf reader (samtools) myFileTypePtr = new BgzfFileType(filename, mode); } } else { // Not BGZF, just a normal gzip. myFileTypePtr = new GzipFileType(filename, mode); } } else { // The file is a uncompressed, uncompressed file, // so set the myFileTypePtr accordingly. myFileTypePtr = new UncompressedFileType(filename, mode); } } } } if(myFileTypePtr == NULL) { return(false); } if (!myFileTypePtr->isOpen()) { // The file was not opened, so delete the pointer and set to null. delete myFileTypePtr; myFileTypePtr = NULL; return false; } if(myAllocatedBufferSize == 1) { myFileTypePtr->setBuffered(false); } else { myFileTypePtr->setBuffered(true); } myFileName = filename; return true; } // Open a file. This method will open a file with the specified name and // mode with the fileTypePtr associated with the specified compressionMode. void InputFile::openFileUsingMode(const char * filename, const char * mode, ifileCompression compressionMode) { switch (compressionMode) { case GZIP: // Gzipped. myFileTypePtr = new GzipFileType(filename, mode); break; case BGZF: // // BGZF compression - recovery is possible, so use // Bgzf recovery reader if asked. // if(myAttemptRecovery && ((mode[0] == 'r') || (mode[0] == 'R'))) { // NB: this reader will throw std::runtime_error when it recovers myFileTypePtr = new BgzfFileTypeRecovery(filename, mode); } else { myFileTypePtr = new BgzfFileType(filename, mode); } break; case UNCOMPRESSED: myFileTypePtr = new UncompressedFileType(filename, mode); break; case InputFile::DEFAULT: default: // Check the extension. If it is ".gz", treat as gzip. // otherwise treat it as UNCOMPRESSED. int lastchar = 0; while (filename[lastchar] != 0) lastchar++; if ((lastchar >= 3 && filename[lastchar - 3] == '.' && filename[lastchar - 2] == 'g' && filename[lastchar - 1] == 'z')) { // .gz files files should be gzipped. myFileTypePtr = new GzipFileType(filename, mode); } else { // Create an uncompressed file. myFileTypePtr = new UncompressedFileType(filename, mode); } break; } if(myFileTypePtr == NULL) { return; } if(myAllocatedBufferSize == 1) { myFileTypePtr->setBuffered(false); } else { myFileTypePtr->setBuffered(true); } } #else // No zlib, so just treat all files as std files. // Open a file. Called by the constructor. // Returns true if the file was successfully opened, false otherwise. bool InputFile::openFile(const char * filename, const char * mode, InputFile::ifileCompression compressionMode) { // No zlib, so it is a uncompressed, uncompressed file. myFileTypePtr = new UncompressedFileType(filename, mode); if(myFileTypePtr == NULL) { return(false); } if (!myFileTypePtr->isOpen()) { // The file was not opened, so delete the pointer and set to null. delete myFileTypePtr; myFileTypePtr = NULL; return false; } if(myAllocatedBufferSize == 1) { myFileTypePtr->setBuffered(false); } else { myFileTypePtr->setBuffered(true); } myFileName = filename; return true; } #endif InputFile::~InputFile() { delete myFileTypePtr; myFileTypePtr = NULL; if(myFileBuffer != NULL) { delete[] myFileBuffer; myFileBuffer = NULL; } } int ifprintf(IFILE output, const char * format, ...) { String buffer; va_list ap; va_start(ap, format); buffer.vprintf(format, ap); va_end(ap); return ::ifwrite(output, (const char *) buffer, buffer.Length()); } InputFile& operator << (InputFile& stream, double num) { String val; val = num; stream << val; return(stream); } InputFile& operator << (InputFile& stream, int num) { String val; val = num; stream << val; return(stream); } InputFile& operator << (InputFile& stream, unsigned int num) { String val; val = num; stream << val; return(stream); } libStatGen-1.0.14/general/InputFile.h000066400000000000000000000712321254730101300173170ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /*! \file */ #ifndef __INPUTFILE_H__ #define __INPUTFILE_H__ #include #include #include #include #include "FileType.h" /// Class for easily reading/writing files without having to worry about /// file type (uncompressed, gzip, bgzf) when reading. /// It hides the low level file operations/structure from the user, allowing /// them to generically open and operate on a file using the same /// interface without knowing the file format (standard uncompressed, /// gzip, or bgzf). For writing, the user must specify the file type. /// There is a typedef IFILE which is InputFile* and setup to mimic FILE /// including global methods that take IFILE as a parameter. class InputFile { bool myAttemptRecovery; // use recovery techniques if possible public: /// Compression to use when writing a file & decompression used when /// reading a file from stdin. Any other read checks the file to determine /// how to uncompress it. enum ifileCompression { DEFAULT, ///< Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED. UNCOMPRESSED, ///< uncompressed file. GZIP, ///< gzip file. BGZF ///< bgzf file. }; /// Default constructor InputFile() { myAttemptRecovery = false; myFileTypePtr = NULL; myBufferIndex = 0; myCurrentBufferSize = 0; // Default to buffer. myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; myFileBuffer = new char[myAllocatedBufferSize]; myFileName.clear(); } /// Destructor ~InputFile(); /// Constructor for opening a file. /// \param filename file to open /// \param mode same format as fopen: "r" for read & "w" for write. /// \param compressionMode set the type of file to open for writing or /// for reading from stdin (when reading files, the compression type is /// determined by reading the file). InputFile(const char * filename, const char * mode, InputFile::ifileCompression compressionMode = InputFile::DEFAULT); /// Set the buffer size for reading from files so that bufferSize bytes /// are read at a time and stored until accessed by another read call. /// This improves performance over reading the file small bits at a time. /// Buffering reads disables the tell call for bgzf files. /// Any previous values in the buffer will be deleted. /// \param bufferSize number of bytes to read/buffer at a time, /// turn off read buffering by setting bufferSize = 1; inline void bufferReads(unsigned int bufferSize = DEFAULT_BUFFER_SIZE) { // If the buffer size is the same, do nothing. if(bufferSize == myAllocatedBufferSize) { return; } // Delete the previous buffer. if(myFileBuffer != NULL) { delete[] myFileBuffer; } myBufferIndex = 0; myCurrentBufferSize = 0; // The buffer size must be at least 1 so one character can be // read and ifgetc can just assume reading into the buffer. if(bufferSize < 1) { bufferSize = 1; } myFileBuffer = new char[bufferSize]; myAllocatedBufferSize = bufferSize; if(myFileTypePtr != NULL) { if(bufferSize == 1) { myFileTypePtr->setBuffered(false); } else { myFileTypePtr->setBuffered(true); } } } /// Disable read buffering. inline void disableBuffering() { bufferReads(1); if(myFileTypePtr != NULL) { myFileTypePtr->setBuffered(false); } } /// Close the file. /// \return status of the close (0 is success). inline int ifclose() { if (myFileTypePtr == NULL) { return EOF; } int result = myFileTypePtr->close(); delete myFileTypePtr; myFileTypePtr = NULL; myFileName.clear(); return result; } /// Read size bytes from the file into the buffer. /// \param buffer pointer to memory at least size bytes big to write the /// data into. /// \param size number of bytes to be read /// \return number of bytes read, if it is not equal to size, /// there was either an error or the end of the file was reached, use /// ifeof to determine which case it was. inline int ifread(void * buffer, unsigned int size) { // There are 2 cases: // 1) There are already size available bytes in buffer. // 2) There are not size bytes in buffer. // Determine the number of available bytes in the buffer. unsigned int availableBytes = myCurrentBufferSize - myBufferIndex; int returnSize = 0; // Case 1: There are already size available bytes in buffer. if (size <= availableBytes) { // Just copy from the buffer, increment the index and return. memcpy(buffer, myFileBuffer+myBufferIndex, size); // Increment the buffer index. myBufferIndex += size; returnSize = size; } // Case 2: There are not size bytes in buffer. else { // Check to see if there are some bytes in the buffer. if (availableBytes > 0) { // Size > availableBytes > 0 // Copy the available bytes into the buffer. memcpy(buffer, myFileBuffer+myBufferIndex, availableBytes); } // So far availableBytes have been copied into the read buffer. returnSize = availableBytes; // Increment myBufferIndex by what was read. myBufferIndex += availableBytes; unsigned int remainingSize = size - availableBytes; // Check if the remaining size is more or less than the // max buffer size. if(remainingSize < myAllocatedBufferSize) { // the remaining size is not the full buffer, but read // a full buffer worth of data anyway. myCurrentBufferSize = readFromFile(myFileBuffer, myAllocatedBufferSize); // Check for an error. if(myCurrentBufferSize <= 0) { // No more data was successfully read, so check to see // if any data was copied to the return buffer at all. if( returnSize == 0) { // No data has been copied at all into the // return read buffer, so just return the value // returned from readFromFile. returnSize = myCurrentBufferSize; // Otherwise, returnSize is already set to the // available bytes that was already copied (so no // else statement is needed). } // Set myBufferIndex & myCurrentBufferSize to 0. myCurrentBufferSize = 0; myBufferIndex = 0; } else { // Successfully read more data. // Check to see how much was copied. int copySize = remainingSize; if(copySize > myCurrentBufferSize) { // Not the entire requested amount was read // (either from EOF or there was a partial read due to // an error), so set the copySize to what was read. copySize = myCurrentBufferSize; } // Now copy the rest of the bytes into the buffer. memcpy((char*)buffer+availableBytes, myFileBuffer, copySize); // set the buffer index to the location after what we are // returning as read. myBufferIndex = copySize; returnSize += copySize; } } else { // More remaining to be read than the max buffer size, so just // read directly into the output buffer. int readSize = readFromFile((char*)buffer + availableBytes, remainingSize); // Already used the buffer, so "clear" it. myCurrentBufferSize = 0; myBufferIndex = 0; if(readSize <= 0) { // No more data was successfully read, so check to see // if any data was copied to the return buffer at all. if(returnSize == 0) { // No data has been copied at all into the // return read buffer, so just return the value // returned from readFromFile. returnSize = readSize; // Otherwise, returnSize is already set to the // available bytes that was already copied (so no // else statement is needed). } } else { // More data was read, so increment the return count. returnSize += readSize; } } } return(returnSize); } /// Read until the specified characters, returning which character was /// found causing the stop, -1 returned for EOF, storing the other read /// characters into the specified string. /// Note: If stopChars is just '\n', readLine is faster and if /// stopChars is just '\n' and '\t', readTilTab is faster. /// \param stopChars characters to stop reading when they are hit. /// \param stringRef reference to a string that the read characters should /// be apppended to (does not include the stopchar). /// \return index of the character in stopChars that caused it to stop /// reading or -1 for EOF. int readTilChar(const std::string& stopChars, std::string& stringRef); /// Read until the specified characters, returning which character was /// found causing the stop, -1 returned for EOF, dropping all read chars. /// Note: If stopChars is just '\n', discardLine is faster. /// \param stopChars characters to stop reading when they are hit. /// \return index of the character in stopChars that caused it to stop /// reading or -1 for EOF. int readTilChar(const std::string& stopChars); /// Read until the end of the line, discarding the characters, /// returning -1 returned for EOF and returning 0 if the end of the line /// was found. /// \return 0 if the end of the line was found before EOF or -1 for EOF. int discardLine(); /// Read, appending the characters into the specified string until new /// line or EOF is found, returning -1 if EOF is found first and 0 if new /// line is found first. The new line and EOF are not written into the /// specified string. /// \param line reference to a string that the read characters should /// be apppended to (does not include the new line or eof). /// \return 0 if new line and -1 for EOF. int readLine(std::string& line); /// Read, appending the characters into the specified string until tab, new /// line, or EOF is found, returning -1 if EOF is found first, 0 if new /// line is found first, or 1 if a tab is found first. The tab, new line, /// and EOF are not written into the specified string. /// \param field reference to a string that the read characters should /// be apppended to (does not include the tab, new line, or eof). /// \return 1 if tab is found, 0 if new line, and -1 for EOF. int readTilTab(std::string& field); /// Get a character from the file. Read a character from the internal /// buffer, or if the end of the buffer has been reached, read from the /// file into the buffer and return index 0. /// \return character that was read or EOF. inline int ifgetc() { if (myBufferIndex >= myCurrentBufferSize) { // at the last index, read a new buffer. myCurrentBufferSize = readFromFile(myFileBuffer, myAllocatedBufferSize); myBufferIndex = 0; // If the buffer index is still greater than or equal to the // myCurrentBufferSize, then we failed to read the file - return EOF. // NB: This only needs to be checked when myCurrentBufferSize // is changed. Simplify check - readFromFile returns zero on EOF if (myCurrentBufferSize == 0) { return(EOF); } } return(myFileBuffer[myBufferIndex++]); } /// Get a line from the file. /// \param buffer the buffer into which data is to be placed /// \param max the maximum size of the buffer, in bytes /// \return true if the last character read was an EOF inline bool ifgetline(void *voidBuffer, size_t max) { int ch; char *buffer = (char *) voidBuffer; while( (ch=ifgetc()) != '\n' && ch != EOF) { *buffer++ = ch; if((--max)<2) { // truncate the line, so drop remainder while( (ch=ifgetc()) && ch != '\n' && ch != EOF) { } break; } } *buffer++ = '\0'; return ch==EOF; } /// Reset to the beginning of the file. inline void ifrewind() { // Just set the myBufferIndex and the myCurrentBufferSize to 0 to simulate // clearing the buffer and call rewind to move to the beginning of the // file. if (myFileTypePtr == NULL) { // No pointer, so nothing to rewind. return; } myCurrentBufferSize = 0; myBufferIndex = 0; myFileTypePtr->rewind(); } /// Check to see if we have reached the EOF. /// \return 0 if not EOF, any other value means EOF. inline int ifeof() const { // Not EOF if we are not at the end of the buffer. if (myBufferIndex < myCurrentBufferSize) { // There are still available bytes in the buffer, so NOT EOF. return false; } else { if (myFileTypePtr == NULL) { // No myFileTypePtr, so not eof (return 0). return 0; } // exhausted our buffer, so check the file for eof. return myFileTypePtr->eof(); } } /// Write the specified buffer into the file. /// \param buffer buffer containing size bytes to write to the file. /// \param size number of bytes to write /// \return number of bytes written /// We do not buffer the write call, so just leave this as normal. inline unsigned int ifwrite(const void * buffer, unsigned int size) { if (myFileTypePtr == NULL) { // No myFileTypePtr, so return 0 - nothing written. return 0; } return myFileTypePtr->write(buffer, size); } /// Returns whether or not the file was successfully opened. /// \return true if the file is open, false if not. inline bool isOpen() const { // It is open if the myFileTypePtr is set and says it is open. if ((myFileTypePtr != NULL) && myFileTypePtr->isOpen()) { return true; } // File was not successfully opened. return false; } /// Get current position in the file. /// \return current position in the file, -1 indicates an error. inline int64_t iftell() { if (myFileTypePtr == NULL) { // No myFileTypePtr, so return false - could not seek. return -1; } int64_t pos = myFileTypePtr->tell(); pos -= (myCurrentBufferSize - myBufferIndex); return(pos); } /// Seek to the specified offset from the origin. /// \param offset offset into the file to move to (must be from a tell call) /// \param origin can be any of the following: /// Note: not all are valid for all filetypes. /// SEEK_SET - Beginning of file /// SEEK_CUR - Current position of the file pointer /// SEEK_END - End of file /// \return true on successful seek and false on a failed seek. inline bool ifseek(int64_t offset, int origin) { if (myFileTypePtr == NULL) { // No myFileTypePtr, so return false - could not seek. return false; } // TODO - may be able to seek within the buffer if applicable. // Reset buffering since a seek is being done. myBufferIndex = 0; myCurrentBufferSize = 0; return myFileTypePtr->seek(offset, origin); } /// Get the filename that is currently opened. /// \return filename associated with this class const char* getFileName() const { return(myFileName.c_str()); } /// Enable (default) or disable recovery. /// /// When true, we can attach a myFileTypePtr /// that implements a recovery capable decompressor. /// This requires that the caller be able to catch /// the exception XXX "blah blah blah". /// void setAttemptRecovery(bool flag = false) { myAttemptRecovery = flag; } bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length) { if(myFileTypePtr==NULL) return false; return myFileTypePtr->attemptRecoverySync(checkSignature, length); } // Open a file. Called by the constructor. // Returns true if the file was successfully opened, false otherwise. bool openFile(const char * filename, const char * mode, InputFile::ifileCompression compressionMode); protected: // Read into a buffer from the file. Since the buffer is passed in and // this would bypass the myFileBuffer used by this class, this method must // be protected. inline int readFromFile(void * buffer, unsigned int size) { // If no myFileTypePtr, return 0 - nothing read. if (myFileTypePtr == NULL) { return 0; } return myFileTypePtr->read(buffer, size); } #ifdef __ZLIB_AVAILABLE__ // Only necessary with zlib to determine what file type on a new // file. Without zlib, there are only uncompressed files, so a special // method is not needed to determine the type of file to open. // Open a file. This method will open a file with the specified name and // mode with the fileTypePtr associated with the specified compressionMode. void openFileUsingMode(const char* filename, const char* mode, InputFile::ifileCompression compressionMode); #endif // The size of the buffer used by this class. static const unsigned int DEFAULT_BUFFER_SIZE = 65536; // Pointer to a class that interfaces with different file types. FileType* myFileTypePtr; unsigned int myAllocatedBufferSize; // Buffer used to do large reads rather than 1 by 1 character reads // from the file. The class is then managed to iterate through the buffer. char* myFileBuffer; // Current index into the buffer. Used to track where we are in reading the // file from the buffer. int myBufferIndex; // Current number of entries in the buffer. Used to ensure that // if a read did not fill the buffer, we stop before hitting the // end of what was read. int myCurrentBufferSize; std::string myFileName; }; /// Define IFILE as a pointer to an InputFile object. typedef InputFile* IFILE; /// Open a file with the specified name and mode, using a filename of "-" to /// indicate stdin/stdout. /// \param filename file to open ("-" meands stdin/stdout) /// \param mode same format as fopen: "r" for read & "w" for write. /// \param compressionMode set the type of file to open for writing or /// for reading from stdin (when reading files not from stdin, the compression /// type is determined by reading the file). /// \return IFILE - pointer to the InputFile object that has been opened. inline IFILE ifopen(const char * filename, const char * mode, InputFile::ifileCompression compressionMode = InputFile::DEFAULT) { IFILE file = new InputFile(filename, mode, compressionMode); if (!file->isOpen()) { // Not open, so delete the file, and return null. delete file; file = NULL; } return file; } /// Close the file. /// \param file file to be closed - IFILE is a pointer to an InputFile object /// \return status of the close (0 is success or if NULL is passed in). inline int ifclose(IFILE &file) { if(file == NULL) { // NULL Pointer passed in, so return 0, since no file is open, so // does not need to be closed. return(0); } int result = file->ifclose(); delete file; file = NULL; return(result); } /// Read up to size bytes from the file into the buffer. /// \param file file to be read - IFILE is a pointer to an InputFile object /// \param buffer pointer to memory at least size bytes big to write the /// data into. /// \param size number of bytes to be read /// \return number of bytes read inline unsigned int ifread(IFILE file, void * buffer, unsigned int size) { if(file == NULL) { // No file was passed in, so 0 bytes were read. return(0); } return(file->ifread(buffer, size)); } /// Get a character from the file. Read a character from the internal /// buffer, or if the end of the buffer has been reached, read from the /// file into the buffer and return index 0. /// \param file file to be read - IFILE is a pointer to an InputFile object /// \return character that was read or EOF. inline int ifgetc(IFILE file) { if(file == NULL) { // return eof since there is no file. return(EOF); } return(file->ifgetc()); } /// Get a line from the file. /// \param file file to be read - IFILE is a pointer to an InputFile object /// \param buffer the buffer into which data is to be placed /// \param max the maximum size of the buffer, in bytes /// \return true if the last character read was an EOF inline bool ifgetline(IFILE file, void *buffer, size_t max) { if(file == NULL) { // return eof since there is no file. return(true); } return(file->ifgetline(buffer, max)); } /// Reset to the beginning of the file (cannot be done for stdin/stdout). /// \param file file to be rewound - IFILE is a pointer to an InputFile object inline void ifrewind(IFILE file) { if(file == NULL) { return; } file->ifrewind(); } /// Check to see if we have reached the EOF (returns 0 if not EOF). /// \param file file to be checked - IFILE is a pointer to an InputFile object /// \return 0 if not EOF, any other value means EOF. inline int ifeof(IFILE file) { if(file == NULL) { // No file, so that is considered to be EOF, so return 1. return(1); } return(file->ifeof()); } /// Write the specified number of bytes from the specified buffer into the file. /// \param file file to write to - IFILE is a pointer to an InputFile object /// \param buffer buffer containing size bytes to write to the file. /// \param size number of bytes to write /// \return number of bytes written inline unsigned int ifwrite(IFILE file, const void * buffer, unsigned int size) { if(file == NULL) { // No file specified, so retun 0 bytes written. return(0); } return(file->ifwrite(buffer, size)); } /// Get current position in the file. Can be fed back into ifseek. /// \param file file to perform tell on - IFILE is a pointer to an InputFile object /// \return current position in the file, -1 indicates an error. inline int64_t iftell(IFILE file) { if(file == NULL) { return(-1); } return (file->iftell()); } /// Seek to the specified position (result from an iftell), but cannot /// be done for stdin/stdout. /// \param file file to perform seek on - IFILE is a pointer to an InputFile object /// \param offset offset into the file to move to (must be from a tell call) /// \param origin can be any of the following: /// Note: not all are valid for all filetypes. /// SEEK_SET - Beginning of file /// SEEK_CUR - Current position of the file pointer /// SEEK_END - End of file /// \return true on successful seek and false on a failed seek. inline bool ifseek(IFILE file, int64_t offset, int origin) { if(file == NULL) { // Could not see since no file was specified. return(false); } return (file->ifseek(offset, origin)); } /// Write to a file using fprintf format. /// \param file file to write to - IFILE is a pointer to an InputFile object /// \param format printf format for writing, followed by parameters. /// \return number of bytes written int ifprintf(IFILE output, const char * format, ...); /// Read a line from a file using streaming. /// Will not fail when the file hits EOF, so do not do: while(iFile >> iStr) /// unless within your loop you check for ifeof and break. /// Instead, do something like: /// while(!iFile->ifeof() && iFile >> iStr) /// \param stream file to read from - IFILE is a pointer to an InputFile object /// \param str output string containing the line read from the file. inline IFILE operator >> (IFILE stream, std::string &str) { str.clear(); int ch; // not safe... newline handling? while ((ch = stream->ifgetc())!=EOF && (ch != '\n')) str.push_back(ch); return stream; } /// Write to a file using streaming. /// \param stream file to write to - IFILE is a pointer to an InputFile object /// \param str string containing what should be written to the file. inline InputFile& operator << (InputFile& stream, const std::string& str) { unsigned int numExpected = str.length(); unsigned int numWritten = stream.ifwrite(str.c_str(), numExpected); if(numExpected != numWritten) { std::cerr << "Failed to stream to IFILE, expected " << numExpected << " but only wrote " << numWritten << std::endl; } return(stream); } /// Write to a file using streaming. /// \param stream file to write to - IFILE is a pointer to an InputFile object /// \param str string containing what should be written to the file. inline InputFile& operator << (InputFile& stream, const char* str) { unsigned int numExpected = strlen(str); unsigned int numWritten = stream.ifwrite(str, numExpected); if(numExpected != numWritten) { std::cerr << "Failed to stream to IFILE, expected " << numExpected << " but only wrote " << numWritten << std::endl; } return(stream); } /// Write to a file using streaming. /// \param stream file to write to - IFILE is a pointer to an InputFile object /// \param num number that should be written to the file. InputFile& operator << (InputFile& stream, double num); /// Write to a file using streaming. /// \param stream file to write to - IFILE is a pointer to an InputFile object /// \param num number that should be written to the file. InputFile& operator << (InputFile& stream, int num); /// Write to a file using streaming. /// \param stream file to write to - IFILE is a pointer to an InputFile object /// \param num number that should be written to the file. InputFile& operator << (InputFile& stream, unsigned int num); /// Write to a file using streaming. /// \param stream file to write to - IFILE is a pointer to an InputFile object /// \param ch character that should be written to the file. inline InputFile& operator << (InputFile& stream, char ch) { unsigned int numWritten = stream.ifwrite(&ch, 1); if(1 != numWritten) { std::cerr << "Failed to stream to IFILE, expected 1, but only wrote " << numWritten << std::endl; } return(stream); } #endif libStatGen-1.0.14/general/IntArray.cpp000066400000000000000000000204161254730101300175020ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "IntArray.h" #include "Error.h" #include "Hash.h" #include "Sort.h" #include int IntArray::alloc = 4; IntArray::IntArray(int start_size) { count = start_size; size = (count + alloc) / alloc * alloc; items = new int [size]; } IntArray::IntArray(const IntArray & source) { count = source.count; size = source.size; items = new int [size]; for (int i = 0; i < count; i++) items[i] = source.items[i]; } IntArray::~IntArray() { delete [] items; } void IntArray::Grow(int new_size) { if (new_size > size) { if ((new_size >> 1) >= size) size = (new_size + alloc) / alloc * alloc; else { size = alloc; while (size <= new_size) size *= 2; } int * new_items = new int [size]; for (int i = 0; i < count; i++) new_items[i] = items[i]; delete [] items; items = new_items; } } int IntArray::Append(int value) { Grow(count + 1); items[count++] = value; return count; } int IntArray::Append(const IntArray & rhs) { Grow(count + rhs.count); for (int i = 0; i < rhs.count; i++) items[count + i] = rhs.items[i]; count += rhs.count; return count; } void IntArray::Set(int value) { for (int i = 0; i < count; i++) items[i] = value; } void IntArray::SetSequence(int start, int increment) { for (int i = 0; i < count; i++, start += increment) items[i] = start; } int IntArray::Delete(int index) { count--; if (count - index) memmove(items + index, items + index + 1, sizeof(int) *(count - index)); return count; } void IntArray::InsertAt(int index, int value) { Grow(count + 1); if (count - index) memmove(items + index + 1, items + index, sizeof(int) *(count - index)); items[index] = value; count++; } IntArray & IntArray::operator = (const IntArray & rhs) { Grow(rhs.count); count = rhs.count; for (int i = 0; i < count; i++) items[i] = rhs.items[i]; return *this; } int IntArray::Sum(int start, int end) const { int result = 0; for (int i = start; i <= end; i++) result += items[i]; return result; } double IntArray::dSum(int start, int end) const { double result = 0; for (int i = start; i <= end; i++) result += items[i]; return result; } int IntArray::Max(int start, int end) const { if (start >= count) return 0; int result = items[start]; for (int i = start + 1; i <= end; i++) if (result < items[i]) result = items[i]; return result; } int IntArray::Min(int start, int end) const { if (start >= count) return 0; int result = items[start]; for (int i = start + 1; i <= end; i++) if (result > items[i]) result = items[i]; return result; } int IntArray::Find(int value) const { for (int i = 0; i < count; i++) if (value == items[i]) return i; return -1; } int IntArray::BinarySearch(int value) const { int start = 0; int stop = count - 1; while (start <= stop) { int mid = (start + stop) / 2; if (items[mid] == value) return mid; if (items[mid] > value) stop = mid - 1; else start = mid + 1; } return -1; } void IntArray::Zero() { for (int i = 0; i < count; i++) items[i] = 0; } int IntArray::Compare(int * a, int * b) { return *a - *b; } void IntArray::Sort() { QuickSort(items, count, sizeof(int), COMPAREFUNC Compare); } void IntArray::Sort(IntArray & freeRider) { QuickSort2(items, freeRider.items, count, sizeof(int), COMPAREFUNC Compare); } void IntArray::Reverse() { for (int i = 0, j = count - 1; i < j; i++, j--) Swap(i, j); } int IntArray::CountIfGreater(int threshold) const { int result = 0; for (int i = 0; i < count; i++) if (items[i] > threshold) result++; return result; } int IntArray::CountIfGreaterOrEqual(int treshold) const { int result = 0; for (int i = 0; i < count; i++) if (items[i] >= treshold) result++; return result; } void IntArray::Add(int term) { for (int i = 0; i < count; i++) items[i] += term; } void IntArray::Multiply(int factor) { for (int i = 0; i < count; i++) items[i] *= factor; } void IntArray::Divide(int denominator) { for (int i = 0; i < count; i++) items[i] /= denominator; } void IntArray::Stack(const IntArray & a) { int end = count; Dimension(count + a.count); for (int i = 0; i < a.count; i++) items[i + end] = a[i]; } bool IntArray::operator == (const IntArray & rhs) const { if (count != rhs.count) return false; for (int i = 0; i < rhs.count; i++) if (items[i] != rhs.items[i]) return false; return true; } bool IntArray::operator != (const IntArray & rhs) const { return !(*this == rhs); } // Check if all values are in ascending or descending order // bool IntArray::isAscending() { for (int i = 1; i < count; i++) if (items[i] < items[i - 1]) return false; return true; } bool IntArray::isDescending() { for (int i = 1; i < count; i++) if (items[i] > items[i - 1]) return false; return true; } void IntArray::Add(const IntArray & v) { if (Length() != v.Length()) error("IntArray::Add - vectors have different lengths\n" "IntArrays - Left[%d] += Right[%d] ", Length(), v.Length()); for (int i = 0; i < Length(); i++) items[i] += v[i]; } int IntArray::InnerProduct(IntArray & v) { if (Length() != v.Length()) error("IntArray::InnerProduct - vectors have different dimensions\n" "IntArrays - Left[%d] * Right[%d] ", Length(), v.Length()); int sum = 0; for (int i = 0; i < Length(); i++) sum += items[i] * v[i]; return sum; } void IntArray::Swap(IntArray & rhs) { int * temp = rhs.items; rhs.items = items; items = temp; int swap = rhs.count; rhs.count = count; count = swap; swap = rhs.size; rhs.size = size; size = swap; } void IntArray::Print(FILE * output) { Print(output, "Array of Integers"); } void IntArray::Print(FILE * output, const char * label) { fprintf(output, "%s [%d elements]: ", label, count); for (int i = 0; i < count; i++) fprintf(output, "%d ", items[i]); fprintf(output, "\n"); } void IntArray::PushIfNew(int value) { for (int i = 0; i < count; i++) if (items[i] == value) return; Push(value); } int IntArray::Product() { int product = 1; for (int i = 0; i < count; i++) product *= items[i]; return product; } double IntArray::DoubleProduct() { double product = 1.0; for (int i = 0; i < count; i++) product *= items[i]; return product; } int IntArray::Hash(int initval) { return hash((unsigned char *) items, sizeof(int) * count, initval); } int IntArray::SumProduct(const IntArray & weight) const { if (count != weight.count) error("IntArray::SumProduct called with different sized arrays\n"); int sum = 0; for (int i = 0; i < count; i++) sum += items[i] * weight[i]; return sum; } double IntArray::dSumProduct(const IntArray & weight) const { if (count != weight.count) error("IntArray::dSumProduct called with different sized arrays\n"); double sum = 0.0; for (int i = 0; i < count; i++) sum += items[i] * weight[i]; return sum; } libStatGen-1.0.14/general/IntArray.h000066400000000000000000000123451254730101300171510ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __INTARRAY_H__ #define __INTARRAY_H__ #include class IntArray { private: int * items; int size, count; void Grow(int new_size); static int Compare(int * a, int * b); public: static int alloc; IntArray(int start_size = 0); IntArray(const IntArray & source); ~IntArray(); IntArray & operator = (const IntArray & rhs); int & operator [](int index) { return items[index]; } int operator [](int index) const { return items[index]; } // Suggested by Anthony Berno, 12/28/06, to avoid "ambiguities" that // Visual Studio encountered when handling implicit conversions ... int & operator [](char index) { return items[int(index)]; } int operator [](char index) const { return items[int(index)]; } // ... who knows whether Visual Studio makes C++ annoying to encourage C#? int & operator [](double fraction) { return items[(int)(count * fraction)]; } int operator [](double fraction) const { return items[(int)(count * fraction)]; } int Append(int value); int Append(const IntArray & rhs); void Push(int value) { Append(value); } int Pop() { return items[--count]; } int Peek() const { return items[count - 1]; } int &Last() const { return items[count - 1]; } void PushIfNew(int value); // used for maintaining list without duplicates int Delete(int index); void InsertAt(int index, int value); int Find(int value) const; int FastFind(int value) const { return BinarySearch(value); } int BinarySearch(int value) const; void Sort(); void Sort(IntArray & freeRider); // Sorts two arrays simultaneously void Zero(); void Set(int value); void SetSequence(int start = 0, int increment = 1); int Length() const { return count; } void Dimension(int new_count) { Grow(new_count); count = new_count; } void Clear() { count = 0; } int Sum() const { return Sum(0, count - 1); } int Sum(int start) const { return Sum(start, count - 1); } int Sum(int start, int end) const; double dSum() const { return dSum(0, count - 1); } double dSum(int start) const { return dSum(start, count - 1); } double dSum(int start, int end) const; int SumProduct(const IntArray & weight) const; double dSumProduct(const IntArray & weight) const; int Max() const { return Max(0, count - 1); } int Max(int start) const { return Max(start, count - 1); } int Max(int start, int end) const; int Min() const { return Min(0, count - 1); } int Min(int start) const { return Min(start, count - 1); } int Min(int start, int end) const; int Count() const { return count; } int CountIfGreater(int treshold) const; int CountIfGreaterOrEqual(int treshold) const; void Swap(int i, int j) { int tmp = items[i]; items[i] = items[j]; items[j] = tmp; } void Reverse(); operator int *() { return items; } void Add(int term); void Subtract(int term) { Add(-term); } void Multiply(int factor); void Divide(int denominator); void Add(const IntArray & rhs); IntArray & operator += (int rhs) { Add(rhs); return *this; } IntArray & operator += (const IntArray & rhs) { Add(rhs); return *this; } IntArray & operator *= (int rhs) { Multiply(rhs); return *this; } IntArray & operator -= (int rhs) { Add(-rhs); return *this; } IntArray & operator /= (int rhs) { Divide(rhs); return *this; } int InnerProduct(IntArray & v); bool operator == (const IntArray & rhs) const; bool operator != (const IntArray & rhs) const; bool isAscending(); bool isDescending(); void Stack(const IntArray & rhs); void Swap(IntArray & rhs); void Print() { Print(stdout); } void Print(const char * label) { Print(stdout, label); } void Print(FILE * output); void Print(FILE * output, const char * label); int Product(); double DoubleProduct(); int Hash(int initval = 0); }; #endif libStatGen-1.0.14/general/IntHash.cpp000066400000000000000000000073171254730101300173140ustar00rootroot00000000000000/* * Copyright (C) 2000-2007 Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "IntHash.h" #include "Error.h" #include IntHash::IntHash(int startsize) { count = 0; size = startsize; mask = startsize - 1; // In this implementation, the size of hash tables must be a power of two if (startsize & mask) error("IntHash: Hash table size must be a power of two.\n"); objects = new bool [size]; keys = new unsigned int [size]; for (unsigned int i = 0; i < size; i++) { objects[i] = false; } }; IntHash::~IntHash() { delete [] objects; delete [] keys; } void IntHash::Clear() { // printf("Clearing...\n"); count = 0; if (size > 16) SetSize(16); for (unsigned int i = 0; i < size; i++) objects[i] = false; } void IntHash::SetSize(int newsize) { int newmask = newsize - 1; bool * newobjects = new bool [newsize]; unsigned int * newkeys = new unsigned int [newsize]; for (int i = 0; i < newsize; i++) { newobjects[i] = false; } if (count) for (unsigned int i = 0; i < size; i++) if (objects[i] != false) { unsigned int key = keys[i]; unsigned int h = key & newmask; while (newobjects[h] != false && newkeys[h] != h) h = (h + 1) & newmask; newkeys[h] = key; newobjects[h] = objects[i]; } delete [] objects; delete [] keys; objects = newobjects; keys = newkeys; size = newsize; mask = newmask; } int IntHash::Add(int key, bool object) { if (count * 2 > size) Grow(); unsigned int h = Iterate(key); while ((objects[h] != false) && (objects[h] != object)) h = ReIterate(key, h); if (objects[h] == false) { // printf("At position %d, inserted %x\n", h, key); keys[h] = key; count++; } objects[h] = object; return h; } int IntHash::Find(int key) { int h = Iterate(key); return objects[h] == false ? -1 : h; } int IntHash::Rehash(int key, int h) { h = ReIterate(key, h); return objects[h] == false ? -1 : h; } void IntHash::Delete(unsigned int index) { if (index >= size || objects[index] == false) return; objects[index] = false; count--; if (count * 8 < size && size > 32) Shrink(); else { // rehash the next entries until we find empty slot index = (index + 1) & mask; while (objects[index] != false) { if ((keys[index] & mask) != index) { unsigned int h = Iterate(keys[index]); while ((objects[h] != false) && (objects[h] != objects[index])) h = ReIterate(keys[index], h); if (h != (unsigned int) index) { keys[h] = keys[index]; objects[h] = objects[index]; objects[index] = false; } } index = (index + 1) & mask; } } } libStatGen-1.0.14/general/IntHash.h000066400000000000000000000054451254730101300167610ustar00rootroot00000000000000/* * Copyright (C) 2000-2007 Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ ////////////////////////////////////////////////////////////////////// // libsrc/IntHash.h // (c) 2000-2007 Goncalo Abecasis // // This file is distributed as part of the MaCH source code package // and may not be redistributed in any form, without prior written // permission from the author. Permission is granted for you to // modify this file for your own personal use, but modified versions // must retain this copyright notice and must not be distributed. // // Permission is granted for you to use this file to compile MaCH. // // All computer programs have bugs. Use this file at your own risk. // // Monday October 29, 2007 // #ifndef __INTHASH_H__ #define __INTHASH_H__ #include class IntHash { protected: bool * objects; unsigned int * keys; unsigned int count, size; unsigned int mask; public: IntHash(int startsize = 32); virtual ~IntHash(); void Grow() { SetSize(size * 2); } void Shrink() { SetSize(size / 2); } void SetSize(int newsize); void Clear(); int Capacity() const { return size; } int Entries() const { return count; } bool Object(int i) const { return objects[i]; } void SetObject(int i, bool object) { objects[i] = object; } int Add(int key, bool object = true); int Find(int key); int Rehash(int key, int h); IntHash & operator = (const IntHash & rhs); bool operator [](int i) const { return objects[i]; } void Delete(unsigned int index); bool SlotInUse(int index) { return objects[index] != false; } private: unsigned int Iterate(unsigned int key) const { unsigned int h = key & mask; while (objects[h] != false && keys[h] != key) h = (h + 1) & mask; return h; } unsigned int ReIterate(unsigned int key, unsigned int h) const { h = (h + 1) & mask; while (objects[h] != false && keys[h] != key) h = (h + 1) & mask; return h; } }; #endif libStatGen-1.0.14/general/LICENSE.twister000066400000000000000000000034261254730101300177540ustar00rootroot00000000000000Mersenne twister code is included in the file Random.cpp COPYRIGHT NOTICE FOR MERSENNE TWISTER ===================================== Mersenne twister coded by Takuji Nishimura and Makoto Matsumoto. Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. libStatGen-1.0.14/general/LICENSE.txt000066400000000000000000001045141254730101300170720ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . libStatGen-1.0.14/general/LongHash.h000066400000000000000000000147501254730101300171250ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __LONGHASH_H__ #define __LONGHASH_H__ #include "Error.h" #include #ifdef UINT_MAX #define LH_NOTFOUND (UINT_MAX) #else #define LH_NOTFOUND 0xFFFFFFFF #endif template class LongHash { protected: ObjectT * objects; long long * keys; bool * occupancy; unsigned int count, size; unsigned int mask; bool allowDuplicates; public: LongHash(int startsize = 32) { count = 0; size = startsize; mask = startsize - 1; // In this implementation, the size of hash tables must be a power of two if (startsize & mask) error("LongHash: Hash table size must be a power of two.\n"); occupancy = new bool [size]; objects = new ObjectT [size]; keys = new long long [size]; allowDuplicates = false; for (unsigned int i = 0; i < size; i++) { occupancy[i] = false; } }; ~LongHash() { delete [] occupancy; delete [] objects; delete [] keys; } void Grow() { SetSize(size * 2); } void Shrink() { SetSize(size / 2); } void SetSize(int newsize) { int newmask = newsize - 1; bool * newoccupancy = new bool [newsize]; ObjectT * newobjects = new ObjectT [newsize]; long long * newkeys = new long long [newsize]; for (int i = 0; i < newsize; i++) newoccupancy[i] = false; if (count) for (unsigned int i = 0; i < size; i++) if (occupancy[i] != false) { long long key = keys[i]; unsigned int h = newmask & (unsigned int) key; while (newoccupancy[h] == true && (newkeys[h] != key || allowDuplicates)) h = (h + 1) & newmask; if (newoccupancy[h]) count--; newkeys[h] = key; newobjects[h] = objects[i]; newoccupancy[h] = true; } delete [] occupancy; delete [] objects; delete [] keys; occupancy = newoccupancy; objects = newobjects; keys = newkeys; size = newsize; mask = newmask; } void Clear() { count = 0; if (size > 32) SetSize(32); for (unsigned int i = 0; i < size; i++) occupancy[i] = false; } int Capacity() const { return size; } int Entries() const { return count; } ObjectT Object(int i) const { return objects[i]; } ObjectT & Object(int i) { return objects[i]; } void SetObject(int i, ObjectT object) { objects[i] = object; } unsigned int Add(long long key, ObjectT object) { if (count * 2 > size) Grow(); unsigned int h = Iterate(key); while (allowDuplicates && occupancy[h] && objects[h] != object) h = ReIterate(key, h); if (!occupancy[h]) { occupancy[h] = true; keys[h] = key; count++; } objects[h] = object; return h; } unsigned int Find(long long key) { unsigned int h = Iterate(key); return occupancy[h] ? h : LH_NOTFOUND; } unsigned int Rehash(long long key, unsigned int h) { h = ReIterate(key, h); return occupancy[h] ? h : LH_NOTFOUND; } LongHash & operator = (const LongHash & rhs); ObjectT operator [](int i) const { return objects[i]; } ObjectT operator [](unsigned int i) const { return objects[i]; } void Delete(unsigned int index) { if (index >= size || !occupancy[index]) return; occupancy[index] = false; count--; if (count * 8 < size && size > 32) Shrink(); else { // rehash the next entries until we find empty slot index = (index + 1) & mask; while (occupancy[index]) { if ((keys[index] & mask) != index) { unsigned int h = Iterate(keys[index]); while (occupancy[h] && objects[h] != objects[index]) h = ReIterate(keys[index], h); if (h != (unsigned int) index) { keys[h] = keys[index]; occupancy[h] = true; objects[h] = objects[index]; occupancy[index] = false; } } index = (index + 1) & mask; } } } bool SlotInUse(int index) const { return occupancy[index] == true; } bool SlotInUse(unsigned int index) const { return occupancy[index] == true; } // Accessor to get a key. long long GetKey(int index) const { return keys[index]; } long long GetKey(const unsigned int index) const { return keys[index]; } void SetAllowDuplicateKeys(bool toggle) { allowDuplicates = toggle; if (count && !allowDuplicates) SetSize(size); } private: unsigned int Iterate(long long key) const { unsigned int h = mask & (unsigned int) key; while (occupancy[h] == true && keys[h] != key) h = (h + 1) & mask; return h; } unsigned int ReIterate(long long key, unsigned int h) const { h = (h + 1) & mask; while (occupancy[h] == true && keys[h] != key) h = (h + 1) & mask; return h; } }; #endif libStatGen-1.0.14/general/LongInt.h000066400000000000000000000121101254730101300167600ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __LONGINT_H__ #define __LONGINT_H__ #ifdef __USE_LONGINT #ifndef __USE_LONG_INT #define __USE_LONG_INT #endif #endif #ifndef __USE_LONG_INT /* longints not enabled */ #define NOTZERO ~0 #define NOTONE ~1 typedef int longint; #else /* longints enabled */ /* GNU C supports long long ... */ #ifdef __GNUC__ #define __USE_LONG_LONG__ #endif /* And so does the Intel Compiler ... */ #ifdef __INTEL_COMPILER #define __USE_LONG_LONG__ #endif /* And the SUN Pro Compiler ... */ #ifdef __SUNPRO_CC #define __USE_LONG_LONG__ #endif /* And the Digital Mars Compiler ... */ #ifdef __DMC__ #ifdef _INTEGRAL_MAX_BITS #if (_INTEGRAL_MAX_BITS >= 64) #define __USE_LONG_LONG__ #endif #endif #endif /* Check for other compilers that support the C99 standard */ #include #ifdef __LLONG_MAX #define __USE_LONG_LONG__ #endif #ifdef __USE_LONG_LONG__ /* If the long long type is supported natively */ #define NOTZERO ~(0ULL) #define NOTONE ~(1ULL) typedef long long longint; #else /* Define a home brew long integer type */ #define NOTZERO longint (~0,~0) #define NOTONE longint (~0,~1) class longint { public: longint() {} longint(unsigned int low) { lo = low; hi = 0; } longint(unsigned int high, unsigned int low) { hi = high; lo = low; } longint(const longint & source) { hi = source.hi; lo = source.lo; } operator int() { return lo; } operator bool() { return lo != 0 || hi != 0; } longint operator ~() { return longint(~hi, ~lo); } longint operator ^(const longint & rhs) { return longint(hi ^ rhs.hi, lo ^ rhs.lo); } longint operator & (const longint & rhs) { return longint(hi & rhs.hi, lo & rhs.lo); } longint operator | (const longint & rhs) { return longint(hi | rhs.hi, lo | rhs.lo); } bool operator != (const longint & rhs) { return lo != rhs.lo || hi != rhs.hi; } bool operator != (unsigned int rhs) { return lo != rhs || hi != 0; } bool operator != (int rhs) { return lo != (unsigned int) rhs || hi != 0; } bool operator == (const longint & rhs) const { return lo == rhs.lo && hi == rhs.hi; } bool operator == (const unsigned int rhs) const { return lo == rhs && hi == 0; } bool operator == (const int rhs) const { return lo == (unsigned int) rhs && hi == 0; } longint & operator = (const longint & rhs) { lo = rhs.lo; hi = rhs.hi; return *this; } longint & operator = (unsigned int rhs) { lo = rhs; hi = 0; return *this; } longint & operator = (int rhs) { lo = rhs; hi = 0; return *this; } longint & operator ^= (const longint & rhs) { hi ^= rhs.hi; lo ^= rhs.lo; return *this; } longint & operator |= (const longint & rhs) { hi |= rhs.hi; lo |= rhs.lo; return *this; } longint operator &= (const longint & rhs) { hi &= rhs.hi; lo &= rhs.lo; return *this; } longint operator << (int bits) { longint result(*this); result <<= bits; return result; } longint & operator <<= (int bits) { if (bits <= 0) return *this; else { hi = (hi << 1) + ((lo & 0x80000000) != 0); lo <<= 1; return *this <<= bits - 1; } } longint operator >> (int bits) { longint result(*this); result >>= bits; return result; } longint & operator >>= (int bits) { if (bits <= 0) return *this; else { lo = (lo >> 1) + (hi & 1 ? 0x80000000 : 0); hi >>= 1; return *this >>= bits - 1; } } longint operator - (unsigned int rhs) { int high = (rhs > lo) ? hi - 1 : hi; return longint(high, lo - rhs); } longint operator - (int rhs) { int high = ((unsigned int) rhs > lo) ? hi - 1 : hi; return longint(high, lo - rhs); } private: unsigned int hi, lo; }; #endif /* __GNUC__ */ #endif /* __USE_LONG_INT */ #endif /* __LONGINT_H__ */ libStatGen-1.0.14/general/LongLongCounter.cpp000066400000000000000000000027051254730101300210310ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "LongLongCounter.h" LongCounter::LongCounter() : LongHash() { SetAllowDuplicateKeys(false); } void LongCounter::IncrementCount(long long key) { unsigned int slot = Find(key); if (slot == LH_NOTFOUND) Add(key, 1); else if (Object(slot) == -1) Delete(slot); else Object(slot)++; } void LongCounter::DecrementCount(long long key) { unsigned int slot = Find(key); if (slot == LH_NOTFOUND) Add(key, -1); else if (Object(slot) == 1) Delete(slot); else Object(slot)--; } int LongCounter::GetCount(long long key) { unsigned int slot = Find(key); if (slot == LH_NOTFOUND) return 0; else return Object(slot)--; } libStatGen-1.0.14/general/LongLongCounter.h000066400000000000000000000017731254730101300205020ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __LONGLONGCOUNTER_H_ #define __LONGLONGCOUNTER_H_ #include "LongHash.h" class LongCounter : public LongHash { public: LongCounter(); void IncrementCount(long long key); void DecrementCount(long long key); int GetCount(long long key); }; #endif libStatGen-1.0.14/general/Makefile000077500000000000000000000045001254730101300167040ustar00rootroot00000000000000UNAME := $(shell uname) ifeq ($(UNAME), Linux) # For Linux, add a couple extra warnings # # trying to strike the right balance on error/warning # handling is difficult, but here we make most everthing # an error, but allow unused results and variables for the # time being. # USER_WARNINGS ?= -Werror $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi) #-Wno-strict-overflow # -Wno-unused-variable $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-unused-result" ; fi) endif ifeq ($(UNAME), Darwin) # Mac OS, nothing to add. # -Wall contain -Wunused-variable and -Wunused-result from 'man gcc' endif #USER_COMPILE_VARS = -D_NO_PHONEHOME TOOLBASE=\ BaseAsciiMap \ BaseQualityHelper \ BaseUtilities \ BasicHash \ BgzfFileType \ BgzfFileTypeRecovery \ CharBuffer \ Chromosome \ Cigar \ CigarRoller \ Error \ ErrorHandler \ FileType \ FortranFormat \ GenomeSequence \ GenotypeLists \ glfHandler \ GzipFileType \ GzipHeader \ Hash \ IndexBase \ Input \ InputFile \ IntArray \ IntHash \ LongLongCounter \ MapFunction \ MathMatrix \ MathVector \ MemoryAllocators \ MemoryInfo \ MemoryMapArray \ MemoryMap \ MiniDeflate \ NonOverlapRegions \ Parameters \ PedigreeAlleleFreq \ Pedigree \ PedigreeDescription \ PedigreeFamily \ PedigreeGlobals \ PedigreePerson \ PhoneHome \ QuickIndex \ Random \ ReferenceSequence \ SmithWaterman \ Sort \ STLUtilities \ StatGenStatus \ StringAlias \ StringArray \ StringBasics \ StringHash \ StringMap \ Tabix \ UncompressedFileType SRCONLY=\ PedigreeLoader.cpp \ PedigreeTrim.cpp \ PedigreeTwin.cpp HDRONLY= \ Constant.h \ CSG_MD5.h \ Generic.h \ GenomeSequenceHelpers.h \ GreedyTupleAligner.h \ InplaceMerge.h \ LongHash.h \ LongInt.h \ MathConstant.h \ PackedVector.h \ PedigreeAlleles.h \ Performance.h \ ReusableVector.h \ SimpleStats.h \ TrimSequence.h \ UnitTest.h include ../Makefiles/Makefile.lib CPP_TESTS := $(shell grep -l 'if defined(TEST)' *cpp) selftest: @for i in $(CPP_TESTS); do \ if [ "XXX$$i" = XXX ] ;\ then \ continue; \ fi;\ (echo "building in self test $$i"; g++ -DTEST -D__STDC_LIMIT_MACROS -o test_$$i $$i -L. -lcsg -lm -lz -lssl ; ./test_$$i) ; \ if [ $$? -ne 0 ] ; \ then \ echo "make stopped because of errors." ; \ break ; \ fi \ done libStatGen-1.0.14/general/Makefile.depends000066400000000000000000000750141254730101300203320ustar00rootroot00000000000000# DO NOT DELETE $(OBJDIR_OPT)/BaseAsciiMap.o: BaseAsciiMap.h StringBasics.h InputFile.h $(OBJDIR_OPT)/BaseAsciiMap.o: FileType.h $(OBJDIR_OPT)/BaseQualityHelper.o: BaseQualityHelper.h $(OBJDIR_OPT)/BaseUtilities.o: BaseUtilities.h BaseAsciiMap.h StringBasics.h $(OBJDIR_OPT)/BaseUtilities.o: InputFile.h FileType.h $(OBJDIR_OPT)/BasicHash.o: BasicHash.h Error.h $(OBJDIR_OPT)/BgzfFileType.o: BgzfFileType.h ../include/bgzf.h FileType.h $(OBJDIR_OPT)/BgzfFileTypeRecovery.o: BgzfFileTypeRecovery.h FileType.h $(OBJDIR_OPT)/CharBuffer.o: CharBuffer.h InputFile.h FileType.h $(OBJDIR_OPT)/Chromosome.o: Chromosome.h GenomeSequence.h MemoryMapArray.h $(OBJDIR_OPT)/Chromosome.o: Generic.h MemoryMap.h BaseAsciiMap.h $(OBJDIR_OPT)/Chromosome.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_OPT)/Chromosome.o: StringArray.h GenomeSequenceHelpers.h $(OBJDIR_OPT)/Cigar.o: Cigar.h Generic.h StringBasics.h InputFile.h $(OBJDIR_OPT)/Cigar.o: FileType.h STLUtilities.h $(OBJDIR_OPT)/CigarRoller.o: CigarRoller.h Cigar.h Generic.h StringBasics.h $(OBJDIR_OPT)/CigarRoller.o: InputFile.h FileType.h $(OBJDIR_OPT)/Error.o: Error.h PhoneHome.h StringBasics.h InputFile.h $(OBJDIR_OPT)/Error.o: FileType.h $(OBJDIR_OPT)/ErrorHandler.o: ErrorHandler.h PhoneHome.h StringBasics.h $(OBJDIR_OPT)/ErrorHandler.o: InputFile.h FileType.h $(OBJDIR_OPT)/FileType.o: FileType.h $(OBJDIR_OPT)/FortranFormat.o: FortranFormat.h StringBasics.h InputFile.h $(OBJDIR_OPT)/FortranFormat.o: FileType.h IntArray.h Error.h $(OBJDIR_OPT)/GenomeSequence.o: Error.h Generic.h GenomeSequence.h $(OBJDIR_OPT)/GenomeSequence.o: MemoryMapArray.h MemoryMap.h BaseAsciiMap.h $(OBJDIR_OPT)/GenomeSequence.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_OPT)/GenomeSequence.o: StringArray.h GenomeSequenceHelpers.h $(OBJDIR_OPT)/GenomeSequence.o: CSG_MD5.h $(OBJDIR_OPT)/GenotypeLists.o: GenotypeLists.h Pedigree.h Constant.h $(OBJDIR_OPT)/GenotypeLists.o: PedigreeAlleles.h LongInt.h PedigreePerson.h $(OBJDIR_OPT)/GenotypeLists.o: PedigreeGlobals.h StringArray.h StringBasics.h $(OBJDIR_OPT)/GenotypeLists.o: InputFile.h FileType.h StringHash.h Hash.h $(OBJDIR_OPT)/GenotypeLists.o: IntArray.h MathVector.h PedigreeFamily.h $(OBJDIR_OPT)/GenotypeLists.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_OPT)/glfHandler.o: glfHandler.h InputFile.h FileType.h $(OBJDIR_OPT)/glfHandler.o: StringBasics.h BaseQualityHelper.h $(OBJDIR_OPT)/GzipFileType.o: GzipFileType.h FileType.h $(OBJDIR_OPT)/GzipHeader.o: GzipHeader.h $(OBJDIR_OPT)/Hash.o: Hash.h $(OBJDIR_OPT)/IndexBase.o: IndexBase.h InputFile.h FileType.h StatGenStatus.h $(OBJDIR_OPT)/IndexBase.o: ErrorHandler.h $(OBJDIR_OPT)/Input.o: Input.h Error.h Constant.h $(OBJDIR_OPT)/InputFile.o: InputFile.h FileType.h StringBasics.h GzipHeader.h $(OBJDIR_OPT)/InputFile.o: BgzfFileType.h ../include/bgzf.h $(OBJDIR_OPT)/InputFile.o: BgzfFileTypeRecovery.h GzipFileType.h $(OBJDIR_OPT)/InputFile.o: UncompressedFileType.h $(OBJDIR_OPT)/IntArray.o: IntArray.h Error.h Hash.h Sort.h Constant.h $(OBJDIR_OPT)/IntHash.o: IntHash.h Error.h $(OBJDIR_OPT)/LongLongCounter.o: LongLongCounter.h LongHash.h Error.h $(OBJDIR_OPT)/MapFunction.o: MapFunction.h MathConstant.h $(OBJDIR_OPT)/MathMatrix.o: MathMatrix.h MathVector.h StringBasics.h $(OBJDIR_OPT)/MathMatrix.o: InputFile.h FileType.h Error.h MathConstant.h $(OBJDIR_OPT)/MathMatrix.o: Sort.h Constant.h $(OBJDIR_OPT)/MathVector.o: MathVector.h StringBasics.h InputFile.h $(OBJDIR_OPT)/MathVector.o: FileType.h MathMatrix.h Error.h MathConstant.h $(OBJDIR_OPT)/MathVector.o: Sort.h Constant.h $(OBJDIR_OPT)/MemoryAllocators.o: MemoryAllocators.h $(OBJDIR_OPT)/MemoryInfo.o: MemoryInfo.h StringBasics.h InputFile.h $(OBJDIR_OPT)/MemoryInfo.o: FileType.h $(OBJDIR_OPT)/MemoryMapArray.o: MemoryMapArray.h Generic.h MemoryMap.h $(OBJDIR_OPT)/MemoryMap.o: MemoryMap.h $(OBJDIR_OPT)/MiniDeflate.o: MiniDeflate.h $(OBJDIR_OPT)/NonOverlapRegions.o: NonOverlapRegions.h $(OBJDIR_OPT)/Parameters.o: Parameters.h StringMap.h StringBasics.h $(OBJDIR_OPT)/Parameters.o: InputFile.h FileType.h PhoneHome.h Constant.h $(OBJDIR_OPT)/Parameters.o: MathConstant.h Error.h $(OBJDIR_OPT)/PedigreeAlleleFreq.o: PedigreeAlleleFreq.h Pedigree.h $(OBJDIR_OPT)/PedigreeAlleleFreq.o: Constant.h PedigreeAlleles.h LongInt.h $(OBJDIR_OPT)/PedigreeAlleleFreq.o: PedigreePerson.h PedigreeGlobals.h $(OBJDIR_OPT)/PedigreeAlleleFreq.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_OPT)/PedigreeAlleleFreq.o: FileType.h StringHash.h Hash.h IntArray.h $(OBJDIR_OPT)/PedigreeAlleleFreq.o: MathVector.h PedigreeFamily.h $(OBJDIR_OPT)/PedigreeAlleleFreq.o: PedigreeDescription.h QuickIndex.h $(OBJDIR_OPT)/PedigreeAlleleFreq.o: StringMap.h Error.h $(OBJDIR_OPT)/Pedigree.o: Pedigree.h Constant.h PedigreeAlleles.h LongInt.h $(OBJDIR_OPT)/Pedigree.o: PedigreePerson.h PedigreeGlobals.h StringArray.h $(OBJDIR_OPT)/Pedigree.o: StringBasics.h InputFile.h FileType.h StringHash.h $(OBJDIR_OPT)/Pedigree.o: Hash.h IntArray.h MathVector.h PedigreeFamily.h $(OBJDIR_OPT)/Pedigree.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_OPT)/Pedigree.o: GenotypeLists.h MemoryInfo.h Error.h Sort.h $(OBJDIR_OPT)/PedigreeDescription.o: PedigreeDescription.h PedigreeGlobals.h $(OBJDIR_OPT)/PedigreeDescription.o: Constant.h StringArray.h StringBasics.h $(OBJDIR_OPT)/PedigreeDescription.o: InputFile.h FileType.h StringHash.h $(OBJDIR_OPT)/PedigreeDescription.o: Hash.h IntArray.h MathVector.h $(OBJDIR_OPT)/PedigreeDescription.o: PedigreePerson.h PedigreeAlleles.h $(OBJDIR_OPT)/PedigreeDescription.o: LongInt.h MapFunction.h FortranFormat.h $(OBJDIR_OPT)/PedigreeDescription.o: Error.h $(OBJDIR_OPT)/PedigreeFamily.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_OPT)/PedigreeFamily.o: LongInt.h PedigreePerson.h PedigreeGlobals.h $(OBJDIR_OPT)/PedigreeFamily.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_OPT)/PedigreeFamily.o: FileType.h StringHash.h Hash.h IntArray.h $(OBJDIR_OPT)/PedigreeFamily.o: MathVector.h PedigreeFamily.h $(OBJDIR_OPT)/PedigreeFamily.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_OPT)/PedigreeFamily.o: MathConstant.h Error.h $(OBJDIR_OPT)/PedigreeGlobals.o: PedigreeGlobals.h Constant.h StringArray.h $(OBJDIR_OPT)/PedigreeGlobals.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_OPT)/PedigreeGlobals.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_OPT)/PedigreeGlobals.o: Sort.h Error.h $(OBJDIR_OPT)/PedigreePerson.o: PedigreePerson.h Constant.h PedigreeAlleles.h $(OBJDIR_OPT)/PedigreePerson.o: LongInt.h PedigreeGlobals.h StringArray.h $(OBJDIR_OPT)/PedigreePerson.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_OPT)/PedigreePerson.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_OPT)/PedigreePerson.o: Error.h $(OBJDIR_OPT)/PhoneHome.o: PhoneHome.h StringBasics.h InputFile.h FileType.h $(OBJDIR_OPT)/PhoneHome.o: ../include/knetfile.h $(OBJDIR_OPT)/QuickIndex.o: QuickIndex.h MathVector.h StringBasics.h $(OBJDIR_OPT)/QuickIndex.o: InputFile.h FileType.h StringArray.h StringHash.h $(OBJDIR_OPT)/QuickIndex.o: Constant.h Hash.h IntArray.h StringMap.h Error.h $(OBJDIR_OPT)/Random.o: Random.h MathConstant.h Error.h $(OBJDIR_OPT)/ReferenceSequence.o: Error.h Generic.h ReferenceSequence.h $(OBJDIR_OPT)/ReferenceSequence.o: BaseAsciiMap.h StringBasics.h InputFile.h $(OBJDIR_OPT)/ReferenceSequence.o: FileType.h PackedVector.h $(OBJDIR_OPT)/SmithWaterman.o: SmithWaterman.h CigarRoller.h Cigar.h $(OBJDIR_OPT)/SmithWaterman.o: Generic.h StringBasics.h InputFile.h $(OBJDIR_OPT)/SmithWaterman.o: FileType.h $(OBJDIR_OPT)/Sort.o: Sort.h Constant.h Error.h $(OBJDIR_OPT)/STLUtilities.o: STLUtilities.h $(OBJDIR_OPT)/StatGenStatus.o: StatGenStatus.h ErrorHandler.h $(OBJDIR_OPT)/StringAlias.o: StringAlias.h StringArray.h StringBasics.h $(OBJDIR_OPT)/StringAlias.o: InputFile.h FileType.h StringHash.h Constant.h $(OBJDIR_OPT)/StringAlias.o: Hash.h $(OBJDIR_OPT)/StringArray.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_OPT)/StringArray.o: FileType.h Sort.h Constant.h Error.h $(OBJDIR_OPT)/StringBasics.o: StringBasics.h InputFile.h FileType.h Error.h $(OBJDIR_OPT)/StringBasics.o: Constant.h MathConstant.h $(OBJDIR_OPT)/StringHash.o: StringHash.h StringBasics.h InputFile.h $(OBJDIR_OPT)/StringHash.o: FileType.h Constant.h Hash.h Error.h $(OBJDIR_OPT)/StringMap.o: StringMap.h StringBasics.h InputFile.h FileType.h $(OBJDIR_OPT)/Tabix.o: Tabix.h IndexBase.h InputFile.h FileType.h $(OBJDIR_OPT)/Tabix.o: StatGenStatus.h ErrorHandler.h StringBasics.h $(OBJDIR_OPT)/UncompressedFileType.o: UncompressedFileType.h FileType.h $(OBJDIR_OPT)/PedigreeLoader.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_OPT)/PedigreeLoader.o: LongInt.h PedigreePerson.h PedigreeGlobals.h $(OBJDIR_OPT)/PedigreeLoader.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_OPT)/PedigreeLoader.o: FileType.h StringHash.h Hash.h IntArray.h $(OBJDIR_OPT)/PedigreeLoader.o: MathVector.h PedigreeFamily.h $(OBJDIR_OPT)/PedigreeLoader.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_OPT)/PedigreeLoader.o: FortranFormat.h Error.h $(OBJDIR_OPT)/PedigreeTrim.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_OPT)/PedigreeTrim.o: LongInt.h PedigreePerson.h PedigreeGlobals.h $(OBJDIR_OPT)/PedigreeTrim.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_OPT)/PedigreeTrim.o: FileType.h StringHash.h Hash.h IntArray.h $(OBJDIR_OPT)/PedigreeTrim.o: MathVector.h PedigreeFamily.h $(OBJDIR_OPT)/PedigreeTrim.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_OPT)/PedigreeTwin.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_OPT)/PedigreeTwin.o: LongInt.h PedigreePerson.h PedigreeGlobals.h $(OBJDIR_OPT)/PedigreeTwin.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_OPT)/PedigreeTwin.o: FileType.h StringHash.h Hash.h IntArray.h $(OBJDIR_OPT)/PedigreeTwin.o: MathVector.h PedigreeFamily.h $(OBJDIR_OPT)/PedigreeTwin.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_OPT)/PedigreeTwin.o: Error.h $(OBJDIR_DEBUG)/BaseAsciiMap.o: BaseAsciiMap.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/BaseAsciiMap.o: FileType.h $(OBJDIR_DEBUG)/BaseQualityHelper.o: BaseQualityHelper.h $(OBJDIR_DEBUG)/BaseUtilities.o: BaseUtilities.h BaseAsciiMap.h $(OBJDIR_DEBUG)/BaseUtilities.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/BasicHash.o: BasicHash.h Error.h $(OBJDIR_DEBUG)/BgzfFileType.o: BgzfFileType.h ../include/bgzf.h FileType.h $(OBJDIR_DEBUG)/BgzfFileTypeRecovery.o: BgzfFileTypeRecovery.h FileType.h $(OBJDIR_DEBUG)/CharBuffer.o: CharBuffer.h InputFile.h FileType.h $(OBJDIR_DEBUG)/Chromosome.o: Chromosome.h GenomeSequence.h MemoryMapArray.h $(OBJDIR_DEBUG)/Chromosome.o: Generic.h MemoryMap.h BaseAsciiMap.h $(OBJDIR_DEBUG)/Chromosome.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/Chromosome.o: StringArray.h GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/Cigar.o: Cigar.h Generic.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/Cigar.o: FileType.h STLUtilities.h $(OBJDIR_DEBUG)/CigarRoller.o: CigarRoller.h Cigar.h Generic.h StringBasics.h $(OBJDIR_DEBUG)/CigarRoller.o: InputFile.h FileType.h $(OBJDIR_DEBUG)/Error.o: Error.h PhoneHome.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/Error.o: FileType.h $(OBJDIR_DEBUG)/ErrorHandler.o: ErrorHandler.h PhoneHome.h StringBasics.h $(OBJDIR_DEBUG)/ErrorHandler.o: InputFile.h FileType.h $(OBJDIR_DEBUG)/FileType.o: FileType.h $(OBJDIR_DEBUG)/FortranFormat.o: FortranFormat.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/FortranFormat.o: FileType.h IntArray.h Error.h $(OBJDIR_DEBUG)/GenomeSequence.o: Error.h Generic.h GenomeSequence.h $(OBJDIR_DEBUG)/GenomeSequence.o: MemoryMapArray.h MemoryMap.h BaseAsciiMap.h $(OBJDIR_DEBUG)/GenomeSequence.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/GenomeSequence.o: StringArray.h GenomeSequenceHelpers.h $(OBJDIR_DEBUG)/GenomeSequence.o: CSG_MD5.h $(OBJDIR_DEBUG)/GenotypeLists.o: GenotypeLists.h Pedigree.h Constant.h $(OBJDIR_DEBUG)/GenotypeLists.o: PedigreeAlleles.h LongInt.h PedigreePerson.h $(OBJDIR_DEBUG)/GenotypeLists.o: PedigreeGlobals.h StringArray.h $(OBJDIR_DEBUG)/GenotypeLists.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/GenotypeLists.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_DEBUG)/GenotypeLists.o: PedigreeFamily.h PedigreeDescription.h $(OBJDIR_DEBUG)/GenotypeLists.o: PedigreeAlleleFreq.h $(OBJDIR_DEBUG)/glfHandler.o: glfHandler.h InputFile.h FileType.h $(OBJDIR_DEBUG)/glfHandler.o: StringBasics.h BaseQualityHelper.h $(OBJDIR_DEBUG)/GzipFileType.o: GzipFileType.h FileType.h $(OBJDIR_DEBUG)/GzipHeader.o: GzipHeader.h $(OBJDIR_DEBUG)/Hash.o: Hash.h $(OBJDIR_DEBUG)/IndexBase.o: IndexBase.h InputFile.h FileType.h $(OBJDIR_DEBUG)/IndexBase.o: StatGenStatus.h ErrorHandler.h $(OBJDIR_DEBUG)/Input.o: Input.h Error.h Constant.h $(OBJDIR_DEBUG)/InputFile.o: InputFile.h FileType.h StringBasics.h $(OBJDIR_DEBUG)/InputFile.o: GzipHeader.h BgzfFileType.h ../include/bgzf.h $(OBJDIR_DEBUG)/InputFile.o: BgzfFileTypeRecovery.h GzipFileType.h $(OBJDIR_DEBUG)/InputFile.o: UncompressedFileType.h $(OBJDIR_DEBUG)/IntArray.o: IntArray.h Error.h Hash.h Sort.h Constant.h $(OBJDIR_DEBUG)/IntHash.o: IntHash.h Error.h $(OBJDIR_DEBUG)/LongLongCounter.o: LongLongCounter.h LongHash.h Error.h $(OBJDIR_DEBUG)/MapFunction.o: MapFunction.h MathConstant.h $(OBJDIR_DEBUG)/MathMatrix.o: MathMatrix.h MathVector.h StringBasics.h $(OBJDIR_DEBUG)/MathMatrix.o: InputFile.h FileType.h Error.h MathConstant.h $(OBJDIR_DEBUG)/MathMatrix.o: Sort.h Constant.h $(OBJDIR_DEBUG)/MathVector.o: MathVector.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/MathVector.o: FileType.h MathMatrix.h Error.h MathConstant.h $(OBJDIR_DEBUG)/MathVector.o: Sort.h Constant.h $(OBJDIR_DEBUG)/MemoryAllocators.o: MemoryAllocators.h $(OBJDIR_DEBUG)/MemoryInfo.o: MemoryInfo.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/MemoryInfo.o: FileType.h $(OBJDIR_DEBUG)/MemoryMapArray.o: MemoryMapArray.h Generic.h MemoryMap.h $(OBJDIR_DEBUG)/MemoryMap.o: MemoryMap.h $(OBJDIR_DEBUG)/MiniDeflate.o: MiniDeflate.h $(OBJDIR_DEBUG)/NonOverlapRegions.o: NonOverlapRegions.h $(OBJDIR_DEBUG)/Parameters.o: Parameters.h StringMap.h StringBasics.h $(OBJDIR_DEBUG)/Parameters.o: InputFile.h FileType.h PhoneHome.h Constant.h $(OBJDIR_DEBUG)/Parameters.o: MathConstant.h Error.h $(OBJDIR_DEBUG)/PedigreeAlleleFreq.o: PedigreeAlleleFreq.h Pedigree.h $(OBJDIR_DEBUG)/PedigreeAlleleFreq.o: Constant.h PedigreeAlleles.h LongInt.h $(OBJDIR_DEBUG)/PedigreeAlleleFreq.o: PedigreePerson.h PedigreeGlobals.h $(OBJDIR_DEBUG)/PedigreeAlleleFreq.o: StringArray.h StringBasics.h $(OBJDIR_DEBUG)/PedigreeAlleleFreq.o: InputFile.h FileType.h StringHash.h $(OBJDIR_DEBUG)/PedigreeAlleleFreq.o: Hash.h IntArray.h MathVector.h $(OBJDIR_DEBUG)/PedigreeAlleleFreq.o: PedigreeFamily.h PedigreeDescription.h $(OBJDIR_DEBUG)/PedigreeAlleleFreq.o: QuickIndex.h StringMap.h Error.h $(OBJDIR_DEBUG)/Pedigree.o: Pedigree.h Constant.h PedigreeAlleles.h LongInt.h $(OBJDIR_DEBUG)/Pedigree.o: PedigreePerson.h PedigreeGlobals.h StringArray.h $(OBJDIR_DEBUG)/Pedigree.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/Pedigree.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_DEBUG)/Pedigree.o: PedigreeFamily.h PedigreeDescription.h $(OBJDIR_DEBUG)/Pedigree.o: PedigreeAlleleFreq.h GenotypeLists.h MemoryInfo.h $(OBJDIR_DEBUG)/Pedigree.o: Error.h Sort.h $(OBJDIR_DEBUG)/PedigreeDescription.o: PedigreeDescription.h $(OBJDIR_DEBUG)/PedigreeDescription.o: PedigreeGlobals.h Constant.h $(OBJDIR_DEBUG)/PedigreeDescription.o: StringArray.h StringBasics.h $(OBJDIR_DEBUG)/PedigreeDescription.o: InputFile.h FileType.h StringHash.h $(OBJDIR_DEBUG)/PedigreeDescription.o: Hash.h IntArray.h MathVector.h $(OBJDIR_DEBUG)/PedigreeDescription.o: PedigreePerson.h PedigreeAlleles.h $(OBJDIR_DEBUG)/PedigreeDescription.o: LongInt.h MapFunction.h $(OBJDIR_DEBUG)/PedigreeDescription.o: FortranFormat.h Error.h $(OBJDIR_DEBUG)/PedigreeFamily.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_DEBUG)/PedigreeFamily.o: LongInt.h PedigreePerson.h $(OBJDIR_DEBUG)/PedigreeFamily.o: PedigreeGlobals.h StringArray.h $(OBJDIR_DEBUG)/PedigreeFamily.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/PedigreeFamily.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_DEBUG)/PedigreeFamily.o: PedigreeFamily.h PedigreeDescription.h $(OBJDIR_DEBUG)/PedigreeFamily.o: PedigreeAlleleFreq.h MathConstant.h Error.h $(OBJDIR_DEBUG)/PedigreeGlobals.o: PedigreeGlobals.h Constant.h StringArray.h $(OBJDIR_DEBUG)/PedigreeGlobals.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/PedigreeGlobals.o: StringHash.h Hash.h IntArray.h $(OBJDIR_DEBUG)/PedigreeGlobals.o: MathVector.h Sort.h Error.h $(OBJDIR_DEBUG)/PedigreePerson.o: PedigreePerson.h Constant.h $(OBJDIR_DEBUG)/PedigreePerson.o: PedigreeAlleles.h LongInt.h $(OBJDIR_DEBUG)/PedigreePerson.o: PedigreeGlobals.h StringArray.h $(OBJDIR_DEBUG)/PedigreePerson.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/PedigreePerson.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_DEBUG)/PedigreePerson.o: Error.h $(OBJDIR_DEBUG)/PhoneHome.o: PhoneHome.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/PhoneHome.o: FileType.h ../include/knetfile.h $(OBJDIR_DEBUG)/QuickIndex.o: QuickIndex.h MathVector.h StringBasics.h $(OBJDIR_DEBUG)/QuickIndex.o: InputFile.h FileType.h StringArray.h $(OBJDIR_DEBUG)/QuickIndex.o: StringHash.h Constant.h Hash.h IntArray.h $(OBJDIR_DEBUG)/QuickIndex.o: StringMap.h Error.h $(OBJDIR_DEBUG)/Random.o: Random.h MathConstant.h Error.h $(OBJDIR_DEBUG)/ReferenceSequence.o: Error.h Generic.h ReferenceSequence.h $(OBJDIR_DEBUG)/ReferenceSequence.o: BaseAsciiMap.h StringBasics.h $(OBJDIR_DEBUG)/ReferenceSequence.o: InputFile.h FileType.h PackedVector.h $(OBJDIR_DEBUG)/SmithWaterman.o: SmithWaterman.h CigarRoller.h Cigar.h $(OBJDIR_DEBUG)/SmithWaterman.o: Generic.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/SmithWaterman.o: FileType.h $(OBJDIR_DEBUG)/Sort.o: Sort.h Constant.h Error.h $(OBJDIR_DEBUG)/STLUtilities.o: STLUtilities.h $(OBJDIR_DEBUG)/StatGenStatus.o: StatGenStatus.h ErrorHandler.h $(OBJDIR_DEBUG)/StringAlias.o: StringAlias.h StringArray.h StringBasics.h $(OBJDIR_DEBUG)/StringAlias.o: InputFile.h FileType.h StringHash.h Constant.h $(OBJDIR_DEBUG)/StringAlias.o: Hash.h $(OBJDIR_DEBUG)/StringArray.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/StringArray.o: FileType.h Sort.h Constant.h Error.h $(OBJDIR_DEBUG)/StringBasics.o: StringBasics.h InputFile.h FileType.h Error.h $(OBJDIR_DEBUG)/StringBasics.o: Constant.h MathConstant.h $(OBJDIR_DEBUG)/StringHash.o: StringHash.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/StringHash.o: FileType.h Constant.h Hash.h Error.h $(OBJDIR_DEBUG)/StringMap.o: StringMap.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/StringMap.o: FileType.h $(OBJDIR_DEBUG)/Tabix.o: Tabix.h IndexBase.h InputFile.h FileType.h $(OBJDIR_DEBUG)/Tabix.o: StatGenStatus.h ErrorHandler.h StringBasics.h $(OBJDIR_DEBUG)/UncompressedFileType.o: UncompressedFileType.h FileType.h $(OBJDIR_DEBUG)/PedigreeLoader.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_DEBUG)/PedigreeLoader.o: LongInt.h PedigreePerson.h $(OBJDIR_DEBUG)/PedigreeLoader.o: PedigreeGlobals.h StringArray.h $(OBJDIR_DEBUG)/PedigreeLoader.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/PedigreeLoader.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_DEBUG)/PedigreeLoader.o: PedigreeFamily.h PedigreeDescription.h $(OBJDIR_DEBUG)/PedigreeLoader.o: PedigreeAlleleFreq.h FortranFormat.h $(OBJDIR_DEBUG)/PedigreeLoader.o: Error.h $(OBJDIR_DEBUG)/PedigreeTrim.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_DEBUG)/PedigreeTrim.o: LongInt.h PedigreePerson.h PedigreeGlobals.h $(OBJDIR_DEBUG)/PedigreeTrim.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/PedigreeTrim.o: FileType.h StringHash.h Hash.h IntArray.h $(OBJDIR_DEBUG)/PedigreeTrim.o: MathVector.h PedigreeFamily.h $(OBJDIR_DEBUG)/PedigreeTrim.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_DEBUG)/PedigreeTwin.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_DEBUG)/PedigreeTwin.o: LongInt.h PedigreePerson.h PedigreeGlobals.h $(OBJDIR_DEBUG)/PedigreeTwin.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_DEBUG)/PedigreeTwin.o: FileType.h StringHash.h Hash.h IntArray.h $(OBJDIR_DEBUG)/PedigreeTwin.o: MathVector.h PedigreeFamily.h $(OBJDIR_DEBUG)/PedigreeTwin.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_DEBUG)/PedigreeTwin.o: Error.h $(OBJDIR_PROFILE)/BaseAsciiMap.o: BaseAsciiMap.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/BaseAsciiMap.o: FileType.h $(OBJDIR_PROFILE)/BaseQualityHelper.o: BaseQualityHelper.h $(OBJDIR_PROFILE)/BaseUtilities.o: BaseUtilities.h BaseAsciiMap.h $(OBJDIR_PROFILE)/BaseUtilities.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/BasicHash.o: BasicHash.h Error.h $(OBJDIR_PROFILE)/BgzfFileType.o: BgzfFileType.h ../include/bgzf.h FileType.h $(OBJDIR_PROFILE)/BgzfFileTypeRecovery.o: BgzfFileTypeRecovery.h FileType.h $(OBJDIR_PROFILE)/CharBuffer.o: CharBuffer.h InputFile.h FileType.h $(OBJDIR_PROFILE)/Chromosome.o: Chromosome.h GenomeSequence.h $(OBJDIR_PROFILE)/Chromosome.o: MemoryMapArray.h Generic.h MemoryMap.h $(OBJDIR_PROFILE)/Chromosome.o: BaseAsciiMap.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/Chromosome.o: FileType.h StringArray.h $(OBJDIR_PROFILE)/Chromosome.o: GenomeSequenceHelpers.h $(OBJDIR_PROFILE)/Cigar.o: Cigar.h Generic.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/Cigar.o: FileType.h STLUtilities.h $(OBJDIR_PROFILE)/CigarRoller.o: CigarRoller.h Cigar.h Generic.h $(OBJDIR_PROFILE)/CigarRoller.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/Error.o: Error.h PhoneHome.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/Error.o: FileType.h $(OBJDIR_PROFILE)/ErrorHandler.o: ErrorHandler.h PhoneHome.h StringBasics.h $(OBJDIR_PROFILE)/ErrorHandler.o: InputFile.h FileType.h $(OBJDIR_PROFILE)/FileType.o: FileType.h $(OBJDIR_PROFILE)/FortranFormat.o: FortranFormat.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/FortranFormat.o: FileType.h IntArray.h Error.h $(OBJDIR_PROFILE)/GenomeSequence.o: Error.h Generic.h GenomeSequence.h $(OBJDIR_PROFILE)/GenomeSequence.o: MemoryMapArray.h MemoryMap.h $(OBJDIR_PROFILE)/GenomeSequence.o: BaseAsciiMap.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/GenomeSequence.o: FileType.h StringArray.h $(OBJDIR_PROFILE)/GenomeSequence.o: GenomeSequenceHelpers.h CSG_MD5.h $(OBJDIR_PROFILE)/GenotypeLists.o: GenotypeLists.h Pedigree.h Constant.h $(OBJDIR_PROFILE)/GenotypeLists.o: PedigreeAlleles.h LongInt.h $(OBJDIR_PROFILE)/GenotypeLists.o: PedigreePerson.h PedigreeGlobals.h $(OBJDIR_PROFILE)/GenotypeLists.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/GenotypeLists.o: FileType.h StringHash.h Hash.h IntArray.h $(OBJDIR_PROFILE)/GenotypeLists.o: MathVector.h PedigreeFamily.h $(OBJDIR_PROFILE)/GenotypeLists.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_PROFILE)/glfHandler.o: glfHandler.h InputFile.h FileType.h $(OBJDIR_PROFILE)/glfHandler.o: StringBasics.h BaseQualityHelper.h $(OBJDIR_PROFILE)/GzipFileType.o: GzipFileType.h FileType.h $(OBJDIR_PROFILE)/GzipHeader.o: GzipHeader.h $(OBJDIR_PROFILE)/Hash.o: Hash.h $(OBJDIR_PROFILE)/IndexBase.o: IndexBase.h InputFile.h FileType.h $(OBJDIR_PROFILE)/IndexBase.o: StatGenStatus.h ErrorHandler.h $(OBJDIR_PROFILE)/Input.o: Input.h Error.h Constant.h $(OBJDIR_PROFILE)/InputFile.o: InputFile.h FileType.h StringBasics.h $(OBJDIR_PROFILE)/InputFile.o: GzipHeader.h BgzfFileType.h ../include/bgzf.h $(OBJDIR_PROFILE)/InputFile.o: BgzfFileTypeRecovery.h GzipFileType.h $(OBJDIR_PROFILE)/InputFile.o: UncompressedFileType.h $(OBJDIR_PROFILE)/IntArray.o: IntArray.h Error.h Hash.h Sort.h Constant.h $(OBJDIR_PROFILE)/IntHash.o: IntHash.h Error.h $(OBJDIR_PROFILE)/LongLongCounter.o: LongLongCounter.h LongHash.h Error.h $(OBJDIR_PROFILE)/MapFunction.o: MapFunction.h MathConstant.h $(OBJDIR_PROFILE)/MathMatrix.o: MathMatrix.h MathVector.h StringBasics.h $(OBJDIR_PROFILE)/MathMatrix.o: InputFile.h FileType.h Error.h MathConstant.h $(OBJDIR_PROFILE)/MathMatrix.o: Sort.h Constant.h $(OBJDIR_PROFILE)/MathVector.o: MathVector.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/MathVector.o: FileType.h MathMatrix.h Error.h $(OBJDIR_PROFILE)/MathVector.o: MathConstant.h Sort.h Constant.h $(OBJDIR_PROFILE)/MemoryAllocators.o: MemoryAllocators.h $(OBJDIR_PROFILE)/MemoryInfo.o: MemoryInfo.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/MemoryInfo.o: FileType.h $(OBJDIR_PROFILE)/MemoryMapArray.o: MemoryMapArray.h Generic.h MemoryMap.h $(OBJDIR_PROFILE)/MemoryMap.o: MemoryMap.h $(OBJDIR_PROFILE)/MiniDeflate.o: MiniDeflate.h $(OBJDIR_PROFILE)/NonOverlapRegions.o: NonOverlapRegions.h $(OBJDIR_PROFILE)/Parameters.o: Parameters.h StringMap.h StringBasics.h $(OBJDIR_PROFILE)/Parameters.o: InputFile.h FileType.h PhoneHome.h Constant.h $(OBJDIR_PROFILE)/Parameters.o: MathConstant.h Error.h $(OBJDIR_PROFILE)/PedigreeAlleleFreq.o: PedigreeAlleleFreq.h Pedigree.h $(OBJDIR_PROFILE)/PedigreeAlleleFreq.o: Constant.h PedigreeAlleles.h $(OBJDIR_PROFILE)/PedigreeAlleleFreq.o: LongInt.h PedigreePerson.h $(OBJDIR_PROFILE)/PedigreeAlleleFreq.o: PedigreeGlobals.h StringArray.h $(OBJDIR_PROFILE)/PedigreeAlleleFreq.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/PedigreeAlleleFreq.o: StringHash.h Hash.h IntArray.h $(OBJDIR_PROFILE)/PedigreeAlleleFreq.o: MathVector.h PedigreeFamily.h $(OBJDIR_PROFILE)/PedigreeAlleleFreq.o: PedigreeDescription.h QuickIndex.h $(OBJDIR_PROFILE)/PedigreeAlleleFreq.o: StringMap.h Error.h $(OBJDIR_PROFILE)/Pedigree.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_PROFILE)/Pedigree.o: LongInt.h PedigreePerson.h PedigreeGlobals.h $(OBJDIR_PROFILE)/Pedigree.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/Pedigree.o: FileType.h StringHash.h Hash.h IntArray.h $(OBJDIR_PROFILE)/Pedigree.o: MathVector.h PedigreeFamily.h $(OBJDIR_PROFILE)/Pedigree.o: PedigreeDescription.h PedigreeAlleleFreq.h $(OBJDIR_PROFILE)/Pedigree.o: GenotypeLists.h MemoryInfo.h Error.h Sort.h $(OBJDIR_PROFILE)/PedigreeDescription.o: PedigreeDescription.h $(OBJDIR_PROFILE)/PedigreeDescription.o: PedigreeGlobals.h Constant.h $(OBJDIR_PROFILE)/PedigreeDescription.o: StringArray.h StringBasics.h $(OBJDIR_PROFILE)/PedigreeDescription.o: InputFile.h FileType.h StringHash.h $(OBJDIR_PROFILE)/PedigreeDescription.o: Hash.h IntArray.h MathVector.h $(OBJDIR_PROFILE)/PedigreeDescription.o: PedigreePerson.h PedigreeAlleles.h $(OBJDIR_PROFILE)/PedigreeDescription.o: LongInt.h MapFunction.h $(OBJDIR_PROFILE)/PedigreeDescription.o: FortranFormat.h Error.h $(OBJDIR_PROFILE)/PedigreeFamily.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_PROFILE)/PedigreeFamily.o: LongInt.h PedigreePerson.h $(OBJDIR_PROFILE)/PedigreeFamily.o: PedigreeGlobals.h StringArray.h $(OBJDIR_PROFILE)/PedigreeFamily.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/PedigreeFamily.o: StringHash.h Hash.h IntArray.h $(OBJDIR_PROFILE)/PedigreeFamily.o: MathVector.h PedigreeFamily.h $(OBJDIR_PROFILE)/PedigreeFamily.o: PedigreeDescription.h $(OBJDIR_PROFILE)/PedigreeFamily.o: PedigreeAlleleFreq.h MathConstant.h $(OBJDIR_PROFILE)/PedigreeFamily.o: Error.h $(OBJDIR_PROFILE)/PedigreeGlobals.o: PedigreeGlobals.h Constant.h $(OBJDIR_PROFILE)/PedigreeGlobals.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/PedigreeGlobals.o: FileType.h StringHash.h Hash.h $(OBJDIR_PROFILE)/PedigreeGlobals.o: IntArray.h MathVector.h Sort.h Error.h $(OBJDIR_PROFILE)/PedigreePerson.o: PedigreePerson.h Constant.h $(OBJDIR_PROFILE)/PedigreePerson.o: PedigreeAlleles.h LongInt.h $(OBJDIR_PROFILE)/PedigreePerson.o: PedigreeGlobals.h StringArray.h $(OBJDIR_PROFILE)/PedigreePerson.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/PedigreePerson.o: StringHash.h Hash.h IntArray.h $(OBJDIR_PROFILE)/PedigreePerson.o: MathVector.h Error.h $(OBJDIR_PROFILE)/PhoneHome.o: PhoneHome.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/PhoneHome.o: FileType.h ../include/knetfile.h $(OBJDIR_PROFILE)/QuickIndex.o: QuickIndex.h MathVector.h StringBasics.h $(OBJDIR_PROFILE)/QuickIndex.o: InputFile.h FileType.h StringArray.h $(OBJDIR_PROFILE)/QuickIndex.o: StringHash.h Constant.h Hash.h IntArray.h $(OBJDIR_PROFILE)/QuickIndex.o: StringMap.h Error.h $(OBJDIR_PROFILE)/Random.o: Random.h MathConstant.h Error.h $(OBJDIR_PROFILE)/ReferenceSequence.o: Error.h Generic.h ReferenceSequence.h $(OBJDIR_PROFILE)/ReferenceSequence.o: BaseAsciiMap.h StringBasics.h $(OBJDIR_PROFILE)/ReferenceSequence.o: InputFile.h FileType.h PackedVector.h $(OBJDIR_PROFILE)/SmithWaterman.o: SmithWaterman.h CigarRoller.h Cigar.h $(OBJDIR_PROFILE)/SmithWaterman.o: Generic.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/SmithWaterman.o: FileType.h $(OBJDIR_PROFILE)/Sort.o: Sort.h Constant.h Error.h $(OBJDIR_PROFILE)/STLUtilities.o: STLUtilities.h $(OBJDIR_PROFILE)/StatGenStatus.o: StatGenStatus.h ErrorHandler.h $(OBJDIR_PROFILE)/StringAlias.o: StringAlias.h StringArray.h StringBasics.h $(OBJDIR_PROFILE)/StringAlias.o: InputFile.h FileType.h StringHash.h $(OBJDIR_PROFILE)/StringAlias.o: Constant.h Hash.h $(OBJDIR_PROFILE)/StringArray.o: StringArray.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/StringArray.o: FileType.h Sort.h Constant.h Error.h $(OBJDIR_PROFILE)/StringBasics.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/StringBasics.o: Error.h Constant.h MathConstant.h $(OBJDIR_PROFILE)/StringHash.o: StringHash.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/StringHash.o: FileType.h Constant.h Hash.h Error.h $(OBJDIR_PROFILE)/StringMap.o: StringMap.h StringBasics.h InputFile.h $(OBJDIR_PROFILE)/StringMap.o: FileType.h $(OBJDIR_PROFILE)/Tabix.o: Tabix.h IndexBase.h InputFile.h FileType.h $(OBJDIR_PROFILE)/Tabix.o: StatGenStatus.h ErrorHandler.h StringBasics.h $(OBJDIR_PROFILE)/UncompressedFileType.o: UncompressedFileType.h FileType.h $(OBJDIR_PROFILE)/PedigreeLoader.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_PROFILE)/PedigreeLoader.o: LongInt.h PedigreePerson.h $(OBJDIR_PROFILE)/PedigreeLoader.o: PedigreeGlobals.h StringArray.h $(OBJDIR_PROFILE)/PedigreeLoader.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/PedigreeLoader.o: StringHash.h Hash.h IntArray.h $(OBJDIR_PROFILE)/PedigreeLoader.o: MathVector.h PedigreeFamily.h $(OBJDIR_PROFILE)/PedigreeLoader.o: PedigreeDescription.h $(OBJDIR_PROFILE)/PedigreeLoader.o: PedigreeAlleleFreq.h FortranFormat.h $(OBJDIR_PROFILE)/PedigreeLoader.o: Error.h $(OBJDIR_PROFILE)/PedigreeTrim.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_PROFILE)/PedigreeTrim.o: LongInt.h PedigreePerson.h $(OBJDIR_PROFILE)/PedigreeTrim.o: PedigreeGlobals.h StringArray.h $(OBJDIR_PROFILE)/PedigreeTrim.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/PedigreeTrim.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_PROFILE)/PedigreeTrim.o: PedigreeFamily.h PedigreeDescription.h $(OBJDIR_PROFILE)/PedigreeTrim.o: PedigreeAlleleFreq.h $(OBJDIR_PROFILE)/PedigreeTwin.o: Pedigree.h Constant.h PedigreeAlleles.h $(OBJDIR_PROFILE)/PedigreeTwin.o: LongInt.h PedigreePerson.h $(OBJDIR_PROFILE)/PedigreeTwin.o: PedigreeGlobals.h StringArray.h $(OBJDIR_PROFILE)/PedigreeTwin.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/PedigreeTwin.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_PROFILE)/PedigreeTwin.o: PedigreeFamily.h PedigreeDescription.h $(OBJDIR_PROFILE)/PedigreeTwin.o: PedigreeAlleleFreq.h Error.h libStatGen-1.0.14/general/MapFunction.cpp000066400000000000000000000024361254730101300201760ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "MapFunction.h" #include "MathConstant.h" #include double DistanceToRecombination(double distance) { return (1.0 - exp(-2.0 * distance)) * 0.5; } double RecombinationToDistance(double recombination) { return (log(max(1.0 - 2 * recombination, 1e-7)) * -0.5); } double KosambiDistanceToRecombination(double distance) { double e_to_4x = exp(4.0 * distance); return (0.5 *(e_to_4x - 1.0) / (e_to_4x + 1.0)); } double RecombinationToKosambiDistance(double theta) { return 0.25 * log((1.0 + 2*theta) / max(1.0 - 2.0*theta, 1e-7)); } libStatGen-1.0.14/general/MapFunction.h000066400000000000000000000016071254730101300176420ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __MAPFUNCTION_H__ #define __MAPFUNCTION_H__ double DistanceToRecombination(double distance); double RecombinationToDistance(double recombination); #endif libStatGen-1.0.14/general/MathConstant.h000066400000000000000000000040211254730101300200130ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __MATHCONSTANT_H__ #define __MATHCONSTANT_H__ #ifdef _MSC_VER #define _USE_MATH_DEFINES #endif #include #include // Constants for numerical routines // #define TINY 1.0e-30 // A small number #define ITMAX 200 // Maximum number of iterations #define EPS 3.0e-7 // Relative accuracy #define ZEPS 3.0e-10 // Precision around zero #define FPMIN 1.0e-30 // Number near the smallest representable number #define FPMAX 1.0e+100 // Number near the largest representable number #define TOL 1.0e-6 // Zero SVD values below this #define GOLD 0.61803399 // Golden ratio #define CGOLD 0.38196601 // Complement of golden ratio inline double square(double a) { return a * a; } inline double sign(double a, double b) { return b >= 0 ? fabs(a) : -fabs(a); } inline double min(double a, double b) { return a < b ? a : b; } inline double max(double a, double b) { return a > b ? a : b; } inline int square(int a) { return a * a; } inline int sign(int a, int b) { return b >= 0 ? abs(a) : -abs(a); } inline int min(int a, int b) { return a < b ? a : b; } inline int max(int a, int b) { return a > b ? a : b; } // Useful integer quantities // #define THIRTY_BIT_MASK 0x3FFFFFFF #endif libStatGen-1.0.14/general/MathMatrix.cpp000066400000000000000000000406511254730101300200320ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "MathMatrix.h" #include "MathVector.h" #include "MathConstant.h" #include "Sort.h" #include "Error.h" #include #include #include int Matrix::alloc = 2; Matrix::~Matrix() { // printf("Deleting Matrix %s...\n", (const char *) label); for (int i=0; i extraSize) { int newSize = (n + alloc) / alloc * alloc; ColumnExtras * newExtras = new ColumnExtras [newSize]; if (extras != NULL) for (int i = 0; i < extraSize; i++) newExtras[i] = extras[i]; if (extraSize) delete [] extras; extraSize = newSize; extras = newExtras; } if (m > size) { int newSize = (m + alloc) / alloc * alloc; Vector ** newData = new Vector * [newSize]; if (data != NULL) for (int i = 0; i < size; i++) newData[i] = data[i]; for (int i = size; i < newSize; i++) newData[i] = new Vector(n); if (size) delete [] data; size = newSize; data = newData; } if (cols != n) for (int i = 0; i < rows; i++) data[i]->Dimension(n); if (rows != m) for (int i = rows; i < m; i++) data[i]->Dimension(n); rows = m; cols = n; } void Matrix::Dimension(int m, int n, double value) { int originalRows = rows; int originalColumns = cols; Dimension(m, n); if (rows > originalRows) for (int i = originalRows; i < rows; i++) data[i]->Set(value); if (cols > originalColumns) for (int i = 0; i < originalRows; i++) for (int j = originalColumns; j < cols; j++) data[i]->data[j] = value; } void Matrix::Zero() { for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*(data[i]))[j] = 0.0; } void Matrix::Identity() { if (rows != cols) error("Matrix.Identity - Identity matrices must be square"); for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) if (i == j) (*(data[i]))[j] = 1.0; else (*(data[i]))[j] = 0.0; } void Matrix::Set(double k) { for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*(data[i]))[j] = k; } void Matrix::Negate() { for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*(data[i]))[j] = -(*(data[i]))[j]; } void Matrix::Copy(const Matrix & m) { Dimension(m.rows, m.cols); if (m.data != NULL) for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*this)[i][j] = m[i][j]; } void Matrix::Transpose(const Matrix & m) { Dimension(m.cols, m.rows); for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*(data[i]))[j] = m[j][i]; } void Matrix::Add(double k) { for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*(data[i]))[j] += k; } void Matrix::Multiply(double k) { for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*(data[i]))[j] *= k; } void Matrix::Add(const Matrix & m) { if ((rows != m.rows) && (cols != m.cols)) error("Matrix.Add - Attempted to add incompatible matrices\n" "Matrices - %s [%d, %d] + %s [%d, %d]\n", (const char *) label, rows, cols, (const char *) m.label, m.rows, m.cols); for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*(data[i]))[j] += m[i][j]; } void Matrix::AddMultiple(double k, const Matrix & m) { if ((rows != m.rows) && (cols != m.cols)) error("Matrix.AddMultiple - Attempted to add incompatible matrices\n" "Matrices - %s [%d, %d] + k * %s [%d, %d]\n", (const char *) label, rows, cols, (const char *) m.label, m.rows, m.cols); for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*(data[i]))[j] += k * m[i][j]; } void Matrix::Product(const Matrix & l, const Matrix & r) { if (l.cols != r.rows) error("Matrix.Multiply - Attempted to multiply incompatible matrices\n" "Matrices - %s [%d, %d] + %s [%d, %d]\n", (const char *) l.label, l.rows, l.cols, (const char *) r.label, r.rows, r.cols); Dimension(l.rows, r.cols); Zero(); for (int k = 0; k < l.cols; k++) for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) (*(data[i]))[j] += l[i][k] * r[k][j]; } void Matrix::AddRows(double k, int r1, int r2) { Vector v(*(data[r1])); v.Multiply(k); data[r2]->Add(v); } void Matrix::MultiplyRow(int r1, double k) { data[r1]->Multiply(k); } void Matrix::AddRows(int r1, int r2) { data[r2]->Add(*(data[r1])); } void Matrix::Reduce(double tol) { double pivot; int pivotr = 0; // Initializing pivotr is not necessary, but avoids warnings int r = 0; // the row we are currently reducing for (int j = 0; j < cols; j++) { if (r > rows) return; pivot = 0.0; for (int i = r; i < rows; i++) if (fabs((*this)[i][j]) > pivot) { pivot = fabs((*this)[i][j]); pivotr = i; } if (pivot <= tol) { for (int i = r; i < rows; i++) (*this)[i][j] = 0.0; continue; } SwapRows(pivotr, r); double scale = (*this)[r][j]; (*this)[r][j] = 1.0; for (int k = j+1; k < cols; k++) (*this)[r][k] /= scale; for (int i = r + 1; r < rows; i++) { scale = (*this)[i][j]; (*this)[i][j] = 0.0; for (int k = j+1; k < cols; k++) (*this)[i][k] -= (*this)[r][k] * scale; } r++; } } void Matrix::DeleteRow(int r) { Vector * temp = data[r]; for (int i = r + 1; i < rows; i++) data[i-1] = data[i]; data[rows - 1] = temp; rows--; } void Matrix::DeleteColumn(int c) { for (int i = 0; i < rows; i++) data[i] -> DeleteDimension(c); for (int i = c + 1; i < cols; i++) extras[i-1] = extras[i]; cols--; } void Matrix::SwapColumns(int c1, int c2) { double temp; for (int i = 0; i < rows; i++) { temp = (*data[i])[c1]; (*data[i])[c1] = (*data[i])[c2]; (*data[i])[c2] = temp; } extras[c1].Swap(extras[c2]); } void Matrix::Read(FILE * f) { int r, c; char buffer[100]; int numItems = 0; numItems = fscanf(f, " %s =", buffer); if(numItems != 1) { } buffer[strlen(buffer) - 1] = 0; SetLabel(buffer); numItems = fscanf(f, " [ %d x %d ]", &r, &c); if(numItems != 2) { } Dimension(r, c); for (int c = 0; c < cols; c++) { numItems = fscanf(f, " %s", buffer); if(numItems != 1) { } SetColumnLabel(c, buffer); } for (int r = 0; r < rows; r++) for (int c = 0; c < cols; c++) { numItems = fscanf(f, " %lf", &((*this)[r][c])); if(numItems != 1) { } } } void Matrix::Print(FILE * f, int r, int c) { if (r == -1 || r > rows) r = rows; if (c == -1 || c > cols) c = cols; char dimensions[30]; sprintf(dimensions, "[%d x %d]", r, c); int columnZero = label.Length() > 15 ? label.Length() : 15; fprintf(f, "\n%*s =\n%*s ", columnZero, (const char *) label, columnZero, dimensions); int * precision = new int [c + 1]; int * width = new int [c + 1]; for (int j = 0; j < c; j++) { precision[j] = extras[j].GetPrecision(); width[j] = extras[j].GetWidth(); fprintf(f, "%*s ", width[j], (const char *) extras[j].label); } for (int i = 0; i < r; i++) { fprintf(f, "\n%*s ", columnZero, (const char *) data[i]->label); for (int j = 0; j < c; j++) fprintf(f, "%*.*f ", width[j], precision[j], (*this)[i][j]); } fprintf(f, "\n"); delete [] precision; delete [] width; } void Matrix::CopyLabels(Matrix & M) { for (int i = 0; i < rows; i++) if (i < M.rows) data[i]->SetLabel(M[i].label); for (int i = 0; i < cols; i++) if (i < M.cols) SetColumnLabel(i, M.GetColumnLabel(i)); } // ColumnExtras class // void ColumnExtras::Init() { label = "column"; dirty = true; precision = 3; width = 7; } ColumnExtras::~ColumnExtras() { } void ColumnExtras::SetLabel(const char * name) { label = name; } int ColumnExtras::GetWidth() { if (dirty) { if (precision + 2 > width) width = precision + 2; if (label.Length() > width) width = label.Length(); dirty = false; } return width; } void ColumnExtras::Copy(ColumnExtras & c) { width = c.width; precision = c.precision; dirty = c.dirty; label = c.label; } #define SWAP(a,b) {int swap=(a); (a)=(b); (b)=swap;} #define SWAPBOOL(a,b) {bool swap=(a); (a)=(b); (b)=swap;} void ColumnExtras::Swap(ColumnExtras & c) { SWAP(c.width, width); SWAP(c.precision, precision); SWAPBOOL(c.dirty, dirty); c.label.Swap(label); } int Matrix::CompareRows(Vector ** row1, Vector ** row2) { if ((**row1)[0] < (**row2)[0]) return -1; if ((**row1)[0] > (**row2)[0]) return 1; return 0; } void Matrix::Sort() { QuickSort(data, rows, sizeof(Vector *), COMPAREFUNC CompareRows); } bool Matrix::operator == (const Matrix & rhs) const { if (rhs.rows != rows || rhs.cols != cols) return false; for (int i = 0; i < rows; i++) if ((*this)[i] != rhs[i]) return false; return true; } void Matrix::StackBottom(const Matrix & m) { if (m.cols != cols) error("Attempted to stack matrices with different number of columns"); int end = rows; Dimension(rows + m.rows, cols); for (int i = 0; i < m.rows; i++) *(data[i + end]) = m[i]; } void Matrix::StackLeft(const Matrix & m) { if (m.rows != rows) error("Attempted to stack matrics with different numbers of rows"); for (int i = 0; i < rows; i++) data[i]->Stack(m[i]); Dimension(rows, cols + m.cols); } void Matrix::Swap(Matrix & m) { label.Swap(m.label); ColumnExtras * tmpExtras = extras; extras = m.extras; m.extras = tmpExtras; int swap; swap = rows; rows = m.rows; m.rows = swap; swap = cols; cols = m.cols; m.cols = swap; swap = size; size = m.size; m.size = swap; swap = extraSize; extraSize = m.extraSize; m.extraSize = swap; Vector ** tmpData = data; data = m.data; m.data = tmpData; } double Matrix::Min() const { if (rows == 0 || cols == 0) return 0.0; double minimum = data[0]->Min(); for (int i = 1; i < rows; i++) minimum = min(data[i]->Min(), minimum); return minimum; } double Matrix::Max() const { if (rows == 0 || cols == 0) return 0.0; double maximum = data[0]->Max(); for (int i = 1; i < rows; i++) maximum = max(data[i]->Max(), maximum); return maximum; } double Matrix::Mean() const { if (rows == 0 || cols == 0) return 0.0; double sum = data[0]->Sum(); for (int i = 1; i < rows; i++) sum += data[i]->Sum(); return sum / (rows * cols); } double Matrix::SafeMin() const { double lo = (rows > 0 && cols > 0) ? _NAN_ : 0.0; int i, j; for (i = 0; i < rows; i++) { for (j = 0; j < cols; j++) if (data[i]->data[j] != _NAN_) { lo = data[i]->data[j]; break; } if (j != cols) break; } for (; i < rows; i++, j = 0) for (; j < cols; j++) if (data[i]->data[j] < lo && data[i]->data[j] != _NAN_) lo = data[i]->data[j]; return lo; } double Matrix::SafeMax() const { double hi = (rows > 0 && cols > 0) ? _NAN_ : 0.0; int i, j; for (i = 0; i < rows; i++) { for (j = 0; j < cols; j++) if (data[i]->data[j] != _NAN_) { hi = data[i]->data[j]; break; } if (j != cols) break; } for (; i < rows; i++, j = 0) for (; j < cols; j++) if (data[i]->data[j] > hi && data[i]->data[j] != _NAN_) hi = data[i]->data[j]; return hi; } double Matrix::SafeMean() const { double sum = 0.0; int count = 0; for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) if ((*this)[i][j] != _NAN_) { sum += (*this)[i][j]; count ++; } return (count) ? sum / count : 0.0; } int Matrix::SafeCount() const { int total = 0; for (int i = 0; i < rows; i++) total += data[i]->SafeCount(); return total; } void Matrix::PrintUpper(FILE * f, int r, int c, bool print_diag) { int columnZero; int * precision = NULL, * width = NULL; // Initialization avoids compiler warnings SetupPrint(f, r, c, columnZero, precision, width); int upper = print_diag ? 0 : 1; for (int i = 0; i < r ; i++) { fprintf(f, "\n%*s ", columnZero, (const char *) data[i]->label); for (int j = 0; j < upper; j++) fprintf(f, "%*.*s ", width[j], precision[j], " "); for (int j = upper; j < c; j++) fprintf(f, "%*.*f ", width[j], precision[j], (*this)[i][j]); upper++; } fprintf(f, "\n"); delete [] precision; delete [] width; } void Matrix::PrintLower(FILE * f, int r, int c, bool print_diag) { if (r == -1 || r > rows) r = rows; if (c == -1 || c > cols) c = cols; String dimensions; dimensions.printf("[%d x %d]", r, c); int columnZero = label.Length() > 15 ? label.Length() : 15; fprintf(f, "\n%*s =\n%*s ", columnZero, (const char *) label, columnZero, (const char *) dimensions); int * precision = new int [c + 1]; int * width = new int [c + 1]; for (int j = 0; j < c; j++) { precision[j] = extras[j].GetPrecision(); width[j] = extras[j].GetWidth(); fprintf(f, "%*s ", width[j], (const char *) extras[j].label); } int upper = print_diag ? 1 : 0; for (int i = 0; i < r ; i++) { fprintf(f, "\n%*s ", columnZero, (const char *) data[i]->label); for (int j = 0; j < upper; j++) fprintf(f, "%*.*f ", width[j], precision[j],(*this)[i][j]); for (int j = upper; j < c; j++) fprintf(f, "%*.*s ", width[j], precision[j]," "); upper++; } fprintf(f, "\n"); delete [] precision; delete [] width; } void Matrix::SetupPrint(FILE *f, int r, int c, int & column_zero, int * precision, int * width) { if (r == -1 || r > rows) r = rows; if (c == -1 || c > cols) c = cols; String dimensions; dimensions.printf("[%d x %d]", r, c); column_zero = label.Length() > 15 ? label.Length() : 15; fprintf(f, "\n%*s =\n%*s ", column_zero, (const char *) label, column_zero, (const char *) dimensions); precision = new int [c + 1]; width = new int [c + 1]; for (int j = 0; j < c; j++) { precision[j] = extras[j].GetPrecision(); width[j] = extras[j].GetWidth(); fprintf(f, "%*s ", width[j], (const char *) extras[j].label); } } libStatGen-1.0.14/general/MathMatrix.h000066400000000000000000000131401254730101300174700ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __MATHMATRIX_H__ #define __MATHMATRIX_H__ #include "MathVector.h" #include "Error.h" #include class ColumnExtras { private: bool dirty; int precision, width; void Init(); void Copy(ColumnExtras & c); public: String label; ColumnExtras() { Init(); } ColumnExtras(ColumnExtras & original) { Init(); Copy(original); } ~ColumnExtras(); void SetLabel(const char * name); void SetPrecision(int p) { precision = p; dirty = true; } void SetWidth(int w) { width = w; dirty = true; } int GetWidth(); int GetPrecision() { return precision; } ColumnExtras & operator = (ColumnExtras & rhs) { Copy(rhs); return (*this); } void Swap(ColumnExtras & rhs); }; class Matrix { public: String label; ColumnExtras * extras; int rows, cols, size, extraSize; Vector ** data; Matrix() { Init(); } Matrix(Matrix & m) { Init(); Copy(m); } Matrix(Matrix & m, const char * name) { Init(); Copy(m); SetLabel(name); } Matrix(int n, int m) { Init(); Dimension(n, m); } Matrix(const char * name) { Init(); SetLabel(name); } Matrix(const char * name, int n, int m) { Init(); Dimension(n, m); SetLabel(name); } ~Matrix(); void Dimension(int m, int n); void Dimension(int m, int n, double value); void GrowTo(int m, int n) { Dimension(m > rows ? m : rows, n > cols ? n : cols); } void GrowTo(int m, int n, double value) { Dimension(m > rows ? m : rows, n > cols ? n : cols, value); } void SetLabel(const char * name); void SetColumnLabel(int n, const char * name) { extras[n].SetLabel(name); } const char * GetColumnLabel(int n) { return extras[n].label; } void SetColWidth(int n, int w) { extras[n].SetWidth(w); } void SetColPrecision(int n, int p) { extras[n].SetPrecision(p); } void CopyLabels(Matrix & m); void Negate(); void Identity(); void Zero(); void Set(double k); void Copy(const Matrix & m); void Transpose(const Matrix & m); void Add(const Matrix & m); void AddMultiple(double k, const Matrix & m); void Product(const Matrix & left, const Matrix & right); void Add(double k); void Multiply(double k); // Reduces a matrix to row echelon form, assuming // values smaller than tol are zero void Reduce(double tol = 0.0); Vector & operator [](int i) { assert(i < rows); return *(data[i]); } const Vector & operator [](int i) const { assert(i < rows); return *(data[i]); } void DeleteRow(int r); void DeleteColumn(int c); void SwapRows(int r1, int r2) { Vector * temp = data[r1]; data[r1] = data[r2]; data[r2] = temp; }; void SwapColumns(int c1, int c2); void MultiplyRow(int r1, double k); void AddRows(int r1, int r2); void AddRows(double k, int r1, int r2); // Sort according to numeric values in the first column void Sort(); void Print(FILE * f, int maxRows = -1, int maxCols = -1); void PrintUpper(FILE * f, int maxRows = -1, int maxCols = -1, bool print_diag = false); void PrintLower(FILE * f, int maxRows = -1, int maxCols = -1, bool print_diag = false); void SetupPrint(FILE *f, int r, int c, int & column_zero, int * precision, int * width); void Read(FILE * f); Matrix & operator = (const Matrix & rhs) { Copy(rhs); return *this; } bool operator == (const Matrix & rhs) const; bool operator != (const Matrix & rhs) const { return !(*this == rhs); } Matrix & operator *= (double rhs) { Multiply(rhs); return *this; } Matrix & operator /= (double rhs) { Multiply(1.0/rhs); return *this; } // Stack a matrix to the bottom of the current matrix void StackBottom(const Matrix & m); // Stack a matrix to the left of the current matrix void StackLeft(const Matrix & m); // Swap dynamic allocation for two matrices void Swap(Matrix & m); // Functions that calculate basic summary statistics double Min() const; double Max() const; double Mean() const; // Functions that calculate summary statistics in the presence of missing data double SafeMin() const; double SafeMax() const; double SafeMean() const; int SafeCount() const; // Return the last row in matrix Vector & Last() { return *(data[rows - 1]); } private: static int alloc; static int CompareRows(Vector ** row1, Vector ** row2); void Init(); }; #endif libStatGen-1.0.14/general/MathVector.cpp000066400000000000000000000330231254730101300200230ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "MathVector.h" #include "MathMatrix.h" #include "MathConstant.h" #include "Sort.h" #include "Error.h" #ifdef _MSC_VER #define _USE_MATH_DEFINES #endif #include #include int Vector::alloc = 32; void Vector::Init() { dim = size = 0; label = "Unknown"; data = NULL; } Vector::~Vector() { // printf(" Deleting vector %s ...\n", (const char *) label); if (data != NULL) delete [] data; } void Vector::Dimension(int d) { if (d > size) { if (size < 1024) { size = (d + alloc) / alloc * alloc; double * newData = new double [size]; if (data != NULL) { for (int i = 0; i < dim; i++) newData[i] = data[i]; delete [] data; } data = newData; } else { while (size <= d) size *= 2; double * newData = new double [size]; if (data != NULL) { for (int i = 0; i < dim; i++) newData[i] = data[i]; delete [] data; } data = newData; } } dim = d; } void Vector::Dimension(int d, double value) { int original = dim; Dimension(d); for (int i = original; i < dim; i++) data[i] = value; } void Vector::Negate() { for (int i = 0; i < dim; i++) data[i] = -data[i]; } void Vector::Add(double n) { for (int i = 0; i< dim; i++) data[i] += n; } void Vector::Multiply(double k) { for (int i = 0; i < dim; i++) data[i] *= k; } void Vector::Copy(const Vector & v) { Dimension(v.dim); if (v.data != NULL) for (int i=0; i < dim; i++) data[i] = v.data[i]; } Vector & Vector::operator = (const Vector & rhs) { Copy(rhs); return *this; } void Vector::Add(Vector & v) { if (dim != v.dim) error("Vector::Add - vectors have different dimensions\n" "Vectors - %s [%d] + %s [%d] ", (const char *) label, dim, (const char *) v.label, v.dim); for (int i = 0; i < dim; i++) data[i] += v.data[i]; } void Vector::AddMultiple(double k, Vector & v) { if (dim != v.dim) error("Vector::AddMultiple - vectors are incompatible\n" "Vectors - %s [%d] + %s [%d] ", (const char *) label, dim, (const char *) v.label, v.dim); for (int i = 0; i < dim; i++) data[i] += k * v.data[i]; } void Vector::Subtract(Vector & v) { if (dim != v.dim) error("Vector::Subtract - vectors have different dimensions\n" "Vectors - %s [%d] + %s [%d] ", (const char *) label, dim, (const char *) v.label, v.dim); for (int i = 0; i < dim; i++) data[i] -= v.data[i]; } void Vector::Zero() { for (int i = 0; i < dim; i++) data[i] = 0.0; } void Vector::Set(double k) { for (int i = 0; i < dim; i++) data[i] = k; } void Vector::SetMultiple(double k, Vector & v) { Dimension(v.dim); for (int i = 0; i < dim; i++) data[i] = k * v[i]; } double Vector::InnerProduct(Vector & v) { if (dim != v.dim) error("Vector::InnerProduct - vectors have different dimensions\n" "Vectors - %s[%d] * %s[%d] ", (const char *) label, dim, (const char *) v.label, v.dim); double sum = 0.0; for (int i = 0; i < dim; i++) sum += data[i] * v.data[i]; return sum; } void Vector::Insert(int n, double value) { Dimension(dim + 1); for (int i = dim - 1; i > n; i--) data[i] = data[i - 1]; data[n] = value; } void Vector::DeleteDimension(int n) { for (int i = n; i < dim - 1; i++) data[i] = data[i + 1]; dim --; } void Vector::Product(Matrix & m, Vector & v) { if (m.cols != v.dim) error("Vector::Product - Cannot Multiply Matrix by Vector\n" "Vectors - %s [%d, %d] * %s [%d]\n", (const char *) m.label, m.rows, m.cols, (const char *) v.label, v.dim); Dimension(m.rows); Zero(); for (int i = 0; i < m.rows; i++) for (int j = 0; j < m.cols; j++) data[i] += m[i][j] * v[j]; } double Vector::Average() const { if (dim == 0) error("Average undefined for null vector %s", (const char *) label); return Sum() / dim; } double Vector::Product() const { double product = 1.0; for (int j = 0; j < dim; j++) product *= data[j]; return product; } double Vector::Sum() const { double sum = 0.0; for (int j=0; j 1) var = (var - ep*ep/dim)/(dim-1); } double Vector::Var() const { double mean, var; AveVar(mean, var); return var; } double Vector::StandardDeviation() const { double var = Var(); if (var < 0.0) var = 0.0; return sqrt(var); } void Vector::Print(FILE * f, int d) { if (d == -1 || d > dim) d = dim; fprintf(f, "%.15s : ", (const char *) label); for (int i = 0; i < d; i++) fprintf(f, "%7.3f ", data[i]); fprintf(f, "\n"); } int Vector::CompareDouble(const double * a, const double * b) { if (*a < *b) return -1; if (*a > *b) return 1; return 0; } void Vector::Sort() { QuickSort(data, dim, sizeof(double), COMPAREFUNC CompareDouble); } void Vector::Sort(Vector & freeRider) { QuickSort2(data, freeRider.data, dim, sizeof(double), COMPAREFUNC CompareDouble); } int Vector::BinarySearch(double element) { void * pointer = ::BinarySearch (&element, data, dim, sizeof(double), COMPAREFUNC CompareDouble); if (pointer == NULL) return -1; return ((double *) pointer) - data; } void Vector::RemoveDuplicates() { int out = 0; for (int in = 1; in < Length(); in++) if (data[in] != data[out]) data[++out] = data[in]; Dimension(out + 1); } bool Vector::operator == (const Vector & rhs) const { if (rhs.dim != dim) return false; for (int i = 0; i < dim; i++) if (data[i] != rhs[i]) return false; return true; } // These functions are useful for simulation // int Vector::CountIfGreater(double threshold) const { int count = 0; for (int i = 0; i < dim; i++) if (data[i] > threshold) count++; return count; } int Vector::CountIfGreaterOrEqual(double treshold) const { int count = 0; for (int i = 0; i < dim; i++) if (data[i] >= treshold) count++; return count; } // Min and max functions // double Vector::Min() const { if (dim == 0) return 0.0; double min = data[0]; for (int i = 1; i < dim; i++) if (data[i] < min) min = data[i]; return min; } double Vector::Max() const { if (dim == 0) return 0.0; double max = data[0]; for (int i = 1; i < dim; i++) if (data[i] > max) max = data[i]; return max; } // Push and Pop functions for using vector as a stack // void Vector::Push(double value) { Dimension(dim + 1); data[dim - 1] = value; } void Vector::Stack(const Vector & v) { int end = dim; Dimension(dim + v.dim); for (int i = 0; i < v.dim; i++) data[i + end] = v[i]; } // Check if all values are in ascending or descending order // bool Vector::isAscending() { for (int i = 1; i < dim; i++) if (data[i] < data[i - 1]) return false; return true; } bool Vector::isDescending() { for (int i = 1; i < dim; i++) if (data[i] > data[i - 1]) return false; return true; } // VectorFunc class // VectorFunc::VectorFunc() { f = NULL; } VectorFunc::VectorFunc(double(*func)(Vector &)) { f = func; } double VectorFunc::Evaluate(Vector & v) { return f(v); } #ifndef M_SQRT2 #define M_SQRT2 1.41421356 #endif #define MAXROUNDS 10 #define SQRT_HALF (1.0/M_SQRT2) #define TWO (M_SQRT2 * M_SQRT2) void VectorFunc::Derivative(Vector & x, Vector & d, double h_start) { double a[MAXROUNDS][MAXROUNDS]; // Calculate derivatives along each direction ... for (int k = 0; k < x.dim; k++) { double left, right; double save_x = x[k]; double h = h_start; // Evaluate function to the left of x along direction k x[k] = save_x - h; left = Evaluate(x); // Initialize or update dfmin if appropriate... if (k == 0 || left < dfmin) dfmin = left, dpmin = x; // Evaluate function to the right of x along direction k x[k] = save_x + h; right = Evaluate(x); // Update dfmin if (right < dfmin) dfmin = left, dpmin = x; // Initial crude estimate a[0][0] = (right - left) / (2.0 * h); // Initial guess of error is large double err = 1e30; // At each round, update Neville tableau with smaller stepsize and higher // order extrapolation ... for (int i = 1; i < MAXROUNDS; i++) { // Decrease h h *= SQRT_HALF; // Re-evaluate function and update dfmin as required x[k] = save_x - h; left = Evaluate(x); if (left < dfmin) dfmin = left, dpmin = x; x[k] = save_x + h; right = Evaluate(x); if (right < dfmin) dfmin = right, dpmin = x; // Improved estimate of derivative a[0][i] = (right - left) / (2.0 * h); // Calculate extrapolations of various orders ... double factor = TWO; for (int j = 1; j <= i; j++) { a[j][i] = (a[j-1][i] * factor - a[j-1][i-1])/(factor - 1.0); factor *= TWO; double error = max(fabs(a[j][i] - a[j-1][i]), fabs(a[j][i] - a[j-1][i-1])); // Did we improve solution? if (error < err) { err = error; d[k] = a[j][i]; } } // Stop if solution is deteriorating ... if (fabs(a[i][i] - a[i-1][i-1]) >= 2.0 * err) { x[k] = save_x; break; } } x[k] = save_x; } } int Vector::SafeCount() const { int nonMissing = dim; for (int i = 0; i < dim; i++) if (data[i] == _NAN_) nonMissing--; return nonMissing; } double Vector::SafeMin() const { double min = _NAN_; int i; for (i = 0; i < dim; i++) if (data[i] != _NAN_) { min = data[i]; break; } for (; i < dim; i++) if (data[i] != _NAN_ && data[i] < min) min = data[i]; return min; } double Vector::SafeMax() const { double max = _NAN_; int i; for (i = 0; i < dim; i++) if (data[i] != _NAN_) { max = data[i]; break; } for (; i < dim; i++) if (data[i] != _NAN_ && data[i] > max) max = data[i]; return max; } void Vector::Reverse() { for (int i = 0, j = dim - 1; i < j; i++, j--) Swap(i, j); } void Vector::InsertInSortedList(int value) { // Skip through large elements int pos = dim - 1; while (pos >= 0 && data[pos] > value) pos--; // If the value is already in the list, we are done if (pos >= 0 && data[pos] == value) return; // Otherwise we need to grow array Dimension(dim + 1); // And then shift larger elements to the right pos++; for (int i = dim - 1; i > pos; i--) data[i] = data[i - 1]; data[pos] = value; } void Vector::Swap(Vector & rhs) { double * temp = rhs.data; rhs.data = data; data = temp; int swap = rhs.dim; rhs.dim = dim; dim = swap; swap = rhs.size; rhs.size = size; size = swap; } double Vector::Average(double returnIfNull) { if (Length() == 0) return returnIfNull; return Average(); } double Vector::Var(double returnIfNull) { if (Length() == 0) return returnIfNull; return Var(); } double Vector::StandardDeviation(double returnIfNull) { if (Length() == 0) return returnIfNull; return StandardDeviation(); } libStatGen-1.0.14/general/MathVector.h000066400000000000000000000145631254730101300175000ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __MATHVECTOR_H__ #define __MATHVECTOR_H__ #include "StringBasics.h" #include #include class Matrix; class Vector { public: int dim, size; double * data; String label; Vector() { Init(); } Vector(Vector & v) { Init(); Copy(v); } Vector(int d) { Init(); Dimension(d); } Vector(const char * text) { Init(); label = text; } Vector(const char * text, int d) { Init(); label = text; Dimension(d); } Vector(const char * text, Vector & v) { Init(); label = text; Copy(v); } ~Vector(); void Dimension(int d); void Dimension(int d, double value); void GrowTo(int d) { Dimension(d > dim ? d : dim); } void GrowTo(int d, double value) { Dimension(d > dim ? d : dim, value); } int Length() const { return dim; } void SetLabel(const char * text) { label = text; } void Zero(); void Set(double k); void Set(Vector & v) { Copy(v); }; void SetMultiple(double k, Vector & v); void Negate(); void Add(double n); void Multiply(double k); double InnerProduct(Vector & v); void Copy(const Vector & v); void Add(Vector & v); void AddMultiple(double k, Vector & v); void Subtract(Vector & v); void Product(Matrix & m, Vector & v); double & operator [](int n) { assert(n < dim); return data[n]; } double operator [](int n) const { assert(n < dim); return data[n]; } double operator [](double fraction) { return data[(int)(dim * fraction)]; } double & operator [](double fraction) const { return data[(int)(dim * fraction)]; } Vector & operator = (const Vector & v); bool operator == (const Vector & v) const; bool operator != (const Vector & v) const { return !(*this == v); } void Swap(int i, int j) { double swap = data[i]; data[i] = data[j]; data[j] = swap; } void Swap(Vector & rhs); Vector & operator *= (double rhs) { Multiply(rhs); return *this; } Vector & operator += (double rhs) { Add(rhs); return *this; } Vector & operator -= (double rhs) { return *this += -rhs; } Vector & operator /= (double rhs) { return *this *= 1/rhs; } Vector & operator += (Vector & rhs) { Add(rhs); return * this; } Vector & operator -= (Vector & rhs) { Subtract(rhs); return * this; } void DeleteDimension(int n); void Delete(int n) { DeleteDimension(n); } void Insert(int n, double value); // Calculates average and variance void AveVar(double & ave, double & var) const; double Average() const; double Var() const; double StandardDeviation() const; double Average(double returnIfNull); double Var(double returnIfNull); double StandardDeviation(double returnIfNull); // Common descriptive functions double Sum() const; double SumSquares() const; double Product() const; // Find extreme values double Min() const; double Max() const; // Return the number of elements in a subset int CountIfGreater(double treshold) const; int CountIfGreaterOrEqual(double treshold) const; // Append another vector to the end void Stack(const Vector & v); void Print(int maxDim = -1) { Print(stdout, maxDim); } void Print(FILE * output, int maxDim = -1); // Routines for creating and searching through sorted vectors void Sort(); void Reverse(); void Sort(Vector & freeRider); int BinarySearch(double value); int FastFind(double value) { return BinarySearch(value); } // Remove consecutive duplicate elements from vector void RemoveDuplicates(); // Query first and last elements // double & First() { return data[0]; } double & Last() { return data[dim - 1]; } // Routines for using a vector as a stack of doubles // void Clear() { dim = 0; } void Push(double value); double Pop() { return data[--dim]; } double Peek() const { return data[dim-1]; } // This routine adds items to a sorted list // void InsertInSortedList(int item); static int alloc; bool isAscending(); bool isDescending(); // Routines for dealing with vectors that include missing data // int SafeCount() const; double SafeMin() const; double SafeMax() const; private: static int CompareDouble(const double * a, const double * b); void Init(); }; class VectorFunc // Wrapper for multi-dimensional functions // so that they can be used as parameters // and keep private data { private: double(*f)(Vector &); public: // Constructors VectorFunc(); VectorFunc(double(*func)(Vector &)); // Virtual destructor ensures that dynamic objects are // handled correctly virtual ~VectorFunc() { } virtual double Evaluate(Vector & v); // Calculate derivatives along each direction. Delta is a guess value // for the initial stepsize in numerical derivation virtual void Derivative(Vector & point, Vector & d, double delta = 1.0); // Minimum function value found while evaluating derivative // and its location... double dfmin; Vector dpmin; }; #endif libStatGen-1.0.14/general/MemoryAllocators.cpp000066400000000000000000000117461254730101300212530ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "MemoryAllocators.h" #include char *** AllocateCharCube(int n, int rows, int cols) { char *** cube = new char ** [n]; // Stop early if we are out of memory if (cube == NULL) return NULL; for (int i = 0; i < n; i++) { cube[i] = AllocateCharMatrix(rows, cols); // Safely unravel allocation if we run out of memory if (cube[i] == NULL) { while (i--) FreeCharMatrix(cube[i], rows); delete [] cube; return NULL; } } return cube; } int ** AllocateIntMatrix(int rows, int cols) { int ** matrix = new int * [rows]; // Stop early if we are out of memory if (matrix == NULL) return NULL; for (int i = 0; i < rows; i++) { matrix[i] = new int [cols]; // Safely unravel allocation if we run out of memory if (matrix[i] == NULL) { while (i--) delete [] matrix[i]; delete [] matrix; return NULL; } } return matrix; } char ** AllocateCharMatrix(int rows, int cols) { char ** matrix = new char * [rows]; // Stop early if we are out of memory if (matrix == NULL) return NULL; for (int i = 0; i < rows; i++) { matrix[i] = new char [cols]; // Safely unravel allocation if we run out of memory if (matrix[i] == NULL) { while (i--) delete [] matrix[i]; delete [] matrix; return NULL; } } return matrix; } float ** AllocateFloatMatrix(int rows, int cols) { float ** matrix = new float * [rows]; // Stop early if we are out of memory if (matrix == NULL) return NULL; for (int i = 0; i < rows; i++) { matrix[i] = new float [cols]; // Safely unravel allocation if we run out of memory if (matrix[i] == NULL) { while (i--) delete [] matrix[i]; delete [] matrix; return NULL; } } return matrix; } void FreeCharCube(char *** & cube, int n, int rows) { if (cube == NULL) return; for (int i = 0; i < n; i++) FreeCharMatrix(cube[i], rows); delete [] cube; cube = NULL; } void FreeCharMatrix(char ** & matrix, int rows) { if (matrix == NULL) return; for (int i = 0; i < rows; i++) delete [] matrix[i]; delete [] matrix; matrix = NULL; } void FreeFloatMatrix(float ** & matrix, int rows) { if (matrix == NULL) return; for (int i = 0; i < rows; i++) delete [] matrix[i]; delete [] matrix; matrix = NULL; } void FreeIntMatrix(int ** & matrix, int rows) { if (matrix == NULL) return; for (int i = 0; i < rows; i++) delete [] matrix[i]; delete [] matrix; matrix = NULL; } short ** AllocateShortMatrix(int rows, int cols) { short ** matrix = new short * [rows]; // Stop early if we are out of memory if (matrix == NULL) return NULL; for (int i = 0; i < rows; i++) { matrix[i] = new short [cols]; // Safely unravel allocation if we run out of memory if (matrix[i] == NULL) { while (i--) delete [] matrix[i]; delete [] matrix; return NULL; } } return matrix; } void FreeShortMatrix(short ** & matrix, int rows) { if (matrix == NULL) return; for (int i = 0; i < rows; i++) delete [] matrix[i]; delete [] matrix; matrix = NULL; } double ** AllocateDoubleMatrix(int rows, int cols) { double ** matrix = new double * [rows]; // Stop early if we are out of memory if (matrix == NULL) return NULL; for (int i = 0; i < rows; i++) { matrix[i] = new double [cols]; // Safely unravel allocation if we run out of memory if (matrix[i] == NULL) { while (i--) delete [] matrix[i]; delete [] matrix; return NULL; } } return matrix; } void FreeDoubleMatrix(double ** & matrix, int rows) { for (int i = 0; i < rows; i++) delete [] matrix[i]; delete [] matrix; matrix = NULL; } libStatGen-1.0.14/general/MemoryAllocators.h000066400000000000000000000052411254730101300207110ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __MEMORY_ALLOCATORS_H__ #define __MEMORY_ALLOCATORS_H__ #include template T** AllocateMatrix(int rows, int cols); template T** AllocateMatrix(int rows, int cols, T value); template void FreeMatrix(T ** & matrix, int rows); char ** AllocateCharMatrix(int rows, int cols); void FreeCharMatrix(char ** & matrix, int rows); float ** AllocateFloatMatrix(int rows, int cols); void FreeFloatMatrix(float ** & matrix, int rows); double ** AllocateDoubleMatrix(int rows, int cols); void FreeDoubleMatrix(double ** & matrix, int rows); int ** AllocateIntMatrix(int rows, int cols); void FreeIntMatrix(int ** & matrix, int rows); short ** AllocateShortMatrix(int rows, int cols); void FreeShortMatrix(short ** & matrix, int rows); char *** AllocateCharCube(int n, int rows, int cols); void FreeCharCube(char *** & matrix, int n, int rows); // Template definitions follow ... // template T** AllocateMatrix(int rows, int cols) { T ** matrix = new T * [rows]; // Stop early if we are out of memory if (matrix == NULL) return NULL; for (int i = 0; i < rows; i++) { matrix[i] = new T [cols]; // Safely unravel allocation if we run out of memory if (matrix[i] == NULL) { while (i--) delete [] matrix[i]; delete [] matrix; return NULL; } } return matrix; }; template T** AllocateMatrix(int rows, int cols, T value) { T ** matrix = AllocateMatrix(rows, cols); if (matrix != NULL) for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) matrix[i][j] = value; return matrix; }; template void FreeMatrix(T ** & matrix, int rows) { if (matrix == NULL) return; for (int i = 0; i < rows; i++) delete [] matrix[i]; delete [] matrix; matrix = NULL; }; #endif libStatGen-1.0.14/general/MemoryInfo.cpp000066400000000000000000000024361254730101300200370ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "MemoryInfo.h" String & MemoryInfo(double bytes) { static String info; if (bytes < 1024) return info = "<1.0 kb"; if (bytes < 1024. * 1024.) info.printf("%.1f kb", (bytes + 1023) / 1024.); else if (bytes < 1024. * 1024. * 1024.) info.printf("%.1f mb", (bytes + 1024. * 1024. - 1) / (1024. * 1024.)); else if (bytes < 1024. * 1024. * 1024. * 1024.) info.printf("%.1f gb", bytes / (1024. * 1024. * 1024.)); else info.printf("%.1f tb", bytes / (1024. * 1024. * 1024. * 1024.)); return info; } libStatGen-1.0.14/general/MemoryInfo.h000066400000000000000000000015341254730101300175020ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __MEMORYINFO_H__ #define __MEMORYINFO_H__ #include "StringBasics.h" String & MemoryInfo(double bytes); #endif libStatGen-1.0.14/general/MemoryMap.cpp000077500000000000000000000210061254730101300176560ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include "MemoryMap.h" #ifndef _WIN32 #include #include #endif #ifndef MAP_POPULATE #define MAP_POPULATE 0x0000 #endif #ifndef MAP_NONBLOCK #define MAP_NONBLOCK 0x0000 #endif MemoryMap::MemoryMap() { constructor_clear(); #if defined(_WIN32) SYSTEM_INFO sysinfo = {0}; ::GetSystemInfo(&sysinfo); DWORD cbView = sysinfo.dwAllocationGranularity; #else page_size = sysconf(_SC_PAGE_SIZE); #endif } MemoryMap::~MemoryMap() { destructor_clear(); }; void MemoryMap::debug_print() { #if defined(_WIN32) std::cout << "fd = " << file_handle << std::endl; #else std::cout << "fd = " << fd << std::endl; #endif std::cout << "data = 0x" << std::hex << data << std::endl; std::cout << "offset = 0x" << std::hex << offset << std::endl; std::cout << "mapped_length = 0x" << std::hex << mapped_length << std::endl; std::cout << "total_length = 0x" << std::hex << total_length << std::endl; std::cout << "page_size = 0x" << std::hex << page_size << std::endl; }; void MemoryMap::constructor_clear() { #if defined(_WIN32) file_handle = NULL; map_handle = NULL; #else fd = -1; #endif data = (void *) NULL; offset = 0; mapped_length = 0; total_length = 0; useMemoryMapFlag = true; }; void MemoryMap::destructor_clear() { #if defined(_WIN32) if (data!=NULL) { // free windows mapped object ::UnmapViewOfFile((LPVOID) data); } if (map_handle != NULL) ::CloseHandle(map_handle); if (file_handle != NULL) ::CloseHandle(file_handle); #else if (data!=NULL) { // free unix mapped object munmap(data, mapped_length); } // free unix resources if (fd!=-1) { ::close(fd); } #endif constructor_clear(); } bool MemoryMap::allocate() { data = (void *) malloc(mapped_length); if (data == NULL) { #ifdef __WIN32__ ::CloseHandle(file_handle); #else ::close(fd); #endif perror("MemoryMap::open"); constructor_clear(); return true; } #ifdef __WIN32__ DWORD resultSize = 0; ReadFile(file_handle, data, mapped_length, &resultSize, NULL); #else size_t resultSize = read(fd, data, mapped_length); #endif if ( resultSize != mapped_length) { #ifdef __WIN32__ ::CloseHandle(file_handle); #else ::close(fd); #endif perror("MemoryMap::open"); constructor_clear(); return true; } return false; } bool MemoryMap::open(const char * file, int flags) { const char * message = "MemoryMap::open - problem opening file %s"; #if defined(_WIN32) file_handle = CreateFile(file, (flags==O_RDONLY) ? GENERIC_READ : (GENERIC_READ | GENERIC_WRITE), FILE_SHARE_READ | FILE_SHARE_WRITE, // subsequent opens may either read or write NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); if(file_handle == INVALID_HANDLE_VALUE) { fprintf(stderr, message, file); constructor_clear(); return true; } LARGE_INTEGER file_size = {0}; ::GetFileSizeEx(file_handle, &file_size); mapped_length = total_length = file_size.QuadPart; #else struct stat buf; fd = ::open(file, flags); if ((fd==-1) || (fstat(fd, &buf) != 0)) { fprintf(stderr, message, file); constructor_clear(); return true; } mapped_length = total_length = buf.st_size; #endif if(!useMemoryMapFlag) { return allocate(); } #if defined(_WIN32) assert(offset == 0); map_handle = CreateFileMapping(file_handle, NULL, (flags==O_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, file_size.HighPart, // upper 32 bits of map size file_size.LowPart, // lower 32 bits of map size NULL); if(map_handle == NULL) { ::CloseHandle(file_handle); fprintf(stderr, message, file); constructor_clear(); return true; } data = MapViewOfFile(map_handle, (flags == O_RDONLY) ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS, 0, 0, mapped_length); if (data == NULL) { CloseHandle(map_handle); CloseHandle(file_handle); fprintf(stderr, message, file); constructor_clear(); return true; } #else data = ::mmap(NULL, mapped_length, (flags == O_RDONLY) ? PROT_READ : PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset); if (data == MAP_FAILED) { ::close(fd); fprintf(stderr, message, file); constructor_clear(); return true; } #endif return false; } bool MemoryMap::create(const char *file, size_t size) { if (file==NULL) { data = calloc(size, 1); return(data==NULL); } const char * message = "MemoryMap::create - problem creating file %s"; #ifdef __WIN32__ file_handle = CreateFile(file, GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); if (file_handle == INVALID_HANDLE_VALUE) { fprintf(stderr, message, file); constructor_clear(); return true; } SetFilePointer(file_handle, size - 1, NULL, FILE_BEGIN); char dummy = 0; DWORD check = 0; WriteFile(file_handle, &dummy, 1, &check, NULL); if (check != 0) { CloseHandle(file_handle); DeleteFile(file); fprintf(stderr, message, file); constructor_clear(); return true; } CloseHandle(file_handle); open(file, O_RDWR); #else fd = ::open(file, O_RDWR|O_CREAT|O_TRUNC, 0666); if(fd == -1) { fprintf(stderr, message, file); constructor_clear(); return true; } lseek(fd, (off_t) size - 1, SEEK_SET); char dummy = 0; if(write(fd, &dummy, 1)!=1) { fprintf(stderr, message, file); constructor_clear(); return true; } data = ::mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, offset); if (data == MAP_FAILED) { ::close(fd); unlink(file); fprintf(stderr, message, file); constructor_clear(); return true; } mapped_length = total_length = size; #endif return false; } bool MemoryMap::create(size_t size) { return create(NULL, size); } bool MemoryMap::close() { destructor_clear(); return false; } void MemoryMap::test() { int result; result = this->open("test/test_memmap_data.txt"); assert(result == 0); assert(data!=NULL); assert(mapped_length == 183); // length of the above file close(); // now try non memory mapped (direct slow file I/O) useMemoryMap(false); result = this->open("test/test_memmap_data.txt"); assert(result == 0); assert(data!=NULL); assert(mapped_length == 183); // length of the above file close(); } int MemoryMap::prefetch() { int sum = 0; size_t i; for (i=0; i. */ #ifndef __MEMORYMAP_H #define __MEMORYMAP_H #include #include #if defined(_WIN32) #include #endif /// /// There are a pair of related data structures in the operating system, /// and also a few simple algorithms that explain why your processes are /// waiting forever. /// /// The symptom you have is that they are getting little or no CPU time, /// as shown in the command 'top'. The machine will appear to have /// available CPU time (look at the Cpu(s): parameter - if less than 100%, /// you have available CPU). The real key, however, is to look at the /// 'top' column with the label 'S' - that is the status of the process, /// and crucial to understanding what is going on. /// /// In your instance, the 'S' column for your karma jobs is 'D', which /// means it is waiting for data. This is because the process is doing /// something that is waiting for the filesystem to return data to it. /// Usually, this is because of a C call like read() or write(), but it /// also happens in large processes where memory was copied to disk and /// re-used for other purposes (this is called paging). /// /// So, a bit of background on the operating system... there is a CPU /// secheduler that takes a list of waiting processes, and picks one to /// run - if the job is waiting for the disk, there is no point in picking /// it to run, since it is blocked, waiting for the disk to return data. /// The scheduler marks the process with 'D' and moves on to the next /// process to schedule. /// /// In terms of data structures that we care about for this example, there /// are two that we care about. First is a linear list of disk buffers /// that are stored in RAM and controlled by the operating system. This /// is usually called the disk buffer pool. Usually, when a program asks /// for data from the disk, this list can be scanned quickly to see if the /// data is already in RAM - if so, no disk operation needs to take place. /// /// Now in the case of the normal Unix read() and write() calls, when the /// operating system is done finding the page, it copies the data into a /// buffer to be used by the process that requested it (in the case of a /// read() - a write() is the opposite). This copy operation is slow and /// inefficient, but gets the job done. /// /// So overall, you gain some efficiency in a large memory system by /// having this disk buffer pool data structure, since you aren't /// re-reading the disk over and over to get the same data that you /// already have in RAM. However, it is less efficient than it might be /// because of the extra buffer copying. /// /// Now we come to memory mapped files, and karma. The underlying system /// call of interest to us is mmap(), and is in MemoryMap.cpp. What it /// does and how it works are important to understanding the benefits of /// it, and frankly, most people don't care about it because it is /// seemingly complex. /// /// Two things are important to know: firstly, there is a data structure /// in the CPU called the page table, which is mostly contained in the CPU /// hardware itself. All memory accesses for normal user processes like /// karma go through this hardware page table. Secondly, it is very fast /// for the operating system to put together a page table that 'connects' /// a bunch of memory locations in your user programs address space to the /// disk buffer pool pages. /// /// The combination of those two facts mean that you can implement a 'zero /// copy' approach to reading data, which means that the data that is in /// the disk buffer pool is directly readable by the program without the /// operating system ever having to actually copy the data, like it does /// for read() or write(). /// /// So the benefit of mmap() is that when the underlying disk pages are /// already in the disk buffer pool, a hardware data structure gets built, /// then the program returns, and the data is available at full processor /// speed with no intervening copy of the data, or waiting for disk or /// anything else. It is as near to instantaneous as you can possibly /// get. This works whether it is 100 bytes or 100 gigabytes. /// /// So, the last part of the puzzle is why your program winds up in 'D' /// (data wait), and what to do about it. /// /// The disk buffer pool is a linear list of blocks ordered by the time /// and date of access. A process runs every once in awhile to take the /// oldest of those pages, and free them, during which it also has to /// update the hardware page tables of any processes referencing them. /// /// So on wonderland, most file access (wget, copy, md5sum, anything else) /// is constantly putting new fresh pages at the front of the list, and /// karma index files, having been opened awhile ago, are prime candidates /// for being paged out. The reason they get paged out as far as I know /// is that in any given second of execution, nowhere near the entire /// index is getting accessed... so at some point, at least one page gets /// sent back to disk (well, flushed from RAM). Once that happens, a /// cascading effect happens, where the longer it waits, the older the /// other pages get, then the more that get reclaimed, and the slower it /// gets, until karma is at a standstill, waiting for pages to be brought /// back into RAM. /// /// Now in an ideal world, karma would rapidly recover, and it can... /// sometimes. The problem is that your karma job is accessing data all /// over that index, and it is essentially looking like a pure random I/O /// to the underlying filesystem. There is about a 10 to 1 performance /// difference between accessing the disk sequentially as compared to /// randomly. /// /// So to make karma work better, the first thing I do when starting karma /// is force it to read all of the disk pages in order. This causes the /// entire index to be forced into memory in order, so it is forcing /// sequential reads, which is the best case possible. There are /// problems, for example if three karma jobs start at once, the disk I/O /// is no longer as purely sequential as we would like. Also, if the /// filesystem is busy taking care of other programs, even if karma thinks /// it is forcing sequential I/O, the net result looks more random. This /// happens when the system is starting to break down (thrashing) and it /// will certainly stall, or look very very slow, or crash. /// /// The upshot of all of this is that when a single reference is shared, /// it is more likely that all the pages will be in the disk buffer pool /// to begin with, and thereby reduce startup time to nearly zero. It is /// also the ideal situation in terms of sharing the same reference among /// say 24 copies of karma on wonderland - the only cost is the hardware /// page table that gets set up to point to all of the disk buffers. /// /// As I mentioned a paragraph back, the pages can still get swapped out, /// even with dozens of karma jobs running. A workaround I created is a /// program in utilities called mapfile - it simply repeatedly accesses /// the data in sequential order to help ensure that all of the pages are /// at the head of the disk buffer pool, and therefore less likely to get /// swapped out. /// /// The benefit of such a program (mapfile) is greater on wonderland, /// where a lot of processes are competing for memory and disk buffers. /// /// class MemoryMap { #if defined(_WIN32) HANDLE file_handle; HANDLE map_handle; DWORD page_size; #else int fd; size_t page_size; #endif off_t offset; size_t mapped_length; size_t total_length; bool useMemoryMapFlag; public: void *data; MemoryMap(); virtual ~MemoryMap(); void debug_print(); void constructor_clear(); void destructor_clear(); virtual bool allocate(); /// open a previously created mapped vector /// /// useMemoryMapFlag will determine whether it /// uses mmap() or malloc()/read() to populate /// the memory virtual bool open(const char * file, int flags = O_RDONLY); /// create the memory mapped file on disk /// /// a file will be created on disk with the header /// filled in. The caller must now populate elements /// using (*this).set(index, value). // virtual bool create(const char * file, size_t size); /// store in allocated memory (malloc), not mmap: /// /// This is for code that needs to more flexibly /// the case when an mmap() file _might_ be available, /// but if it is not, we want to load it as a convenience /// to the user. GenomeSequence::populateDBSNP does exactly this. // virtual bool create(size_t size); bool close(); void test(); size_t length() { return mapped_length; } char operator[](unsigned int index) { return ((char *)data)[index]; }; int prefetch(); // force pages into RAM // // set or unset use of mmap() call in ::open(). // This flag must be set before ::open() is called, // if it is called afterwards, it has no effect. // void useMemoryMap(bool flag=true) { useMemoryMapFlag = flag; } }; #endif libStatGen-1.0.14/general/MemoryMapArray.cpp000066400000000000000000000111741254730101300206570ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "MemoryMapArray.h" void MemoryMapArrayHeader::debugPrint(FILE *f) { time_t local = creationDate; fprintf(f, "typeCookie = %08x\n", typeCookie); fprintf(f, "typeVersion = %08x\n", typeVersion); fprintf(f, "contentCookie = %08x\n", contentCookie); fprintf(f, "contentVersion = %08x\n", contentVersion); fprintf(f, "Created on %s", asctime(localtime(&local))); fprintf(f, "Created by user %s on host %s for application '%s'.\n", creationUser, creationHost, application); } std::ostream &operator << (std::ostream &stream, MemoryMapArrayHeader &h) { time_t local = h.creationDate; stream << "typeCookie = " << h.typeCookie << "\n"; stream << "typeVersion = " << h.typeVersion << "\n"; stream << "contentCookie = " << h.contentCookie << "\n"; stream << "contentVersion = " << h.contentVersion << "\n"; stream << "headerSize = " << h.headerSize << "\n"; stream << "elementCount = " << h.elementCount << "\n"; stream << "Created on " << asctime(localtime(&local)) << "\n"; stream << "Created by user " << h.creationUser << " on host " << h.creationHost << " for application '" << h.application << "'.\n"; return stream; } #if defined(TEST) #include #include void test32() { mmapArrayUint32_t test; unlink("twinkypie"); assert(test.create("twinkypie", 11)==0); test.set(0,0); test.set(1,1); test.set(2,2); test.set(3,3); test.set(4,4); test.set(5,5); test.set(6,6); test.set(7,7); test.set(8,8); test.set(9,9); test.set(10,10); assert(test[0]==0); assert(test[10]==10); test.close(); assert(test.open("twinkypie")==0); assert(test[0]==0); assert(test[10]==10); test.close(); unlink("twinkypie"); } void testbool() { mmapArrayBool_t test; unlink("twinkypie"); assert(test.create("twinkypie", 11)==0); test.set(0,0); test.set(1,1); test.set(2,0); test.set(3,1); test.set(4,0); test.set(5,1); test.set(6,0); test.set(7,1); test.set(8,0); test.set(9,0); test.set(10,1); assert(test[0]==0); assert(test[1]==1); assert(test[10]==1); test.close(); assert(test.open("twinkypie")==0); assert(test[0]==0); assert(test[10]==1); test.close(); unlink("twinkypie"); } void test2bit() { mmapArray2Bit_t test; unlink("twinkypie"); assert(test.create("twinkypie", 11)==0); test.set(0,0); test.set(1,1); test.set(2,2); test.set(3,3); test.set(4,3); test.set(5,2); test.set(6,1); test.set(7,0); test.set(8,2); test.set(9,1); test.set(10,3); test.setApplication("testing 2 bit values!"); assert(test[0]==0); assert(test[1]==1); assert(test[2]==2); assert(test[3]==3); assert(test[4]==3); assert(test[5]==2); assert(test[6]==1); assert(test[7]==0); assert(test[8]==2); assert(test[9]==1); assert(test[10]==3); test.close(); assert(test.open("twinkypie")==0); test.debugPrint(stdout); test.close(); unlink("twinkypie"); } void test4bit() { mmapArray4Bit_t test; unlink("twinkypie"); assert(test.create("twinkypie", 11)==0); test.set(0,0); test.set(1,1); test.set(2,2); test.set(3,3); test.set(4,4); test.set(5,5); test.set(6,6); test.set(7,7); test.set(8,8); test.set(9,9); test.set(10,10); test.setApplication("testing 4 bit values!"); assert(test[0]==0); assert(test[1]==1); assert(test[7]==7); assert(test[10]==10); test.close(); assert(test.open("twinkypie")==0); assert(test[0]==0); assert(test[1]==1); assert(test[7]==7); assert(test[10]==10); test.debugPrint(stdout); test.close(); unlink("twinkypie"); } int main(int argc, char **argv) { test32(); testbool(); test2bit(); test4bit(); exit(0); } #endif libStatGen-1.0.14/general/MemoryMapArray.h000077500000000000000000000252531254730101300203320ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __MEMORYMAPARRAY_H #define __MEMORYMAPARRAY_H #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS #endif #include #include #include #include #include #ifndef _WIN32 #include // for gethostname() #endif #include #include #include // STL: #include #include #include "Generic.h" #include "MemoryMap.h" // // This file defines a template for generating memory map backed arrays // of different types of values. // // The template assumes that the mapped files are broken into two parts, // first, a header (MemoryMapArrayHeader), then followed by the data // in the array. // // typedefs are used to declare various types of arrays beforehand, // since there will be only a few. // // They are: // mmapArrayUint32_t; // mmapArrayBool_t; // mmapArray4Bit_t; // // XXX consider adding env("USER"), argv[0], date/time creation, etc. // class MemoryMapArrayHeader { public: void constructorClear() { memset(this, 0, sizeof(*this)); } uint32_t typeCookie; uint32_t typeVersion; uint32_t contentCookie; uint32_t contentVersion; size_t headerSize; // file generation info time_t creationDate; char creationUser[32]; char creationHost[32]; char application[32]; // now describe the data: size_t elementCount; void debugPrint(FILE *); size_t getHeaderSize(int i) { return sizeof(*this); } void setApplication(const char *s) { strncpy(application, s, sizeof(application)-1); application[sizeof(application)-1] = '\0'; } void setCreationUser(const char *s) { strncpy(creationUser, s, sizeof(creationUser)-1); creationUser[sizeof(creationUser)-1] = '\0'; } void setCreationHost(const char *s) { strncpy(creationHost, s, sizeof(creationHost)-1); creationHost[sizeof(creationHost)-1] = '\0'; } }; // // stream output for header information // std::ostream &operator << (std::ostream &stream, MemoryMapArrayHeader &h); // // This class object represents the application specific information that doesn't // fit in the general header above. Since it is only allocated via an mmap operation, // as part of the mapped file, the destructor must never be called. The virtual // destructor is declared to eliminate gcc warnings. // // For many arrays, this will be empty. // struct MemoryMapGenericHeader { protected: size_t headerSize; // set in ::create and ::open only public: size_t getHeaderSize() { return headerSize; } // other stuff follows... }; template < class elementT, typename indexT, unsigned int cookieVal, unsigned int versionVal, elementT accessorFunc(char *base, indexT), void setterFunc(char *base, indexT, elementT), size_t elementCount2BytesFunc(indexT), class arrayHeaderClass> class MemoryMapArray : public MemoryMap { protected: arrayHeaderClass *header; char *data; std::string errorStr; public: void constructorClear() { header = NULL; data = NULL; // errorStr = ""; } MemoryMapArray() { constructorClear(); } ~MemoryMapArray() { if (data) close(); } const std::string &getErrorString() { return errorStr; } arrayHeaderClass &getHeader() { return *header; } void setContentCookie(uint32_t c) { header->contentCookie = c; } void setContentVersion(uint32_t v) { header->contentVersion = v; } // accessing inline elementT operator[](indexT i) { return accessorFunc(data, i); } inline void set(indexT i, elementT v) { setterFunc(data, i, v); } /// Create a vector with elementCount memebers. // /// Does administrative setup of the header and populating this /// class members. User will need to finish populating the /// contents of the metaData and data sections. /// /// If file==NULL, the underlying allocation is done via malloc(), /// so that the results of write access to this vecor are not /// saved in a file. /// /// If file!=NULL, a file will be created on disk, and all /// write accesses done via the method ::set will be persistent /// in that file. /// int create(const char *file, indexT elementCount, int optionalHeaderCount = 0) { size_t len = elementCount2BytesFunc(elementCount) + header->getHeaderSize(optionalHeaderCount); int rc; rc = MemoryMap::create(file, len); if (rc) { std::ostringstream buf; buf << file << ": failed to create file"; errorStr = buf.str(); close(); return rc; } header = (arrayHeaderClass *) MemoryMap::data; header->constructorClear(); header->typeCookie = cookieVal; header->typeVersion = versionVal; header->headerSize = header->getHeaderSize(optionalHeaderCount); header->elementCount = elementCount; data = (char *)((char *) MemoryMap::data + header->headerSize); const char *env; char hostname[256]; env = getenv("USER"); if (env) header->setCreationUser(env); header->creationDate = time(NULL); #if defined(_WIN32) hostname[0] = '\0'; #else gethostname(hostname, sizeof(hostname)); #endif header->setCreationHost(hostname); return 0; } /// allow anonymous (malloc) create. /// /// we do this when we don't expect to save the results. /// /// The single use case so far is in GenomeSequence::populateDBSNP. /// int create(indexT elementCount, int optionalHeaderCount = 0) { return create(NULL, elementCount, optionalHeaderCount); } // // Open the given filename. flags may be set to // O_RDONLY or O_RDWR, and allows the file to be // condtionally written to. // // Several sanity checks are done: // compare the expected cookie value to the actual one // compare the expected version value to the actual one // // if either condition is not met, the member errorStr is // set to explain why, and true is returned. // // If there were no errors, false is returned. // bool open(const char *file, int flags = O_RDONLY) { int rc = MemoryMap::open(file, flags); if (rc) { std::ostringstream buf; buf << file << ": open() failed (error=" << strerror(errno) << ")."; errorStr = buf.str(); return true; } header = (arrayHeaderClass *) MemoryMap::data; data = (char *)((char *) MemoryMap::data + header->headerSize); if (header->typeCookie!=cookieVal) { std::ostringstream buf; buf << file << ": wrong type of file (expected type " << cookieVal << " but got " << header->typeCookie << ")"; errorStr = buf.str(); // XXX insert better error handling close(); return true; } if (header->typeVersion!=versionVal) { std::ostringstream buf; buf << file << ": wrong version of file (expected version " << versionVal << " but got " << header->typeVersion << ")"; errorStr = buf.str(); // XXX insert better error handling close(); return true; } return false; } bool close() { constructorClear(); return MemoryMap::close(); } void debugPrint(FILE *f) { if (header) header->debugPrint(f); } size_t getElementCount() const { return header->elementCount; } }; struct emptyGenericHeader : public MemoryMapGenericHeader { public: size_t getHeaderSize() { return sizeof(*this); } }; // // define the uint32 array type: // inline uint32_t mmapUint32Access(char *base, uint32_t index) { return ((uint32_t *)base)[index]; } inline void mmapUint32Set(char *base, uint32_t index, uint32_t v) { ((uint32_t *)base)[index] = v; } inline size_t mmapUint32elementCount2Bytes(uint32_t i) { return sizeof(uint32_t) * i; } typedef MemoryMapArray< uint32_t, uint32_t, 0x16b3816c, 20090109, mmapUint32Access, mmapUint32Set, mmapUint32elementCount2Bytes, MemoryMapArrayHeader > mmapArrayUint32_t; // // define the boolean memory mapped array type. // NB: it is limited to 2**32 elements // typedef MemoryMapArray< uint32_t, uint32_t, 0xac6c1dc7, 20090109, PackedAccess_1Bit, PackedAssign_1Bit, Packed1BitElementCount2Bytes, MemoryMapArrayHeader > mmapArrayBool_t; // // define the two bit memory mapped array type: // typedef MemoryMapArray< uint32_t, uint32_t, 0x25b3ea5f, 20090109, PackedAccess_2Bit, PackedAssign_2Bit, Packed2BitElementCount2Bytes, MemoryMapArrayHeader > mmapArray2Bit_t; typedef MemoryMapArray< uint32_t, uint32_t, 0x418e1874, 20090109, PackedAccess_4Bit, PackedAssign_4Bit, Packed4BitElementCount2Bytes, MemoryMapArrayHeader > mmapArray4Bit_t; #if 0 // XXX this is example code I want to use to define arrays of genome wide match values class baseRecord { unsigned int base:4; unsigned int qScore:7; unsigned int conflicts:5; // how many cases of poorer matches that disagree }; // // define the baseRecord array type: // inline baseRecord &mmapBaseRecordAccess(void *base, uint32_t index) { return *((baseRecord *)((char *)base + index*sizeof(baseRecord))); } inline void mmapBaseRecordSet(void *base, uint32_t index, baseRecord &v) { mmapBaseRecordAccess(base, index) = v; } inline size_t mmapBaseRecordElementCount2Bytes(uint32_t i) { return sizeof(baseRecord) * i; } typedef MemoryMapArray< baseRecord &, uint32_t, 0x12341234, 0xdeadbeef, &mmapBaseRecordAccess, mmapBaseRecordSet, mmapBaseRecordElementCount2Bytes, MemoryMapArrayHeader > mmapArrayBaseRecord_t; #endif #endif libStatGen-1.0.14/general/MiniDeflate.cpp000066400000000000000000000246621254730101300201410ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "MiniDeflate.h" // Convenient constants and macros // #define EMPTY_KEY 123 #define uchar unsigned char #ifndef min #define min(a,b) (((a)<(b))?(a):(b)) #endif MiniDeflate::MiniDeflate() { buffer = new uchar [BUFFER_SIZE + 5]; hash_keys = new uchar [HASH_SIZE]; hash_values = new uchar * [HASH_SIZE * HASH_DEPTH]; } MiniDeflate::~MiniDeflate() { delete [] buffer; delete [] hash_keys; delete [] hash_values; } void MiniDeflate::EvaluateMatch(unsigned char * in, int len, int hash, unsigned char * & best_pos, int & best_match) { int max = min(len, 0xFFFF + 66); for (int i = HASH_DEPTH; i > 0; i--) // Check each possible match (up to HASH_DEPTH) { uchar * pos = hash_values[hash * HASH_DEPTH + ((hash_keys[hash] + i) % HASH_DEPTH)]; if (pos == NULL || in - pos >= 0x4001) break; int match = 0; while (match < max && pos[match] == in[match]) match++; if (match > best_match) { best_match = match; best_pos = pos; } } // If string seems pretty unique, add to hash table if (best_match < OKAY_MATCH) { int delta = hash_keys[hash] = (uchar)((hash_keys[hash] + 1) & 7); hash_values[hash * 8 + delta] = in; } } void MiniDeflate::QuoteLiterals(unsigned char * & in, int literal, unsigned char * & out, int & buffer_len, FILE * output) { if (buffer_len < 0) { fwrite(buffer, out - buffer, 1, output); buffer_len = BUFFER_SIZE; out = buffer; } while (buffer_len < literal) { literal -= buffer_len; while (buffer_len--) { *out = *in; in++; out++; } fwrite(buffer, BUFFER_SIZE, 1, output); buffer_len = BUFFER_SIZE; out = buffer; } while (literal--) { *out = *in; in++; out++; buffer_len--; } } void MiniDeflate::OutputLiterals(unsigned char * & in, int literal, unsigned char * & out, int & buffer_len, FILE * output) { while (literal > 0) if (literal < 16) { *out = (char) literal; out++; buffer_len--; QuoteLiterals(in, literal, out, buffer_len, output); break; } else if (literal < 31) { *out = 15; out++; buffer_len--; QuoteLiterals(in, 15, out, buffer_len, output); *out = (uchar)(literal - 15); out++; buffer_len--; QuoteLiterals(in, literal - 15, out, buffer_len, output); break; } else { int length = min(literal, 0xFFFF + 31); literal -= length; length -= 31; *out = 0; out++; *out = (uchar)(length >> 8); out++; *out = (uchar)(length & 0xFF); out++; buffer_len -= 3; QuoteLiterals(in, length + 31, out, buffer_len, output); } } void MiniDeflate::Deflate(FILE * output, void * void_input, size_t len) { uchar * in = (uchar *) void_input; uchar * out = (uchar *) buffer; int buffer_len = BUFFER_SIZE; for (int i = 0; i < HASH_SIZE; i++) hash_keys[i] = EMPTY_KEY; uchar * in2 = in; while (len > 2) { // Hash the current input value int hash = ((in[0] << 16) | (in[1] << 8) | in[2]) % HASH_SIZE; if (hash_keys[hash] != EMPTY_KEY) // Possible matches in hash table { int best_match = 0; uchar * best_pos = NULL; EvaluateMatch(in, len, hash, best_pos, best_match); // If there are no decent matches if (best_match < 3) { in++; len--; continue; } // Try look ahead if match isn't great while (best_match < OKAY_MATCH && len > 3) { // Peek to see if we could get a better match int next_hash = ((in[1] << 16) | (in[2] << 8) | in[3]) % HASH_SIZE; if (hash_keys[next_hash] == EMPTY_KEY) break; int next_match = 0; uchar * next_pos = NULL; EvaluateMatch(in + 1, len - 1, next_hash, next_pos, next_match); // Didn't find a better match if (next_match <= best_match + 1) break; // Found a better match, so try again in++; len--; best_match = next_match; best_pos = next_pos; } int best_offset = in - best_pos - 1; // This is where we output stuff // Check if we have some literals to output first OutputLiterals(in2, in - in2, out, buffer_len, output); in2 = in += best_match; len -= best_match; if (best_match < 17 && best_offset < 0x1000) { *out = (uchar)(((best_match - 1) << 4) | (best_offset >> 8)); out++; *out = (uchar)(best_offset & 0xFF); out++; buffer_len -= 2; } else if (best_match < 66) { *out = (uchar)(16 | (best_offset >> 10)); out++; *out = (uchar)((best_offset >> 2) & 0xFF); out++; *out = (uchar)((best_offset << 6) | (best_match - 2)); out++; buffer_len -= 3; } else { *out = (uchar)(16 | (best_offset >> 10)); out++; *out = (uchar)((best_offset >> 2) & 0xFF); out++; *out = (uchar)(best_offset << 6); out++; best_match -= 66; *out = (uchar)(best_match >> 8); out++; *out = (uchar)(best_match & 0xFF); out++; buffer_len -= 5; } if (buffer_len <= 0) { fwrite(buffer, out - buffer, 1, output); buffer_len = BUFFER_SIZE; out = buffer; } } // Never seen this sequence before else { hash_keys[hash] = 0; for (int i = 1; i < HASH_DEPTH; i++) hash_values[hash * 8 + i] = NULL; hash_values[hash * 8] = in; in++; len--; } } // Check if we have some trailing literals to output in += len; OutputLiterals(in2, in - in2, out, buffer_len, output); // Flush output if (out != buffer) fwrite(buffer, out - buffer, 1, output); } void MiniDeflate::CiteLiteral(unsigned char * & out, int literal, unsigned char * & in, int & buffer_len, FILE * input) { while (buffer_len < literal) { literal -= buffer_len; while (buffer_len--) { *out = *in; in++; out++; } buffer_len = fread(buffer + 5, 1, BUFFER_SIZE, input); in = buffer + 5; } while (literal--) { *out = *in; in++; out++; buffer_len--; } } void MiniDeflate::Inflate(FILE * input, void * void_output, size_t len) { uchar * out = (uchar *) void_output; uchar * in = (uchar *) buffer + 5; int buffer_len = BUFFER_SIZE; buffer_len = fread(buffer + 5, 1, BUFFER_SIZE, input); while (len) { int match_len = *in >> 4; // Matching a literal if (match_len == 0) { match_len = *in & 0x0F; in++, buffer_len--; // If match_len == 0 then string is longer than 30 characters // Strings of 16 - 30 characters are encoded as two short strings if (match_len == 0) { match_len = (in[0] << 8) + in[1] + 31; in += 2; buffer_len -= 2; } CiteLiteral(out, match_len, in, buffer_len, input); len -= match_len; } // Long match, 14 bit offset else if (match_len == 1) { int offset = (((in[0] & 0x0F) << 10) | (in[1] << 2) | (in[2] >> 6)) + 1; match_len = (in[2] & 0x3F) + 2; in += 3; buffer_len -= 3; if (match_len == 2) { match_len = ((in[0] << 8) | in[1]) + 66; in += 2; buffer_len -= 2; } uchar * match_pos = out - offset; len -= match_len; while (match_len--) { *out = *match_pos; out++, match_pos++; } } // Typical short match else { int offset = (((in[0] & 0x0F) << 8) | in[1]) + 1; in += 2; buffer_len -= 2; uchar * match_pos = out - offset; len -= ++match_len; while (match_len--) { *out = *match_pos; out++, match_pos++; } } if (buffer_len < 5) { uchar * in2 = (uchar *) buffer + 5 - buffer_len; while (in2 != buffer + 5) { *in2 = *in; in2++; in++; } in = buffer + 5 - buffer_len; buffer_len += fread(buffer + 5, 1, BUFFER_SIZE, input); } } if (buffer_len) fseek(input, -buffer_len, SEEK_CUR); } libStatGen-1.0.14/general/MiniDeflate.h000066400000000000000000000074431254730101300176040ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __MINIDEFLATE_H__ #define __MINIDEFLATE_H__ #include // MiniDeflate reads and writes files in a simple Deflate like format // A quick overview of this format follows, at the bottom of this file // // Performance tuning constants // // Hash table size is HASH_SIZE (a prime) #define HASH_SIZE 4093 // Hash table depth is HASH_DEPTH (a power of 2) #define HASH_DEPTH 8 // Matches that are not at least OKAY_MATCH chars are added to hash table #define OKAY_MATCH 32 // Buffer size for FILE I/O #define BUFFER_SIZE (32 * 1024) class MiniDeflate { public: MiniDeflate(); ~MiniDeflate(); void Deflate(FILE * output, void * input, size_t bytes); void Inflate(FILE * input, void * ouput, size_t bytes); private: unsigned char * buffer; unsigned char * hash_keys; unsigned char ** hash_values; // Inline functions used during file compression inline void EvaluateMatch(unsigned char * in, int len, int hash, unsigned char * & best_pos, int & best_match); inline void QuoteLiterals(unsigned char * & in, int literal, unsigned char * & out, int & buffer_len, FILE * output); inline void OutputLiterals(unsigned char * & in, int literal, unsigned char * & out, int & buffer_len, FILE * output); inline void CiteLiteral(unsigned char * & out, int literal, unsigned char * & in, int & buffer_len, FILE * input); }; // Format specification for deflate files // // A compressed file is a sequence of bytes {0 .. N}. // Each byte is a sequence of bits [0 .. 7] with 0 as the Most Significant Bit. // // The following tokens are recognized: // // Literal quotes -- refer to unique strings // // BYTE0 BYTE1 BYTE2 Description // 0 HI LO Quote of 31 bytes of more // Followed by (HI << 8 + LO + 31) quoted chars // 0:4|LEN Quote of up to 1-15 bytes // Followed by LEN quoted chars // // String matches -- refer to previous strings in the input stream // // BYTE0 BYTE1 BYTE2 BYTE3 BYTE4 Description // 1:4|OFF OFF1 OFF2:2|0 HI LO Long match of > 66 bytes // Offset of OFF|OFF1|OFF2 + 1 // Length of HI|LO + 66 // 1:4|OFF OFF1 OFF2:2|LEN Distant match of < 66 bytes // Offset of OFF|OFF1|OFF2 + 1 // Length of LEN + 2 // LEN|OFF OFF1 Nearby short match // Offset OFF|OFF1 + 1 // Length LEN // // NOTE: When partitioning bytes, I use the notation X:n|Y so that // X takes the n MSB bits of byte and Y takes the remaining bits. #endif libStatGen-1.0.14/general/NonOverlapRegions.cpp000066400000000000000000000162551254730101300213710ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ ////////////////////////////////////////////////////////////////////////// #include "NonOverlapRegions.h" #include NonOverlapRegions::NonOverlapRegions() : myRegions() { } NonOverlapRegions::~NonOverlapRegions() { myRegions.clear(); } void NonOverlapRegions::add(const char* chrom, int32_t start, int32_t end) { // Add the region. myRegions[chrom].add(start, end); } bool NonOverlapRegions::inRegion(const char* chrom, int32_t pos) { // Return whether or not the position was found within a region. // Note, this will create a NonOverlapRegion for this chrom if it // did not already exist, but it won't have any regions. return(myRegions[chrom].inRegion(pos)); } NonOverlapRegionPos::NonOverlapRegionPos() : myRegions() { myRegionIter = myRegions.begin(); myTmpIter = myRegions.begin(); } NonOverlapRegionPos::NonOverlapRegionPos(const NonOverlapRegionPos& reg) : myRegions() { myRegionIter = myRegions.begin(); myTmpIter = myRegions.begin(); } NonOverlapRegionPos::~NonOverlapRegionPos() { myRegionIter = myRegions.begin(); myTmpIter = myRegions.begin(); myRegions.clear(); } void NonOverlapRegionPos::add(int32_t start, int32_t end) { // Check to see if the start/end are valid in relation. if(start >= end) { std::cerr << "NonOverlapRegionPos::add: Invalid Range, " << "start must be < end, but " << start << " >= " << end << std::endl; return; } bool added = false; // Locate the correct position in the region list for this start/end. if(inRegion(start)) { // Check if the region end needs to be updated. if(end > myRegionIter->second) { myRegionIter->second = end; } added = true; } else { // Check to see if we are at the end. if(myRegionIter != myRegions.end()) { // Not at the end. // Check to see if the region overlaps the current region. if(end >= myRegionIter->first) { // Overlaps, so update the start. myRegionIter->first = start; // Check if the end needs to be updated. if(myRegionIter->second < end) { myRegionIter->second = end; } added = true; } } } // If we already added the record, check to see if the end of the // new region overlaps any additional regions (know that myRegionIter is // not at the end. if(added) { // Check to see if any other regions were overlapped by this record. myTmpIter = myRegionIter; ++myTmpIter; while(myTmpIter != myRegions.end()) { // If the region starts before the end of this one, consume it. if(myTmpIter->first <= end) { if(myTmpIter->second > myRegionIter->second) { // Update this region with the new end. myRegionIter->second = myTmpIter->second; } myTmpIter = myRegions.erase(myTmpIter); } else { // This region is not overlapped by the new region, so stop. break; } } } else { // Add the region. myRegionIter = myRegions.insert(myRegionIter, std::make_pair(start, end)); } } bool NonOverlapRegionPos::inRegion(int32_t pos) { // Return whether or not the position was found within a region. // If it is found within the region, myRegionIter will point to the region // otherwise myRegionIter will point to the region after the position // or to the end if the position is after the last region. // Determine if it needs to search to the left // a) it is at the end // b) the region starts after the position. if(myRegionIter == myRegions.end()) { // If the iterator is at the end, search to the left. return(findLeft(pos)); } else if(pos < myRegionIter->first) { // Not at the end, so search left if the position is less // than this region's start. return(findLeft(pos)); } else { return(findRight(pos)); } } bool NonOverlapRegionPos::findRight(int32_t pos) { // Keep looping until the end or until the position is found. while(myRegionIter != myRegions.end()) { // Check to see if we have passed the position. if(pos < myRegionIter->first) { // stop here, position comes before this region, // so myRegionIter is pointing to just after it, // but was not found in the region. return(false); } else if(pos < myRegionIter->second) { // the position is in the region, so return true. return(true); } else { // The position is after this region, so increment. ++myRegionIter; } } // exited because we are at the end of the regions and the position was // not found. return(false); } bool NonOverlapRegionPos::findLeft(int32_t pos) { if(myRegionIter == myRegions.end()) { if(myRegionIter == myRegions.begin()) { // There is nothing in this list, so just return. return(false); } // Move 1 lower than the end. --myRegionIter; } while(myRegionIter->first > pos) { // This region is before our position, so move to the previous region // unless this is the first region in the list. if(myRegionIter == myRegions.begin()) { // Did not find the position and the iter is at the element // just after the position. return(false); } // Not yet to the beginning of the list, so decrement. --myRegionIter; } // At this point, the regions iter points to a region that starts // before the position. // Determine if the position is in the region by checking if it is // less than the end of the region. if(pos < myRegionIter->second) { // in the region. return(true); } // This region ends before this position. The iterator needs to point // to the region after the position, so increment it. ++myRegionIter; return(false); } libStatGen-1.0.14/general/NonOverlapRegions.h000077500000000000000000000103521254730101300210310ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ ////////////////////////////////////////////////////////////////////////// #ifndef __NONOVERLAP_REGIONS_H__ #define __NONOVERLAP_REGIONS_H__ #include #include #include #include /// This class contains a list of non-overlapping regions, just positions, not /// including chromosomes (see NonOverlapRegions for chromosomes and positions). /// When regions are added that overlap, it merges them. After adding regions, /// you can check to see if a position is found in one of the regions. It is /// designed to work fastest if you make calls in sequential order. class NonOverlapRegionPos { public: friend class NonOverlapRegionsTest; NonOverlapRegionPos(); /// Copy constructor, does not copy, but initializes with an empty /// region list. NonOverlapRegionPos(const NonOverlapRegionPos& reg); ~NonOverlapRegionPos(); /// End position is not included in the region. /// If this region overlaps another region(s), they will be merged into /// one region. void add(int32_t start, int32_t end); /// Return whether or not the position was found within a region. /// If it is found within the region, myRegionIter will point to the region /// otherwise myRegionIter will point to the region after the position /// or to the end if the position is after the last region. bool inRegion(int32_t pos); private: // True if pos found in the region pointed to by myRegionIter or to // the right of myRegionIter. If the position is found in a region, // myRegionIter will point to the region containing the position. // If the position is not found in a region, myRegionIter will point // to the region after the position, or to the end if the position is // after the last region. bool findRight(int32_t pos); // True if pos found in the region pointed to by myRegionIter or to // the left of myRegionIter. If the position is found in a region, // myRegionIter will point to the region containing the position. // If the position is not found in a region, myRegionIter will point // to the region after the position, or to the end if the position is // after the last region. bool findLeft(int32_t pos); std::list< std::pair > myRegions; std::list< std::pair >::iterator myRegionIter; std::list< std::pair >::iterator myTmpIter; }; /// This class contains a list of non-overlapping regions. When regions are /// added that overlap, it merges them. After adding regions, you can check /// to see if a position is found in one of the regions. It is designed to /// work fastest if you make calls in sequential order. class NonOverlapRegions { public: friend class NonOverlapRegionsTest; NonOverlapRegions(); ~NonOverlapRegions(); /// End position is not included in the region. /// If this region overlaps another region(s), they will be merged into /// one region. void add(const char* chrom, int32_t start, int32_t end); /// Return whether or not the position was found within a region. /// If it is found within the region, myRegionIter will point to the region /// otherwise myRegionIter will point to the region after the position /// or to the end if the position is after the last region. bool inRegion(const char* chrom, int32_t pos); private: // Copy Constructor - unimplimented. NonOverlapRegions(const NonOverlapRegions& reg); std::map myRegions; }; #endif libStatGen-1.0.14/general/PackedVector.h000066400000000000000000000065641254730101300200000ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PACKEDVECTOR_H #define __PACKEDVECTOR_H // STL: #include #include #include #include "Generic.h" // // This file implements a packed vector template based on the // getter/setter code used in MemoryMapArray.h // template < uint32_t accessorFunc(std::vector &base, uint32_t index), void setterFunc(std::vector &base, uint32_t index, uint32_t value), size_t elementCount2BytesFunc(uint32_t elementCount) > class PackedVector { protected: std::vector m_data; size_t m_elementCount; double m_growthRateMultiplier; double m_growthRateAdder; public: PackedVector() : m_elementCount(0), m_growthRateMultiplier(1.20), m_growthRateAdder(128) {;} // accessing inline uint32_t operator[](uint32_t i) { return accessorFunc(m_data, i); } inline void set(uint32_t i, uint32_t v) { setterFunc(m_data, i, v); } size_t getElementCount() const { return m_elementCount; } double getUtilization() { return elementCount2BytesFunc(m_elementCount) / (double) m_data.capacity(); } void reserve(uint32_t reserveElements) { m_data.reserve(elementCount2BytesFunc(reserveElements)); } size_t size() {return m_elementCount;} void resize(uint32_t newSize) { m_elementCount = newSize; m_data.resize(elementCount2BytesFunc(m_elementCount)); } // it's a bit of a challenge to optimize this... void push_back(uint32_t value) { m_elementCount++; if(elementCount2BytesFunc(m_elementCount) >= m_data.size()) { if( (elementCount2BytesFunc(m_elementCount)) > m_data.capacity()) { size_t newCapacity = (size_t) (m_data.capacity() * m_growthRateMultiplier); // for small capacities, small fractional multipliers don't work, // so we check and do a linear increase in those cases: if(newCapacity == m_data.capacity()) { newCapacity = (size_t) (m_data.capacity() + m_growthRateAdder); } m_data.reserve(newCapacity); } } m_data.resize(elementCount2BytesFunc(m_elementCount)); set(m_elementCount-1, value); } }; typedef PackedVector< PackedAccess_1Bit, PackedAssign_1Bit, Packed1BitElementCount2Bytes > PackedVectorBool_t; typedef PackedVector< PackedAccess_2Bit, PackedAssign_2Bit, Packed2BitElementCount2Bytes > PackedVector2Bit_t; typedef PackedVector< PackedAccess_4Bit, PackedAssign_4Bit, Packed4BitElementCount2Bytes > PackedVector4Bit_t; #endif libStatGen-1.0.14/general/Parameters.cpp000066400000000000000000000553711254730101300200640ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Parameters.h" #include "Constant.h" #include "MathConstant.h" #include "Error.h" #include "PhoneHome.h" #include #include #include #include #include int Parameter::nameCol = 30; int Parameter::statusCol = 15; Parameter::Parameter(char c, const char * desc, void * v) { ch = (char) tolower(c); description = new char [strlen(desc) + 1]; strcpy(description, desc); var = v; warnings = NULL; myNoPhoneHome = true; myVersion.Clear(); } bool Parameter::Read(int , char ** argv, int argn) { int p = 0; char c = (char) tolower(argv[argn][p]); if ((c == '-') || (c == '/')) { p++; c = (char) tolower(argv[argn][p]); } if (c == ch) { Translate(&(argv[argn][++p])); return true; } return false; } bool Parameter::TranslateExtras(const char * , const char *) { return false; } void Parameter::warning(const char * format, ...) { String buffer; va_list ap; va_start(ap, format); buffer.vprintf(format, ap); va_end(ap); if (warnings == NULL) ::warning(buffer); else (*warnings) += buffer; } void IntParameter::Translate(const char * value) { *(int *) var = atoi(value); } bool IntParameter::TranslateExtras(const char * value, const char * extras) { if (value[0] != 0 || !CheckInteger(extras)) return false; Translate(extras); return true; } void IntParameter::Status() { fprintf(stderr, "%*s : %*d (-%c9999)\n", nameCol, description, statusCol, *(int *) var, ch); } void SwitchParameter::Translate(const char * value) { switch (*value) { case '+' : *(bool *) var = true; break; case '-' : *(bool *) var = false; break; case 0 : *(bool *) var = ! * (bool *) var; break; default : warning("Command line parameter -%c%s: the option '%c' has no meaning\n", ch, value, value[0]); } } void SwitchParameter::Status() { fprintf(stderr, "%*s : %*s (-%c[+|-])\n", nameCol, description, statusCol, *(bool *) var == false ? "OFF" : "ON", ch); } DoubleParameter::DoubleParameter(char c, const char * desc, double & v) : Parameter(c, desc, &v) { precision = 2; } void DoubleParameter::Translate(const char * value) { if (value[0]) *(double *) var = atof(value); else *(double *) var = _NAN_; } bool DoubleParameter::TranslateExtras(const char * value, const char * extras) { if (value[0] != 0 || !CheckDouble(extras)) return false; Translate(extras); return true; } void DoubleParameter::Status() { double absolute_value = fabs(* (double *) var); if (*(double *) var == _NAN_) fprintf(stderr, "%*s : %*s (-%c99.999)\n", nameCol, description, statusCol, "NAN", ch); else if (absolute_value >= 0.00095) fprintf(stderr, "%*s : % *.*f (-%c99.999)\n", nameCol, description, statusCol, precision, * (double *) var, ch); else if (absolute_value <= 1e-15) fprintf(stderr, "%*s : % *.0f (-%c99.999)\n", nameCol, description, statusCol, * (double *) var, ch); else fprintf(stderr, "%*s : %*.0e (-%c99.999)\n", nameCol, description, statusCol, *(double *) var, ch); } void StringParameter::Translate(const char * value) { String * s = (String *) var; *s = value; } bool StringParameter::TranslateExtras(const char * value, const char * extras) { if ((value[0] != 0) || ((!required) && (extras[0] == '-'))) return false; String * s = (String *) var; *s = extras; return true; } void StringParameter::Status() { fprintf(stderr, "%*s : %*s (-%cname)\n", nameCol, description, statusCol, (const char *)(*(String *) var), ch); } void ListParameter::Status() { OptionList * l; for (l = options; l->ch != 0; l++) if (l->code == *((int *)var)) break; fprintf(stderr, "%*s : %*s (-%c[%s])\n", nameCol, description, statusCol, l->description, ch, (const char *) key); } void ListParameter::Translate(const char * value) { OptionList * l; for (l = options; l->ch != 0; l++) if (tolower(l->ch) == tolower(value[0])) break; if (l->ch == 0 && tolower(value[0]) != 0) warning("Command line parameter -%c%s: the option '%c' has no meaning\n", ch, value, value[0], (const char *) key); *((int*) var) = l->code; } ListParameter::ListParameter(char c, const char * desc, int & v, OptionList * opt) : Parameter(c, desc, &v) { options = opt; for (OptionList * l = options; l->ch != 0; l++) { key += l->ch; key += '|'; } key.SetLength(key.Length() - 1); } SetParameter::SetParameter(char c, const char * desc, int & v, OptionList * opt) : Parameter(c, desc, &v) { options = opt; for (OptionList * l = options; l->ch != 0; l++) { key += l->ch; key += '|'; } key.SetLength(key.Length() - 1); } void SetParameter::Status() { bool first = 0; int temp = * (int *) var; for (OptionList * l = options; l->ch != 0; l++) if ((l->code & temp) || (l->code == *(int *) var)) { if (!first) fprintf(stderr, "%*s : %*s (-%c{%s})\n", nameCol, description, statusCol, l->description, ch, (const char *) key); else fprintf(stderr, "%*s & %*s\n", nameCol, "", statusCol, l->description); first = true; temp &= ~l->code; } } void SetParameter::Translate(const char * value) { *(int*)var = 0; for (const char * chr = value; *chr != 0; chr++) { int valid = false; for (OptionList * l = options; l->ch != 0; l++) if (tolower(l->ch) == tolower(*chr)) { *((int*) var) |= l->code; valid = true; } if (!valid) warning("Command line parameter -%c%s: the option '%c' has no meaning\n", ch, value, *chr); } } LongParameters::LongParameters(const char * desc, LongParameterList * lst) : Parameter('-', desc, NULL) { list = lst; index.Clear(); legacyIndex.Clear(); group_len = 0; LongParameterList * ptr = list + 1; while (ptr->description != NULL) { if (ptr->type == LP_LEGACY_PARAMETERS) break; if(ptr->type == LP_PHONEHOME_VERSION) { // Phone home is turned on, so add // the parameter for the user to turn it off. myNoPhoneHome = false; myVersion = ptr->description; ptr->description = "noPhoneHome"; ptr->value = &myNoPhoneHome; ptr->type = LP_BOOL_PARAMETER; index.Add(ptr->description, ptr); } else { if (ptr->value != NULL) index.Add(ptr->description, ptr); else group_len = max(strlen(ptr->description), group_len); } ptr++; } while (ptr->description != NULL) { if(ptr->type == LP_PHONEHOME_VERSION) { // Phone home is turned on, so add // the parameter for the user to turn it off. myNoPhoneHome = false; myVersion = ptr->description; ptr->description = "noPhoneHome"; ptr->value = &myNoPhoneHome; ptr->type = LP_BOOL_PARAMETER; legacyIndex.Add(ptr->description, ptr); } else { if (ptr->value != NULL) legacyIndex.Add(ptr->description, ptr); } ptr++; } precision = 2; } void LongParameters::ExplainAmbiguity(const char * cstr) { String value(cstr); int p = value.FastFindChar(':'); String stem = p == -1 ? value : value.Left(p); String matches; for (int i = 0; i < index.Length(); i++) if (index[i].SlowCompareToStem(stem) == 0) { if (matches.Length() + index[i].Length() > 50) { matches += " ..."; break; } matches.catprintf(" --%s", (const char *) index[i]); } warning("Ambiguous --%s matches%s\n", (const char *) value, (const char *) matches); } void LongParameters::Translate(const char * cstr) { String value(cstr); int p = value.FastFindChar(':'); int option = p == -1 ? index.FindStem(value) : index.FindStem(value.Left(p)); if (option == -2) { ExplainAmbiguity(cstr); return; } LongParameterList * ptr; if (option >= 0) ptr = (LongParameterList *) index.Object(option); else { int alternate = p == -1 ? legacyIndex.FindFirstStem(value) : legacyIndex.FindFirstStem(value.Left(p)); if (alternate < 0) { warning("Command line parameter --%s is undefined\n", (const char *) value); return; } ptr = (LongParameterList *) legacyIndex.Object(alternate); ptr->touched = true; } ptr->touched = true; if (ptr->type == LP_BOOL_PARAMETER) { if (p == -1) * (bool *) ptr->value ^= true; else *(bool *) ptr->value = value.SubStr(p + 1).SlowCompare("ON") == 0; // In exclusive groups, only one option may be selected if (ptr->exclusive) { for (int i = -1; ptr[i].exclusive; i--) *(bool *)ptr[i].value = false; for (int i = 1; ptr[i].exclusive; i++) *(bool *)ptr[i].value = false; } } else if (ptr->type == LP_INT_PARAMETER) if (p == -1) * (int *) ptr->value = * (int *) ptr->value ? 0 : 1; else *(int *) ptr->value = value.SubStr(p + 1).SlowCompare("ON") == 0 ? 1 : value.SubStr(p + 1).AsInteger(); else if (ptr->type == LP_DOUBLE_PARAMETER) { if (p != -1) * (double *) ptr->value = value.SubStr(p + 1).AsDouble(); } else if (ptr->type == LP_STRING_PARAMETER) { if (p != -1) * (String *) ptr->value = value.SubStr(p + 1); } } bool LongParameters::TranslateExtras(const char * cstr, const char * extras) { if (strchr(cstr, ':') != NULL) return false; int option = index.FindStem(cstr); if (option == -2) { // No need to explain ambiguity here ... will be handle by later call // to Translate() // ExplainAmbiguity(cstr); return false; } LongParameterList * ptr; if (option >= 0) ptr = (LongParameterList *) index.Object(option); else { option = legacyIndex.FindFirstStem(cstr); if (option < 0) return false; ptr = (LongParameterList *) legacyIndex.Object(option); ptr->touched = true; } if (ptr->type == LP_INT_PARAMETER && CheckInteger(extras)) { *(int *) ptr->value = atoi(extras); ptr->touched = true; return true; } else if (ptr->type == LP_DOUBLE_PARAMETER && CheckDouble(extras)) { *(double *) ptr->value = atof(extras); ptr->touched = true; return true; } else if (ptr->type == LP_STRING_PARAMETER) { *(String *) ptr->value = extras; ptr->touched = true; return true; } return false; } void LongParameters::Status(LongParameterList * ptr, int & line_len, bool & need_a_comma) { String state; int line_start = group_len ? group_len + 5 : 0; if (ptr->value == NULL) { fprintf(stderr, "%s %*s :", need_a_comma ? "\n" : "", group_len + 2, ptr->description); need_a_comma = false; line_len = line_start; } else { if (ptr->type == LP_BOOL_PARAMETER) state = * (bool *) ptr->value ? " [ON]" : ""; else if (ptr->type == LP_INT_PARAMETER) if (((* (int *) ptr->value == 1) && (ptr->exclusive)) || (* (int *) ptr->value == 0)) state = * (int *) ptr->value ? " [ON]" : ""; else state = " [", state += * (int *) ptr->value, state += ']'; else if (ptr->type == LP_DOUBLE_PARAMETER) if (* (double *) ptr->value != _NAN_) { double value = * (double *) ptr->value; state = " ["; if (value == 0.0 || value >= 0.01) state.catprintf("%.*f", precision, value); else state.catprintf("%.1e", value); state += ']'; } else state = ""; else if (ptr->type == LP_STRING_PARAMETER) state = " [" + * (String *) ptr->value + "]"; int item_len = 3 + strlen(ptr->description) + need_a_comma + state.Length(); if (item_len + line_len > 78 && line_len > line_start) { line_len = line_start; fprintf(stderr, "%s\n%*s", need_a_comma ? "," : "", line_len, ""); need_a_comma = 0; item_len -= 1; } fprintf(stderr, "%s --%s%s", need_a_comma ? "," : (need_a_comma = true, ""), ptr->description, (const char *) state); need_a_comma = true; line_len += item_len; } } void LongParameters::Status() { if (description != NULL && description[0] != 0) fprintf(stderr, "\n%s\n", description); bool need_a_comma = false; int line_len = 0; bool legacy_parameters = false; bool legacy_count = 0; for (LongParameterList * ptr = list + 1; ptr->description != NULL; ptr++) if (ptr->type == LP_LEGACY_PARAMETERS) legacy_parameters = true; else if (legacy_parameters == false) Status(ptr, line_len, need_a_comma); else if (ptr->touched) { if (legacy_count == 0) { fprintf(stderr, "\n\nAdditional Options:\n %*s ", group_len + 3, ""); line_len = group_len + 5; need_a_comma = false; } Status(ptr, line_len, need_a_comma); legacy_count++; } fprintf(stderr, "\n"); } void LongParameters::addParamsToString(String& params) { for (LongParameterList * ptr = list + 1; ptr->description != NULL; ptr++) { if (ptr->touched) { if(!params.IsEmpty()) { params += PARAM_STR_SEP; } params += ptr->description; } } } void ParameterList::Add(Parameter * p) { if (count + 1 >= size) error("Parameter list size should be increased"); p->SetWarningBuffer(warnings); pl[count++] = p; }; void ParameterList::Read(int argc, char ** argv, int start) { MakeString(argc, argv, start); for (int i=start; i < argc; i++) { bool success = false; if (argv[i][0] == '-' && argv[i][1]) for (int j=0; jch; if (success) { if ((i+1 < argc) && pl[j]->TranslateExtras(argv[i]+2, argv[i+1])) i++; else if (argv[i][2] == 0 && (i+1 < argc) && (argv[i + 1][0] != '-')) pl[j]->Translate(argv[++i]); else pl[j]->Translate(argv[i] + 2); break; } } if (!success) { String warning; warning.printf("Command line parameter %s (#%d) ignored\n", argv[i], i); warnings += warning; } } if (warnings.Length()) { ::warning("Problems encountered parsing command line:\n\n%s", (const char *) warnings); warnings.Clear(); } HandlePhoneHome(argc, argv, start); } int ParameterList::ReadWithTrailer(int argc, char ** argv, int start) { MakeString(argc, argv, start); int last_success = start - 1; bool split = false; for (int i=start; i < argc; i++) { bool success = false; if (argv[i][0] == '-' && argv[i][1]) for (int j=0; jch; if (success) { if ((i+1 < argc) && pl[j]->TranslateExtras(argv[i]+2, argv[i+1])) split = true; else if (argv[i][2] == 0 && (i+1 < argc) && (argv[i + 1][0] != '-')) pl[j]->Translate(argv[i + 1]), split = true; else pl[j]->Translate(argv[i] + 2); break; } } if (success) for (last_success++; last_success < i; last_success++) warnings.printf("Command line parameter %s (#%d) ignored\n", argv[last_success], last_success); if (split) { split = false; last_success++; i++; } } if (warnings.Length()) { ::warning("Problems encountered parsing command line:\n\n%s", (const char *) warnings); warnings.Clear(); } HandlePhoneHome(argc, argv, start); return last_success; }; void ParameterList::Status() { for (int i=0; iStatus(); fprintf(stderr, "\n"); if (messages.Length()) fprintf(stderr, "NOTES:\n%s\n", (const char *) messages); } void ParameterList::MakeString(int argc, char ** argv, int start) { int len = 0; for (int i=start; iaddParamsToString(params); // Check if phonehome is enabled. if(!pl[i]->myVersion.IsEmpty() && (!pl[i]->myNoPhoneHome)) { // Version specified & phoneHome enabled, so // phonehome. version = pl[i]->myVersion; } } if(!version.IsEmpty()) { PhoneHome::checkVersion(programName.c_str(), version.c_str(), params.c_str()); } } ParameterList::~ParameterList() { for (int i = 0; i < count; i++) delete pl[i]; delete [] pl; delete [] string; }; bool Parameter::CheckInteger(const char * value) { if (value[0] != '+' && value[0] != '-' && (value[0] < '0' || value[0] > '9')) return false; int pos = 1; while (value[pos] != 0) if (value[pos] < '0' || value[pos] > '9') return false; else pos++; return true; } bool Parameter::CheckDouble(const char * value) { if (value[0] != '+' && value[0] != '-' && value[0] != '.' && (value[0] < '0' || value[0] > '9')) { return false; } bool decimal = value[0] == '.'; for (int pos = 1; value[pos] != 0; pos++) { if (value[pos] < '0' || value[pos] > '9') { if (!decimal && value[pos] == '.') { decimal = true; } else if (value[pos] == 'e' || value[pos] == 'E') { return CheckInteger(value + pos + 1); } } } return true; } void ParameterList::Enforce(bool & var, bool value, const char * format, ...) { if (var == value) return; var = value; String buffer; va_list ap; va_start(ap, format); buffer.vprintf(format, ap); va_end(ap); messages += buffer; } void ParameterList::Enforce(int & var, int value, const char * format, ...) { if (var == value) return; var = value; String buffer; va_list ap; va_start(ap, format); buffer.vprintf(format, ap); va_end(ap); messages += buffer; } void ParameterList::Enforce(double & var, double value, const char * format, ...) { if (var == value) return; var = value; String buffer; va_list ap; va_start(ap, format); buffer.vprintf(format, ap); va_end(ap); messages += buffer; } void ParameterList::Enforce(String & var, const char * value, const char * format, ...) { if (var.SlowCompare(value) == 0) return; var = value; String buffer; va_list ap; va_start(ap, format); buffer.vprintf(format, ap); va_end(ap); messages += buffer; } LongParamContainer::LongParamContainer() : myEndIndex(0) { // Add the first (also adds ending) indicators. add(NULL, NULL, false, 0, 0); } LongParamContainer::~LongParamContainer() { } void LongParamContainer::add(const char * label, void * val, bool excl, int paramType, bool touch) { if(myEndIndex+1 < MAX_PARAM_ARRAY_SIZE) { // Overwrite the previous end record. myArray[myEndIndex].description = label; myArray[myEndIndex].value = val; myArray[myEndIndex].exclusive = excl; myArray[myEndIndex].type = paramType; myArray[myEndIndex].touched = touch; ++myEndIndex; // Add a new empty entry to the end. myArray[myEndIndex].description = NULL; myArray[myEndIndex].value = NULL; myArray[myEndIndex].exclusive = false; myArray[myEndIndex].type = 0; myArray[myEndIndex].touched = 0; } else { throw std::runtime_error("Tool Error: trying to add more parameters than allowed in LongParamContainer.\n"); } } libStatGen-1.0.14/general/Parameters.h000066400000000000000000000240531254730101300175220ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PARAMETERS_H__ #define __PARAMETERS_H__ #include "StringMap.h" #include "PhoneHome.h" #include #include class ParameterList; class Parameter { protected: static const char PARAM_STR_SEP = ','; char ch; char * description; void * var; static int nameCol; static int statusCol; virtual void Translate(const char * value) = 0; virtual bool TranslateExtras(const char * value, const char * extras); static bool CheckInteger(const char * value); static bool CheckDouble(const char * value); String * warnings; bool myNoPhoneHome; String myVersion; public: Parameter(char c, const char * desc, void * v); virtual ~Parameter() { delete [] description; } virtual bool Read(int argc, char ** argv, int argn); virtual void Status() = 0; virtual void addParamsToString(String& params) { if(var != NULL) { if(!params.IsEmpty()) { params += PARAM_STR_SEP; } params += description; } } static void SetNameLen(int len) { nameCol = len; } static void SetStatusLen(int len) { statusCol = len; } void SetWarningBuffer(String & buffer) { warnings = &buffer; } void warning(const char * format, ...); friend class ParameterList; }; class IntParameter : public Parameter { public: IntParameter(char c, const char * desc, int & v) : Parameter(c, desc, &v) {} virtual void Status(); protected: virtual void Translate(const char * value); virtual bool TranslateExtras(const char * value, const char * extras); }; class HiddenInteger : public IntParameter { public: HiddenInteger(char c, const char * desc, int & v) : IntParameter(c, desc, v) {} virtual void Status() { } }; class SwitchParameter : public Parameter { public: SwitchParameter(char c, const char * desc, bool & v) : Parameter(c, desc, &v) {} virtual void Status(); protected: virtual void Translate(const char * value); }; class HiddenSwitch : public SwitchParameter { public: HiddenSwitch(char c, const char * desc, bool & v) : SwitchParameter(c, desc, v) {} virtual void Status() { } }; class DoubleParameter : public Parameter { public: DoubleParameter(char c, const char * desc, double & v); virtual void Status(); DoubleParameter & SetPrecision(int precision) { this->precision = precision; return *this; } protected: virtual void Translate(const char * value); virtual bool TranslateExtras(const char * value, const char * extras); int precision; }; class HiddenDouble : public DoubleParameter { public: HiddenDouble(char c, const char * desc, double &v) : DoubleParameter(c, desc, v) {} virtual void Status() { } }; class StringParameter : public Parameter { public: StringParameter(char c, const char * desc, String & v, bool allowBlank = true) : Parameter(c, desc, &v) { required = !allowBlank; } virtual void Status(); protected: bool required; virtual void Translate(const char * value); virtual bool TranslateExtras(const char * value, const char * extras); }; class HiddenString : public StringParameter { public: HiddenString(char c, const char * desc, String & v) : StringParameter(c, desc, v) {} virtual void Status() { } }; struct OptionList { char ch; char * description; int code; }; #define BEGIN_OPTION_LIST(name) ; OptionList name[] = { #define END_OPTION_LIST(none) , {0, none, 0} }; class ListParameter : public Parameter { public: ListParameter(char c, const char * desc, int & v, OptionList * opt); virtual void Status(); protected: String key; OptionList * options; virtual void Translate(const char * value); }; class SetParameter : public Parameter { public: SetParameter(char c, const char * desc, int & v, OptionList * opt); virtual void Status(); protected: String key; OptionList * options; virtual void Translate(const char * value); }; struct LongParameterList { const char * description; void * value; bool exclusive; int type; bool touched; }; #define LP_BOOL_PARAMETER 1 #define LP_INT_PARAMETER 2 #define LP_DOUBLE_PARAMETER 3 #define LP_STRING_PARAMETER 4 #define LP_LEGACY_PARAMETERS 99 #define LP_PHONEHOME_VERSION 98 #define BEGIN_LONG_PARAMETERS(array) LongParameterList array[] = {\ { NULL, NULL, false, 0, 0}, #define LONG_PARAMETER_GROUP(label) { label, NULL, false, 0, 0}, #define LONG_PARAMETER(label,boolptr) { label, boolptr, false, 1, 0}, #define EXCLUSIVE_PARAMETER(label,boolptr) { label, boolptr, true, 1, 0}, #define LONG_INTPARAMETER(label,intptr) { label, intptr, false, 2, 0}, #define LONG_SMARTINTPARAMETER(label,intptr) { label, intptr, true, 2, 0}, #define LONG_DOUBLEPARAMETER(label,doubleptr) { label, doubleptr, false, 3, 0}, #define LONG_STRINGPARAMETER(label,stringptr) { label, stringptr, false, 4, 0}, #define LONG_PHONEHOME(version) { "PhoneHome", NULL, false, 0, 0}, { version, NULL, false, LP_PHONEHOME_VERSION, 0}, {"phoneHomeThinning", &PhoneHome::allThinning, false, LP_INT_PARAMETER, 0}, #define BEGIN_LEGACY_PARAMETERS() { "$$$", NULL, false, 99, 0}, #define END_LONG_PARAMETERS() { NULL, NULL, false, 0, 0}}; class LongParameters : public Parameter { public: LongParameters(const char * desc, LongParameterList * list); virtual void Status(); virtual void addParamsToString(String& params); LongParameters * SetPrecision(int precision) { this->precision = precision; return this; } protected: StringMap index; StringMap legacyIndex; LongParameterList * list; int group_len; int precision; virtual void Translate(const char * value); virtual bool TranslateExtras(const char * value, const char * extras); void ExplainAmbiguity(const char * value); void Status(LongParameterList * ptr, int & line_len, bool & need_a_comma); }; class ParameterList { protected: Parameter ** pl; int count; int size; void MakeString(int argc, char ** argv, int start = 1); void HandlePhoneHome(int argc, char ** argv, int start); public: char * string; ParameterList(int s = 36) { size = s; count = 0; pl = new Parameter * [size]; string = NULL; } virtual ~ParameterList(); void Add(Parameter * p); // Tries to process all command line arguments virtual void Read(int argc, char ** argv, int start = 1); // Allows for trailing, unprocessed, filenames in the command line // The number of translated argv[] items is returned virtual int ReadWithTrailer(int argc, char ** argv, int start = 1); // Outputs summary of parameter switches and settings virtual void Status(); // Keeps track of warnings generated during parameter processing String warnings; String messages; // Functions that gracefully enforce parameter settings void Enforce(bool & var, bool value, const char * reason, ...); void Enforce(int & var, int value, const char * reason, ...); void Enforce(double & var, double value, const char * reason, ...); void Enforce(String & var, const char * value, const char * reason, ...); }; // Container for holding the long parameter list. // Allows parameters to be added. // Allows users to not have to use BEGIN_LONG_PARAMETERS or to understand // the details of a LongParameterList. class LongParamContainer { public: LongParamContainer(); ~LongParamContainer(); // Get a pointer to the LongParameterList. inline LongParameterList* getLongParameterList() { return(myArray); } void add(const char * label, void * val, bool excl, int paramType, bool touch = 0); inline void addGroup(const char * label) { add(label, NULL, false, 0, 0); } inline void addBool(const char * label, void * boolptr) { add(label, boolptr, false, LP_BOOL_PARAMETER, 0); } inline void addExclusiveBool(const char * label, void * boolptr) { add(label, boolptr, true, LP_BOOL_PARAMETER, 0); } inline void addInt(const char * label, void * intptr) { add(label, intptr, false, LP_INT_PARAMETER, 0); } inline void addSmartInt(const char * label, void * intptr) { add(label, intptr, true, LP_INT_PARAMETER, 0); } inline void addDouble(const char * label, void * doubleptr) { add(label, doubleptr, false, LP_DOUBLE_PARAMETER, 0); } inline void addString(const char * label, void * stringptr) { add(label, stringptr, false, LP_STRING_PARAMETER, 0); } inline void addPhoneHome(const char* version) { add("PhoneHome", NULL, false, 0, 0); add(version, NULL, false, LP_PHONEHOME_VERSION, 0); add("phoneHomeThinning", &PhoneHome::allThinning, false, LP_INT_PARAMETER, 0); } inline void startLegacyParams() { add("$$$", NULL, false, 99, 0); } private: // At most 100 parameters are allowed. static const int MAX_PARAM_ARRAY_SIZE = 100; LongParameterList myArray[MAX_PARAM_ARRAY_SIZE]; int myEndIndex; }; #endif libStatGen-1.0.14/general/Pedigree.cpp000066400000000000000000001006471254730101300175020ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Pedigree.h" #include "GenotypeLists.h" #include "MemoryInfo.h" #include "Constant.h" #include "Error.h" #include "Sort.h" #include bool Pedigree::sexAsCovariate = false; String Pedigree::missing("-99.999"); Pedigree::Pedigree() : pd() { haveTwins = count = 0; size = 10000; persons = new Person *[size]; familyCount = 0; families = new Family * [1]; multiPd = NULL; multiFileCount = 0; } Pedigree::~Pedigree() { for (int i = 0; i < count; i++) delete persons[i]; for (int i = 0; i < familyCount; i++) delete families[i]; delete [] families; delete [] persons; if (multiPd != NULL) delete [] multiPd; } void Pedigree::Sort() { QuickSort(persons, count, sizeof(Person *), COMPAREFUNC Pedigree::ComparePersons); haveTwins = 0; // Check for structural problems in input pedigree bool problem = false; // Check that we have no duplicates... for (int i = 1; i < count; i++) if (ComparePersons((const Person **) &persons[i-1], (const Person **) &persons[i]) == 0) { printf("Family %s: Person %s is duplicated\n", (const char *) persons[i]->famid, (const char *) persons[i]->pid); problem = true; do { i++; } while (i < count && ComparePersons((const Person **) &persons[i-1], (const Person **) &persons[i]) == 0); } // Assign parents... for (int i = 0; i < count; i++) { persons[i]->serial = i; persons[i]->father = FindPerson(persons[i]->famid, persons[i]->fatid); persons[i]->mother = FindPerson(persons[i]->famid, persons[i]->motid); problem |= !persons[i]->CheckParents(); persons[i]->AssessStatus(); // Check if we have any twins... haveTwins |= persons[i]->zygosity; } if (problem) error("Please correct problems with pedigree structure\n"); MakeSibships(); MakeFamilies(); } void Pedigree::MakeSibships() { Person ** sibs = new Person * [count]; for (int i = 0; i < count; i++) sibs[i] = persons[i]; QuickSort(sibs, count, sizeof(Person *), COMPAREFUNC Pedigree::CompareParents); for (int first = 0; first < count; first++) if (!sibs[first]->isFounder()) { int last = first + 1; while (last < count) if (sibs[first]-> mother != sibs[last]->mother || sibs[first]-> father != sibs[last]->father) break; else last++; last --; for (int j = first; j <= last; j++) { if (sibs[j]->sibCount) delete [] sibs[j]->sibs; sibs[j]->sibCount = last - first + 1; sibs[j]->sibs = new Person * [sibs[j]->sibCount]; for (int k = first; k <= last; k++) sibs[j]->sibs[k - first] = sibs[k]; } first = last; } delete [] sibs; } void Pedigree::MakeFamilies() { for (int i = 0; i < familyCount; i++) delete families[i]; delete [] families; familyCount = 0; families = new Family * [count]; for (int first=0; first < count; first++) { int last = first; while (last < count) if (SlowCompare(persons[first]->famid, persons[last]->famid) == 0) last++; else break; families[familyCount] = new Family(*this, first, --last, familyCount); first = last; familyCount++; } } // Utility functions for finding a person in a pedigree struct PedigreeKey { const char * famid; const char * pid; }; int CompareKeyToPerson(PedigreeKey * key, Person ** p) { int result = SlowCompare(key->famid, (**p).famid); if (result != 0) return result; return SlowCompare(key->pid, (**p).pid); } int CompareKeyToFamily(PedigreeKey * key, Family ** f) { return SlowCompare(key->famid, (**f).famid); } Person * Pedigree::FindPerson(const char * famid, const char * pid) { PedigreeKey key; key.famid = famid; key.pid = pid; Person ** result = (Person **) BinarySearch (&key, persons, count, sizeof(Person *), COMPAREFUNC CompareKeyToPerson); return (result == NULL) ? (Person *) NULL : *result; } Person * Pedigree::FindPerson(const char *famid, const char *pid, int universe) { PedigreeKey key; key.famid = famid; key.pid = pid; Person ** result = (Person **) BinarySearch (&key, persons, universe, sizeof(Person *), COMPAREFUNC CompareKeyToPerson); return (result == NULL) ? (Person *) NULL : *result; } Family * Pedigree::FindFamily(const char * famid) { PedigreeKey key; key.famid = famid; Family ** result = (Family **) BinarySearch (&key, families, familyCount, sizeof(Family *), COMPAREFUNC CompareKeyToFamily); return (result == NULL) ? (Family *) NULL : *result; } int Pedigree::CountAlleles(int marker) { return ::CountAlleles(*this, marker); } void Pedigree::LumpAlleles(double min, bool reorder) { if (min > 0.0) printf("Lumping alleles with frequencies of %.2f or less...\n\n", min); for (int m=0; m < markerCount; m++) ::LumpAlleles(*this, m, min, reorder); } void Pedigree::EstimateFrequencies(int estimator, bool quiet) { bool estimated = false; int line = 3; const char * estimators[] = { "using all genotypes", "using founder genotypes", "assumed equal" }; bool condensed = markerCount > 100; int grain = markerCount / 50, estimates = 0; for (int m=0; m < markerCount; m++) if (::EstimateFrequencies(*this, m, estimator)) if (!quiet) { if (!estimated) printf("Estimating allele frequencies... [%s]\n ", estimators[estimator]), estimated = true; if (condensed) { if (estimates++ % grain == 0) { printf("."); fflush(stdout); } continue; } if (line + markerNames[m].Length() + 1 > 79) printf("\n "), line = 3; printf("%s ", (const char *) markerNames[m]); line += markerNames[m].Length() + 1; } if (estimated) printf(condensed ? "\nDone estimating frequencies for %d markers\n\n" : "\n\n", estimates); } int Pedigree::ComparePersons(const Person ** p1, const Person ** p2) { int result = SlowCompare((**p1).famid, (**p2).famid); if (result != 0) return result; return SlowCompare((**p1).pid, (**p2).pid); } int Pedigree::CompareParents(const Person ** p1, const Person ** p2) { int result = SlowCompare((**p1).famid, (**p2).famid); if (result) return result; result = SlowCompare((**p1).fatid, (**p2).fatid); if (result) return result; return SlowCompare((**p1).motid, (**p2).motid); } void Pedigree::Grow() { size *= 2; Person ** temp = new Person * [size]; if (temp == NULL) error("Out of memory"); for (int i=0; iCopy(rhs); } void Pedigree::WriteDataFile(FILE * output) { // write in the following order: // markers, traits, affections, covariates if (haveTwins) fprintf(output, " Z Zygosity\n"); for (int m = 0; m < markerCount; m++) fprintf(output, " M %s\n", (const char *) markerNames[m]); for (int t = 0; t < traitCount; t++) fprintf(output, " T %s\n", (const char *) traitNames[t]); for (int a = 0; a < affectionCount; a++) fprintf(output, " A %s\n", (const char *) affectionNames[a]); for (int c = 0; c < covariateCount; c++) fprintf(output, " C %s\n", (const char *) covariateNames[c]); for (int s = 0; s < stringCount; s++) fprintf(output, " $ %s\n", (const char *) stringNames[s]); fprintf(output, " E END-OF-DATA \n"); } void Pedigree::WritePedigreeFile(FILE * output) { MarkerInfo ** info = new MarkerInfo * [markerCount]; for (int i = 0; i < markerCount; i++) info[i] = GetMarkerInfo(i); for (int i = 0; i < count; i++) WriteRecodedPerson(output, i, info); fprintf(output, "end\n"); delete [] info; } void Pedigree::WritePerson(FILE * output, int person, const char * famid, const char * pid, const char * fatid, const char * motid) { WriteRecodedPerson(output, person, NULL, famid, pid, fatid, motid); } void Pedigree::WriteRecodedPerson( FILE * output, int person, MarkerInfo ** markerInfo, const char * famid, const char * pid, const char * fatid, const char * motid) { Person * p = persons[person]; if (famid == NULL) famid = p->famid; if (pid == NULL) pid = p->pid; if (fatid == NULL) fatid = p->fatid; if (motid == NULL) motid = p->motid; // write in the following order: // markers, traits, affections, covariates fprintf(output, "%s\t%s\t%s\t%s\t%d\t", famid, pid, fatid, motid, p->sex); const char * twinCodes[] = {"0", "MZ", "DZ"}; if (haveTwins) { if (p->zygosity <= 2) fprintf(output, "%s\t", twinCodes[p->zygosity]); else fprintf(output, "%d\t", p->zygosity); } for (int m = 0; m < markerCount; m++) if (markerInfo == NULL) fprintf(output, markerCount < 20 ? "%3d/%3d\t" : "%d/%d\t", p->markers[m][0], p->markers[m][1]); else fprintf(output, markerCount < 20 ? "%3s/%3s\t" : "%s/%s\t", (const char *) markerInfo[m]->GetAlleleLabel(p->markers[m][0]), (const char *) markerInfo[m]->GetAlleleLabel(p->markers[m][1])); for (int t = 0; t < traitCount; t++) if (p->isPhenotyped(t)) fprintf(output, "%.3f\t", p->traits[t]); else fprintf(output, "x\t"); for (int a = 0; a < affectionCount; a++) if (p->isDiagnosed(a)) fprintf(output, "%d\t", p->affections[a]); else fprintf(output, "x\t"); for (int c = 0; c < covariateCount; c++) if (p->isControlled(c)) fprintf(output, "%.3f\t", p->covariates[c]); else fprintf(output, "x\t"); for (int s = 0; s < stringCount; s++) if (!p->strings[s].IsEmpty()) fprintf(output, "%s\t", (const char *) p->strings[s]); else fprintf(output, ".\t"); fprintf(output, "\n"); } void Pedigree::WriteDataFile(const char * output) { FILE * f = fopen(output, "wt"); if (f == NULL) error("Couldn't open data file %s", output); WriteDataFile(f); fclose(f); } void Pedigree::WritePedigreeFile(const char * output) { FILE * f = fopen(output, "wt"); if (f == NULL) error("Couldn't open pedigree file %s", output); WritePedigreeFile(f); fclose(f); } void Pedigree::PrepareDichotomization() { for (int t = 0; t < traitCount; t++) { String new_affection = traitNames[t] + "*"; GetAffectionID(new_affection); } } int Pedigree::Dichotomize(int t, double mean) { String new_affection = traitNames[t] + "*"; int af = GetAffectionID(new_affection); if (mean == _NAN_) { mean = 0.0; double dcount = 0; for (int i = 0; i < count; i++) if (persons[i]->isPhenotyped(t) && !persons[i]->isFounder()) { mean += persons[i]->traits[t]; dcount ++; } if (!dcount) return af; mean /= dcount; } printf("Dichotomizing %s around mean of %.3f ...\n", (const char *) traitNames[t], mean); for (int i = 0; i < count; i++) if (persons[i]->isPhenotyped(t) && !persons[i]->isFounder()) persons[i]->affections[af] = persons[i]->traits[t] > mean ? 2 : 1; else persons[i]->affections[af] = 0; Sort(); return af; } void Pedigree::DichotomizeAll(double mean) { for (int t = 0; t < traitCount; t++) Dichotomize(t, mean); } bool Pedigree::InheritanceCheck(bool abortIfInconsistent) { bool fail = false; if (haveTwins) fail |= TwinCheck(); if (chromosomeX) fail |= SexLinkedCheck(); else fail |= AutosomalCheck(); if (fail && abortIfInconsistent) error("Mendelian inheritance errors detected\n"); return !fail; } bool Pedigree::AutosomalCheck() { // Arrays indicating which alleles and homozygotes occur IntArray haplos, genos, counts, failedFamilies; bool fail = false; // For each marker ... for (int m = 0; m < markerCount; m++) { MarkerInfo * info = GetMarkerInfo(m); // Summary for marker int alleleCount = CountAlleles(m); int genoCount = alleleCount * (alleleCount + 1) / 2; // Initialize arrays haplos.Dimension(alleleCount + 1); haplos.Set(-1); genos.Dimension(genoCount + 1); genos.Set(-1); failedFamilies.Dimension(familyCount); failedFamilies.Zero(); counts.Dimension(alleleCount + 1); for (int f = 0; f < familyCount; f++) for (int i = families[f]->first; i <= families[f]->last; i++) if (!persons[i]->isFounder() && persons[i]->sibs[0] == persons[i]) { // This loop runs once per sibship Alleles fat = persons[i]->father->markers[m]; Alleles mot = persons[i]->mother->markers[m]; bool fgeno = fat.isKnown(); bool mgeno = mot.isKnown(); // Number of alleles, homozygotes and genotypes in this sibship int haplo = 0, homo = 0, diplo = 0; // No. of different genotypes per allele counts.Zero(); // In general, there should be no more than 3 genotypes per allele bool too_many_genos = false; for (int j = 0; j < persons[i]->sibCount; j++) if (persons[i]->sibs[j]->isGenotyped(m)) { Alleles geno = persons[i]->sibs[j]->markers[m]; int fat1 = fat.hasAllele(geno.one); int fat2 = fat.hasAllele(geno.two); int mot1 = mot.hasAllele(geno.one); int mot2 = mot.hasAllele(geno.two); if ((fgeno && mgeno && !((fat1 && mot2) || (fat2 && mot1))) || (fgeno && !(fat1 || fat2)) || (mgeno && !(mot1 || mot2))) { printf("%s - Fam %s: Child %s [%s/%s] has ", (const char *) markerNames[m], (const char *) persons[i]->sibs[j]->famid, (const char *) persons[i]->sibs[j]->pid, (const char *) info->GetAlleleLabel(geno.one), (const char *) info->GetAlleleLabel(geno.two)); if (!fgeno || !mgeno) printf("%s [%s/%s]\n", fgeno ? "father" : "mother", (const char *) info->GetAlleleLabel(fgeno ? fat.one : mot.one), (const char *) info->GetAlleleLabel(fgeno ? fat.two : mot.two)); else printf("parents [%s/%s]*[%s/%s]\n", (const char *) info->GetAlleleLabel(fat.one), (const char *) info->GetAlleleLabel(fat.two), (const char *) info->GetAlleleLabel(mot.one), (const char *) info->GetAlleleLabel(mot.two)); fail = true; failedFamilies[f] = true; } else { if (haplos[geno.one] != i) { haplo++; haplos[geno.one] = i; }; if (haplos[geno.two] != i) { haplo++; haplos[geno.two] = i; }; int index = geno.SequenceCoded(); if (genos[index] != i) { genos[index] = i; diplo++; counts[geno.one]++; if (geno.isHomozygous()) homo++; else counts[geno.two]++; if (counts[geno.one] > 2) too_many_genos = true; if (counts[geno.two] > 2) too_many_genos = true; } } } if (fgeno) { if (haplos[fat.one] != i) { haplo++; haplos[fat.one] = i; } if (haplos[fat.two] != i) { haplo++; haplos[fat.two] = i; } homo += fat.isHomozygous(); } if (mgeno) { if (haplos[mot.one] != i) { haplo++; haplos[mot.one] = i; } if (haplos[mot.two] != i) { haplo++; haplos[mot.two] = i; } homo += mot.isHomozygous(); } if (diplo > 4 || haplo + homo > 4 || (haplo == 4 && too_many_genos)) { printf("%s - Fam %s: ", (const char *) markerNames[m], (const char *) persons[i]->famid); if (persons[i]->father->markers[m].isKnown()) printf("Father %s [%s/%s] has children [", (const char *) persons[i]->father->pid, (const char *) info->GetAlleleLabel(fat.one), (const char *) info->GetAlleleLabel(fat.two)); else if (persons[i]->mother->markers[m].isKnown()) printf("Mother %s [%s/%s] has children [", (const char *) persons[i]->mother->pid, (const char *) info->GetAlleleLabel(mot.one), (const char *) info->GetAlleleLabel(mot.two)); else printf("Couple %s * %s has children [", (const char *) persons[i]->mother->pid, (const char *) persons[i]->father->pid); for (int j = 0; j < persons[i]->sibCount; j++) printf("%s%s/%s", j == 0 ? "" : " ", (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].one), (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].two)); printf("]\n"); fail = true; failedFamilies[f] = true; } } for (int f = 0; f < familyCount; f++) if (!failedFamilies[f] && (families[f]->count > families[f]->founders + 1) && !families[f]->isNuclear()) fail |= !GenotypeList::EliminateGenotypes(*this, families[f], m); } if (fail) printf("\nMendelian inheritance errors detected\n"); return fail; } bool Pedigree::SexLinkedCheck() { bool fail = false; // Keep track of what families fail the basic inheritance check, // so that we can run later run genotype elimination check on the remainder IntArray failedFamilies(familyCount); // For each marker ... for (int m = 0; m < markerCount; m++) { MarkerInfo * info = GetMarkerInfo(m); failedFamilies.Zero(); // Check for homozygous males for (int f = 0; f < familyCount; f++) for (int i = families[f]->first; i <= families[f]->last; i++) if (persons[i]->sex == SEX_MALE && persons[i]->markers[m].isKnown() && !persons[i]->markers[m].isHomozygous()) { printf("%s - Fam %s: Male %s has two X alleles [%s/%s]\n", (const char *) markerNames[m], (const char *) persons[i]->famid, (const char *) persons[i]->pid, (const char *) info->GetAlleleLabel(persons[i]->markers[m].one), (const char *) info->GetAlleleLabel(persons[i]->markers[m].two)); // Wipe this genotype so we don't get cascading errors below persons[i]->markers[m][0] = persons[i]->markers[m][1] = 0; fail = true; failedFamilies[f] = true; } // Check full sibships for errors // TODO -- We could do better by grouping male half-sibs for (int f = 0; f < familyCount; f++) for (int i = families[f]->first; i <= families[f]->last; i++) if (!persons[i]->isFounder() && persons[i]->sibs[0] == persons[i]) { // This loop runs once per sibship Alleles fat = persons[i]->father->markers[m]; Alleles mot = persons[i]->mother->markers[m]; bool fgeno = fat.isKnown(); bool mgeno = mot.isKnown(); Alleles inferred_mother = mot; Alleles first_sister; Alleles inferred_father; bool mother_ok = true; int sisters = 0; for (int j = 0; j < persons[i]->sibCount; j++) if (persons[i]->sibs[j]->isGenotyped(m)) { Alleles geno = persons[i]->sibs[j]->markers[m]; bool fat1 = fat.hasAllele(geno.one); bool fat2 = fat.hasAllele(geno.two); bool mot1 = mot.hasAllele(geno.one); bool mot2 = mot.hasAllele(geno.two); int sex = persons[i]->sibs[j]->sex; if (sex == SEX_MALE) { if (mgeno && !mot1) { printf("%s - Fam %s: Child %s [%s/Y] has mother [%s/%s]\n", (const char *) markerNames[m], (const char *) persons[i]->famid, (const char *) persons[i]->sibs[j]->pid, (const char *) info->GetAlleleLabel(geno.one), (const char *) info->GetAlleleLabel(mot.one), (const char *) info->GetAlleleLabel(mot.two)); fail = true; failedFamilies[f] = true; } else mother_ok &= inferred_mother.AddAllele(geno.one); } if (sex == SEX_FEMALE) { if ((fgeno && mgeno && !((fat1 && mot2) || (fat2 && mot1))) || (fgeno && !(fat1 || fat2)) || (mgeno && !(mot1 || mot2))) { printf("%s - Fam %s: Child %s [%s/%s] has ", (const char *) markerNames[m], (const char *) persons[i]->famid, (const char *) persons[i]->sibs[j]->pid, (const char *) info->GetAlleleLabel(geno.one), (const char *) info->GetAlleleLabel(geno.two)); if (!fgeno) printf("mother [%s/%s]\n", (const char *) info->GetAlleleLabel(mot.one), (const char *) info->GetAlleleLabel(mot.two)); else if (!mgeno) printf("father [%s/Y]\n", (const char *) info->GetAlleleLabel(fat.one)); else printf("parents [%s/Y]*[%s/%s]\n", (const char *) info->GetAlleleLabel(fat.one), (const char *) info->GetAlleleLabel(mot.one), (const char *) info->GetAlleleLabel(mot.two)); fail = true; failedFamilies[f] = true; } else { if (!sisters++) inferred_father = first_sister = geno; else if (first_sister != geno) { inferred_father.Intersect(geno); mother_ok &= inferred_mother.AddAllele( geno.otherAllele(inferred_father.one)); mother_ok &= inferred_mother.AddAllele( first_sister.otherAllele(inferred_father.one)); } if (!fgeno && (mot1 ^ mot2)) inferred_father.Intersect(mot1 ? geno.two : geno.one); if (!mgeno && (fat1 ^ fat2)) mother_ok &= inferred_mother.AddAllele(fat1 ? geno.two : geno.one); } } } if (!mother_ok || (sisters && !inferred_father.isKnown())) { printf("%s - Fam %s: ", (const char *) markerNames[m], (const char *) persons[i]->famid); if (fgeno) printf("Father %s [%s/Y] has children [", (const char *) persons[i]->father->pid, (const char *) info->GetAlleleLabel(fat.one)); else if (mgeno) printf("Mother %s [%s/%s] has children [", (const char *) persons[i]->mother->pid, (const char *) info->GetAlleleLabel(mot.one), (const char *) info->GetAlleleLabel(mot.two)); else printf("Couple %s * %s has children [", (const char *) persons[i]->mother->pid, (const char *) persons[i]->father->pid); for (int j = 0; j < persons[i]->sibCount; j++) printf( persons[i]->sibs[j]->sex == SEX_MALE ? "%s%s/Y" : "%s%s/%s", j == 0 ? "" : " ", (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].one), (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].two)); printf("]\n"); fail = true; failedFamilies[f] = true; } } for (int f = 0; f < familyCount; f++) if (!failedFamilies[f] && (families[f]->count > families[f]->founders + 1) && !families[f]->isNuclear()) fail |= !GenotypeList::EliminateGenotypes(*this, families[f], m); } if (fail) printf("\nMendelian inheritance errors detected\n"); return fail; } void Pedigree::ExtractFamily(int id, Pedigree & single_fam_ped) { for (int i = families[id]->first; i <= families[id]->last; i++) single_fam_ped.Add(*persons[i]); single_fam_ped.Sort(); } void Pedigree::ExtractOnAffection(int a, Pedigree & new_ped, int target_status) { for (int i = 0; i < count; i++) if (persons[i]->affections[a] == target_status) new_ped.Add(*persons[i]); else { Person blank_person; blank_person.CopyIDs(*persons[i]); new_ped.Add(blank_person); } new_ped.Sort(); } void Pedigree::Filter(IntArray & filter) { if (filter.Length() != count) error("Pedigree:Size of pedigree filter doesn't match number of persons in pedigree"); for (int i = 0; i < count; i++) if (filter[i] == 1) { persons[i]->WipePhenotypes(); persons[i]->filter = true; } } void Pedigree::AddPerson(const char * famid, const char * pid, const char * fatid, const char * motid, int sex, bool delay_sort) { if (count == size) Grow(); persons[count] = new Person; persons[count]->famid = famid; persons[count]->pid = pid; persons[count]->fatid = fatid; persons[count]->motid = motid; persons[count]->sex = sex; count++; if (!delay_sort) Sort(); } void Pedigree::ShowMemoryInfo() { unsigned int bytes = 0; for (int i = 0; i < count; i++) bytes += persons[i]->famid.BufferSize() + persons[i]->pid.BufferSize() + persons[i]->fatid.BufferSize() + persons[i]->motid.BufferSize(); bytes += count * (markerCount * sizeof(Alleles) + traitCount * sizeof(double) + covariateCount * sizeof(double) + affectionCount * sizeof(char) + sizeof(Person)); printf(" %40s %s\n", "Pedigree file ...", (const char *) MemoryInfo(bytes)); } libStatGen-1.0.14/general/Pedigree.h000066400000000000000000000127601254730101300171450ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _PEDIGREE_H_ #define _PEDIGREE_H_ #include "Constant.h" #include #include "PedigreeAlleles.h" #include "PedigreePerson.h" #include "PedigreeGlobals.h" #include "PedigreeFamily.h" #include "PedigreeDescription.h" #include "PedigreeAlleleFreq.h" class Pedigree : public PedigreeGlobals { public: static bool sexAsCovariate; static String missing; int size; int count; Person ** persons; int familyCount; Family ** families; int haveTwins; PedigreeDescription pd; PedigreeDescription *multiPd; int multiFileCount; Pedigree(); ~Pedigree(); void Prepare(IFILE & input); // Read pedigree parameters from data file void Load(IFILE & input); // Read pedigree from pedigree file void LoadMendel(IFILE & input); // Read pedigree in Mendel format void Prepare(const char * input); // Read pedigree parameters from named file // Read pedigree parameters from named file, stop program on failure // depending on setting of allow failures void Load(const char * input, bool allowFailures = false); // I/O related utility functions int TranslateSexCode(const char * code, bool & failure); void PrepareDichotomization(); // Register dummy affections for each trait int Dichotomize(int trait, double mean = _NAN_); void DichotomizeAll(double mean = _NAN_); void WriteDataFile(FILE * output); // Write data file void WritePedigreeFile(FILE * output); // Write pedigree file void WriteDataFile(const char * output); // Write named data file void WritePedigreeFile(const char * output); // Write named pedigree file void WritePerson(FILE * output, int who, // Write a single person const char * famid = NULL, // if supplied, famid, pid, const char * pid = NULL, // fatid and motid allow a const char * fatid = NULL, // pedigree or person to const char * motid = NULL); // be renamed / restructured void WriteRecodedPerson( // Like write person, but uses FILE * output, int who, // user supplied markerInfo MarkerInfo ** markerInfo, // array to recode marker const char * famid = NULL, // alleles as they are written const char * pid = NULL, const char * fatid = NULL, const char * motid = NULL); void Sort(); // Sorts the pedigree items Family * FindFamily(const char * famid); // Find a family Person * FindPerson(const char * famid, // Find an individual const char * pid); // functions dealing with genetic markers // Counts the alleles at a marker int CountAlleles(int marker); // Lumps together rare alleles and, depending on reorder flag, // sorts alleles so the most common allele has the lowest index void LumpAlleles(double treshold, bool reorder = true); // Calculate allele frequencies void EstimateFrequencies(int estimator, bool quiet = false); // shorthand operators Person & operator [](int i) { return *(persons[i]); } // Perform a basic inheritance check bool InheritanceCheck(bool abortIfInconsistent = true); bool AutosomalCheck(); bool SexLinkedCheck(); bool TwinCheck(); // Merge twins into a single individual void MergeTwins(); // Remove individuals with no data from pedigree void Trim(bool quiet = false, int * informative = NULL); // Add a single individual to a pedigree void AddPerson(const char * famid, const char * pid, const char * fatid, const char * motid, int sex, bool delay_sort = false); // Add all individuals in family with famid = id to new_ped void ExtractFamily(int id, Pedigree & new_ped); // Add individuals with affection status target_status for affection a to new_ped void ExtractOnAffection(int a, Pedigree & new_ped, int target_status = 2); // Remove all covariate, affection and genotype information from persons for which filter[i] = 0 void Filter(IntArray & filter); // Reports memory usage for storing the pedigree void ShowMemoryInfo(); private: void Grow(); void Add(Person & rhs); static int ComparePersons(const Person ** p1, const Person ** p2); static int CompareParents(const Person ** p1, const Person ** p2); void MakeSibships(); void MakeFamilies(); Person * FindPerson(const char * famid, const char * pid, int universe); void ShowTrimHeader(bool & flag); }; #endif libStatGen-1.0.14/general/PedigreeAlleleFreq.cpp000066400000000000000000000176441254730101300214430ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "PedigreeAlleleFreq.h" #include "QuickIndex.h" #include "Error.h" #include int CountAlleles(Pedigree & /* ped */, int marker) { // With automatic recoding in the pedigree loader there // is no need to iterate through the pedigree ... MarkerInfo * info = Pedigree::GetMarkerInfo(marker); return info->CountAlleles(); } void LumpAlleles(Pedigree & ped, int marker, double threshold, bool reorder) { // find out how many alleles there are int alleles = ped.CountAlleles(marker); if (alleles < 2) return; MarkerInfo * info = PedigreeGlobals::GetMarkerInfo(marker); if (alleles < info->freq.Length()) alleles = info->freq.Length() - 1; IntArray counts(alleles + 1); counts.Zero(); // Count number of occurrences for each allele for (int i = 0; i < ped.count; i++) { counts[int(ped[i].markers[marker][0])]++; counts[int(ped[i].markers[marker][1])]++; } // Calculate treshold for lumping alleles int total = 0; for (int i = 1; i <= alleles; i++) total += counts[i]; int thresh = int(total * threshold); // If threshold is set at zero, we artificially increase // counts for alleles that do not appear in the pedigree // but whose frequencies are set > 0.0. This ensures that // allele frequency data does not get discarded when simply // recoding alleles (vs. lumping) if (thresh == 0) for (int i = 1; i < info->freq.Length(); i++) if (counts[i] == 0 && info->freq[i] > 0.0) counts[i] = 1, total++; // If allele reordering is disabled, put in dummy allele // counts so as to ensure that allele have desired ordering if (!reorder) { QuickIndex index(info->alleleLabels); index.Reverse(); for (int i = 0; i < index.Length(); i++) counts[index[i]] = i + 1; total = counts.Sum(1, counts.Length() - 1); } // Order all alleles according to their frequency // Zero, which corresponds to missing values, stays put! counts[0] = total + 1; QuickIndex index(counts); index.Reverse(); // recode alleles // all alleles where frequency < thresh are labelled N // use counts array to keep track of labels int N = 0; bool rare = false; for (int i = 0; i <= alleles; i++) if (counts[index[i]] > thresh) { counts[index[i]] = i; N++; } else { if (counts[index[i]] > 0) rare = true; counts[index[i]] = N; } // This loop does the recoding for (int i = 0; i < ped.count; i++) { Alleles & current = ped[i].markers[marker]; current[0] = counts[current[0]]; current[1] = counts[current[1]]; } StringArray oldLabels(info->alleleLabels); String label; info->alleleLabels.Clear(); info->alleleNumbers.Clear(); for (int i = 0; i < N; i++) { if (oldLabels.Length() <= index[i]) info->alleleLabels.Push(label = index[i]); else info->alleleLabels.Push(oldLabels[index[i]]); if (i) info->alleleNumbers.SetInteger(info->alleleLabels.Last(), i); } // Reorder allele frequencies if necessary if (info->freq.Length()) { Vector freq(info->freq); info->freq.Dimension(N); info->freq[0] = 0.0; for (int i = 1; i < N; i++) { info->freq[i] = freq[index[i]]; freq[index[i]] = 0; } if ((1.0 - info->freq.Sum()) > 1e-10) rare = true; if (rare) { info->freq.Dimension(N + 1); info->freq[N] = 1.0 - info->freq.Sum(); } } if (rare) { info->alleleLabels.Push("OTHER"); info->alleleNumbers.SetInteger("OTHER", info->alleleLabels.Length()); } } bool EstimateFrequencies(Pedigree & ped, int marker, int estimator) { int alleleCount = CountAlleles(ped, marker); IntArray founder(alleleCount + 1); IntArray all(alleleCount + 1); founder.Zero(); all.Zero(); for (int i = 0; i < ped.count; i++) { // When counting alleles, note that males only carry one X chromosome // and are arbitrarily scored as homozygous. all[ped[i].markers[marker][0]]++; if (!ped.chromosomeX || ped[i].sex != SEX_MALE) all[ped[i].markers[marker][1]]++; if (!ped[i].isFounder()) continue; founder[ped[i].markers[marker][0]]++; if (!ped.chromosomeX || ped[i].sex != SEX_MALE) founder[ped[i].markers[marker][1]]++; } MarkerInfo * info = ped.GetMarkerInfo(marker); if (info->freq.dim > 0) { // previous allele frequency information is available if (alleleCount >= info->freq.dim) error("For marker %s, input files define %d alleles, but at least\n" "one other allele (named '%s') occurs in the pedigree\n", (const char *) info->name, info->freq.dim - 1, (const char *) info->GetAlleleLabel(alleleCount)); for (int i = 1; i <= alleleCount; i++) if (all[i] > 0 && info->freq[i] <= 0.0) error("Although allele %s for marker %s has frequency zero,\n" "it occurs %d times in the pedigree", (const char *) info->GetAlleleLabel(i), (const char *) info->name, all[i]); return false; } else { if (alleleCount < 1) { // If no one is genotyped, default to two equifrequent allele // since some programs do not like monomorphic markers info->freq.Dimension(3); info->freq[0] = 0.0; info->freq[1] = 0.99999; info->freq[2] = 0.00001; return true; } info->freq.Dimension(alleleCount + 1); info->freq.Zero(); if (estimator == FREQ_FOUNDERS && founder.Sum() > founder[0]) { // Make sure the frequency of alleles occuring in the pedigree // is never zero for (int i = 1; i <= alleleCount; i++) if (founder[i] == 0 && all[i] > 0) founder[i] = 1; // To get frequencies, just multiply counts by 1 / total_counts double factor = 1.0 / (founder.Sum() - founder[0]); for (int i = 1; i <= alleleCount; i++) info->freq[i] = founder[i] * factor; } else if (estimator == FREQ_ALL || estimator == FREQ_FOUNDERS) { // To get frequencies, just multiply counts by 1 / total_counts double factor = 1.0 / (all.Sum() - all[0]); for (int i = 1; i <= alleleCount; i++) info->freq[i] = all[i] * factor; } else if (estimator == FREQ_EQUAL) // Assume all alleles have equal frequency { // Count the number of observed alleles all[0] = 0; int alleles = all.CountIfGreater(0); double freq = 1.0 / alleles; // Set equal frequencies for all occuring alleles for (int i = 0; i <= alleleCount; i++) info->freq[i] = all[i] ? freq : 0.0; } } return true; } libStatGen-1.0.14/general/PedigreeAlleleFreq.h000066400000000000000000000022441254730101300210760ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __ALLELEFREQUENCIES_H__ #define __ALLELEFREQUENCIES_H__ #include "Pedigree.h" int CountAlleles(Pedigree & ped, int marker); void LumpAlleles(Pedigree & ped, int marker, double threshold, bool reorder); #define FREQ_ALL 0 #define FREQ_FOUNDERS 1 #define FREQ_EQUAL 2 // Returns true if frequencies estimated, false if previous information okay bool EstimateFrequencies(Pedigree & ped, int marker, int estimator); #endif libStatGen-1.0.14/general/PedigreeAlleles.h000066400000000000000000000074701254730101300204510ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PEDALLELES_H__ #define __PEDALLELES_H__ #include "LongInt.h" class Alleles { public: char one; char two; Alleles() { one = two = 0; } char & operator [](int i) { return (i == 1) ? one : two; } // is the genotype fully defined? bool isKnown() { return (one * two) != 0; } bool isHeterozygous() { return isKnown() && (one != two); } bool isHomozygous() { return isKnown() && (one == two); } bool hasAllele(int a) { return (one == a) || (two == a); } // in a bi-allelic system (a, NOT a) bool isHeterozygousFor(int a) { return isHeterozygous() && hasAllele(a); } bool isHomozygousFor(int a) { return !(isHeterozygousFor(a)); } // how may alleles a in this genotype? int countAlleles(int a) { return ((one == a) ? 1 : 0) + ((two == a) ? 1 : 0); } // what is the other allele, assuming genotype is (a, X) int otherAllele(int a) { return ((one == a) ? two : one); } // are two unordered genotypes identical? int identicalTo(Alleles & al) { return ((al.one == one) && (al.two == two)) || ((al.two == one) && (al.one == two)); } // how many alleles are identical by state int countIBS(Alleles & al) { return (one == al.one) ? ((two == al.two) ? 2 : 1) : ((one == al.two) ? ((two == al.one) ? 2 : 1) : (((two == al.one) || (two == al.two)) ? 1 : 0)); } int operator == (Alleles & rhs) { return identicalTo(rhs); } int operator != (Alleles & rhs) { return !identicalTo(rhs); } char Hi() { return one > two ? one : two; } char Lo() { return one > two ? two : one; } int SequenceCoded() { return isKnown() ? Hi() *(Hi() - 1) / 2 + Lo() : 0; } longint BinaryCoded() { if (isKnown()) { longint allele1(1); longint allele2(1); allele1 <<= one - 1; allele2 <<= two - 1; return allele1 | allele2; } else return NOTZERO; } void Intersect(Alleles & geno) { char a1 = Lo(), a2 = Hi(); char b1 = geno.Lo(), b2 = geno.Hi(); if (a1 == b1 && a2 == b2) return; if (a1 == b1 || a1 == b2) one = two = a1; else if (a2 == b1 || a2 == b2) one = two = a2; else one = two = 0; } void Intersect(char allele) { if (one != allele && two != allele) one = two = 0; else one = two = allele; } bool AddAllele(char allele) { if (one == allele || two == allele) return true; if (one != 0 && two != 0) return false; if (one == 0) one = allele; else two = allele; return true; } void Wipe() { one = two = 0; } }; #endif libStatGen-1.0.14/general/PedigreeDescription.cpp000066400000000000000000000726421254730101300217110ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "PedigreeDescription.h" #include "MapFunction.h" #include "MathVector.h" #include "Constant.h" #include "FortranFormat.h" #include "Error.h" #include #include #include #include PedigreeDescription::PedigreeDescription() { columnCount = 0; mendelFormat = false; } PedigreeDescription::~PedigreeDescription() { }; PedigreeDescription & PedigreeDescription::operator = (PedigreeDescription & rhs) { columnCount = rhs.columnCount; columns = rhs.columns; columnHash = rhs.columnHash; return *this; }; void PedigreeDescription::Load(IFILE & input, bool warnIfLinkage) { // Check if we are dealing with a linkage format data file String buffer; StringArray tokens; mendelFormat = false; ReadLineHelper(input, buffer, tokens); ifrewind(input); if (tokens.Length() == 4 && isdigit(tokens[0][0])) { if (warnIfLinkage) printf("Data file looks like a LINKAGE format file...\n\n"); LoadLinkageDataFile(input); return; } if (buffer.Length() > 18 && (buffer.SubStr(8,8).SlowCompare("AUTOSOME") == 0 || buffer.SubStr(8,8).SlowCompare("X-LINKED") == 0) && (isdigit(buffer[16]) || isdigit(buffer[17])) && (isdigit(buffer[18]) || isdigit(buffer[19]) || (buffer.Length() > 19 && isdigit(buffer[20])))) { printf("Data file looks like a MENDEL format file...\n" " Activating EXPERIMENTAL support for this format\n\n"); LoadMendelDataFile(input); return; } // Reset things ifrewind(input); int done = 0; int line = 0; columns.Clear(); columnHash.Clear(); columnCount = 0; while (!ifeof(input) && !done) { int i; buffer.ReadLine(input); line++; tokens.Clear(); tokens.AddTokens(buffer, WHITESPACE); if (tokens.Length() < 1) continue; if (tokens.Length() == 1) error("Problem reading data file:\n" "Item #%d (of type %s) has no name.", columnCount+1, (const char *) tokens[0]); switch (toupper(tokens[0][0])) { case 'A' : columnHash.Push(GetAffectionID(tokens[1])); columns.Push(pcAffection); columnCount++; break; case 'M' : columnHash.Push(GetMarkerID(tokens[1])); columns.Push(pcMarker); columnCount++; break; case 'T' : columnHash.Push(GetTraitID(tokens[1])); columns.Push(pcTrait); columnCount++; break; case 'C' : columnHash.Push(GetCovariateID(tokens[1])); columns.Push(pcCovariate); columnCount++; break; case '$' : columnHash.Push(GetStringID(tokens[1])); columns.Push(pcString); columnCount++; break; case 'S' : i = (int) tokens[0].SubStr(1); i = i > 0 ? i : 1; while (i--) { columns.Push(pcSkip); columnHash.Push(0); columnCount++; } break; case 'Z' : columnHash.Push(0); columns.Push(pcZygosity); columnCount++; break; case 'V' : GetMarkerID(tokens[1]); break; case 'E' : done = 1; break; case 'U' : if (toupper(tokens[0][1]) == 'T' && toupper(tokens[0][2]) == 'C') { int c = GetCovariateID(tokens[1]); int t = GetTraitID(tokens[1]); if (c >= 32767 || t >= 32767) error("Internal error processing data file\n"); columnHash.Push(t * 32768 + c); columns.Push(pcUndocumentedTraitCovariate); columnCount++; break; } default : error("Problem in data file (line %d):\n%s\n", line, (const char *) buffer); } } columns.Push(pcEnd); columnHash.Push(0); }; void PedigreeDescription::Load(const char * iFilename, bool warnIfLinkage) { IFILE f = ifopen(iFilename, "rb"); if (f == NULL) error( "The datafile %s cannot be opened\n\n" "Common causes for this problem are:\n" " * You might not have used the correct options to specify input file names,\n" " please check the program documentation for information on how to do this\n\n" " * The file doesn't exist or the filename might have been misspelt\n\n" " * The file exists but it is being used by another program which you will need\n" " to close before continuing\n\n" " * The file is larger than 2GB and you haven't compiled this application with\n" " large file support.\n\n", iFilename); Load(f, warnIfLinkage); ifclose(f); filename = iFilename; }; void PedigreeDescription::LoadMap(const char * iFilename) { IFILE f = ifopen(iFilename, "rb"); if (f == NULL) error( "The mapfile %s cannot be opened\n\n" "Please check that the file exists and is not being used by another program\n" "To find out how to set input filenames, check the documentation\n", iFilename); LoadMap(f); ifclose(f); }; void PedigreeDescription::LoadMap(IFILE & input) { columns.Clear(); columnHash.Clear(); columnCount = 0; int lastposition = 0; String buffer; StringArray tokens; buffer.ReadLine(input); tokens.AddTokens(buffer, WHITESPACE); while (tokens.Length() == 0 && !ifeof(input)) { buffer.ReadLine(input); tokens.AddTokens(buffer, WHITESPACE); } if (tokens.Length() != 3) error("Error reading map file header, which has %d columns.\n" "Three columns were expected, corresponding to\n" "MARKER_ID, MARKER_NAME and BASE_PAIR_POSITION\n" "The offending header is transcribed below:\n\n" "%s", tokens.Length(), (const char *) buffer); else printf("Map file column labels\n" " -- COLUMN 1, Expecting MARKER_ID, Read %s\n" " -- COLUMN 2, Expecting MARKER_NAME, Read %s\n" " -- COLUMN 3, Expection BASE_PAIR_POSITION, Read %s\n\n", (const char *)(tokens[0]), (const char *)(tokens[1]), (const char *)(tokens[2])); int line = 1; while (!ifeof(input)) { int serial; long position; buffer.ReadLine(input); line++; tokens.Clear(); tokens.AddTokens(buffer, WHITESPACE); if (tokens.Length() < 1) continue; if (tokens.Length() != 3) error("Each line in the map file should have 3 tokens, corresponding\n" "to MARKER_ID, MARKER_NAME and BASE_PAIR_POSITION respectively\n" "However, there are %d tokens in line %d, transcribed below:\n\n" "%s", tokens.Length(), line, (const char *) buffer); serial = (int) tokens[0]; if (serial != columnCount + 1) error("Reading Marker Index from Map File...\n" "Markers should be indexed consecutively starting at 1\n" "Marker %d does not fit this pattern\n", columnCount + 1); position = (int) tokens[2]; if (position < lastposition) error("Reading Marker Position from Map File...\n" "Marker position should be in base-pairs\n" "and markers should be in map order\n"); // TODO -- store marker locations somewhere! lastposition = position; columnHash.Push(GetMarkerID(tokens[1])); columns.Push(pcMarker); columnCount++; GetMarkerInfo(tokens[1])->position = position * 1e-8; } columns.Push(pcEnd); columnHash.Push(0); }; int PedigreeDescription::CountTextColumns() { int count = 0; for (int i = 0; i < columnCount; i++, count++) if (columns[i] == pcMarker) count++; return count; } void PedigreeDescription::LoadLinkageDataFile(const char * iFilename) { IFILE f = ifopen(iFilename, "rb"); if (f == NULL) error( "The linkage format datafile %s cannot be opened\n\n" "Please check that the file exists and is not being used by another program\n" "To find out how to set input filenames, check the documentation\n", iFilename); LoadLinkageDataFile(f); ifclose(f); filename = iFilename; }; void PedigreeDescription::LoadLinkageDataFile(IFILE & input) { columns.Clear(); columnHash.Clear(); columnCount = 0; String buffer, label; StringArray tokens; ReadLineHelper(input, buffer, tokens); if (tokens.Length() != 4 || tokens[2].AsInteger() != (int) chromosomeX || tokens[0].AsInteger() < 0) error("Cannot handle first line of data file\n\n" "Expecting four (4) numeric values, which correspond to:\n" " num-loci -- number of loci in the pedigree\n" " this value must be positive\n" " risk-locus -- locus for which risks should be calculated\n" " this value will be ignored\n" " sex-link -- are the loci sex linked [0 - No, 1 - Yes]\n" " %s\n" " program -- which LINKAGE program do you want to use?\n" " this value will also be ignored\n\n" "The actual input read:\n%s\n", chromosomeX ? "expecting X-linked data, so this value must be ONE (1)" : "expecting autosomal data, so this must be ZERO (0)", (const char *) buffer); int numloci = tokens[0]; ReadLineHelper(input, buffer, tokens); if (tokens.Length() != 4 || tokens[0].AsInteger() != 0 || tokens[3].AsInteger() != 0) error("Cannot handle second line of data file\n\n" "Expecting four (4) numeric values, which correspond to:\n" " mutation-model -- must be zero, corresponding to no mutation\n" " male-mutation-rate -- ignored\n" " female-mutation-rate -- ignored\n" " linkage-disequilibrium -- must be zero, may be used in the future to\n" " read haplotype frequencies\n\n" "The actual input read:\n%s\n", (const char *) buffer); StringArray markerOrder; int unknown = 0; ReadLineHelper(input, buffer, markerOrder); if (markerOrder.Length() > numloci) error("The third line of the data file lists marker order\n\n" "Although %d loci are defined [in the first line],\n" "this line includes %d values:\n%s\n", numloci, markerOrder.Length(), (const char *) buffer); IntArray locus; bool need_blank_line = false; while (!ifeof(input) && numloci--) { if (ReadLineHelper(input, buffer, tokens) == 0) error("Linkage data file ends unexpectedly"); if (tokens.Length() < 2) error("Incomplete locus information in data file\n" "Information for each locus should include 2 or more fiels\n" "The expected fields are:\n" " field_type -- indicator of locus type (trait, marker,...)\n" " alleles -- number of alleles\n" " name -- locus name, preceded by hash (#) sign\n\n" "The actual input read:\n%s\n", (const char *) buffer); int locus_type = (int) tokens[0]; int alleles = (int) tokens[1]; String locus_name("LOCUS"); locus_name += ++unknown; if (tokens.Length() > 2 && tokens[2][0] == '#') { if (tokens[2][1] != 0) locus_name = tokens[2].SubStr(1); else if (tokens.Length() > 3) locus_name = tokens[3]; } if ((locus_type == 4 && alleles == 0) || (locus_type == 4 && alleles == 1)) { columnHash.Push(GetCovariateID(locus_name)); columns.Push(pcCovariate); columnCount++; continue; } if (locus_type == 0 && alleles == 0) { columnHash.Push(GetTraitID(locus_name)); columns.Push(pcTrait); columnCount++; continue; } if (ReadLineHelper(input, buffer, tokens) != alleles) error("Expecting %d allele frequencies, but input has %d columns:\n" "%s\n", alleles, tokens.Length(), (const char *) buffer); Vector frequencies(alleles + 1); frequencies[0] = 0.0; for (int i = 1; i <= alleles; i++) frequencies[i] = (double) tokens[i - 1]; double sum = frequencies.Sum(); if (sum <= 0.0) error("Allele frequencies at %s sum to %f, which doesn't make sense\n", (const char *) locus_name, sum); if (fabs(sum - 1.0) > 1.2e-5) { printf("Allele frequencies at %s sum to %f, adjusted to 1.0\n", (const char *) locus_name, sum); need_blank_line = true; } if (sum != 1.0) frequencies *= 1.0 / sum; switch (locus_type) { case 1 : { // Affection columnHash.Push(GetAffectionID(locus_name)); columns.Push(pcAffection); columnCount++; // Read number of liability classes if (ReadLineHelper(input, buffer, tokens) == 0) error("Linkage data file ends unexpectedly\n"); // Skip liability class data int classes = tokens[0]; if (classes > 1) { columnHash.Push(0); columns.Push(pcSkip); columnCount++; } // Separate liability class rows for males and females for X-linked data if (chromosomeX) classes *= 2; while (classes--) if (ReadLineHelper(input, buffer, tokens) == 0) error("Linkage data file ends unexpectedly\n"); // Ignore map location for quantitative variables locus.Push(-1); } break; case 3 : { columnHash.Push(GetMarkerID(locus_name)); columns.Push(pcMarker); columnCount++; // Store allele frequencies MarkerInfo * info = GetMarkerInfo(locus_name); info->freq = frequencies; // Initialize allele labels info->alleleLabels.Clear(); for (int i = 0; i < frequencies.Length(); i++) info->alleleLabels.Push(label = i); info->IndexAlleles(); // Store marker id, so that we can track map location locus.Push(GetMarkerID(locus_name)); } break; case 0 : { // Read number of quantitative variables if (ReadLineHelper(input, buffer, tokens) == 0) error("Linkage data file ends unexpectedly\n"); // Add each quantitative variable to pedigree // Discard information on means for (int vars = tokens[0], i = 0; i < vars; i++) { if (ReadLineHelper(input, buffer, tokens) == 0) error("Linkage data file ends unexpectedly\n"); String trait_name(locus_name); if (i) { trait_name += "."; trait_name += i + 1; } columnHash.Push(GetTraitID(trait_name)); columns.Push(pcTrait); columnCount++; } // Skip var-covar matrix if (ReadLineHelper(input, buffer, tokens) == 0) error("Linkage data file ends unexpectedly\n"); // Skip heterozygote scaling factor for var-covar matrix if (ReadLineHelper(input, buffer, tokens) == 0) error("Linkage data file ends unexpectedly\n"); // Ignore map location for quantitative variables locus.Push(-1); } break; case 2 : error("The data file includes binary factors\n" "Regretably, loci of this type are not supported\n\n"); break; default : error("Unsupported locus type [%d] in data file", locus_type); break; } } if (need_blank_line) printf("\n"); columns.Push(pcEnd); columnHash.Push(0); ReadLineHelper(input, buffer, tokens); int sexDifference = tokens.Length() ? tokens[0].AsInteger() : -1; if (tokens.Length() != 2 || (sexDifference != 0 && sexDifference != 2) || tokens[1].AsInteger() != 0) error("Error retrieving recombination information\n\n" "Expecting two (2) numeric values, which correspond to:\n" " sex-difference -- must be zero (no difference) or two (sex specific recombination)\n" " map-function -- must be zero, that is, no interference\n" "The actual input read:\n%s\n", (const char *) buffer); Vector distances[2]; bool distance_in_centimorgans = false; for (int r = 0; r <= sexDifference; r += 2) { ReadLineHelper(input, buffer, tokens); if (tokens.Length() != markerOrder.Length() - 1) error("Error retrieving recombination information\n\n" "Expecting %d recombination fractions (current map includes %d loci)\n" "Instead the following line was input:\n%s\n", markerOrder.Length() - 1, markerOrder.Length(), (const char *) buffer); distances[r >> 1].Dimension(tokens.Length()); for (int i = 0; i < tokens.Length(); i++) distances[r >> 1][i] = (double) tokens[i]; if (distances[r >> 1].Min() < 0.0) error("Linkage datafile specifies negative recombination fractions"); bool centimorgans = distances[r >> 1].Max() > 0.5; if (centimorgans && !distance_in_centimorgans) printf(" Some recombination fractions in datafile are greater than 0.5,\n" " so recombination fractions will be interpreted as cM distances\n\n"); distance_in_centimorgans |= centimorgans; } double position = 0.0, positionMale = 0.0; for (int i = 0, moving = false; i < markerOrder.Length(); i++) { int m = markerOrder[i].AsInteger() - 1; if (m < 0 || m >= locus.Length()) error("The marker order in the linkage datafile is invalid\n"); m = locus[m]; if (m != -1) { MarkerInfo * info = GetMarkerInfo(m); info->chromosome = chromosomeX ? 9999 : 0; if (sexDifference == 2) info->position = (position + positionMale) * 0.5, info->positionFemale = position, info->positionMale = positionMale; else info->position = info->positionMale = info->positionFemale = position; moving = true; } if (i < markerOrder.Length() - 1 && moving) position += distance_in_centimorgans ? 0.01 * distances[0][i] : RecombinationToDistance(distances[0][i]); if (sexDifference == 2 && i < markerOrder.Length() - 1 && moving) positionMale += distance_in_centimorgans ? 0.01 * distances[1][i] : RecombinationToDistance(distances[1][i]); } } int PedigreeDescription::ReadLineHelper(IFILE & input, String & buffer, StringArray & tokens) { do { // Read Line buffer.ReadLine(input); buffer.Trim(); // Strip comments marked with >> int pos = buffer.FastFind(">>"); if (pos == -1) pos = buffer.FastFind("<<"); if (pos == -1) pos = buffer.Length() + 1; if (buffer[0] == '#') pos = 0; // Find space/tab delimited tokens tokens.Clear(); tokens.AddTokens(buffer.Left(pos - 1), WHITESPACE); } while (tokens.Length() == 0 && !ifeof(input)); return tokens.Length(); } void PedigreeDescription::LoadMendelDataFile(const char * iFilename) { IFILE f = ifopen(iFilename, "rb"); if (f == NULL) error( "The MENDEL format datafile %s cannot be opened\n\n" "Please check that the file exists and is not being used by another program\n" "To find out how to set input filenames, check the documentation\n", iFilename); LoadMendelDataFile(f); ifclose(f); }; void PedigreeDescription::LoadMendelDataFile(IFILE & file) { // Processes mendel format file mendelFormat = true; // Codominant markers are mapped to markers // Non-codominant markers are mapped into multiple "affection status" // (Y/N) variables columns.Clear(); columnHash.Clear(); columnCount = 0; FortranFormat parser; // Variables for storing parsed input String locusName; String locusType; String alleleLabel; String alleleFreq; String phenotype; String genotype; int phenoCount; int alleleCount; while (!ifeof(file)) { // Cycle through headers for each locus parser.SetInputFile(file); parser.SetFormat("(2A8,I2,I3)"); // After retrieving locus name, check that we haven't tried to // read past the end-of-file parser.GetNextField(locusName); parser.GetNextField(locusType); alleleCount = parser.GetNextInteger(); phenoCount = parser.GetNextInteger(); if (locusName.IsEmpty() && locusType.IsEmpty() && alleleCount == 0 && phenoCount == 0 && ifeof(file)) break; // Only recognize autosomal and x-linked loci if (locusType.Compare("AUTOSOME") != 0 && locusType.Compare("X-LINKED")) error("Unrecognized locus type '%s' in Mendel data file\n\n" "Recognized locus types are \"AUTOSOME\" and \"X-LINKED\".", (const char *) locusType); if (locusType.Compare("AUTOSOME") == 0 && chromosomeX) error("The data file indicates that locus %s is AUTOSOMAL, but\n" "X-LINKED loci were expected as input\n", (const char *) locusName); if (locusType.Compare("X-LINKED") == 0 && !chromosomeX) error("The data file indicates that locus %s is X-LINKED, but\n" "AUTOSOMAL loci were expected as input\n", (const char *) locusName); if (locusName.IsEmpty()) error("Blank locus name encountered in data file\n"); if (phenoCount == 0) { // Co-dominant marker columns.Push(pcMarker); columnHash.Push(GetMarkerID(locusName)); columnCount++; // Update marker info with allele labels and frequencies MarkerInfo * info = GetMarkerInfo(locusName); info->alleleLabels.Clear(); info->alleleLabels.Push(""); info->freq.Clear(); parser.SetFormat("(2A8)"); // Mendel allows allele names to be specified with frequencies // left blank for (int i = 0; i < alleleCount; i++) { parser.GetNextField(alleleLabel); parser.GetNextField(alleleFreq); if (alleleLabel.IsEmpty()) error("Locus %s is missing allele label for allele #%d\n", (const char *) locusName, i+1); info->alleleLabels.Push(alleleLabel); if (!alleleFreq.IsEmpty()) { if (info->freq.Length() == 0) info->freq.Push(0.0); info->freq.Push(alleleFreq.AsDouble()); } } info->IndexAlleles(); if (info->alleleLabels.Length() != info->freq.Length() && info->freq.Length() != 0) error("Locus %s is missing allele frequency information for %d alleles\n", (const char *) locusName, info->alleleLabels.Length() - info->freq.Length()); } else { // Non-codominant marker, which we decompose into multiple traits... parser.SetFormat("(2A8)"); // First skip allele frequency information for (int i = 0; i < alleleCount; i++) { parser.GetNextField(alleleLabel); parser.GetNextField(alleleFreq); } // Then read in each phenotype for (int i = 0; i < alleleCount; i++) { parser.SetFormat("(A8,I3)"); parser.GetNextField(phenotype); int genoCount = parser.GetNextInteger(); parser.SetFormat("(A17)"); for (int j = 0; j < genoCount; j++) parser.GetNextField(genotype); columns.Push(pcAffection); columnHash.Push(GetAffectionID(locusName + "->" + phenotype)); columnCount++; } } } columns.Push(pcEnd); columnHash.Push(0); } int PedigreeDescription::CountColumns(int type) { int count = 0; for (int i = 0; i < columns.Length(); i++) if (columns[i] == type) count++; return count; } const char * PedigreeDescription::ColumnSummary(String & string) { string.Clear(); UpdateSummary(string, pcMarker, " markers [x2 cols]"); UpdateSummary(string, pcTrait, " traits"); UpdateSummary(string, pcAffection, " discrete traits"); UpdateSummary(string, pcCovariate, " covariates"); UpdateSummary(string, pcString, " strings"); UpdateSummary(string, pcZygosity, " zygosity"); UpdateSummary(string, pcSkip, " skipped"); return string; } void PedigreeDescription::UpdateSummary(String & string, int type, const char * label) { int count = CountColumns(type); if (count) { if (string.Length()) string += ", "; string += count; string += label; } } void PedigreeDescription::AddMarkerColumn(const char * markerName) { if (columns.Last() == pcEnd) { columns.Pop(); columnHash.Pop(); } columnHash.Push(GetMarkerID(markerName)); columns.Push(pcMarker); columnCount++; } void PedigreeDescription::AddCovariateColumn(const char * covariateName) { if (columns.Last() == pcEnd) { columns.Pop(); columnHash.Pop(); } columnHash.Push(GetCovariateID(covariateName)); columns.Push(pcCovariate); columnCount++; } void PedigreeDescription::AddTraitColumn(const char * traitName) { if (columns.Last() == pcEnd) { columns.Pop(); columnHash.Pop(); } columnHash.Push(GetCovariateID(traitName)); columns.Push(pcTrait); columnCount++; } void PedigreeDescription::AddAffectionColumn(const char * affectionName) { if (columns.Last() == pcEnd) { columns.Pop(); columnHash.Pop(); } columnHash.Push(GetAffectionID(affectionName)); columns.Push(pcAffection); columnCount++; } void PedigreeDescription::AddStringColumn(const char * stringName) { if (columns.Last() == pcEnd) { columns.Pop(); columnHash.Pop(); } columnHash.Push(GetStringID(stringName)); columns.Push(pcString); columnCount++; } void PedigreeDescription::AddZygosityColumn() { if (columns.Last() == pcEnd) { columns.Pop(); columnHash.Pop(); } columnHash.Push(0); columns.Push(pcZygosity); columnCount++; } void PedigreeDescription::AddSkippedColumn() { if (columns.Last() == pcEnd) { columns.Pop(); columnHash.Pop(); } columnHash.Push(0); columns.Push(pcSkip); columnCount++; } libStatGen-1.0.14/general/PedigreeDescription.h000066400000000000000000000050711254730101300213460ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PEDDESCRIBE_H__ #define __PEDDESCRIBE_H__ #include "PedigreeGlobals.h" #include "PedigreePerson.h" #include "StringArray.h" #include "IntArray.h" #include // Possible pedigree columns #define pcSkip 0 #define pcMarker 1 #define pcTrait 2 #define pcAffection 3 #define pcCovariate 4 #define pcString 5 #define pcZygosity 6 #define pcEnd 7 // Undocumented pedigree column types -- not recommended #define pcUndocumentedTraitCovariate 1001 class PedigreeDescription : public PedigreeGlobals { public: int columnCount; IntArray columns, columnHash; PedigreeDescription(); ~PedigreeDescription(); void Load(IFILE & Input, bool warnIfLinkage = false); void Load(const char * filename, bool warnIfLinkage = false); void LoadLinkageDataFile(IFILE & input); void LoadLinkageDataFile(const char * filename); void LoadMendelDataFile(IFILE & input); void LoadMendelDataFile(const char * filename); void LoadMap(IFILE & Input); void LoadMap(const char * filename); PedigreeDescription & operator = (PedigreeDescription & rhs); int CountTextColumns(); // returns a string summarizing column contents const char * ColumnSummary(String & string); // Flag specifying Mendel format bool mendelFormat; String filename; void AddMarkerColumn(const char * markerName); void AddTraitColumn(const char * traitName); void AddAffectionColumn(const char * affectionName); void AddCovariateColumn(const char * covariateName); void AddStringColumn(const char * stringName); void AddZygosityColumn(); void AddSkippedColumn(); private: int ReadLineHelper(IFILE & input, String & buffer, StringArray & tokens); int CountColumns(int type); void UpdateSummary(String & string, int type, const char * label); }; #endif libStatGen-1.0.14/general/PedigreeFamily.cpp000066400000000000000000000206421254730101300206400ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Pedigree.h" #include "Constant.h" #include "MathConstant.h" #include "Error.h" #include #include #include #include Family::Family(Pedigree & pedigree, int _first, int _last, int _serial) : ped(pedigree) { serial = _serial; first = _first; last = _last; count = last - first + 1; path = new int [count]; famid = ped[first].famid; founders = mzTwins = 0; for (int i=first; i<=last; i++) if (ped[i].isFounder()) { ped[i].traverse = founders; path[founders++] = ped[i].serial; } else { ped[i].traverse = -1; if (ped[i].isMzTwin(ped[i])) for (int j = first; j < i; j++) if (ped[i].isMzTwin(ped[j])) { mzTwins++; break; } } nonFounders = count - founders; generations = nonFounders == 0 ? 1 : 2; int next = founders; while (next < count) { bool check = false; // Create traversal where path ancestors precede their offspring for (int i=first; i<=last; i++) if (ped[i].traverse == -1) { int fatherSerial = ped[i].father->traverse; int motherSerial = ped[i].mother->traverse; if (fatherSerial >= 0 && motherSerial >= 0) { check = true; ped[i].traverse = next; path[next++] = i; if (fatherSerial >= founders || motherSerial >= founders) generations = 3; // If this individual is part of a set of MZ twins if (ped[i].zygosity & 1) for (int j = 0; j < ped[i].sibCount; j++) { Person & sib = *ped[i].sibs[j]; // Insert all co-twins at the same position in traversal // order if (sib.traverse == -1 && ped[i].zygosity == sib.zygosity) { sib.traverse = next; path[next++] = sib.serial; } } } } if (!check) ShowInvalidCycles(); } } Family::~Family() { delete [] path; } void Family::ShowInvalidCycles() { // Try and identify key individuals responsible for // pedigree mess-up ... when this function is called // pedigree has been traversed top-down and individuals // that are correctly specified have IDs of >= 0. // This routine traverses the pedigree bottom up to // identify a subset of individuals likely to be causing // the problem IntArray descendants(ped.count); descendants.Zero(); for (int i = first; i <= last; i++) if (ped[i].traverse == -1) { descendants[ped[i].father->serial]++; descendants[ped[i].mother->serial]++; } IntArray stack; for (int i = first; i <= last; i++) if (ped[i].traverse == -1 && descendants[i] == 0) { stack.Push(i); do { int j = stack.Pop(); if (ped[j].traverse != -1) continue; ped[j].traverse = 9999; if (--descendants[ped[j].father->serial] == 0) stack.Push(ped[j].father->serial); if (--descendants[ped[j].mother->serial] == 0) stack.Push(ped[j].mother->serial); } while (stack.Length()); } printf("The structure of family %s requires\n" "an individual to be his own ancestor.\n\n" "To identify the problem(s), examine the\n" "following key individuals:\n\n", (const char *) famid); for (int i = first; i <= last; i++) if (ped[i].traverse == -1) printf("Problem Person: %s\n", (const char *) ped[i].pid); error("Invalid pedigree structure."); } int Family::ConnectedGroups(IntArray * groupMembership) { IntArray groups(count); // Use the quick union algorithm to identify connected groups groups.SetSequence(0, 1); for (int i = count - 1; i >= founders; i--) { // Lookup parents int group0 = i; int group1 = ped[path[i]].father->traverse; int group2 = ped[path[i]].mother->traverse; // Identify their corresponding groupings while (groups[group0] != group0) group0 = groups[group0]; while (groups[group1] != group1) group1 = groups[group1]; while (groups[group2] != group2) group2 = groups[group2]; int group = group1 < group2 ? group1 : group2; if (group0 < group) group = group0; groups[group0] = groups[group1] = groups[group2] = group; } // Count groupings int groupCount = 0; for (int i = 0; i < founders; i++) if (groups[i] == i) groupCount++; if (groupMembership == NULL) return groupCount; // Flatten tree so all items point to root for (int i = 1; i < count; i++) groups[i] = groups[groups[i]]; // Update group membership info int group = 0; groupMembership->Dimension(count); for (int i = 0; i < count; i++) if (groups[i] == i) (*groupMembership)[i] = ++group; else (*groupMembership)[i] = (*groupMembership)[groups[i]]; #if 0 // This stretch of code outputs family structure and group membership // And should usually be commented out! for (int j = first; j <= last; j++) printf("%s %s %s %s %d %d\n", (const char *) famid, (const char *) ped[j].pid, (const char *) ped[j].fatid, (const char *) ped[j].motid, ped[j].sex, groups[ped[j].traverse]); #endif return groupCount; } /* int Family::ConnectedGroups(IntArray * groupMembership) { IntArray * stack = new IntArray[count]; IntArray groups(count); groups.Zero(); int group = 0; int seed = count - 1; // Search for connected sets of individuals until everyone is accounted for while (true) { while ((seed >= 0) && (groups[seed] != 0)) seed--; if (seed == -1) break; Mark(seed, ++group, stack, groups); for (int j = seed; j >= founders; j--) if (groups[j] == 0) { int fat_j = ped[path[j]].father->traverse; int mot_j = ped[path[j]].mother->traverse; if (groups[fat_j] == group || groups[mot_j] == group) Mark(j, group, stack, groups); else stack[mot_j].Push(j), stack[fat_j].Push(j); } for (int j = 0; j < count; j++) stack[j].Clear(); } if (groupMembership != NULL) (*groupMembership) = groups; // This stretch of code outputs family structure and group membership // And should usually be commented out! #if 0 for (int j = first; j <= last; j++) printf("%s %s %s %s %d %d\n", (const char *) famid, (const char *) ped[j].pid, (const char *) ped[j].fatid, (const char *) ped[j].motid, ped[j].sex, groups[ped[j].traverse]); #endif delete [] stack; return group; } void Family::Mark(int j, int group, IntArray * stack, IntArray & groups) { if (groups[j] == group) return; groups[j] = group; while (stack[j].Length()) Mark(stack[j].Pop(), group, stack, groups); if (j < founders) return; Mark(ped[path[j]].father->traverse, group, stack, groups); Mark(ped[path[j]].mother->traverse, group, stack, groups); } */ libStatGen-1.0.14/general/PedigreeFamily.h000066400000000000000000000037271254730101300203120ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PEDFAMILY_H__ #define __PEDFAMILY_H__ #include "PedigreeAlleles.h" #include "PedigreePerson.h" #include "StringBasics.h" class Pedigree; class Family { public: Pedigree & ped; String famid; int serial; int first, last; // sentinel family members int count; // number of individuals in pedigree int founders; // number of founders in pedigree int nonFounders; // number of non-founders in pedigree int mzTwins; // number of MZ twins, excluding 1st twin in set int * path; // traverses the pedigree so that ancestors // preceed their descendants int generations; // Rough classification as: // 1 -- all individuals are unrelated // 2 -- two generations (inc. multiple couples) // 3 -- three or more generations bool isNuclear() { return (generations == 2) && (founders == 2); } Family(Pedigree & ped, int top, int bottom, int serial = 0); ~Family(); int ConnectedGroups(IntArray * groupMembership = NULL); private: void ShowInvalidCycles(); Family & operator = (Family & rhs); // void Mark(int who, int group, IntArray * stack, IntArray & group_id ); }; #endif libStatGen-1.0.14/general/PedigreeGlobals.cpp000066400000000000000000000635161254730101300210110ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "PedigreeGlobals.h" #include "Sort.h" #include "Error.h" #include #include #include int PedigreeGlobals::traitCount = 0; int PedigreeGlobals::affectionCount = 0; int PedigreeGlobals::covariateCount = 0; int PedigreeGlobals::markerCount = 0; int PedigreeGlobals::stringCount = 0; // If this value isn't set, all X chromosome data will be rejected bool PedigreeGlobals::chromosomeX = false; bool PedigreeGlobals::sexSpecificMap = false; StringArray PedigreeGlobals::traitNames; StringArray PedigreeGlobals::markerNames; StringArray PedigreeGlobals::covariateNames; StringArray PedigreeGlobals::affectionNames; StringArray PedigreeGlobals::stringNames; StringIntHash PedigreeGlobals::markerLookup; StringIntHash PedigreeGlobals::traitLookup; StringIntHash PedigreeGlobals::affectionLookup; StringIntHash PedigreeGlobals::covariateLookup; StringIntHash PedigreeGlobals::stringLookup; int PedigreeGlobals::markerInfoCount = 0; int PedigreeGlobals::markerInfoSize = 0; MarkerInfo ** PedigreeGlobals::markerInfo = NULL; MarkerInfo ** PedigreeGlobals::markerInfoByInteger = NULL; StringHash PedigreeGlobals::markerInfoByName; int MarkerInfo::count = 0; int MarkerInfo::ComparePosition(MarkerInfo ** left, MarkerInfo ** right) { if ((*left)->chromosome != (*right)->chromosome) return (*left)->chromosome - (*right)->chromosome; double difference = (*left)->position - (*right)->position; if (difference > 0.0) return 1; else if (difference == 0.0) return (*left)->serial - (*right)->serial; else return -1; } String MarkerInfo::GetAlleleLabel(int allele) { if (alleleLabels.Length() > allele && alleleLabels[allele].Length()) return alleleLabels[allele]; else if (alleleLabels.Length() <= allele) alleleLabels.Dimension(allele + 1); return alleleLabels[allele] = allele; } bool MarkerInfo::AdjustFrequencies() { if (freq.Length() <= 1) { freq.Clear(); return false; } if (freq.Min() < 0.0) error("Locus %s has negative allele frequencies\n", (const char *) name); double sum = freq.Sum(); if (sum <= 0.0) error("Locus %s frequencies sum to %f, which doesn't make sense\n", (const char *) name, sum); if (sum != 1.0) freq *= 1.0 / sum; if (fabs(sum - 1.0) > 1.2e-5) { printf("Locus %s frequencies sum to %f, adjusted to 1.0\n", (const char *) name, sum); return true; } return false; } void MarkerInfo::IndexAlleles() { if (alleleLabels.Length() >= 255) error("Marker %s has more than 254 distinct alleles\n", (const char *) name); alleleNumbers.Clear(); for (int i = 1; i < alleleLabels.Length(); i++) alleleNumbers.SetInteger(alleleLabels[i], i); } int MarkerInfo::NewAllele(const String & label) { if (alleleLabels.Length() == 0) alleleLabels.Push(""); if (alleleLabels.Length() >= 255) error("Marker %s has more than 254 distinct alleles\n", (const char *) name); alleleNumbers.SetInteger(label, alleleLabels.Length()); alleleLabels.Push(label); return alleleLabels.Length() - 1; } int PedigreeGlobals::GetTraitID(const char * name) { int idx = traitLookup.Integer(name); if (idx != -1) return idx; traitNames.Add(name); traitLookup.SetInteger(name, traitCount); return traitCount++; } int PedigreeGlobals::GetAffectionID(const char * name) { int idx = affectionLookup.Integer(name); if (idx != -1) return idx; affectionNames.Add(name); affectionLookup.SetInteger(name, affectionCount); return affectionCount++; } int PedigreeGlobals::GetCovariateID(const char * name) { int idx = covariateLookup.Integer(name); if (idx != -1) return idx; covariateNames.Add(name); covariateLookup.SetInteger(name, covariateCount); return covariateCount++; } int PedigreeGlobals::GetStringID(const char * name) { int idx = stringLookup.Integer(name); if (idx != -1) return idx; stringNames.Add(name); stringLookup.SetInteger(name, stringCount); return stringCount++; } int PedigreeGlobals::GetMarkerID(const char * name) { int idx = markerLookup.Integer(name); if (idx != -1) return idx; markerNames.Add(name); markerLookup.SetInteger(name, markerCount); // Grow the marker info key ... if (markerCount == 0) { markerInfoByInteger = new MarkerInfo * [16]; for (int i = 0; i < 16; i++) markerInfoByInteger[i] = NULL; } else if ((markerCount & (markerCount - 1)) == 0 && markerCount > 15) { MarkerInfo ** newKey = new MarkerInfo * [markerCount * 2]; for (int i = 0; i < markerCount; i++) newKey[i] = markerInfoByInteger[i]; for (int i = markerCount; i < markerCount * 2; i++) newKey[i] = NULL; delete [] markerInfoByInteger; markerInfoByInteger = newKey; } return markerCount++; } MarkerInfo * PedigreeGlobals::GetMarkerInfo(String & name) { MarkerInfo * info = (MarkerInfo *) markerInfoByName.Object(name); if (info != NULL) return info; info = new MarkerInfo(name); markerInfoByName.Add(name, info); if (markerInfoCount >= markerInfoSize) GrowMarkerInfo(); markerInfo[markerInfoCount++] = info; int markerId = LookupMarker(name); if (markerId >= 0) markerInfoByInteger[markerId] = info; return info; } MarkerInfo * PedigreeGlobals::GetMarkerInfo(int markerId) { if (markerId >= markerCount) error("Attempted to retrieve MarkerInfo using out-of-bounds index\n"); if (markerInfoByInteger[markerId] != NULL) return markerInfoByInteger[markerId]; else return GetMarkerInfo(markerNames[markerId]); } void PedigreeGlobals::GrowMarkerInfo() { int newSize = markerInfoSize ? 2 * markerInfoSize : 32; MarkerInfo ** newArray = new MarkerInfo * [newSize]; if (markerInfoSize) { memcpy(newArray, markerInfo, sizeof(MarkerInfo *) * markerInfoSize); delete [] markerInfo; } markerInfo = newArray; markerInfoSize = newSize; } void PedigreeGlobals::FlagMissingMarkers(IntArray & missingMarkers) { int skipped_markers = 0; if (missingMarkers.Length()) { StringArray names; printf("These markers couldn't be placed and won't be analysed:"); for (int i = 0; i < missingMarkers.Length(); i++) names.Push(GetMarkerInfo(missingMarkers[i])->name); names.Sort(); for (int i = 0, line = 80, lines = 0; i < missingMarkers.Length(); i++) { if (line + names[i].Length() + 1 > 79) printf("\n "), line = 3, lines++; if (lines < 5) { printf("%s ", (const char *) names[i]); line += names[i].Length() + 1; } else skipped_markers++; } if (skipped_markers) printf("as well as %d other unlisted markers...", skipped_markers); printf("\n\n"); } } void PedigreeGlobals::GetOrderedMarkers(IntArray & markers) { if (markers.Length() == 0) { markers.Dimension(markerCount); markers.SetSequence(0, 1); } MarkerInfo ** subset = new MarkerInfo * [markers.Length()]; int count = 0; IntArray missingMarkers; for (int i = 0; i < markers.Length(); i++) { MarkerInfo * info = GetMarkerInfo(markers[i]); if (info->chromosome != -1) subset[count++] = info; else missingMarkers.Push(i); } FlagMissingMarkers(missingMarkers); QuickSort(subset, count, sizeof(MarkerInfo *), COMPAREFUNC MarkerInfo::ComparePosition); markers.Clear(); for (int i = 0; i < count; i++) markers.Push(GetMarkerID(subset[i]->name)); } int PedigreeGlobals::SortMarkersInMapOrder(IntArray & markers, int chromosome) { if (markers.Length() == 0) { markers.Dimension(markerCount); markers.SetSequence(0, 1); } MarkerInfo ** subset = new MarkerInfo * [markers.Length()]; int count = 0; IntArray missingMarkers; for (int i = 0; i < markers.Length(); i++) { MarkerInfo * info = GetMarkerInfo(markers[i]); if (info->chromosome != -1) subset[count++] = info; else if (chromosome == -1) missingMarkers.Push(i); } if (chromosome == -1) FlagMissingMarkers(missingMarkers); QuickSort(subset, count, sizeof(MarkerInfo *), COMPAREFUNC MarkerInfo::ComparePosition); markers.Clear(); int current_chromosome = -1, next_chromosome = 0; for (int i = 0; i < count; i++) if (subset[i]->chromosome < chromosome) continue; else if (current_chromosome == -1 || subset[i]->chromosome == current_chromosome) { markers.Push(GetMarkerID(subset[i]->name)); current_chromosome = subset[i]->chromosome; } else if (!next_chromosome) { next_chromosome = subset[i]->chromosome; break; } delete [] subset; return next_chromosome; } void PedigreeGlobals::VerifySexSpecificOrder() { if (markerCount <= 1) return; MarkerInfo ** sortedMarkers = new MarkerInfo * [markerCount]; for (int i = 0; i < markerCount; i++) sortedMarkers[i] = GetMarkerInfo(i); QuickSort(sortedMarkers, markerCount, sizeof(MarkerInfo *), COMPAREFUNC MarkerInfo::ComparePosition); double prev_female = sortedMarkers[0]->positionFemale; double prev_male = sortedMarkers[0]->positionMale; double curr_female, curr_male; int prev_chromosome = sortedMarkers[0]->chromosome; int curr_chromosome; for (int i = 1; i < markerCount; i++) { curr_chromosome = sortedMarkers[i]->chromosome; curr_female = sortedMarkers[i]->positionFemale; curr_male = sortedMarkers[i]->positionMale; if (curr_chromosome == prev_chromosome && (curr_female < prev_female || curr_male < prev_male)) error("Sex-specific and sex-averaged maps are inconsistent.\n\n" "In the sex-averaged map, marker %s (%.2f cM) follows marker %s (%.2f cM).\n" "In the %smale map, marker %s (%.2f cM) PRECEDES marker %s (%.2f cM).\n", (const char *) sortedMarkers[i]->name, sortedMarkers[i]->position * 100, (const char *) sortedMarkers[i-1]->name, sortedMarkers[i-1]->position * 100, curr_female < prev_female ? "fe" : "", (const char *) sortedMarkers[i]->name, (curr_female < prev_female ? curr_female : curr_male) * 100, (const char *) sortedMarkers[i-1]->name, (curr_female < prev_female ? prev_female : prev_male) * 100); prev_chromosome = curr_chromosome; prev_female = curr_female; prev_male = curr_male; } delete [] sortedMarkers; } void PedigreeGlobals::LoadAlleleFrequencies(const char * filename, bool required) { // This function is often called with an empty string, and not // all implementations of the C library like that ... if (filename[0] == 0) { if (required) error("No name provided for required allele freuquency file\n"); else return; } // If we get here, the filename is not empty and things should // work as planned IFILE f = ifopen(filename, "rb"); if (f == NULL) { if (required) error("Failed to open required allele frequency file '%s'", (const char *) filename); else return; } LoadAlleleFrequencies(f); ifclose(f); } void PedigreeGlobals::LoadAlleleFrequencies(IFILE & input) { int done = 0; String buffer; StringArray tokens; MarkerInfo *info = NULL; bool need_blank_line = false; int allele_size, old_max, next_allele = 0; // Initialization avoids compiler warning while (!ifeof(input) && !done) { int i, j; buffer.ReadLine(input); tokens.Clear(); tokens.AddTokens(buffer, WHITESPACE); if (tokens.Length() < 1) continue; switch (toupper(tokens[0][0])) { case 'M' : if (tokens.Length() == 1) error("Unnamed marker in allele frequency file"); if (info != NULL) need_blank_line |= info->AdjustFrequencies(); info = GetMarkerInfo(tokens[1]); info->freq.Clear(); info->freq.Push(0.0); next_allele = 1; break; case 'F' : if (info != NULL) for (i = 1; i < tokens.Length(); i++) { buffer = next_allele++; int allele = LoadAllele(info, buffer); if (allele >= info->freq.Length()) { old_max = info->freq.Length(); info->freq.Dimension(allele + 1); for (j = old_max; j < allele; j++) info->freq[j] = 0.0; } info->freq[allele] = tokens[i].AsDouble(); } break; case 'A' : if (info == NULL) continue; if (tokens.Length() != 3) error("Error reading named allele frequencies for locus %s\n" "Lines with named alleles should have the format\n" " A allele_label allele_frequency\n\n" "But the following line was read:\n%s\n", (const char *) info->name, (const char *) buffer); allele_size = LoadAllele(info, tokens[1]); next_allele = atoi(tokens[1]) + 1; if (allele_size < 1) error("Error reading named allele frequencies for locus %s\n" "An invalid allele label was encountered\n", (const char *) info->name); if (allele_size >= info->freq.Length()) { old_max = info->freq.Length(); info->freq.Dimension(allele_size + 1); for (i = old_max; i < allele_size; i++) info->freq[i] = 0.0; } info->freq[allele_size] = tokens[2]; break; case 'E' : done = 1; break; default : error("Problem in allele frequency file.\n" "Lines in this file should be of two types:\n" " -- Marker name lines begin with an M\n" " -- Frequency lines begin with an F\n\n" "However the following line is different:\n%s\n", (const char *) buffer); } } if (info != NULL) need_blank_line |= info->AdjustFrequencies(); if (need_blank_line) printf("\n"); } void PedigreeGlobals::LoadMarkerMap(const char * filename, bool filter) { IFILE f = ifopen(filename, "rb"); if (f == NULL) return; LoadMarkerMap(f, filter); ifclose(f); } void PedigreeGlobals::LoadMarkerMap(IFILE & input, bool filter) { String buffer; StringArray tokens; bool first_pass = true; while (!ifeof(input)) { buffer.ReadLine(input); tokens.Clear(); tokens.AddTokens(buffer, WHITESPACE); if (tokens.Length() < 1) continue; if (first_pass) { sexSpecificMap = (tokens.Length() == 5); // if (sexSpecificMap) // printf("\n Found sex-specific map ...\n\n"); first_pass = false; } if (tokens.Length() != 3 && !sexSpecificMap) error("Error reading map file\n" "Each line in this file should include 3 fields:\n" "CHROMOSOME, MARKER_NAME, and POSITION\n" "However the following line has %d fields\n%s\n", tokens.Length(), (const char *) buffer); if (tokens.Length() != 5 && sexSpecificMap) error("Error reading map file\n" "Each line in this file should include 5 fields:\n\n" "CHROMOSOME, MARKER_NAME, SEX_AVERAGED_POS, FEMALE_POS AND MALE_POS\n\n" "However the following line has %d fields\n%s\n", tokens.Length(), (const char *) buffer); bool previous_state = String::caseSensitive; String::caseSensitive = false; if ((tokens[0] == "CHR" || tokens[0] == "CHROMOSOME") && (tokens[1] == "MARKER" || tokens[1] == "MARKER_NAME" || tokens[1] == "MRK") && (tokens[2] == "KOSAMBI" || tokens[2] == "POS" || tokens[2] == "POSITION" || tokens[2] == "SEX_AVERAGED_POS" || tokens[2] == "CM" || tokens[2] == "HALDANE")) continue; String::caseSensitive = previous_state; if (filter) if (LookupMarker(tokens[1]) < 0) continue; MarkerInfo * info = GetMarkerInfo(tokens[1]); int chr = (tokens[0][0] == 'x' || tokens[0][0] == 'X') ? 999 : (int) tokens[0]; info->chromosome = chr; info->position = (double) tokens[2] * 0.01; if (sexSpecificMap) { char * flag; double female = strtod(tokens[3], &flag); if (*flag) error("In the map file, the female cM position for marker\n" "%s is %s. This is not a valid number.", (const char *) tokens[1], (const char *) tokens[3]); double male = strtod(tokens[4], &flag); if (*flag) error("In the map file, the male cM position for marker\n" "%s is %s. This is not a valid number.", (const char *) tokens[1], (const char *) tokens[4]); info->positionFemale = (double) female * 0.01; info->positionMale = (double) male * 0.01; } else info->positionFemale = info->positionMale = info->position; } if (sexSpecificMap) VerifySexSpecificOrder(); } void PedigreeGlobals::LoadBasepairMap(const char * filename) { IFILE f = ifopen(filename, "rb"); if (f == NULL) error("The map file [%s] could not be opened\n\n" "Please check that the filename is correct and that the file is\n" "not being used by another program", filename); LoadBasepairMap(f); ifclose(f); } void PedigreeGlobals::LoadBasepairMap(IFILE & input) { String buffer; StringArray tokens; sexSpecificMap = false; while (!ifeof(input)) { buffer.ReadLine(input); tokens.Clear(); tokens.AddTokens(buffer, WHITESPACE); if (tokens.Length() < 1) continue; if (tokens.Length() != 3) error("Error reading map file\n" "Each line in this file should include 3 fields:\n" "CHROMOSOME, MARKER_NAME, and POSITION\n" "However the following line has %d fields\n%s\n", tokens.Length(), (const char *) buffer); bool previous_state = String::caseSensitive; String::caseSensitive = false; if ((tokens[0] == "CHR" || tokens[0] == "CHROMOSOME") && (tokens[1] == "MARKER" || tokens[1] == "MARKER_NAME" || tokens[1] == "MRK") && (tokens[2] == "BASEPAIR" || tokens[2] == "POS" || tokens[2] == "POSITION")) continue; String::caseSensitive = previous_state; MarkerInfo * info = GetMarkerInfo(tokens[1]); int chr = (tokens[0][0] == 'x' || tokens[0][0] == 'X') ? 999 : (int) tokens[0]; info->chromosome = chr; info->position = (double) tokens[2]; } } int PedigreeGlobals::instanceCount = 0; PedigreeGlobals::~PedigreeGlobals() { if (--instanceCount == 0 && markerInfoSize) { for (int i = 0; i < markerInfoCount; i++) delete markerInfo[i]; delete [] markerInfo; delete [] markerInfoByInteger; } } void PedigreeGlobals::WriteMapFile(const char * filename) { if (!MarkerPositionsAvailable()) return; FILE * output = fopen(filename, "wt"); if (output == NULL) error("Creating map file \"%s\"", filename); WriteMapFile(output); fclose(output); } void PedigreeGlobals::WriteMapFile(FILE * output) { if (!sexSpecificMap) fprintf(output, "CHR MARKER POS\n"); else fprintf(output, "CHR MARKER POS POSF POSM\n"); for (int i = 0; i < markerInfoCount; i++) { if (markerInfo[i]->chromosome != -1) { if (!sexSpecificMap) fprintf(output, "%3d %-10s %g\n", markerInfo[i]->chromosome, (const char *) markerInfo[i]->name, markerInfo[i]->position * 100.0); else fprintf(output, "%3d %-10s %g %g %g\n", markerInfo[i]->chromosome, (const char *) markerInfo[i]->name, markerInfo[i]->position * 100.0, markerInfo[i]->positionFemale * 100.0, markerInfo[i]->positionMale * 100.0); } } } void PedigreeGlobals::WriteFreqFile(const char * filename, bool old_format) { FILE * output = fopen(filename, "wt"); if (output == NULL) error("Creating allele frequency file \"%s\"", filename); WriteFreqFile(output, old_format); fclose(output); } void PedigreeGlobals::WriteFreqFile(FILE * output, bool old_format) { for (int i = 0; i < markerInfoCount; i++) { MarkerInfo * info = markerInfo[i]; if (info->freq.Length() == 0) continue; fprintf(output, "M %s\n", (const char *) info->name); if (old_format && info->alleleLabels.Length() == 0) for (int j = 1; j < info->freq.Length(); j++) fprintf(output, "%s%.5f%s", j % 7 == 1 ? "F " : "", info->freq[j], j == info->freq.Length() - 1 ? "\n" : j % 7 == 0 ? "\n" : " "); else for (int j = 1; j < info->freq.Length(); j++) if (info->freq[j] > 1e-7) fprintf(output, "A %5s %.5f\n", (const char *) info->GetAlleleLabel(j), info->freq[j]); } } bool PedigreeGlobals::MarkerPositionsAvailable() { for (int i = 0; i < markerInfoCount; i++) if (markerInfo[i]->chromosome != -1) return true; return false; } bool PedigreeGlobals::AlleleFrequenciesAvailable() { for (int i = 0; i < markerInfoCount; i++) if (markerInfo[i]->freq.Length() > 1) return true; return false; } int PedigreeGlobals::LoadAllele(int marker, String & token) { return LoadAllele(GetMarkerInfo(marker), token); } int PedigreeGlobals::LoadAllele(MarkerInfo * info, String & token) { int allele = info->GetAlleleNumber(token); if (allele >= 0) return allele; static unsigned char lookup[128]; static bool init = false; if (!init) { init = true; for (int i = 0; i < 128; i++) lookup[i] = 0; for (int i = '1'; i <= '9'; i++) lookup[i] = 1; lookup[int('a')] = lookup[int('A')] = lookup[int('c')] = lookup[int('C')] = 2; lookup[int('g')] = lookup[int('G')] = lookup[int('t')] = lookup[int('T')] = 2; } int first = token[0]; bool goodstart = first > 0 && first < 128; if (token.Length() == 1 && goodstart && lookup[int(token[0])]) return info->NewAllele(token); if (!goodstart || lookup[int(token[0])] != 1) return 0; int integer = token.AsInteger(); token = integer; allele = info->GetAlleleNumber(token); if (allele > 0) return allele; if (integer <= 0) return 0; if (integer > 1000000) { static bool warn_user = true; if (warn_user) { printf("Some allele numbers for marker %s are > 1000000\n" "All allele numbers >1000000 will be treated as missing\n\n", (const char *) info->name); warn_user = false; } return 0; } return info->NewAllele(token); } std::ostream &operator << (std::ostream &stream, MarkerInfo &m) { stream << "MarkerInfo for marker " << m.name << std::endl; stream << " located on chromsome " << m.chromosome << ":" << (int64_t)(100 * m.position) << std::endl; stream << " allele count = " << m.freq.Length() << std::endl; stream << " label count = " << m.alleleLabels.Length() << std::endl; if (m.freq.Length() == m.alleleLabels.Length()) { for (int i=0; i. */ #ifndef __PEDGLOBALS_H__ #define __PEDGLOBALS_H__ #include "Constant.h" #include "StringArray.h" #include "StringHash.h" #include "IntArray.h" #include "MathVector.h" #include class MarkerInfo { public: // Chromosome number int chromosome; // Position along chromosome in morgans double position; double positionMale; double positionFemale; Vector freq; String name; StringArray alleleLabels; StringIntHash alleleNumbers; MarkerInfo(String & string) { serial = count++; name = string; chromosome = -1; position = 0.0; positionMale = 0.0; positionFemale = 0.0; } bool AdjustFrequencies(); static int ComparePosition(MarkerInfo ** left, MarkerInfo ** right); String GetAlleleLabel(int allele); int GetAlleleNumber(char label) const { String labelString; labelString = label; return(GetAlleleNumber(labelString)); } int GetAlleleNumber(const String & label) const { return label == "0" ? 0 : alleleNumbers.Integer(label); } int NewAllele(char label) { String labelString; labelString = label; return(NewAllele(labelString)); } int NewAllele(const String & label); // Calling update serial for a series of markers ensures they are // clustered in a particular order void UpdateSerial() { serial = count++; } void IndexAlleles(); int CountAlleles() { return alleleLabels.Length() ? alleleLabels.Length() - 1 : 0; } private: // How many marker info structures have we created? static int count; static String label; // When sorting markers, use serial_no to break ties, so // markers we saw first in the map file / datafile come // first int serial; }; std::ostream &operator << (std::ostream &stream, MarkerInfo &m); class PedigreeGlobals { public: static int traitCount; static int markerCount; static int affectionCount; static int covariateCount; static int stringCount; // Should be set to true if handling X-linked data static bool chromosomeX; // Set to true when map file includes position info // based on sex-specific recombination fractions static bool sexSpecificMap; static StringArray traitNames; static StringArray covariateNames; static StringArray affectionNames; static StringArray markerNames; static StringArray stringNames; static StringIntHash markerLookup; static StringIntHash traitLookup; static StringIntHash affectionLookup; static StringIntHash covariateLookup; static StringIntHash stringLookup; // These functions are guaranteed to return a valid ID // If no matching attribute exists, one is created // static int GetTraitID(const char * name); static int GetMarkerID(const char * name); static int GetCovariateID(const char * name); static int GetAffectionID(const char * name); static int GetStringID(const char * name); // These functions return a matching ID or -1 if none is found // static int LookupTrait(const char * name) { return traitLookup.Integer(name); } static int LookupMarker(const char * name) { return markerLookup.Integer(name); } static int LookupCovariate(const char * name) { return covariateLookup.Integer(name); } static int LookupAffection(const char * name) { return affectionLookup.Integer(name); } static int LookupString(const char * name) { return stringLookup.Integer(name); } static int markerInfoCount; static int markerInfoSize; static MarkerInfo ** markerInfo; static StringHash markerInfoByName; static MarkerInfo ** markerInfoByInteger; static void GrowMarkerInfo(); static MarkerInfo * GetMarkerInfo(String & name); static MarkerInfo * GetMarkerInfo(int marker); static int SortMarkersInMapOrder(IntArray & markers, int chromosome = -1); static void GetOrderedMarkers(IntArray & markers); static void FlagMissingMarkers(IntArray & missingMarkers); static bool MarkerPositionsAvailable(); static bool AlleleFrequenciesAvailable(); static void VerifySexSpecificOrder(); static void LoadAlleleFrequencies(const char * filename, bool required = false); static void LoadAlleleFrequencies(IFILE & file); static void LoadMarkerMap(const char * filename, bool filter = false); static void LoadMarkerMap(IFILE & file, bool filter = false); static void LoadBasepairMap(const char * filename); static void LoadBasepairMap(IFILE & file); static void WriteMapFile(const char * filename); static void WriteMapFile(FILE * file); static void WriteFreqFile(const char * filename, bool old_format = false); static void WriteFreqFile(FILE * file, bool old_format = false); static int LoadAllele(int marker, String & label); // Read an allele static int LoadAllele(MarkerInfo * info, String & label); PedigreeGlobals() { instanceCount++; } ~PedigreeGlobals(); private: static int instanceCount; }; #endif libStatGen-1.0.14/general/PedigreeLoader.cpp000066400000000000000000000552621254730101300206330ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Pedigree.h" #include "FortranFormat.h" #include "Error.h" #include #include #include void Pedigree::Prepare(IFILE & input) { pd.Load(input); } void Pedigree::Load(IFILE & input) { if (pd.mendelFormat) { LoadMendel(input); return; } int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1; int textCols = pd.CountTextColumns() + 5; int oldCount = count; bool warn = true; int line = 0; String buffer; StringArray tokens; while (!ifeof(input)) { int field = 0; buffer.ReadLine(input); tokens.Clear(); tokens.AddTokens(buffer, WHITESPACE); if (tokens.Length() == 0) continue; if (tokens[0].SlowCompare("end") == 0) break; line++; if (tokens.Length() < textCols) { if (buffer.Length() > 79) { buffer.SetLength(75); buffer += " ..."; } String description; pd.ColumnSummary(description); error("Loading Pedigree...\n\n" "Expecting %d columns (%s),\n" "but read only %d columns in line %d.\n\n" "The problem line is transcribed below:\n%s\n", textCols, (const char *) description, tokens.Length(), line, (const char *) buffer); } if (tokens.Length() > textCols && warn && textCols > 5) { pd.ColumnSummary(buffer); printf("WARNING -- Trailing columns in pedigree file will be ignored\n" " Expecting %d data columns (%s)\n" " However line %d, for example, has %d data columns\n\n", textCols - 5, (const char *) buffer, line, tokens.Length() - 5); warn = false; } Person * p; // create a new person if necessary if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL) { if (count == size) Grow(); p = persons[count++] = new Person; } p->famid = tokens[field++]; // famid p->pid = tokens[field++]; // pid p->fatid = tokens[field++]; // fatid p->motid = tokens[field++]; // motid bool failure = false; p->sex = TranslateSexCode(tokens[field++], failure); if (failure) error("Can't interpret the sex of individual #%d\n" "Family: %s Individual: %s Sex Code: %s", count, (const char *) p->famid, (const char *) p->pid, (const char *) tokens[field-1]); if (sexAsCovariate) { if (p->sex) p->covariates[sexCovariate] = p->sex; else p->covariates[sexCovariate] = _NAN_; } for (int col = 0; col < pd.columnCount; col++) switch (pd.columns[col]) { case pcAffection : { int a = pd.columnHash[col]; int new_status; const char * affection = tokens[field++]; switch (toupper(affection[0])) { case '1' : case 'N' : case 'U' : new_status = 1; break; case '2' : case 'D' : case 'A' : case 'Y' : new_status = 2; break; default : new_status = atoi(affection); if (new_status < 0 || new_status > 2) error("Incorrect formating for affection status " "Col %d, Affection %s\n" "Family: %s Individual: %s Status: %s", col, (const char *) affectionNames[a], (const char *) p->famid, (const char *) p->pid, affection); } if (new_status != 0 && p->affections[a] != 0 && new_status != p->affections[a]) error("Conflict with previous affection status - " "Col %d, Affection %s\n" "Family: %s Individual: %s Old: %d New: %d", col, (const char *) affectionNames[a], (const char *) p->famid, (const char *) p->pid, p->affections[a], new_status); if (new_status) p->affections[a] = new_status; break; } case pcMarker : { int m = pd.columnHash[col]; Alleles new_genotype; new_genotype[0] = LoadAllele(m, tokens[field++]); new_genotype[1] = LoadAllele(m, tokens[field++]); if (p->markers[m].isKnown() && new_genotype.isKnown() && new_genotype != p->markers[m]) { MarkerInfo * info = GetMarkerInfo(m); error("Conflict with previous genotype - Col %d, Marker %s\n" "Family: %s Individual: %s Old: %s/%s New: %s/%s", col, (const char *) markerNames[m], (const char *) p->famid, (const char *) p->pid, (const char *) info->GetAlleleLabel(p->markers[m][0]), (const char *) info->GetAlleleLabel(p->markers[m][1]), (const char *) info->GetAlleleLabel(new_genotype[0]), (const char *) info->GetAlleleLabel(new_genotype[1])); } if (new_genotype.isKnown()) p->markers[m] = new_genotype; break; } case pcTrait : case pcUndocumentedTraitCovariate : { int t = pd.columnHash[col]; double new_pheno = _NAN_; if (pd.columns[col] == pcUndocumentedTraitCovariate) t = t / 32768; const char * value = tokens[field++]; char * flag = NULL; if (missing == (const char *) NULL || strcmp(value, missing) != 0) new_pheno = strtod(value, &flag); if (flag != NULL && *flag) new_pheno = _NAN_; if (p->traits[t] != _NAN_ && new_pheno != _NAN_ && new_pheno != p->traits[t]) error("Conflict with previous phenotype - Col %d, Trait %s\n" "Family: %s Individual: %s Old: %f New: %f", col, (const char *) traitNames[t], (const char *) p->famid, (const char *) p->pid, p->traits[t], new_pheno); if (new_pheno != _NAN_) p->traits[t] = new_pheno; if (pd.columns[col] == pcTrait) break; } case pcCovariate : { int c = pd.columnHash[col]; double new_covar = _NAN_; if (pd.columns[col] == pcUndocumentedTraitCovariate) { c = c % 32768; field--; } const char * value = tokens[field++]; char * flag = NULL; if (missing == (const char *) NULL || strcmp(value, missing) != 0) new_covar = strtod(value, &flag); if (flag != NULL && *flag) new_covar = _NAN_; if (p->covariates[c] != _NAN_ && new_covar != _NAN_ && new_covar != p->covariates[c]) error("Conflict with previous value - Col %d, Covariate %s\n" "Family: %s Individual: %s Old: %f New: %f", col, (const char *) covariateNames[c], (const char *) p->famid, (const char *) p->pid, p->covariates[c], new_covar); if (new_covar != _NAN_) p->covariates[c] = new_covar; break; } case pcString : { int c = pd.columnHash[col]; if (!p->strings[c].IsEmpty() && p->strings[c] != tokens[field]) error("Conflict with previous value - Col %d, String %s\n" "Family: %s Individual: %s Old: %s New: %s", col, (const char *) stringNames[c], (const char *) p->famid, (const char *) p->pid, (const char *) p->strings[c], (const char *) tokens[field]); p->strings[c] = tokens[field++]; break; } case pcSkip : field++; break; case pcZygosity : { int new_zygosity; const char * zygosity = tokens[field++]; switch (zygosity[0]) { case 'D' : case 'd' : new_zygosity = 2; break; case 'M' : case 'm' : new_zygosity = 1; break; default : new_zygosity = atoi(zygosity); } if (p->zygosity != 0 && new_zygosity != p->zygosity) error("Conflict with previous zygosity - " "Column %d in pedigree\n" "Family: %s Individual: %s Old: %d New: %d\n", col, (const char *) p->famid, (const char *) p->pid, p->zygosity, new_zygosity); p->zygosity = new_zygosity; break; } case pcEnd : break; default : error("Inconsistent Pedigree Description -- Internal Error"); } } Sort(); } void Pedigree::LoadMendel(IFILE & input) { // First, retrieve the two format statements from file String familyHeader; String individualRecord; familyHeader.ReadLine(input); individualRecord.ReadLine(input); // Then create two FORTRAN input streams... // One will be used for retrieving family labels and sizes, the other // will be used for individual information FortranFormat headers, records; headers.SetInputFile(input); headers.SetFormat(familyHeader); records.SetInputFile(input); records.SetFormat(individualRecord); // Storage for key pieces of information String famid; String phenotype; String affectionCode; String affectionStem; int familySize; String allele1, allele2; int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1; while (!ifeof(input)) { if (count == size) Grow(); // Retrieve header for next family familySize = headers.GetNextInteger(); headers.GetNextField(famid); headers.Flush(); if (famid.IsEmpty()) { if (ifeof(input) && familySize == 0) break; else error("Blank family id encountered\n"); } // Retrieve each individual in the family for (int i = 0; i < familySize; i++) { Person * p = persons[count++] = new Person; // Retrieve basic pedigree structure p->famid = famid; records.GetNextField(p->pid); records.GetNextField(p->fatid); records.GetNextField(p->motid); if (p->pid.IsEmpty()) error("No unique identifier for individual #%d in family %s\n", i + 1, (const char *) famid); if (p->pid.Compare(".") == 0) error("Family %s has an individual named '.', but this code is\n" "reserved to indicate missing parents\n"); if (p->fatid.IsEmpty()) p->fatid = "."; if (p->motid.IsEmpty()) p->motid = "."; // Retrieve and decode sex code char sex = records.GetNextCharacter(); switch (sex) { case '0' : case 'x' : case 'X' : case '?' : case 0 : p->sex = 0; break; case '1' : case 'm' : case 'M' : p->sex = 1; break; case '2' : case 'f' : case 'F' : p->sex = 2; break; default : error("Can't interpret the sex of individual #%d\n" "Family: %s Individual: %s Sex Code: %s", count, (const char *) p->famid, (const char *) p->pid, sex); }; if (sexAsCovariate) { if (p->sex) p->covariates[sexCovariate] = p->sex; else p->covariates[sexCovariate] = _NAN_; } // Retrieve and decode zygosity char zygosity = records.GetNextCharacter(); // Mendel uses a unique character to indicate each MZ pair, // we use a unique odd number... if (zygosity) p->zygosity = (zygosity - ' ') * 2 - 1; affectionStem.Clear(); for (int col = 0; col < pd.columnCount; col++) switch (pd.columns[col]) { case pcAffection : { int a = pd.columnHash[col]; // We expand each Mendel non-codominant trait into multiple // affection status column... First, if this is not a // continuation of a previous expansion we first retrieve // and encode the affection status. if (affectionStem.Length() == 0 || affectionNames[a].CompareToStem(affectionStem) != 0) { affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar('>') + 1); records.GetNextField(phenotype); affectionCode = affectionStem + phenotype; } // Then encode each phenotype appropriately if (phenotype.IsEmpty()) p->affections[a] = 0; else p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1; break; } case pcMarker : { int m = pd.columnHash[col]; records.GetNextField(phenotype); if (phenotype.IsEmpty()) { p->markers[m].one = p->markers[m].two = 0; continue; } int separator = phenotype.FindChar('/'); if (separator == -1) separator = phenotype.FindChar('|'); if (separator == -1) error("At marker %s, person %s in family %s has genotype %s.\n" "This genotype is not in the 'al1/al2' format.\n", (const char *) markerNames[m], (const char *) p->pid, (const char *) p->famid, (const char *) phenotype); allele1.Copy(phenotype, 0, separator); allele1.Trim(); allele2.Copy(phenotype, separator + 1, 8); allele2.Trim(); MarkerInfo * info = GetMarkerInfo(m); int one = info->alleleNumbers.Integer(allele1); if (one < 0) { if (info->freq.Length() == 0) one = info->NewAllele(allele1); else error("At marker %s, person %s in family %s has genotype %s.\n" "However, '%s' is not a valid allele for this marker.\n", (const char *) markerNames[m], (const char *) p->pid, (const char *) p->famid, (const char *) phenotype, (const char *) allele1); } int two = info->alleleNumbers.Integer(allele2); if (two < 0) { if (info->freq.Length() == 0) two = info->NewAllele(allele2); else error("At marker %s, person %s in family %s has genotype %s.\n" "However, '%s' is not a valid allele for this marker.\n", (const char *) markerNames[m], (const char *) p->pid, (const char *) p->famid, (const char *) phenotype, (const char *) allele2); } p->markers[m].one = one; p->markers[m].two = two; break; } case pcEnd : break; case pcTrait : case pcCovariate : case pcSkip : case pcZygosity : default: error("Inconsistent Pedigree Description -- Internal Error"); } records.Flush(); } } Sort(); } void Pedigree::Prepare(const char * filename) { // Clear any previously loaded pedigree description if (multiPd != NULL) delete [] multiPd; multiFileCount = 1; // Enable multifile support StringArray filenames; filenames.AddColumns(filename, ','); if (filenames.Length() <= 1) pd.Load(filename); else { printf("AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n"); multiPd = new PedigreeDescription[filenames.Length()]; for (int i = 0; i < filenames.Length(); i++) { printf(" AUTOMATIC MERGE: Reading data file '%s' ...\n", (const char *) filenames[i]); multiPd[i].Load(filenames[i], false); } multiFileCount = filenames.Length(); } } void Pedigree::Load(const char * filename, bool allowFailures) { if (multiFileCount <= 1) { IFILE f = ifopen(filename, "rb"); if (f == NULL && allowFailures) return; if (f == NULL) error( "The pedigree file %s cannot be opened\n\n" "Common causes for this problem are:\n" " * You might not have used the correct options to specify input file names,\n" " please check the program documentation for information on how to do this\n\n" " * The file doesn't exist or the filename might have been misspelt\n\n" " * The file exists but it is being used by another program which you will need\n" " to close\n\n" " * The file is larger than 2GB and you haven't compiled this application with\n" " large file support.\n\n", filename); Load(f); ifclose(f); } else { StringArray filenames; filenames.AddColumns(filename, ','); if (filenames.Length() != multiFileCount) error("Different numbers of comma separated data and pedigree file names provided\n"); for (int i = 0; i < filenames.Length(); i++) { printf(" AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n", (const char *) multiPd[i].filename, (const char *) filenames[i]); pd = multiPd[i]; IFILE f = ifopen(filenames[i], "rb"); if (f == NULL) error("The pedigree file '%s' cannot be opened\n\n", (const char *) filenames[i]); Load(f); ifclose(f); } printf("\n"); } } int Pedigree::TranslateSexCode(const char * code, bool & failure) { failure = false; switch (code[0]) { case 'x' : case 'X' : case '?' : return 0; case '1' : case 'm' : case 'M' : return 1; case '2' : case 'f' : case 'F' : return 2; default : { int result = atoi(code); if (result != 0 && result != 1 && result != 2) { failure = true; result = 0; } return result; } }; } libStatGen-1.0.14/general/PedigreePerson.cpp000066400000000000000000000127471254730101300206740ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "PedigreePerson.h" #include "Constant.h" #include "StringArray.h" #include "Error.h" #include #include #include #include Person::Person() { zygosity = sex = 0; serial = traverse = -1; markers = new Alleles [markerCount]; traits = new double [traitCount]; covariates = new double [covariateCount]; affections = new char [affectionCount]; strings = new String [stringCount]; for (int i = 0; i < traitCount; i++) traits[i] = _NAN_; for (int i = 0; i < covariateCount; i++) covariates[i] = _NAN_; for (int i = 0; i < affectionCount; i++) affections[i] = 0; filter = false; father = mother = NULL; sibs = NULL; sibCount = 0; ngeno = 0; hasBothParents = hasAllTraits = hasAllAffections = hasAllCovariates = false; } Person::~Person() { delete [] markers; delete [] traits; delete [] affections; delete [] covariates; delete [] strings; if (sibCount) delete [] sibs; } void Person::Copy(Person & rhs) { CopyIDs(rhs); CopyPhenotypes(rhs); } void Person::CopyPhenotypes(Person & rhs) { for (int i = 0; i < Person::traitCount; i++) traits[i] = rhs.traits[i]; for (int i = 0; i < Person::affectionCount; i++) affections[i] = rhs.affections[i]; for (int i = 0; i < Person::covariateCount; i++) covariates[i] = rhs.covariates[i]; for (int i = 0; i < Person::markerCount; i++) markers[i] = rhs.markers[i]; ngeno = rhs.ngeno; } void Person::WipePhenotypes(bool remove_genotypes) { for (int i = 0; i < traitCount; i++) traits[i] = _NAN_; for (int i = 0; i < covariateCount; i++) covariates[i] = _NAN_; for (int i = 0; i < affectionCount; i++) affections[i] = 0; if (remove_genotypes) { for (int i = 0; i < markerCount; i++) markers[i][0] = markers[i][1] = 0; ngeno = 0; } } void Person::CopyIDs(Person & rhs) { famid = rhs.famid; pid = rhs.pid; fatid = rhs.fatid; motid = rhs.motid; sex = rhs.sex; zygosity = rhs.zygosity; } bool Person::CheckParents() { hasBothParents = father != NULL && mother != NULL; if (!hasBothParents) { if (father != NULL || mother != NULL) { printf("Parent named %s for Person %s in Family %s is missing\n", (father == NULL) ? (const char *) fatid : (const char *) motid, (const char *) pid, (const char *) famid); return false; } else return true; } if (father->sex == SEX_FEMALE || mother->sex == SEX_MALE) // If parents are switched around, we can fix it... { Person * swap = father; father = mother; mother = swap; String temp = fatid; fatid = motid; motid = temp; } if (father->sex == SEX_FEMALE || mother->sex == SEX_MALE) // If things still don't make sense then the problem is more serious ... { printf("Parental sex codes don't make sense for Person %s in Family %s\n", (const char *) pid, (const char *) famid); return false; } return true; } void Person::AssessStatus() { hasBothParents = father != NULL && mother != NULL; hasAllTraits = hasAllAffections = hasAllCovariates = true; ngeno = 0; for (int m = 0; m < markerCount; m++) if (isGenotyped(m)) ngeno++; for (int t = 0; t < traitCount; t++) if (!isPhenotyped(t)) { hasAllTraits = false; break; } for (int c = 0; c < covariateCount; c++) if (!isControlled(c)) { hasAllCovariates = false; break; } for (int a = 0; a < affectionCount; a++) if (!isDiagnosed(a)) { hasAllAffections = false; break; } } void Person::Order(Person * & p1, Person * & p2) { if (p1->traverse > p2->traverse) { Person * temp = p1; p1 = p2; p2 = temp; } } int Person::GenotypedMarkers() { int count = 0; for (int m = 0; m < Person::markerCount; m++) if (markers[m].isKnown()) count++; return count; } bool Person::haveData() { if (ngeno) return true; for (int i = 0; i < affectionCount; i++) if (affections[i] != 0) return true; for (int i = 0; i < traitCount; i++) if (traits[i] != _NAN_) return true; return false; } bool Person::isAncestor(Person * descendant) { if (traverse > descendant->traverse) return false; if (serial == descendant->serial) return true; if (descendant->isFounder()) return false; return (isAncestor(descendant->mother) || isAncestor(descendant->father)); } libStatGen-1.0.14/general/PedigreePerson.h000066400000000000000000000065571254730101300203430ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PEDPERSON_H__ #define __PEDPERSON_H__ #include "Constant.h" #include "PedigreeAlleles.h" #include "PedigreeGlobals.h" #include "StringArray.h" #include "IntArray.h" #define SEX_MALE 1 #define SEX_FEMALE 2 #define SEX_UNKNOWN 0 class Person : public PedigreeGlobals { public: String famid; String pid; String motid; String fatid; int sex; int zygosity; int serial, traverse; Alleles * markers; double * traits; char * affections; double * covariates; String * strings; Person * father; Person * mother; int sibCount; Person ** sibs; int ngeno; bool filter; Person(); ~Person(); bool isHalfSib(Person & sib) { return hasBothParents && ((sib.father == father) ^(sib.mother == mother)); } bool isSib(Person & sib) { return hasBothParents && (sib.father == father) && (sib.mother == mother); } bool isTwin(Person & twin) { return (zygosity != 0) && (zygosity == twin.zygosity) && isSib(twin); } bool isMzTwin(Person & mzTwin) { return (zygosity & 1) && (zygosity == mzTwin.zygosity) && isSib(mzTwin); } // Check that both parents or none are available // Verify that fathers are male and mothers are female bool CheckParents(); // Assess status before using quick diagnostics functions void AssessStatus(); // Quick diagnostics bool isFounder() { return !hasBothParents; } bool isSexed() { return sex != 0; } bool isGenotyped(int m) { return markers[m].isKnown(); } bool isFullyGenotyped() { return ngeno == markerCount; } bool isControlled(int c) { return covariates[c] != _NAN_; } bool isFullyControlled() { return hasAllCovariates; } bool isPhenotyped(int t) { return traits[t] != _NAN_; } bool isFullyPhenotyped() { return hasAllTraits; } bool isDiagnosed(int a) { return affections[a] != 0; } bool isFullyDiagnosed() { return hasAllAffections; } bool haveData(); bool isAncestor(Person * descendant); int GenotypedMarkers(); static void Order(Person * & p1, Person * & p2); void Copy(Person & rhs); void CopyIDs(Person & rhs); void CopyPhenotypes(Person & rhs); void WipePhenotypes(bool remove_genotypes = true); private: bool hasAllCovariates, hasAllTraits, hasAllAffections, hasBothParents; }; #endif libStatGen-1.0.14/general/PedigreeTrim.cpp000066400000000000000000000134631254730101300203350ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Pedigree.h" void Pedigree::ShowTrimHeader(bool & flag) { if (flag) { printf("Trimming uninformative individuals...\n"); flag = false; } } void Pedigree::Trim(bool quiet, int * informative) { int newCount = 0; Person ** newPersons = new Person * [count]; // This function applies the following filters to reduce complexity // of pedigree // // RULE 1: Remove all pedigrees no genotype or phenotype data // RULE 2: Remove leaf individuals with no data // RULE 3: Remove founder couples with <2 offspring and no data bool showHeader = true; IntArray discardable, offspring, mates, haveData; for (int f = 0; f < familyCount; f++) { Family * fam = families[f]; // Cache for storing indicators about whether each family member is // informative haveData.Dimension(fam->count); // Check that some data is available in the family int hasData = false; for (int i = fam->first; i <= fam->last; i++) if (informative == NULL) hasData |= haveData[persons[i]->traverse] = persons[i]->haveData(); else hasData |= haveData[persons[i]->traverse] = informative[i]; if (!hasData) { if (!quiet) { ShowTrimHeader(showHeader); printf(" Removing family %s: No data\n", (const char *) fam->famid); } for (int i = fam->first; i <= fam->last; i++) delete persons[i]; continue; } // Assume that we need everyone in the family discardable.Dimension(fam->count); discardable.Set(0); bool trimming = true; while (trimming) { trimming = false; // Tally the number of offspring for each individual offspring.Dimension(fam->count); offspring.Zero(); // Tally the number of mates for each individual mates.Dimension(fam->count); mates.Set(-1); // In the first round, we count the number of offspring // for each individual in the current trimmed version of the // pedigree for (int i = fam->count - 1; i >= fam->founders; i--) { if (discardable[i]) continue; Person & p = *(persons[fam->path[i]]); if (discardable[p.father->traverse]) continue; if (offspring[i] == 0 && !haveData[p.traverse]) { trimming = true; discardable[i] = true; continue; } int father = p.father->traverse; int mother = p.mother->traverse; if (mates[father] == -1 && mates[mother] == -1) { mates[father] = mother, mates[mother] = father; } else if (mates[father] != mother) { if (mates[father] >= 0) mates[mates[father]] = -2; if (mates[mother] >= 0) mates[mates[mother]] = -2; mates[mother] = -2; mates[father] = -2; } offspring[father]++; offspring[mother]++; } // In the second pass, we remove individuals with no // data who are founders with a single offspring (and // no multiple matings) or who have no descendants for (int i = fam->count - 1; i >= 0; i--) { if (discardable[i]) continue; Person & p = *(persons[fam->path[i]]); if (p.isFounder() || discardable[p.father->traverse]) { if (mates[i] == -2 || offspring[i] > 1 || (mates[i] >= fam->founders && !discardable[persons[fam->path[mates[i]]]->father->traverse]) || haveData[p.traverse] || (mates[i] != -1 && haveData[mates[i]])) continue; trimming = true; discardable[i] = true; continue; } } } for (int i = fam->count - 1; i >= 0; i--) if (discardable[i]) { if (!quiet) { ShowTrimHeader(showHeader); printf(" Removing person %s->%s: No data\n", (const char *) fam->famid, (const char *) persons[fam->path[i]]->pid); } delete persons[fam->path[i]]; } else newPersons[newCount++] = persons[fam->path[i]]; } if (!showHeader) printf("\n"); delete [] persons; persons = newPersons; count = newCount; Sort(); } libStatGen-1.0.14/general/PedigreeTwin.cpp000066400000000000000000000143611254730101300203410ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Pedigree.h" #include "Error.h" #include bool Pedigree::TwinCheck() { bool fail = false; IntArray mzTwins; for (int f = 0; f < familyCount; f++) { mzTwins.Clear(); for (int i = families[f]->first, j; i <= families[f]->last; i++) // Is this person an identical twin? if (persons[i]->isMzTwin(*persons[i])) { // Have we got another identical sib yet? for (j = 0; j < mzTwins.Length(); j++) if (persons[i]->isMzTwin(*persons[mzTwins[j]])) break; // If not, add to list of twins if (j == mzTwins.Length()) { mzTwins.Push(i); continue; } // Check that their genotypes are compatible and // merge new twin's genotypes into original twin... Person * original = persons[mzTwins[j]]; Person * twin = persons[i]; for (int m = 0; m < Person::markerCount; m++) { if (!original->markers[m].isKnown()) original->markers[m] = twin->markers[m]; else if (twin->markers[m].isKnown() && twin->markers[m] != original->markers[m]) printf("MZ Twins %s and %s in family %s have " "different %s genotypes\n", (const char *) original->pid, (const char *) twin->pid, (const char *) original->famid, (const char *) Person::markerNames[m]), fail = true; if (twin->sex != original->sex) printf("MZ Twins %s and %s in family %s have " "different sexes\n", (const char *) original->pid, (const char *) twin->pid, (const char *) original->famid), fail = true; } } if (mzTwins.Length() == 0) continue; // In the second pass we copy merged twin genotypes // from original twin to other twins for (int i = families[f]->first, j; i <= families[f]->last; i++) if (persons[i]->isMzTwin(*persons[i])) { for (j = 0; j < mzTwins.Length(); j++) if (persons[i]->isMzTwin(*persons[mzTwins[j]])) break; if (mzTwins[j] == i) continue; Person * original = persons[mzTwins[j]]; Person * twin = persons[i]; for (int m = 0; m < Person::markerCount; m++) twin->markers[m] = original->markers[m]; } } return fail; } void Pedigree::MergeTwins() { if (!haveTwins) return; IntArray mzTwins, surplus; printf("Merging MZ twins into a single individual...\n"); for (int f = 0; f < familyCount; f++) { mzTwins.Clear(); for (int i = families[f]->first, j; i <= families[f]->last; i++) if (persons[i]->isMzTwin(*persons[i])) { // Have we got another identical sib yet? for (j = 0; j < mzTwins.Length(); j++) if (persons[i]->isMzTwin(*persons[mzTwins[j]])) break; // If not, add to list of twins if (j == mzTwins.Length()) { mzTwins.Push(i); continue; } // Append name to first twins name persons[mzTwins[j]]->pid += ((char) '$') + persons[i]->pid; // Set the first twin to affected if at least one of the cotwins is affected for (int j = 0; j < affectionCount; j++) if (persons[i]->affections[j] == 2) persons[mzTwins[j]]->affections[j] = 2; surplus.Push(i); } // More than one representative of each twin-pair... if (surplus.Length()) { // Reassign parent names for each offspring for (int i = families[f]->first, j; i < families[f]->last; i++) if (!persons[i]->isFounder()) { if (persons[i]->father->isMzTwin(*persons[i]->father)) { for (j = 0; j < mzTwins.Length(); j++) if (persons[i]->father->isMzTwin(*persons[mzTwins[j]])) break; persons[i]->fatid = persons[mzTwins[j]]->pid; } if (persons[i]->mother->isMzTwin(*persons[i]->mother)) { for (j = 0; j < mzTwins.Length(); j++) if (persons[i]->mother->isMzTwin(*persons[mzTwins[j]])) break; persons[i]->motid = persons[mzTwins[j]]->pid; } } // Delete surplus individuals while (surplus.Length()) { int serial = surplus.Pop(); delete persons[serial]; for (count--; serial < count; serial++) persons[serial] = persons[serial + 1]; } // Resort pedigree Sort(); } } } libStatGen-1.0.14/general/Performance.h000066400000000000000000000034571254730101300176650ustar00rootroot00000000000000/* * Copyright (c) 2009 Regents of the University of Michigan * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #ifndef _PERFORMANCE_H #define _PERFORMANCE_H #include #include class Timing { timeval startInterval; timeval endInterval; public: Timing() { start(); } void start(); void end(); double interval(); }; inline void Timing::start() { gettimeofday(&startInterval, NULL); } inline void Timing::end() { gettimeofday(&endInterval, NULL); } /// /// Return time interval between start() and end() /// @return elapsed time in seconds /// inline double Timing::interval() { return (endInterval.tv_sec + (endInterval.tv_usec/1000000.0)) - (startInterval.tv_sec + (startInterval.tv_usec/1000000.0)); } #endif libStatGen-1.0.14/general/PhoneHome.cpp000066400000000000000000000136451254730101300176410ustar00rootroot00000000000000/* * Copyright (C) 2013 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "PhoneHome.h" #include "knetfile.h" #include #include #include int PhoneHome::allThinning = 50; int PhoneHome::ourNumber = -1; bool PhoneHome::ourEnableCompletionStatus = false; std::string PhoneHome::ourBaseURL = "http://csgph.sph.umich.edu/ph/"; std::string PhoneHome::ourURL = ourBaseURL; char PhoneHome::ourPrefixChar = '?'; String PhoneHome::ourReturnString = ""; String PhoneHome::ourToolName = ""; void PhoneHome::enableCompletionStatus(const char* programName) { if(programName != NULL) { add("pgm", programName); } ourEnableCompletionStatus = true; } void PhoneHome::disableCompletionStatus() { ourEnableCompletionStatus = false; } bool PhoneHome::checkVersion(const char* programName, const char* version, const char* params) { enableCompletionStatus(); add("pgm", programName); add("vsn", version); add("args", params); connect(); // Look for this program in the returned string. int start = ourReturnString.Find(ourToolName+"\t"); if(start < 0) { // Parse the toolName, and check for the program name // just up to a ':' int colStart = ourToolName.FastFindChar(':'); if(colStart >= 0) { ourToolName.SetLength(colStart); start = ourReturnString.Find(ourToolName+"\t"); } } if(start < 0) { // This program name was not found in the version file, // so it is a program for which version is not tracked, // just return true. return(true); } // Found this program, so extract the version. start += ourToolName.Length(); while((start < ourReturnString.Length()) && isspace(ourReturnString[start])) { // Consume whitespace ++start; } // Start now contains the position of the start of the version String thisVersion = version; String latestVersion; int end = start; while((end < ourReturnString.Length()) && !isspace(ourReturnString[end])) { latestVersion += ourReturnString[end]; ++end; } // std::cerr << "latest version = " << latestVersion << "\nthis version = " << thisVersion.c_str() << "\n"; if(latestVersion.FastCompare(thisVersion) > 0) { std::cerr << "\n**************************************************************************************\n" << "A new version, " << latestVersion << ", of " << ourToolName << " is available (currently running " << thisVersion.c_str() << ")" << "\n**************************************************************************************\n\n"; return(false); } return(true); } void PhoneHome::completionStatus(const char* status, const char* programName) { if(programName != NULL) { add("pgm", programName); enableCompletionStatus(); } if(ourEnableCompletionStatus) { add("status", status); connect(); } } void PhoneHome::resetURL() { ourURL = ourBaseURL; ourPrefixChar = '?'; } void PhoneHome::add(const char* name, const char* val) { if((name != NULL) && (strlen(name) != 0) && (val != NULL) && (strlen(val) != 0)) { // Check if the value is already set. if(ourURL.find(name) != std::string::npos) { // value already set, so do not set it. return; } // A value was passed in, so add it to the URL. ourURL += ourPrefixChar; ourURL += name; ourURL += '='; // If it is a tool name, trim anything before the last '/' if(strstr(name, "pgm") != NULL) { // toolname, so trim the val. const char* toolVal = strrchr(val, '/'); if(toolVal != NULL) { toolVal++; } else { toolVal = val; } ourURL.append(toolVal); ourToolName = toolVal; } else { ourURL += val; } ourPrefixChar = '&'; } } bool PhoneHome::connect() { if(ourNumber == -1) { srand (time(NULL)); ourNumber = rand(); String numString; numString = ourNumber; String thinningString; thinningString = allThinning; add("uniqNum", numString); add("thinning", thinningString); } if((ourNumber % 100) >= allThinning) { // Skip phoneHome. return(true); } // std::cerr << "url = " << ourURL << std::endl; ourReturnString.Clear(); // return(true); #ifndef _NO_PHONEHOME knet_silent(1); knetFile *file = knet_open(ourURL.c_str(), "r"); if (file == 0) return(false); const int BUF_SIZE = 100; char buf[BUF_SIZE]; ssize_t readLen = BUF_SIZE-1; ssize_t numRead = readLen; while(numRead == readLen) { numRead = knet_read(file, buf, readLen); buf[numRead] = '\0'; ourReturnString += buf; } knet_close(file); knet_silent(0); // std::cerr << "PhoneHome URL = " << ourReturnString.c_str() << std::endl; #endif return(true); } libStatGen-1.0.14/general/PhoneHome.h000066400000000000000000000054341254730101300173030ustar00rootroot00000000000000/* * Copyright (C) 2013 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __PHONEHOME_H__ #define __PHONEHOME_H__ #include #include "StringBasics.h" // By default, CompletionStatus PhoneHome is disabled. // To enable it: // 1) call "enableCompletionStatus" // 2) call checkVersion // 3) call completionStatus with the program name passed in // class PhoneHome { public: // Public method that can be set to control the thinning of version checks. static int allThinning; // Enable Completion Status PhoneHome, it is disabled by default. // It can also be enabled by: // * calling checkVersion // * calling completionStatus with the program name passed in // Program name must be specified in order to log completionStatus static void enableCompletionStatus(const char* programName = NULL); // Disable Completion Status PhoneHome. (It is already disabled by default.) static void disableCompletionStatus(); // Check the version, printing a message if a newer version is available. // Enables CompletionStatus PhoneHome // Returns false if there is a new version available, otherwise true. static bool checkVersion(const char* programName, const char* version, const char* params = NULL); // If completionStatus is enabled, send the completion status. // completionStatus is enabled if: // 1) enableCompletionStatus was called // 2) checkVersion was called // 3) programName is passed in // ProgramName is ignored if it has previously been set. static void completionStatus(const char* status, const char* programName = NULL); static void setURL(const char* url); static void resetURL(); protected: private: static void add(const char* name, const char* val); static bool connect(); static bool ourEnableCompletionStatus; static std::string ourBaseURL; static std::string ourURL; static char ourPrefixChar; static int ourNumber; static String ourToolName; static String ourReturnString; }; #endif libStatGen-1.0.14/general/QuickIndex.cpp000066400000000000000000000131621254730101300200150ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "QuickIndex.h" #include "Error.h" #define __QI_INVALID 0 #define __QI_VECTOR 1 #define __QI_INTARRAY 2 #define __QI_STRINGARRAY 3 QuickIndex::QuickIndex() { source = NULL; datatype = __QI_INVALID; } void QuickIndex::Index(const IntArray & source_data) { source = (const void *) &source_data; datatype = __QI_INTARRAY; Dimension(source_data.Length()); SetSequence(); Sort(); } void QuickIndex::Index(const Vector & source_data) { source = (const void *) &source_data; datatype = __QI_VECTOR; Dimension(source_data.Length()); SetSequence(); Sort(); } void QuickIndex::Index(const StringArray & source_data) { source = (const void *) &source_data; datatype = __QI_STRINGARRAY; Dimension(source_data.Length()); SetSequence(); Sort(); } void QuickIndex::IndexCounts(const StringIntMap & source_data) { IntArray counts(source_data.Length()); for (int i = 0; i < source_data.Length(); i++) counts[i] = source_data.GetCount(i); Index(counts); } void QuickIndex::IndexCounts(const StringIntHash & source_data) { IntArray counts(source_data.Capacity()); for (int i = 0; i < source_data.Capacity(); i++) if (source_data.SlotInUse(i)) counts[i] = source_data.Integer(i); else counts[i] = -1; Index(counts); Reverse(); Dimension(source_data.Entries()); Reverse(); } bool QuickIndex::IsBefore(int i, int j) { i = (*this)[i]; j = (*this)[j]; switch (datatype) { case __QI_VECTOR : { const Vector & data = * (const Vector *) source; return data[i] < data[j]; } case __QI_INTARRAY : { const IntArray & data = * (const IntArray *) source; return data[i] < data[j]; } case __QI_STRINGARRAY : { const StringArray & data = * (const StringArray *) source; return data[i].SlowCompare(data[j]) < 0; } } return 0; } void QuickIndex::Sort() { struct __QuickIndexStack { int left, right; }; if (Length() <= 1) return; // Create a pseudo-stack to avoid recursion __QuickIndexStack stack[32]; int stackIdx = 0; // Size of minimum partition to median of three const int Threshold = 7; // current partitions int lsize, rsize; int l, mid, r; int scanl, scanr, pivot; l = 0; r = Length() - 1; while (1) { while (r > l) { if (r - l > Threshold) // QuickSort : median of three partitioning { mid = (r + l) / 2; // sort l, mid, and r if (IsBefore(mid, l)) Swap(mid, l); if (IsBefore(r, l)) Swap(r, l); if (IsBefore(r, mid)) Swap(r, mid); // set up for partitioning... pivot = r - 1; Swap(mid, pivot); scanl = l + 1; scanr = r - 2; } else { // set up random partition -- faster pivot = r; scanl = l; scanr = r - 1; } while (1) { // scan from left for element >= pivot while ((scanl < r) && IsBefore(scanl, pivot)) ++scanl; while ((scanr > l) && IsBefore(pivot, scanr)) --scanr; // if scans have met, we are done if (scanl >= scanr) break; Swap(scanl, scanr); if (scanl < r) ++scanl; if (scanr > l) --scanr; } // Exchange final element Swap(pivot, scanl); // Place largest partition on stack lsize = scanl - l; rsize = r - scanl; if (lsize > rsize) { // if size is one we are done ++ stackIdx; stack[stackIdx].left = l; stack[stackIdx].right = scanl - 1; if (rsize != 0) l = scanl + 1; else break; } else { // if size is one we are done ++ stackIdx; stack[stackIdx].left = scanl + 1; stack[stackIdx].right = r; if (lsize != 0) r = scanl - 1; else break; } } // iterate with values from stack if (stackIdx) { l = stack[stackIdx].left; r = stack[stackIdx].right; --stackIdx; } else break; } } libStatGen-1.0.14/general/QuickIndex.h000066400000000000000000000031001254730101300174510ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __QUICKINDEX_H__ #define __QUICKINDEX_H__ #include "MathVector.h" #include "StringArray.h" #include "StringHash.h" #include "IntArray.h" #include "StringMap.h" class QuickIndex : public IntArray { public: QuickIndex(); QuickIndex(const IntArray & source_data) { Index(source_data); } QuickIndex(const StringArray & source_data) { Index(source_data); } QuickIndex(const Vector & source_data) { Index(source_data); } void Index(const IntArray & source_data); void Index(const StringArray & source_data); void Index(const Vector & source_data); void IndexCounts(const StringIntMap & source_data); void IndexCounts(const StringIntHash & source_data); private: const void * source; int datatype; bool IsBefore(int i, int j); void Sort(); }; #endif libStatGen-1.0.14/general/Random.cpp000066400000000000000000000250131254730101300171670ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ ////////////////////////////////////////////////////////////////////////////// // This file includes code derived from the original Mersenne Twister Code // by Makoto Matsumoto and Takuji Nishimura // and is subject to their original copyright notice copied below: ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// // COPYRIGHT NOTICE FOR MERSENNE TWISTER CODE // Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // 3. The names of its contributors may not be used to endorse or promote // products derived from this software without specific prior written // permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // /////////////////////////////////////////////////////////////////////////////// #include "Random.h" #include "MathConstant.h" #include "Error.h" #include //Constants used internally by Mersenne random number generator #define MERSENNE_N 624 #define MERSENNE_M 397 // constant vector a #define MATRIX_A 0x9908b0dfUL // most significant w-r bits #define UPPER_MASK 0x80000000UL // least significant r bits #define LOWER_MASK 0x7fffffffUL // Constants used internally by Park-Miller random generator #define IA 16807 #define IM 2147483647 #define AM (1.0 / IM) #define IQ 127773 #define IR 2836 #define NTAB 32 #define NDIV (1+(IM-1)/NTAB) #define RNMX (1.0-EPS) Random::Random(long s) { #ifndef __NO_MERSENNE mt = new unsigned long [MERSENNE_N]; mti = MERSENNE_N + 1; mersenneMult = 1.0/4294967296.0; #else shuffler = new long [NTAB]; #endif Reset(s); } Random::~Random() { #ifndef __NO_MERSENNE delete [] mt; #else delete [] shuffler; #endif } void Random::Reset(long s) { normSaved = 0; #ifndef __NO_MERSENNE InitMersenne(s); #else // 'Continuous' Random Generator if ((seed = s) < 1) seed = s == 0 ? 1 : -s; // seed == 0 would be disastrous for (int j=NTAB+7; j>=0; j--) // Warm up and set shuffle table { long k = seed / IQ; seed = IA * (seed - k * IQ) - IR * k; if (seed < 0) seed += IM; if (j < NTAB) shuffler[j] = seed; } last=shuffler[0]; #endif } // initializes mt[MERSENNE_N] with a seed void Random::InitMersenne(unsigned long s) { mt[0]= s & 0xffffffffUL; for (mti = 1; mti < MERSENNE_N; mti++) { mt[mti] = (1812433253UL * (mt[mti-1] ^(mt[mti-1] >> 30)) + mti); /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ /* In the previous versions, MSBs of the seed affect */ /* only MSBs of the array mt[]. */ /* 2002/01/09 modified by Makoto Matsumoto */ mt[mti] &= 0xffffffffUL; } } int Random::Binary() { return Next() > 0.5 ? 1 : 0; } #ifndef __NO_MERSENNE double Random::Next() { unsigned long y; // mag01[x] = x * MATRIX_A for x=0,1 static unsigned long mag01[2]={0x0UL, MATRIX_A}; if (mti >= MERSENNE_N) { /* generate MERSENNE_N words at one time */ int kk; // If InitMersenne() has not been called, a default initial seed is used if (mti == MERSENNE_N+1) InitMersenne(5489UL); for (kk=0; kk < MERSENNE_N-MERSENNE_M; kk++) { y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK); mt[kk] = mt[kk+MERSENNE_M] ^(y >> 1) ^ mag01[y & 0x1UL]; } for (; kk < MERSENNE_N-1; kk++) { y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK); mt[kk] = mt[kk+(MERSENNE_M - MERSENNE_N)] ^(y >> 1) ^ mag01[y & 0x1UL]; } y = (mt[MERSENNE_N-1] & UPPER_MASK) | (mt[0] & LOWER_MASK); mt[MERSENNE_N-1] = mt[MERSENNE_M-1] ^(y >> 1) ^ mag01[y & 0x1UL]; mti = 0; } y = mt[mti++]; // Tempering y ^= (y >> 11); y ^= (y << 7) & 0x9d2c5680UL; y ^= (y << 15) & 0xefc60000UL; y ^= (y >> 18); return (mersenneMult *((double) y + 0.5)); } // Generates a random number on [0,0xffffffff]-interval unsigned long Random::NextInt() { unsigned long y; // mag01[x] = x * MATRIX_A for x=0,1 static unsigned long mag01[2]={0x0UL, MATRIX_A}; if (mti >= MERSENNE_N) { /* generate MERSENNE_N words at one time */ int kk; // If InitMersenne() has not been called, a default initial seed is used if (mti == MERSENNE_N + 1) InitMersenne(5489UL); for (kk= 0; kk < MERSENNE_N - MERSENNE_M; kk++) { y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK); mt[kk] = mt[kk+MERSENNE_M] ^(y >> 1) ^ mag01[y & 0x1UL]; } for (; kk< MERSENNE_N-1; kk++) { y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK); mt[kk] = mt[kk+(MERSENNE_M - MERSENNE_N)] ^(y >> 1) ^ mag01[y & 0x1UL]; } y = (mt[MERSENNE_N-1] & UPPER_MASK) | (mt[0] & LOWER_MASK); mt[MERSENNE_N-1] = mt[MERSENNE_M-1] ^(y >> 1) ^ mag01[y & 0x1UL]; mti = 0; } y = mt[mti++]; // Tempering y ^= (y >> 11); y ^= (y << 7) & 0x9d2c5680UL; y ^= (y << 15) & 0xefc60000UL; y ^= (y >> 18); return y; } #else double Random::Next() { // Compute seed = (IA * seed) % IM without overflows // by Schrage's method long k = seed / IQ; seed = IA * (seed - k * IQ) - IR * k; if (seed < 0) seed += IM; // Map to 0..NTAB-1 int j = last/NDIV; // Output value is shuffler[j], which is in turn replaced by seed last = shuffler[j]; shuffler[j] = seed; // Map to 0.0 .. 1.0 excluding endpoints double temp = AM * last; if (temp > RNMX) return RNMX; return temp; } unsigned long Random::NextInt() { // Compute seed = (IA * seed) % IM without overflows // by Schrage's method long k = seed / IQ; seed = IA * (seed - k * IQ) - IR * k; if (seed < 0) seed += IM; // Map to 0..NTAB-1 int j = last/NDIV; // Output value is shuffler[j], which is in turn replaced by seed last = shuffler[j]; shuffler[j] = seed; return last; } #endif double Random::Normal() { double v1, v2, fac, rsq; if (!normSaved) // Do we need new numbers? { do { v1 = 2.0 * Next() - 1.0; // Pick two coordinates from v2 = 2.0 * Next() - 1.0; // -1 to +1 and check if they rsq = v1*v1 + v2*v2; // are in unit circle... } while (rsq >= 1.0 || rsq == 0.0); fac = sqrt(-2.0 * log(rsq)/rsq); // Apply the Box-Muller normStore = v1 * fac; // transformation and save normSaved = 1; // one deviate for next time return v2 * fac; } else { normSaved = 0; return normStore; } } void Random::Choose(int * array, int n, int k) { int choices = 1, others = 0; if (k > n / 2) { choices = 0; others = 1; k = n - k; } for (int i = 0; i < n; i++) array[i] = others; while (k > 0) { int i = NextInt() % n; if (array[i] == choices) continue; array[i] = choices; k--; } } void Random::Choose(int * array, float * weights, int n, int k) { int choices = 1, others = 0; if (k > n / 2) { choices = 0; others = 1; k = n - k; } // First calculate cumulative sums of weights ... float * cumulative = new float [n + 1]; cumulative[0] = 0; for (int i = 1; i <= n; i++) cumulative[i] = cumulative[i - 1] + weights[i - 1]; float & sum = cumulative[n], reject = 0.0; for (int i = 0; i < n; i++) array[i] = others; while (k > 0) { float weight = Next() * sum; int hi = n, lo = 0, i = 0; while (hi >= lo) { i = (hi + lo) / 2; if (cumulative[i + 1] <= weight) lo = i + 1; else if (cumulative[i] >= weight) hi = i - 1; else break; } if (array[i] == choices) continue; array[i] = choices; reject += weights[i]; // After selecting a substantial number of elements, update the cumulative // distribution -- to ensure that at least half of our samples produce a hit if (reject > sum * 0.50) { cumulative[0] = 0; for (int i = 1; i <= n; i++) if (array[i] != choices) cumulative[i] = cumulative[i - 1] + weights[i - 1]; else cumulative[i] = cumulative[i - 1]; reject = 0.0; sum = cumulative[n]; } k--; } delete [] cumulative; } Random globalRandom; libStatGen-1.0.14/general/Random.h000066400000000000000000000106441254730101300166400ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ ////////////////////////////////////////////////////////////////////////////// // This file includes code derived from the original Mersenne Twister Code // by Makoto Matsumoto and Takuji Nishimura // and is subject to their original copyright notice copied below: ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// // COPYRIGHT NOTICE FOR MERSENNE TWISTER CODE // // Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // 3. The names of its contributors may not be used to endorse or promote // products derived from this software without specific prior written // permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // /////////////////////////////////////////////////////////////////////////////// #ifndef __RANDOM_H__ #define __RANDOM_H__ // Define a quick and dirty generator #define RANDMUL 1664525L #define RANDADD 1013904223L #define RAND(seed) ((seed = seed * RANDMUL + RANDADD) & 0xFFFFFFFF) class Random // Implements the Mersenne Twister as default random number generator. // Compilation flag __NO_MERSENNE sets default generator to // a minimal Park-Miller with Bays-Durham shuffle and added safe guards. { protected: // values for "minimal random values" long seed; long last; long * shuffler; // and for normal deviates int normSaved; double normStore; double mersenneMult; // Array for Mersenne state vector unsigned long * mt; // Used to signal that Mersenne state vector is not initialized int mti; public: Random(long s = 0x7654321); ~Random(); // Next bit in series of 0s and 1s int Binary(); // Next bit in series of 0s and 1s // Next value in series, between 0 and 1 double Next(); // Next integer unsigned long NextInt(); // Random number form N(0,1) double Normal(); void Reset(long s); void InitMersenne(unsigned long s); // Random number between 0 and 1 operator double() { return Next(); } // Random number between arbitrary bounds double Uniform(double lo = 0.0, double hi = 1.0) { return lo + (hi - lo) * Next(); } void Choose(int * array, int n, int k); void Choose(int * array, float * weights, int n, int k); }; extern Random globalRandom; #endif libStatGen-1.0.14/general/ReferenceSequence.cpp000066400000000000000000000111701254730101300213350ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "assert.h" #include "ctype.h" #include "stdio.h" #include "Error.h" #include "Generic.h" #include "ReferenceSequence.h" #include #include #include #include #include // // Given a buffer with a fasta format contents, count // the number of chromsomes in it and return that value. // bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount) { chromosomeCount = 0; baseCount = 0; bool atLineStart = true; // // loop over the fasta file, essentially matching for the // pattern '^>.*$' and counting them. // for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++) { switch (fastaData[fastaIndex]) { case '\n': case '\r': atLineStart = true; break; case '>': { if (!atLineStart) break; chromosomeCount++; // // eat the rest of the line // while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r') { fastaIndex++; } break; } default: baseCount++; atLineStart = false; break; } } return false; } #if 0 // turn this into a template on read/quality/etc... int GenomeSequence::debugPrintReadValidation( std::string &read, std::string &quality, char direction, genomeIndex_t readLocation, int sumQuality, int mismatchCount, bool recurse ) { int validateSumQ = 0; int validateMismatchCount = 0; int rc = 0; std::string genomeData; for (uint32_t i=0; i= (read.size() - 24))) validateMismatchCount++; genomeData.push_back(tolower((*this)[readLocation + i])); } else { genomeData.push_back(toupper((*this)[readLocation + i])); } } assert(validateSumQ>=0); if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount) { printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n", genomeData.c_str(), read.c_str(), validateSumQ, sumQuality ); rc++; } else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount) { printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n", genomeData.c_str(), read.c_str(), validateMismatchCount, mismatchCount ); rc++; } else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount) { printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n", genomeData.c_str(), read.c_str(), validateSumQ, sumQuality, validateMismatchCount, mismatchCount ); rc++; } if (recurse && abs(validateMismatchCount - mismatchCount) > (int) read.size()/2) { printf("large mismatch difference, trying reverse strand: "); std::string reverseRead = read; std::string reverseQuality = quality; getReverseRead(reverseRead); reverse(reverseQuality.begin(), reverseQuality.end()); rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false); } return rc; } #endif libStatGen-1.0.14/general/ReferenceSequence.h000066400000000000000000000332031254730101300210030ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __REFERENCESEQUENCE_H #define __REFERENCESEQUENCE_H #include "BaseAsciiMap.h" #include "Generic.h" #include "PackedVector.h" #include #include // // The namespace Sequence is for templated algorithms that are by and large // independent of the underlying storage mechanism for the bases. // // They are written in such a way as to assume that the array operator [] // will return an ASCII representation of a base space nucleotide. // // In theory, this set of templates will work with a variety of combinations // of means for representing bases - String, std::string, and others. // // The containers are expected to allow for an overidden [] operator, // and provide a size() method to return the number of bases in the // container. // // In containers where sequence data is placed, they must in addition // have a clear() method as well as a push_back() method as done // in std::string containers. // namespace Sequence { // // wordMatch(Sequnece, Index, Word) - compare a word to a sequence of bases // Sequence is a generic container with a large set of bases // Index is the starting point to start the comparison // Word is the small sequence of bases to match // template bool wordMatch(sequenceType &sequence, sequenceIndexType index, wordType &word) { if( (index + word.size()) >= sequence.size() ) return false; for(size_t i = 0; i < word.size(); i++) { if( sequence[index + i] != word[i]) return false; } return true; } // // printNearbyWords(output, sequence, index, word, deviation) searches // for 'deviation' bases on either side of the index into sequence // and prints all occurrences where word appears. // template void printNearbyWords(std::ostream &output, sequenceType &sequence, sequenceIndexType index, wordType &word, int deviation) { for (sequenceIndexType i = index - deviation; i < index + deviation; i++) { if (wordMatch(sequence, i, word)) { output << "word '" << word << "' found " << i - index << " away from position " << index << "." << std::endl; } } } // // getString(sequence, index, baseCount, word) - populate word with the 'baseCount' // bases that occur at the 'index' starting position in sequence. // template void getString(sequenceType &sequence, sequenceIndexType index, int baseCount, wordType &word) { word.clear(); for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++) { word.push_back(sequence[index + i]); } } // // getHighLightedString() is a debugging aid for printing "highlighted" // subsets of bases, where the highlighting is done via turning the // base into a lower case ASCII equivalent. // template void getHighLightedString( sequenceType &sequence, sequenceIndexType index, int baseCount, wordType &word, sequenceIndexType highLightStart, sequenceIndexType highLightEnd) { word.clear(); for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++) { char base = sequence[index+i]; if(in(index+i, highLightStart, highLightEnd)) base = tolower(base); word.push_back(base); } } // // printBaseContext() outputs a base at location 'index' along with 'baseCount' // bases on either side of that base (default 30). // template void printBaseContext(std::ostream &output, sequenceType &sequence, sequenceIndexType index, int baseCount = 30) { output << "index: " << index << std::endl; for (sequenceIndexType i=index-baseCount; i<=index+baseCount; i++) output << sequence[i]; output << std::endl; for (sequenceIndexType i=index-baseCount; i void getMismatchHatString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read) { result = ""; for (uint32_t i=0; i < read.size(); i++) { if (read[i] == sequence[location+i]) result.push_back(' '); else result.push_back('^'); } } template void getMismatchString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read) { result = ""; for (uint32_t i=0; i < read.size(); i++) { if (read[i] == sequence[location+i]) result.push_back(toupper(read[i])); else result.push_back(tolower(read[i])); } } /// Return the mismatch count, disregarding CIGAR strings /// /// \param read is the sequence we're counting mismatches in /// \param location is where in the genmoe we start comparing /// \param exclude is a wildcard character (e.g. '.' or 'N') /// /// \return number of bases that don't match the reference, except those that match exclude template int getMismatchCount(sequenceType &sequence, sequenceIndexType location, readType &read, char exclude='\0') { int mismatchCount = 0; for (uint32_t i=0; i int getSumQ(sequenceType &sequence, sequenceIndexType location, readType &read, qualityType &qualities) { int sumQ = 0; for (uint32_t i=0; i sequenceIndexType simpleLocalAligner( sequenceType &sequence, sequenceIndexType index, readType &read, qualityType &quality, int windowSize) { int bestScore = 1000000; // either mismatch count or sumQ sequenceIndexType bestMatchLocation = -1; for (int i=-windowSize; i(*this))[baseIndex]]; } inline void set(PackedVectorIndex_t baseIndex, char value) { this->PackedVector4Bit_t::set(baseIndex, BaseAsciiMap::base2int[(uint32_t) value]); } inline void push_back(char value) { this->PackedVector4Bit_t::push_back(BaseAsciiMap::base2int[(uint32_t) value]); } }; std::ostream &operator << (std::ostream &stream, PackedSequenceData &v) { for(size_t i=0; i bool loadFastaFile(const char *filename, std::vector &sequenceData, std::vector &chromosomeNames) { InputFile inputStream(filename, "r", InputFile::DEFAULT); if(!inputStream.isOpen()) { std::cerr << "Failed to open file " << filename << "\n"; return true; } int whichChromosome = -1; // // chromosomeNames is cheap to clear, so do it here. // // NB: I explicitly choose not to clear the sequence data // container, this allows the caller to pre-allocate based // on their knowledge of the size of the expected genome. // chromosomeNames.clear(); char ch; while((ch = inputStream.ifgetc()) != EOF) { switch (ch) { case '\n': case '\r': break; case '>': { std::string chromosomeName = ""; // // pull out the chromosome new name // while (!isspace((ch = inputStream.ifgetc())) && ch != EOF) { chromosomeName += ch; // slow, but who cares } // // eat the rest of the line // do { ch = inputStream.ifgetc(); } while(ch != EOF && ch != '\n' && ch != '\r'); // // save the Chromosome name and index into our // header so we can use them later. // chromosomeNames.push_back(chromosomeName); whichChromosome++; sequenceData.resize(whichChromosome+1); break; } default: // we get here for sequence data. // // save the base value // Note: invalid characters come here as well, but we // let ::set deal with mapping them. sequenceData[whichChromosome].push_back(toupper(ch)); #if 0 if (isColorSpace()) { // // anything outside these values represents an invalid base // base codes: 0-> A, 1-> C, 2-> G, 3-> T // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red // const char fromBase2CS[] = { /* 0000 */ 0, // A->A /* 0001 */ 1, // A->C /* 0010 */ 2, // A->G /* 0011 */ 3, // A->T /* 0100 */ 1, // C->A /* 0101 */ 0, // C->C /* 0110 */ 3, // C->G /* 0111 */ 2, // C->T /* 1000 */ 2, // G->A /* 1001 */ 3, // G->C /* 1010 */ 0, // G->G /* 1011 */ 1, // G->T /* 1100 */ 3, // T->A /* 1101 */ 2, // T->C /* 1110 */ 1, // T->G /* 1111 */ 0, // T->T }; // // we are writing color space values on transitions, // so we don't write a colorspace value when we // get the first base value. // // On second and subsequent bases, write based on // the index table above // char thisBase = base2int[(int)(fasta[fastaIndex])]; if (lastBase>=0) { char color; if (lastBase>3 || thisBase>3) color=4; else color = fromBase2CS[(int)(lastBase<<2 | thisBase)]; // re-use the int to base, because ::set expects a base char (ATCG), not // a color code (0123). It should only matter on final output. set(header->elementCount++, int2base[(int) color]); } lastBase = thisBase; } else { set(header->elementCount++, toupper(fasta[fastaIndex])); } #endif break; } } return false; } #endif libStatGen-1.0.14/general/ReusableVector.h000066400000000000000000000076701254730101300203520ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #ifndef __REUSABLE_VECTOR_H__ #define __REUSABLE_VECTOR_H__ /// Create a vector of DATA_TYPE that reuses created objects to save on /// memory reallocations. DATA_TYPE must have a function called clear() /// that is used to reset it for reuse. template class ReusableVector { public: ReusableVector(): myCont(), myNextEmpty(0) {} virtual ~ReusableVector(); /// Clear the vector contents. void reset(); /// Clear the vector contents. void clear() {reset();} /// Get a reference to a new entry to be populated so the user can /// directly populate it rather than having to copy into it. DATA_TYPE& getNextEmpty(); /// Get a reference to the data at the specified index. /// Throws an exception if the index is out of range. DATA_TYPE& get(unsigned int index) const; /// Return the number of populated entries in the vector. // The next empty position is the same as the size. int size() const {return(myNextEmpty);} void rmLast(); protected: std::vector myCont; unsigned int myNextEmpty; private: ReusableVector& operator=(const ReusableVector& rv); ReusableVector(const ReusableVector& rv); }; ///////////////////////////////////////////////////////////// // ReusableVector template ReusableVector::~ReusableVector() { for(unsigned int i = 0; i < myCont.size(); i++) { // Delete all the entries. delete myCont[i]; myCont[i] = NULL; } myCont.clear(); myNextEmpty = 0; } template void ReusableVector::reset() { // Set the next empty element to be the first one on the list. // That means there are none used. myNextEmpty = 0; } template DATA_TYPE& ReusableVector::getNextEmpty() { if(myNextEmpty == myCont.size()) { // We are at the end of the available entries, so add a new one. myCont.resize(myCont.size() + 1); // Create a new entry. myCont[myNextEmpty] = new DATA_TYPE; } else { // myNextEmpty is an element, and not the end. // So, clear out the data. myCont[myNextEmpty]->clear(); } DATA_TYPE* returnVal = myCont[myNextEmpty]; // Increment next empty to the next element. ++myNextEmpty; // return the element to be used. return(*returnVal); } template DATA_TYPE& ReusableVector::get(unsigned int index) const { if((index < myNextEmpty) && (index >= 0)) { // index is a valid position, so return that data. if(myCont[index] == NULL) { throw(std::runtime_error("ReusableVector::get BUG, found a null pointer.")); } return(*myCont[index]); } // Not set in the vector, so throw an exception. throw(std::runtime_error("ReusableVector::get called with out of range index.")); // return(myCont[0]); } template void ReusableVector::rmLast() { if(myNextEmpty > 0) { --myNextEmpty; } } #endif libStatGen-1.0.14/general/STLUtilities.cpp000066400000000000000000000044501254730101300203070ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "STLUtilities.h" namespace STLUtilities { // // Split the string input into words delimited by the character // delimiter. For a given number of input delimiters, result.size() // will not change, regardless of the data in between the delimiters. // // Refactor this to pre-allocate the word that we place data into, // then we have minimal data copy. // int Tokenize(std::vector &result, const char *input, char delimiter) { if (*input=='\0') { result.clear(); result.resize(1); // one word, and it is empty return 0; } size_t wordCount = 1; // since input is non-empty, we know we will have at least // one word, so we allocate it here, and begin to fill it in if (result.size()push_back(*input); } input++; } if (wordCount < result.size()) result.resize(wordCount); // potentially truncate to wordCount elements return result.size(); } } // end of namespace STLUtilities libStatGen-1.0.14/general/STLUtilities.h000066400000000000000000000217451254730101300177620ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _STLUTILITIES_H #define _STLUTILITIES_H #include #include #include #include #include #include #include #include #include #include /// /// This file is inspired by the poor quality of string support in /// STL for what should be trivial capabiltiies, for example setting /// or appending the ASCII representation of a floating point /// or integer number to a string. /// /// This file uses variadic templates to implement a type safe /// version (subset) of C-library printf. /// /// Therefore, -std=c++0x is a required option on g++ /// namespace STLUtilities { /// /// use std streams API to do float conversion to string, /// then append it. /// inline std::string &append(std::string &s, float f) { std::ostringstream buffer; buffer << f; s += buffer.str(); return s; } /// /// use std streams API to do double conversion to string, /// then append it. /// inline std::string &append(std::string &s, double f) { std::ostringstream buffer; buffer << f; s += buffer.str(); return s; } /// /// The rest we can handle readily ourselves. /// Unlike std::string operator +, this operator treats c as /// a character and appends the ASCII character c. /// inline std::string &append(std::string &s, char c) { s += c; return s; } /// /// Similar to signed char case, but this time for unsigned. /// inline std::string &append(std::string &s, unsigned char c) { s += c; return s; } /// /// Now append a full C style NUL terminated string to /// the std::string. /// inline std::string &append(std::string &s, const char *rhs) { s += rhs; return s; } /// /// Prevent the generic template from picking up std::string /// inline std::string &append(std::string &s, std::string &rhs) { s += rhs; return s; } /// /// iterate over the provided vector, appending all elements with /// an optional separator /// template std::string &append(std::string &s, std::vector v, std::string separator="") { for (typename T::iterator i=v.begin(); i!=v.end(); i++) { if (i!=v.begin()) s += separator; s << *i; } return s; } /// /// This template handles the rest of the cases for /// integral types. Not user friendly if you pass /// in a type T that is for example a std::vector. /// template std::string &append(std::string &s, T i) { char digits[20]; char *p = digits; bool negative = false; if (i<0) { negative = true; i = -i; } do { *p++ = '0' + i % 10; i = i/10; } while (i); if (negative) s += '-'; do { s += *--p; } while (p > digits); return s; } inline std::string &operator <<(std::string &s, char c) { return append(s, c); } inline std::string &operator <<(std::string &s, unsigned char c) { return append(s, c); } inline std::string &operator <<(std::string &s, uint64_t i) { return append(s, i); } inline std::string &operator <<(std::string &s, int64_t i) { return append(s, i); } template std::string &operator <<(std::string &s, T val) { return append(s, val); } template std::string &append(std::string &s, std::vector v, S delimeter, bool itemize = false) { bool showDelimeter = false; for (std::vector::iterator i=v.begin(); i!=v.end(); i++) { if (showDelimeter) s << delimeter; else showDelimeter = true; if (itemize) s << (i - v.begin()) << ": "; s << *i; } return s; } template std::string &append(std::string &s, std::vector v, S delimeter, bool itemize = false) { bool showDelimeter = false; for (typename std::vector::iterator i=v.begin(); i!=v.end(); i++) { if (showDelimeter) s << delimeter; else showDelimeter = true; if (itemize) s << (i - v.begin()) << ": "; s << *i; } return s; } // // Split the string input into words delimited by the character // delimiter. For a given number of input delimiters, result.size() // will not change, regardless of the data in between the delimiters. // // Refactor this to pre-allocate the word that we place data into, // then we have minimal data copy. // int Tokenize(std::vector &result, const char *input, char delimiter); // // Variadic templates necessary for reasonable printf implementation // are only supported as an experimental feature that in theory is // subject to changes in the future draft standard for C++. // // Only defined when the g++ option -std=c++0x is used. // // #if defined(__GXX_EXPERIMENTAL_CXX0X__) // // problems in compatibility with stdio printf/fprintf: // - variable field width (%*d) not implemented // - L 0 fills to the left of the number through precision width // (ie printf("%20.6d",12) yields 000012 in a 20 width field) // // What is different from C-style printf: // type safe // no truncation of type data // no vairable width fields // inline void fprintf(std::ostream &stream, const char* s) { while (*s) { if (*s == '%' && *++s != '%') throw std::runtime_error("invalid format string: missing arguments"); stream << *s++; } } template void fprintf(std::ostream &stream, const char* s, const T& value, const Args&... args) { while (*s) { if (*s == '%' && *++s != '%') { bool leftJustify = false; bool zeroPad = false; int fieldWidth = 0; int precision = 3; char fillChar = ' '; if (*s && *s == '-') { leftJustify = true; s++; } if (*s && *s == '0') { fillChar = '0'; zeroPad = true; s++; } while (*s && isdigit(*s)) { fieldWidth *= 10; fieldWidth += (*s - '0'); s++; } if (*s && *s == '.') { precision = 0; s++; while (*s && isdigit(*s)) { precision *= 10; precision += (*s - '0'); s++; } s++; } while (*s) { switch (*s) { case 's': s++; stream << std::setw(fieldWidth) << (leftJustify ? std::left : std::right) << value; break; case 'p': case 'x': case 'X': s++; stream << std::setw(fieldWidth) << std::setfill(fillChar) << (leftJustify ? std::left : std::right) << std::hex << value; break; case 'l': case 'L': s++; continue; case 'f': case 'd': case 'h': case 'j': case 't': case 'z': s++; stream << std::setw(fieldWidth) << std::setfill(fillChar) << (leftJustify ? std::left : std::right) << std::dec << value; break; default: throw std::runtime_error("Unrecognized printf conversion character"); break; } break; } fprintf(stream, s, args...); return; } stream << *s++; } throw std::runtime_error("extra arguments provided to printf"); } template void printf(const char* s, const T& value, const Args&... args) { fprintf(std::cout, s, value, args...); } template void sprintf(std::string &buffer, const char *fmt, const Args&... args) { std::ostringstream stream; fprintf((std::ostream &) stream, fmt, args...); // stream.str() returns a const std::string &, so we // can't do a std::swap() buffer = stream.str(); } #endif } // end namespace STLUtilities #endif libStatGen-1.0.14/general/SimpleStats.h000066400000000000000000000045051254730101300176670ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _SIMPLESTATS_H_ #define _SIMPLESTATS_H_ #include // for sqrt #include // // see http://www.johndcook.com/standard_deviation.html // or Donald Knuth's Art of Computer Programming, Vol 2, page 232, 3rd edition // class RunningStat { public: RunningStat() : m_n(0), m_oldM(0), m_newM(0), m_oldS(0), m_newS(0) {} void Clear() { m_n = 0; } void Push(double x) { m_n++; // See Knuth TAOCP vol 2, 3rd edition, page 232 if (m_n == 1) { m_oldM = x; m_oldS = 0.0; m_newM = x; m_newS = 0.0; } else { m_newM = m_oldM + (x - m_oldM)/m_n; m_newS = m_oldS + (x - m_oldM)*(x - m_newM); // set up for next iteration m_oldM = m_newM; m_oldS = m_newS; } } int NumDataValues() const { return m_n; } double Mean() const { return (m_n > 0) ? m_newM : 0.0; } double Variance() const { return ((m_n > 1) ? m_newS/(m_n - 1) : 0.0); } double StandardDeviation() const { return sqrt(Variance()); } private: uint64_t m_n; double m_oldM, m_newM, m_oldS, m_newS; }; // // helpers for Tabulate template // inline bool operator == (RunningStat &r, int i) { return r.NumDataValues() == i; } inline std::ostream &operator << (std::ostream &stream, RunningStat &s) { stream << "N: " << s.NumDataValues() << " Mean: " << s.Mean() << " Standard Deviation: " << s.StandardDeviation(); return stream; } #endif libStatGen-1.0.14/general/SmithWaterman.cpp000066400000000000000000000214311254730101300205320ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include "SmithWaterman.h" // put TEST below here, so that makedepend will see the .h, so that we // can get a clean dependency for SmithWaterman.o, so that we can at least // compile the header when we change it. #if defined(TEST) #include #include "Generic.h" // g++ -g -o testSW -DTEST SmithWaterman.cpp // // Smith-Waterman - test code uses a 256x256 array of int16 // int swat( bool showAllCases, const char *A, const char *qualities, const char *B, int direction, const char *expectedCigarString, int expectedSumQ ) { int allowedInsertDelete = 1024; int errors = 0; // read length 256 // reference length 1024 SmithWaterman<256, 1024, uint16_t, const char *, const char *, const char *, uint32_t, uint32_t > sw(&A, &qualities, &B, strlen(A), strlen(B), allowedInsertDelete, direction); // // now we align the read: // sw.populateH(); sw.populateAlignment(); int sumQ = sw.getSumQ(); CigarRoller cigar; cigar.clear(); sw.rollCigar(cigar); const char *cigarStr = cigar.getString(); // // now we pretty print the results // bool badCigar = false, badQuality = false; if (strcmp(cigarStr, expectedCigarString)!=0) { badCigar = true; errors ++; } if (sumQ != expectedSumQ) { badQuality = true; errors ++; } if (showAllCases || errors>0) { cout << "=============" << endl; cout << " Read: " << A << endl; cout << " Reference: " << B << endl; cout << " Direction: " << direction << endl; cout << "Max Cell: " << sw.maxCostValue << " located at " << sw.maxCostPosition << endl; cout << "M: " << sw.m << " N: " << sw.n << endl; cout << "Cigar String: " << cigarStr ; if (badCigar) cout << " (EXPECTED: " << expectedCigarString << ")"; cout << endl; cout << " sumQ:" << sumQ; if (badQuality) cout << " (EXPECTED: " << expectedSumQ << ")"; cout << endl; if (strlen(B) < 100 || showAllCases) sw.printH(false); for (vector >::iterator i = sw.alignment.begin(); i != sw.alignment.end(); i++) cout << *i << endl; cout << "=============" << endl << endl; } delete cigarStr; return errors; } // test with Sequence 1 = ACACACTA // Sequence 2 = AGCACACA int main(int argc, const char **argv) { int errors = 0; bool showAllCasesFlag = false; int opt; while ((opt = getopt(argc, (char **) argv, "v")) != -1) { switch (opt) { case 'v': showAllCasesFlag = true; break; default: cerr << "usage: testSW [-v]" << std::endl; exit(1); } } // CIGAR explanation - for backward SW runs, the corresponding // CIGAR string is generated from the back of the string to the // front. Recall that the soft clipping is only done at the // "end" of the string, taking direction into account. // forwards - simple errors += swat(showAllCasesFlag, "1234", "\"#$-", "1235", 1, "3M1S", 0); // backwards - simple errors += swat(showAllCasesFlag, "1234", "\"#$-", "1235", -1, "4M", 12); // backwards - soft left clip errors += swat(showAllCasesFlag, "1234", "\"#$-", "0234", -1, "1S3M", 0); // delete in read (arg 1) - forward errors += swat(showAllCasesFlag, "123467890", "\"#$%^&*()-", "1234567890", +1, "4M1D5M", 50); // insert in read (arg 1) - forward errors += swat(showAllCasesFlag, "1234556789", "\"#$%^&*()-", "1234567890", +1, "5M1I4M", 50); // delete in read (arg 1) - backward errors += swat(showAllCasesFlag, "X123467890", "#\"#$%^&*()-", "1234567890", -1, "1S4M1D5M", 50); // insert in read (arg 1) - backward errors += swat(showAllCasesFlag, "1234556789", "\"#$%^&*()-", "0123456789", -1, "4M1I5M", 50); // insert in read (arg 1) - backward errors += swat(showAllCasesFlag, "X1223456789", "00000000000", "00123456789", -1, "1S1M1I8M", 50); // insert in read (arg 1) - backward errors += swat(showAllCasesFlag, "XY1223456789", "000000000000", "000123456789", -1, "2S1M1I8M", 50); // forward - soft right clip of 2 bases - sumQ should be 0 errors += swat(showAllCasesFlag, "123456700", "\"#$%^&*()-", "123456789", +1, "7M2S", 0); // insert in read (arg 1) - forward w/2 mismatches at end errors += swat(showAllCasesFlag, "1023456700", "\"#$%^&*()-", "123456789", +1, "1M1I6M2S", 50); // insert in read (arg 1) - forward w/2 mismatches at end errors += swat(showAllCasesFlag, "CTCCACCTCCCGGTT", "111111111111111", "TCCACCTCCCAGGTT", -1, "1S10M1D4M", 50); // errors += swat(showAllCasesFlag, "1234", "0000", "12345", +1, "4M", 0); // errors += swat(showAllCasesFlag, "1234X", "00000", "12345", +1, "4M1S", 0); // errors += swat(showAllCasesFlag, "4321", "0000", "7654321", -1, "4M", 0); // errors += swat(showAllCasesFlag, "X4321", "00000", "7654321", -1, "1S4M", 0); // errors += swat(showAllCasesFlag, "X432A10", "0000000", "76543210", -1, "1S3M1I2M", 50); // errors += swat(showAllCasesFlag, "1345", "0000", "12345", -1, "1M1D3M", 50); errors += swat(showAllCasesFlag, "45689", "00000", "1234567890", -1, "3M1D2M", 50); // errors += swat(showAllCasesFlag, "AATAATTTTTTATATACAGATCGCTGTAGAGTGTAGTTATAGTATGATTCCAACTTTTATTTCTTTCATGACTAATTATATGTATACATGTGCCATGTTGGTGTGCTG", "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", "TCCAATGTAGGGCTGTTATAAACAGTGTTGATACATATGTTTTTGTATAAGTCTTTGTTGAATACATGCTTTCATTTTTGTAGGGTATATGTCCAGGAATTAAATTTTTGCATTATTGGGGAAGTTCAAACGTAGATCAGTAGATGTTCCCAAATGATTTTCAGGATATGTATCCATGTAAATTCCTACCAGCAATGCAGGAGAATTCCAATTGCCCATGTTCTAATCAGAATATTGTTATATCCTAAGACTAATTTTAAATATTCTGATGGGTGTAGAGTGGAGGCATAGTATGATTTCAACTTGTATTTCTTTCATGACTAATTATCTTCTATGTTAATTGTTATTTTGTATGTTTATTGCAAAGTGCCTATCCAGAATTTTTGTCTATAATTTTGTTGTGCTGTCTCTTGCTTTATGAATTTTATAGGATTCTTAATATTATAATTGAGTTATCTTTCTTTTTTATTATTATTATTATACTTTAAGTTTTAGGGTATATGTGCACAACGTGCAAGTTTGTCACATATGTATACATGTGCCATGTTGGTGTGCTGCACCCATTAACTCATCATTTAGCATTAGGTATATCTCCTAATGCTATCCCTTCCTCCTCCCCCCACCCCACAACAGTCCCCGGTGTGTGATGTTCCCCTGCCTTTGTCCTCTTTCTTATACTTGCATGAGCAATCTCCTCAAACTGATACTTGCCTTTTTTGTCCTTGGTGTGGTTTGGCTCTGTGTTCCCACCCAAATCTTCATAATACCCATGTGCCAAGGGTGGGACTGGGTGGAGGTAATTGGGTCATGGGGATGGTTTCCCTCATACTATTATGATAGTGAGTGTTTTCACGAGACCTGATGGTTTTATAACTGTGTGGCATTTCCCTTGCTTCCACTCACTCCATCCTGCCACCCTGTGAAGAAGGTGCCTGCTTCTCCTTTGGTTACTGCTATGATTGTAAGTTTCCTGAGGCCTCCCCAGCAACGCAAAACTGTGAATCAATTAAACCTTTTTCCTTTATAAATTACTAAGTCTTGGGTATTTCTTCATAGTGTTGTGAGCATAGACTAAAACAGTAAGTTGTTACCAGGAGTGGGGTACTGCTGTAAGATAACTGAGAATGTGAAAGTGACTTAGGAACTAGGTAATGAGCAGAGGTTGGAACAGTTTAAAAGGCTCAGAAGAAGACAGAAAGATGTGGGAAAGTTTGGA", -1, "77M200D31M", 50); errors += swat(showAllCasesFlag, "TTAGAATGCTATTGTGTTTGGAGATTTGAGGAAAGTGGGCGTGAAGACTTAGTGTTCATTTCCTCAACCTCTCTCTGTGTGAACATACGTCATCGGTCAGAAATTGGG", "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", "CCGAGATTGTGCCATTGCACTCCTGCCTGGGTAACAGAGTCAGACCCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAGATTAGGTTTTATAGATGGAAAATTCACAGCTCTCTCCAGATCAGAAATCTCCAAGAGTAAATTAGTGTCTTAAAGGGGTTGTAATAACTTTCCTATGTGACTAAGTGCATTATTAATCAATTTTTCTATGATCAAGTACTCCTTTACATACCTGCTAATACAATTTTTGATATGAAATCAGTCCTAGAGGGAATCAATGTAAGATACAGACTTGATGAGTGCTTGCAGTTTTTTATTGACAATCTGAAGAATGACTTGACTCTAAATTGCAGCTCAAGGCTTAGAATGCTATTGTGTTTGGAGATTTGAGGAAAGTGGGCGTGAAGACTTAGTGTTCATTTCCTCAACCTCTCTCTGTGTGAACATACAGGAATCAAATCTGTCTAGCCTCTCTTTTTGGCAAGGTTAAGAACAATTCCACTTCATCCTAATCCCAATGATTCCTGCCGACCCTCTTCCAAAAACTATTTAAAGACATGTTCTTCAAAGTTATATTTGTCTTTCCTTCAGGGAGAAAAAGAATACCAATCACTTATAATATGGAAACTAGCAGAAATGGGTCACATAAGTCATCTGTCAGAAATTGGGAAAATAGAGTAGGTCAGTCTTTCCAGTCATGGTACTTTTACCTTCAATCA", -1, "88M200D20M", 50); // prefix TTAGAATGCTATTGTGTTTGGAGATTTGAGGAAAGTGGGCGTGAAGACTTAGTGTTCATTTCCTCAACCTCTCTCTGTGTGAACATAC // suffix GTCATCTGTCAGAAATTGGGA cout << endl << "Total Errors found: " << errors << endl; } #endif libStatGen-1.0.14/general/SmithWaterman.h000066400000000000000000000540311254730101300202010ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #if !defined(_SMITH_WATERMAN_) #define _SMITH_WATERMAN_ #include // for inline use of strcat, etc #include // for INT_MAX #include // for uint32_t and friends // // This file implements a bi-directional, banded Smith Waterman matching // algorithm. // // The design is dictated by several observations: // // - building the full matrix H for the read and reference is slow, // so we perform it only for a band down the diagonal, thus speeding // up the algorithm at the cost of reduced detection of insert/deletes. // // - we must minimize data copying // // - we must have the ability to test the algorithm quickly and simply, // hence we implement the class as a template so that this file doesn't // have to depend on Karma's GenomeSequence object, which is a relatively // heavy weight object to do testing against. // // - because Karma uses index words to determine match candidate locations // across the genome, and because we use the banded Smith Waterman approach, // we must provide bi-directional Smith Waterman matching. See example below. // // To fully understand the examples below, make sure you understand // Phred quality scores, CIGAR strings, and pattern matching. // // Simple Functional Examples: // // Given a read, a read quality and a reference, we want to obtain some // measure of how well that read matches the reference at that given location. // So, we have: // Read: "ATCG" // Quality: "$$$$" (Phred scores - all are decimal 3) // Reference: "ATCG" // We expect: sumQ = 0, and Cigar "4M" // // Complex Functional Examples: // Read: "AATCG" // Quality: "$$$$$" (Phred scores - all are decimal 3) // Reference: "ATCG" // We expect: sumQ = 3, and Cigar "1M1I3M" // // Backwards matching: // It is harder for me to construct a clear example, so imagine a read with // cumulative inserts or deletes that sum up to a number larger than the // width of the band along the diagonal. If we perform SW in the 'wrong' // direction, we will lose the read entirely, whereas if we start from the // end that is known to be matching, we may obtain a good match for the bulk // of the read. // // For example, you could imagine a read where the first 10 bases had a mess // of inserts, but then was clean for the next 100 bases. You'd want it // to match as many of the good bases as practical, even if you knew you were // losing information at the end. // #include #include #include #include #include #include #include #include #include "CigarRoller.h" #include "Generic.h" using std::cout; using std::cin; using std::cerr; using std::setw; using std::endl; using std::pair; using std::vector; #if !defined(MAX) #define MAX(x,y) ((x)>(y) ? (x) : (y)) #endif #if !defined(MIN) #define MIN(x,y) ((x)<(y) ? (x) : (y)) #endif // // Implement the core of Smith Waterman as described in: // http://en.wikipedia.org/wiki/Smith_waterman // // The only variation from the basic SW algorithm is the // use of a banded approach - to limit the algorithm to // a band along the diagonal. This limits the maximum // additive number of indels, but allows an O(c*M) approach, // where c is the constant max number of indels allowed. // // This is implemented as function templates because for testing, it is easier // to use character arrays. In our mapper, we will be using a // combination of a String object for the read and the genome object // as the reference. Both can be indexed, and give a character (base) // value, but the code would be duplicated if we implement SW for // each type of argument. // // Htype -> the type of the array H cell (8 or 16 bit unsigned int) // Atype -> the read string type (must have Atype::operator [] defined) // template class SmithWaterman { public: // // XXX in theory, this weight should be sensitive // to the quality of the base, and should have // an appropriate cost for an indel, as well. // // I think we need to get rid of this, since it is // basically wrong for our needs. // struct weight { weight() { match=2; misMatch=-1; insert=-1; del=-1; }; int match; int misMatch; int insert; int del; }; HCellType H[maxReadLengthH][maxReferenceLengthH]; Atype *A; Btype *B; QualityType *qualities; int m,n; readIndexType MOffset; // constant offset to m (read) referenceIndexType NOffset; // constant offset to n (reference) weight w; int allowedInsertDelete; int direction; int gapOpenCount; int gapCloseCount; int gapExtendCount; vector > alignment; void clearAlignment() { alignment.clear(); } HCellType maxCostValue; // max Cost value in H pair maxCostPosition; // row/col of max cost value in H // // Clear the member variables only. // To clear H, call clearH(). // // In theory, clear() plus set() should be sufficient to // get a clean run, but I haven't tested this extensively. // void clear() { maxCostPosition.first = 0; maxCostPosition.second = 0; A = NULL; B = NULL; qualities = NULL; m = 0; n = 0; MOffset = 0; NOffset = 0; allowedInsertDelete = 0; direction = 0; gapOpenCount = 0; gapCloseCount = 0; gapExtendCount = 0; } // caller will be using set* methods to set everything up. SmithWaterman() { clear(); clearH(); } // construct with everything and the kitchen sink: SmithWaterman( Atype *A, QualityType *qualities, Btype *B, int m, int n, int allowedInsertDelete = INT_MAX, int direction = 1, readIndexType MOffset = 0, referenceIndexType NOffset = 0): A(A), qualities(qualities), B(B), m(m), n(n), allowedInsertDelete(allowedInsertDelete), direction(direction), MOffset(MOffset), NOffset(NOffset), maxCostValue((HCellType) 0) { } void setRead(Atype *A) { this->A = A; } void setReadQuality(QualityType *qualities) { this->qualities = qualities; } void setReference(Btype *B) { this->B = B; } // Caller may wish to index into the read to do the matching against // only part of the read. // NB: the quality length and offset are the same as the read. void setReadLength(int m) { this->m = m; } void setReadOffset(readIndexType MOffset) { this->MOffset = MOffset; } // The reference is typically long, and not necessarily a char *, // so we provide an offset here. If it were always a char *, // we'd just modify the caller to point directly at the reference // location. void setReferenceLength(int n) { this->n = n; } void setReferenceOffset(referenceIndexType NOffset) { this->NOffset = NOffset; } // // Configuration: how wide is the band on the diagonal? // We should keep this small -- 1, 2, 3 or similar. If // the value is default (INT_MAX), then the full matrix // will be built, which is fine, but quite slow. // // If this paramater is made smaller than when a previous // call to populateH was made, clearH will also need to be called. // void setAllowedInsertDelete(int allowedInsertDelete = INT_MAX) { this->allowedInsertDelete = allowedInsertDelete; } // // Configuration: which end do we begin performing SW matching // from? We need this because of index 'anchors' in the karma // matcher. void setDirection(int direction) { this->direction = direction; } void clearH() { memset(H, 0, sizeof(H)); } void populateH() { maxCostValue = 0; for (int i=1; i<=m ; i++) { // implement a banded Smith-Waterman approach: int low = MAX(1, i - allowedInsertDelete); int high = MIN(n, i + allowedInsertDelete); for (int j=low; j<=high ; j++) { HCellType c; c = 0; if (direction>0) c = MAX(c, H[i-1][j-1] + (((*A)[MOffset + i-1]==(*B)[NOffset + j-1]) ? w.match : w.misMatch)); else c = MAX(c, H[i-1][j-1] + (((*A)[MOffset + m-i+0]==(*B)[NOffset + n-j+0]) ? w.match : w.misMatch)); c = MAX(c, H[i-1][j] + w.del); c = MAX(c, H[i][j-1] + w.insert); H[i][j] = c; if (c>maxCostValue) { maxCostValue = c; maxCostPosition.first = i; maxCostPosition.second = j; } } } } // // Given the matrix H as filled in by above routine, print it out. // void printH(bool prettyPrint = true) { // print the scoring matrix: for (int i=-1; i<=m ; i++) { for (int j=-1; j<=n ; j++) { if (prettyPrint) cout << setw(3); if (i==-1 && j==-1) { if (prettyPrint) cout << " "; else cout << "\t"; } else if (j==-1) { if (!prettyPrint) cout << "\t"; if (i==0) cout << "-"; else cout << (*A)[MOffset + direction>0 ? i-1 : m - i]; } else if (i==-1) { if (!prettyPrint) cout << "\t"; if (j==0) cout << "-"; else cout << (*B)[NOffset + direction>0 ? j-1 : n - j]; } else { if (!prettyPrint) cout << "\t"; cout << H[i][j]; } } cout << endl; } } void debugPrint(bool doPrintH = true) { if (doPrintH) printH(); cout << "maxCostPosition = " << maxCostPosition << std::endl; if (alignment.empty()) cout << "alignment vector is empty.\n"; else { cout << "alignment vector:\n"; for (vector >::iterator i=alignment.begin(); i < alignment.end(); i++) { cout << (i - alignment.begin()) << ": " << *i << "\n"; } } cout << std::endl; } // // Given the Matrix H as filled in by populateH, fill in the // alignment vector with the indeces of the optimal match. // void populateAlignment() { alignment.clear(); int i = m, j = n; i = maxCostPosition.first; j = maxCostPosition.second; // // Stop when we either reach zero cost cell or // when we reach the upper left corner of H. // A zero cost cell to the lower right means we // are soft clipping that end. // while (H[i][j] > 0 || (i>0 && j>0)) { // #define DEBUG_ALIGNMENT_VECTOR #if defined(DEBUG_ALIGNMENT_VECTOR) cout << "alignment.push_back(" << i << ", " << j << ")" << endl; #endif alignment.push_back(pair(i,j)); if (H[i-1][j-1]>=H[i-1][j] && H[i-1][j-1]>=H[i][j-1]) { // diagonal upper left cell is biggest i--; j--; } else if (H[i-1][j] < H[i][j-1]) { // upper cell is biggest j--; } else { // left cell is biggest i--; } } alignment.push_back(pair(i,j)); #if defined(DEBUG_ALIGNMENT_VECTOR) cout << "alignment.push_back(" << i << ", " << j << ")" << endl; cout << "alignment.size(): " << alignment.size() << endl; #endif } // // Compute the sumQ for a read that has been mapped using populateH(). // // In the simplest case, the read lies on the diagonal of the // matrix H, which means it has only matches and mismatches: // no inserts or deletes. // // However, in general, it is possible to have 0 or more insert, // delete, mismatch and soft clipped bases in the read, so we // need to accomodate all of those variations. // // XXX finish this. // int getSumQ() { if (direction>0) return getSumQForward(); else return getSumQBackward(); } int getSumQForward() { int sumQ = 0; vector >::reverse_iterator i; for (i=alignment.rbegin(); i < alignment.rend() - 1; i++) { // #define DEBUG_GETSUMQ #if defined(DEBUG_GETSUMQ) cout << *i << ": "; #endif if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second + 1)) { // match/mismatch #if defined(DEBUG_GETSUMQ) cout << "Match/Mismatch"; #endif if ((*A)[MOffset + (*i).first] != (*B)[NOffset + (*i).second]) sumQ += (*qualities)[MOffset + (*i).first] - '!'; } else if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second)) { // insert? #if defined(DEBUG_GETSUMQ) cout << "Insert"; #endif sumQ += 50; } else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second + 1)) { // delete? #if defined(DEBUG_GETSUMQ) cout << "Delete"; #endif sumQ += 50; } } #if defined(DEBUG_GETSUMQ) cout << endl; #endif return sumQ; } int getSumQBackward() { int sumQ = 0; vector >::iterator i; for (i=alignment.begin(); i < alignment.end() - 1; i++) { #if defined(DEBUG_GETSUMQ) cout << *i << ": "; #endif if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second - 1)) { // match/mismatch #if defined(DEBUG_GETSUMQ) cout << "Match/Mismatch"; #endif if ((*A)[MOffset + m - (*i).first] != (*B)[NOffset + n - (*i).second]) sumQ += (*qualities)[MOffset + m - (*i).first] - '!'; } else if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second)) { // insert? #if defined(DEBUG_GETSUMQ) cout << "Insert?"; #endif sumQ += 50; } else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second - 1)) { // delete? #if defined(DEBUG_GETSUMQ) cout << "Delete?"; #endif sumQ += 50; } } #if defined(DEBUG_GETSUMQ) cout << endl; #endif return sumQ; } #if 0 int getSumQ() { vector >::reverse_iterator i; int sumQ = 0; for (i=alignment.rbegin(); i < alignment.rend() - 1; i++) { #if defined(DEBUG_ALIGNMENT_VECTOR) cout << "i: " << i - alignment.rbegin() << *i << endl; #endif // XXX NOT THIS SIMPLE - need to account for indels if (direction>0) { if ((*A)[MOffset + (*i).first] != (*B)[NOffset + (*i).second]) sumQ += (*qualities)[MOffset + (*i).first] - '!'; } else { // m and n are sizes, first and second are 1 based offsets if ((*A)[MOffset + m - (*i).first] != (*B)[NOffset + n - (*i).second]) sumQ += (*qualities)[MOffset + m - (*i).first] - '!'; } } return sumQ; } #endif // // Append cigar operations to an existing cigar list. // // XXX we no longer need the CigarRoller += methods. // // In this case, the Smith Waterman array H was created from // the read and reference in the forward direction. // void rollCigarForward(CigarRoller &cigar) { vector >::reverse_iterator i; for (i=alignment.rbegin(); i < alignment.rend() - 1; i++) { // #define DEBUG_CIGAR #if defined(DEBUG_CIGAR) cout << *i << ": "; #endif if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second + 1)) { // match/mismatch #if defined(DEBUG_CIGAR) cout << "Match/Mismatch"; #endif cigar.Add(CigarRoller::match, 1); } else if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second)) { // insert? #if defined(DEBUG_CIGAR) cout << "Insert"; #endif cigar.Add(CigarRoller::insert, 1); } else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second + 1)) { // delete? #if defined(DEBUG_CIGAR) cout << "Delete"; #endif cigar.Add(CigarRoller::del, 1); } } // if there is soft clipping, allow for it (::Add will // ignore if the count is 0): cigar.Add(CigarRoller::softClip, getSoftClipCount()); #if defined(DEBUG_CIGAR) cout << endl; #endif } // // Append cigar operations to an existing cigar list. // // XXX we no longer need the CigarRoller += methods. // // In this case, the Smith Waterman array H was created from // the read and reference in the reverse direction. // void rollCigarBackward(CigarRoller &cigar) { vector >::iterator i; // if there is soft clipping, allow for it (::Add will // ignore if the count is 0): cigar.Add(CigarRoller::softClip, getSoftClipCount()); i = alignment.begin(); for (i=alignment.begin(); i < alignment.end() - 1; i++) { #if defined(DEBUG_CIGAR) cout << *i << ": "; #endif if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second - 1)) { // match/mismatch #if defined(DEBUG_CIGAR) cout << "Match/Mismatch"; #endif cigar.Add(CigarRoller::match, 1); } else if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second)) { // insert? #if defined(DEBUG_CIGAR) cout << "Insert?"; #endif cigar.Add(CigarRoller::insert, 1); } else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second - 1)) { // delete? #if defined(DEBUG_CIGAR) cout << "Delete?"; #endif cigar.Add(CigarRoller::del, 1); } } #if defined(DEBUG_CIGAR) cout << endl; #endif } // // Given the direction, and the alignment vector, obtain // the soft clip (the mismatches at the end of the string which // can in Smith Waterman matching be considered as a separate case). // // NB: be careful that the backward case is correct - it passes // all of two built in tests, but it may not be generally correct. // int getSoftClipCount() { if (direction>0) { // invariant: assert(maxCostPosition == alignment.front()); return m - maxCostPosition.first; } else { // return alignment.back().first; // nope, this always returns 0 // XXX BE CAREFUL... not sure this is right, either. // return n - maxCostPosition.second; return m - maxCostPosition.first; } } void rollCigar(CigarRoller &cigar) { if (direction>0) rollCigarForward(cigar); else rollCigarBackward(cigar); } // // all in one local alignment: // // Steps: // 1 - do internal setup // 2 - populate H // 3 - create alignment vector (this chooses the best path) // 4 - compute sumQ // 5 - compute the cigar string // 6 - compute and update the softclip for the read // bool localAlignment( uint32_t bandSize, Atype &read, readIndexType readLength, QualityType &quality, Btype &reference, referenceIndexType referenceLength, referenceIndexType referenceOffset, CigarRoller &cigarRoller, uint32_t &softClipCount, referenceIndexType &cigarStartingPoint, int &sumQ ) { clear(); cigarRoller.clear(); setDirection(+1); setAllowedInsertDelete(bandSize); setRead(&read); setReadOffset(0); setReadLength(readLength); setReadQuality(&quality); setReference(&reference); setReferenceOffset(referenceOffset); setReferenceLength(referenceLength); populateH(); softClipCount = getSoftClipCount(); populateAlignment(); rollCigar(cigarRoller); sumQ = getSumQ(); return false; }; }; #endif libStatGen-1.0.14/general/Sort.cpp000066400000000000000000000221671254730101300167050ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Sort.h" #include "Error.h" #include #include #define Item(b) (base_char+(b)*width) #define IsBefore(x,y) ((cmp(Item(x),Item(y)))<0) #define Exchange(x,y) {\ memcpy(tmp,Item(x),width);\ memcpy(Item(x),Item(y),width);\ memcpy(Item(y),tmp,width);\ } #define TRUE 1 void QuickSort(void *base, size_t nelem, size_t width, int (*cmp)(const void *, const void *)) { struct __QuickSortStack { size_t left, right; }; if (nelem <= 1) return; // Create a pseudo-stack to avoid recursion char * base_char = (char *) base; const size_t stackSize = 128; __QuickSortStack * stack = new __QuickSortStack[stackSize]; char * tmp = new char [width]; if ((stack == NULL) || (tmp == NULL)) error("Out of memory in QuickSort routine"); size_t stackIdx = 0; // Size of minimum partition to median of three const size_t Threshold = 7; // current partitions size_t lsize, rsize; size_t l, mid, r; size_t scanl, scanr, pivot; l = 0; r = nelem - 1; while (TRUE) { while (r > l) { if (r - l > Threshold) // QuickSort : median of three partitioning { mid = (r + l) / 2; // sort l, mid, and r if (IsBefore(mid, l)) Exchange(mid, l); if (IsBefore(r, l)) Exchange(r, l); if (IsBefore(r, mid)) Exchange(r, mid); // set up for partitioning... pivot = r - 1; Exchange(mid, pivot); scanl = l + 1; scanr = r - 2; } else { // set up random partition -- faster pivot = r; scanl = l; scanr = r - 1; } while (TRUE) { // scan from left for element >= pivot while ((scanl < r) && IsBefore(scanl, pivot)) ++scanl; while ((scanr > l) && IsBefore(pivot, scanr)) --scanr; // if scans have met, we are done if (scanl >= scanr) break; Exchange(scanl, scanr); if (scanl < r) ++scanl; if (scanr > l) --scanr; } // Exchange final element Exchange(pivot, scanl); // Place largest partition on stack lsize = scanl - l; rsize = r - scanl; if (lsize > rsize) { // if size is one we are done ++ stackIdx; if (stackIdx == stackSize) error("Out of Stack in QuickSort routine"); stack[stackIdx].left = l; stack[stackIdx].right = scanl - 1; if (rsize != 0) l = scanl + 1; else break; } else { // if size is one we are done ++ stackIdx; if (stackIdx == stackSize) error("Out of Stack in QuickSort routine"); stack[stackIdx].left = scanl + 1; stack[stackIdx].right = r; if (lsize != 0) r = scanl - 1; else break; } } // iterate with values from stack if (stackIdx) { l = stack[stackIdx].left; r = stack[stackIdx].right; --stackIdx; } else break; } delete [] stack; delete [] tmp; } #define Item2(b) (base_char2+(b)*width) #define Exchange2(x,y) {\ memcpy(tmp,Item(x),width);\ memcpy(Item(x),Item(y),width);\ memcpy(Item(y),tmp,width);\ memcpy(tmp,Item2(x),width);\ memcpy(Item2(x),Item2(y),width);\ memcpy(Item2(y),tmp,width);\ } void QuickSort2(void *base, void *base2, size_t nelem, size_t width, int (*cmp)(const void *, const void *)) { struct __QuickSortStack { size_t left, right; }; if (nelem <= 1) return; // Create a pseudo-stack to avoid recursion char * base_char = (char *) base; char * base_char2 = (char *) base2; const size_t stackSize = 128; __QuickSortStack * stack = new __QuickSortStack[stackSize]; char * tmp = new char [width]; if ((stack == NULL) || (tmp == NULL)) error("Out of memory in QuickSort routine"); size_t stackIdx = 0; // Size of minimum partition to median of three const size_t Threshold = 7; // current partitions size_t lsize, rsize; size_t l, mid, r; size_t scanl, scanr, pivot; l = 0; r = nelem - 1; while (TRUE) { while (r > l) { if (r - l > Threshold) // QuickSort : median of three partitioning { mid = (r + l) / 2; // sort l, mid, and r if (IsBefore(mid, l)) Exchange2(mid, l); if (IsBefore(r, l)) Exchange2(r, l); if (IsBefore(r, mid)) Exchange2(r, mid); // set up for partitioning... pivot = r - 1; Exchange2(mid, pivot); scanl = l + 1; scanr = r - 2; } else { // set up random partition -- faster pivot = r; scanl = l; scanr = r - 1; } while (TRUE) { // scan from left for element >= pivot while ((scanl < r) && IsBefore(scanl, pivot)) ++scanl; while ((scanr > l) && IsBefore(pivot, scanr)) --scanr; // if scans have met, we are done if (scanl >= scanr) break; Exchange2(scanl, scanr); if (scanl < r) ++scanl; if (scanr > l) --scanr; } // Exchange final element Exchange2(pivot, scanl); // Place largest partition on stack lsize = scanl - l; rsize = r - scanl; if (lsize > rsize) { // if size is one we are done ++ stackIdx; if (stackIdx == stackSize) error("Out of Stack in QuickSort routine"); stack[stackIdx].left = l; stack[stackIdx].right = scanl - 1; if (rsize != 0) l = scanl + 1; else break; } else { // if size is one we are done ++ stackIdx; if (stackIdx == stackSize) error("Out of Stack in QuickSort routine"); stack[stackIdx].left = scanl + 1; stack[stackIdx].right = r; if (lsize != 0) r = scanl - 1; else break; } } // iterate with values from stack if (stackIdx) { l = stack[stackIdx].left; r = stack[stackIdx].right; --stackIdx; } else break; } delete [] stack; delete [] tmp; } void * BinarySearch(const void *key, const void *base, size_t nelem, size_t width, int (*cmp)(const void *, const void *)) { if (nelem == 0) return NULL; char * base_char = (char *) base; int left = 0; int right = nelem - 1; while (right >= left) { int probe = (left + right) / 2; int test = cmp(key, Item(probe)); if (test == 0) return (void *) Item(probe); if (test < 0) right = probe - 1; else left = probe + 1; } return NULL; } libStatGen-1.0.14/general/Sort.h000066400000000000000000000023221254730101300163410ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SORT_H__ #define __SORT_H__ #include "Constant.h" #include void QuickSort(void *base, size_t nelem, size_t width, int (*cmp)(const void *, const void *)); void QuickSort2(void *base, void * base2, size_t nelem, size_t width, int (*cmp)(const void *, const void *)); void * BinarySearch(const void *key, const void *base, size_t nelem, size_t width, int (*cmp)(const void *, const void *)); #endif libStatGen-1.0.14/general/StatGenStatus.cpp000066400000000000000000000117401254730101300205220ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "StatGenStatus.h" const char* StatGenStatus::enumStatusString[] = { "SUCCESS", "UNKNOWN", "NO_MORE_RECS", "FAIL_IO", "FAIL_ORDER", "FAIL_PARSE", "INVALID_SORT", "INVALID", "FAIL_MEM" }; const char* StatGenStatus::getStatusString(StatGenStatus::Status statusEnum) { return(enumStatusString[statusEnum]); } // Returns whether or not it is "safe" to keep processing the file // after the specified status return. bool StatGenStatus::isContinuableStatus(StatGenStatus::Status status) { if(status == StatGenStatus::SUCCESS || status == StatGenStatus::FAIL_PARSE || status == StatGenStatus::INVALID_SORT || status == StatGenStatus::INVALID) { // The status is such that file processing can continue. return(true); } // UNKNOWN, NO_MORE_RECS, FAIL_IO, FAIL_ORDER, FAIL_MEM return(false); } // Constructor StatGenStatus::StatGenStatus(ErrorHandler::HandlingType handleType) : myHandlingType(handleType) { reset(); } // Destructor StatGenStatus::~StatGenStatus() { } // Resets this status. void StatGenStatus::reset() { myType = UNKNOWN; myMessage.clear(); } void StatGenStatus::setHandlingType(ErrorHandler::HandlingType handleType) { myHandlingType = handleType; } // Set the status with the specified values. void StatGenStatus::setStatus(Status newStatus, const char* newMessage) { myType = newStatus; myMessage = getStatusString(newStatus); myMessage += ": "; myMessage += newMessage; if(newStatus != SUCCESS) { handleError(newStatus, newMessage); } } // Adds the specified error message to the status message. // Sets the status to newStatus if the current status is SUCCESS. void StatGenStatus::addError(Status newStatus, const char* newMessage) { if(myType == StatGenStatus::SUCCESS) { myType = newStatus; } else { myMessage += "\n"; } myMessage += getStatusString(newStatus); myMessage += ": "; myMessage += newMessage; if(newStatus != SUCCESS) { handleError(newStatus, newMessage); } } // Adds the specified status to the status message. // Sets the status to newStatus if the current status is SUCCESS. void StatGenStatus::addError(StatGenStatus newStatus) { if(myType == StatGenStatus::SUCCESS) { myType = newStatus.myType; } else { myMessage += "\n"; } myMessage += newStatus.myMessage; if(newStatus != SUCCESS) { handleError(newStatus.myType, newStatus.myMessage.c_str()); } } // Return the enum for this status. StatGenStatus::Status StatGenStatus::getStatus() const { return(myType); } // Return the status message. const char* StatGenStatus::getStatusMessage() const { return(myMessage.c_str()); } // Overload operator = to set the sam status type to the // passed in status and to clear the message string. StatGenStatus & StatGenStatus::operator = (StatGenStatus::Status newStatus) { myType = newStatus; myMessage.clear(); if(newStatus != SUCCESS) { handleError(newStatus, ""); } return(*this); } // Overload operator = to copy the specified status object to this one. StatGenStatus & StatGenStatus::operator = (StatGenStatus newStatus) { myType = newStatus.myType; myMessage = newStatus.myMessage; myHandlingType = newStatus.myHandlingType; return(*this); } // Overload operator != to determine if the passed in type is not equal // to this status's type. bool StatGenStatus::operator != (const StatGenStatus::Status& compStatus) const { return(compStatus != myType); } // Overload operator != to determine if the passed in type is equal // to this status's type. bool StatGenStatus::operator == (const StatGenStatus::Status& compStatus) const { return(compStatus == myType); } void StatGenStatus::handleError(Status newStatus, const char* newMessage) { // If the status is not success and not NO_MORE_RECS, handle // the error (SUCCESS & NO_MORE_RECS are not real errors.) if((newStatus != SUCCESS) && (newStatus != NO_MORE_RECS)) { std::string message = getStatusString(newStatus); message += ": "; message += newMessage; ErrorHandler::handleError(message.c_str(), myHandlingType); } } libStatGen-1.0.14/general/StatGenStatus.h000066400000000000000000000103351254730101300201660ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __STATGEN_STATUS_H__ #define __STATGEN_STATUS_H__ #include #include "ErrorHandler.h" /// This class is used to track the status results of some methods in the BAM /// classes. It contains a status enum that describing the status. class StatGenStatus { public: /// Return value enum for StatGenFile methods. enum Status { SUCCESS = 0, ///< method completed successfully. UNKNOWN, ///< unknown result (default value should never be used) /// NO_MORE_RECS: failed to read a record since there are no more to /// read either in the file or section if section based reading. NO_MORE_RECS, FAIL_IO, ///< method failed due to an I/O issue. /// FAIL_ORDER: method failed because it was called out of order, /// like trying to read a file without opening it for read or trying /// to read a record before the header. FAIL_ORDER, FAIL_PARSE, ///< failed to parse a record/header - invalid format. INVALID_SORT, ///< record is invalid due to it not being sorted. INVALID, ///< invalid other than for sorting. FAIL_MEM ///< fail a memory allocation. }; /// Return a string representation of the passed in status enum. static const char* getStatusString(StatGenStatus::Status statusEnum); /// Returns whether or not it is "safe" to keep processing the file /// after the specified status return. static bool isContinuableStatus(StatGenStatus::Status status); /// Constructor that takes in the handling type, defaulting it to exception. StatGenStatus(ErrorHandler::HandlingType handleType = ErrorHandler::EXCEPTION); /// Destructor ~StatGenStatus(); /// Reset this status to a default state. void reset(); /// Set how to handle the errors when they are set. void setHandlingType(ErrorHandler::HandlingType handleType); /// Set the status with the specified status enum and message. void setStatus(Status newStatus, const char* newMessage); /// Add the specified error message to the status message, setting /// the status to newStatus if the current status is SUCCESS. void addError(Status newStatus, const char* newMessage); /// Add the specified status to the status message, setting /// the status to newStatus if the current status is SUCCESS. void addError(StatGenStatus newStatus); /// Return the enum for this status object. Status getStatus() const; /// Return the status message for this object. const char* getStatusMessage() const; /// Overload operator = to set the StatGen status type to the /// passed in status and to clear the message string. StatGenStatus & operator = (Status newStatus); /// Overload operator = to copy the specified status object to this one. StatGenStatus & operator = (StatGenStatus newStatus); /// Overload operator != to determine if the passed in type is not equal /// to this status's type. bool operator != (const StatGenStatus::Status& compStatus) const; /// Overload operator == to determine if the passed in type is equal /// to this status's type. bool operator == (const StatGenStatus::Status& compStatus) const; private: // Handle an error based on the error handling type. void handleError(Status newType, const char* newMessage); static const char* enumStatusString[]; Status myType; std::string myMessage; ErrorHandler::HandlingType myHandlingType; }; #endif libStatGen-1.0.14/general/String.cpp000066400000000000000000000030671254730101300172220ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "String.h" #include #ifdef OBSOLETE std::vector *csg::string::split(char splitChar) { std::vector *result = new std::vector; csg::string word; for (size_t i = 0; ipush_back(word); word.clear(); } else word.push_back((*this)[i]); } if (word.size()>0) result->push_back(word); return result; } #if defined(TEST) int main(int argc, const char **argv) { csg::string string("abcdef:abcdefghijk"); std::vector *result = string.split(':'); for (int i=0; isize(); i++) { std::cout << i << "\t" << (*result)[i] << std::endl; } delete result; // suck } #endif #endif libStatGen-1.0.14/general/StringAlias.cpp000077500000000000000000000041151254730101300201720ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "StringAlias.h" #include "InputFile.h" void StringAlias::SetAlias(String & string, String & alias) { int index = lookup.Integer(string); if (index < 0) { aliases.Push(alias); lookup.SetInteger(string, aliases.Length() - 1); } else aliases[index] = alias; } const String & StringAlias::GetAlias(const String & string) const { int index = lookup.Integer(string); if (index < 0) return string; else return aliases[index]; } int StringAlias::GetAliases(StringArray & list) const { if(lookup.Entries() == 0) { return 0; } int edits = 0; for(int i = 0; i < list.Length(); i++) { int index = lookup.Integer(list[i]); if(index >= 0) { list[i] = aliases[index]; edits++; } } return edits; } bool StringAlias::ReadFromFile(const char * filename) { IFILE input = ifopen(filename, "rt"); if (input == NULL) return false; ReadFromFile(input); ifclose(input); return true; } bool StringAlias::ReadFromFile(IFILE & input) { StringArray lines, tokens; lines.Read(input); for (int j = 0; j < lines.Length(); j++) { tokens.ReplaceTokens(lines[j]); if (tokens.Length() != 2) continue; SetAlias(tokens[0], tokens[1]); } return true; } libStatGen-1.0.14/general/StringAlias.h000077500000000000000000000023711254730101300176410ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __STRINGALIAS_H__ #define __STRINGALIAS_H__ #include "StringArray.h" #include "StringHash.h" class StringAlias { public: StringAlias() {} virtual ~StringAlias() {} void SetAlias(String & string, String & alias); const String & GetAlias(const String & string) const; int GetAliases(StringArray & list) const; bool ReadFromFile(const char * filename); bool ReadFromFile(IFILE & input); private: StringIntHash lookup; StringArray aliases; }; #endif libStatGen-1.0.14/general/StringArray.cpp000066400000000000000000000241471254730101300202230ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "StringArray.h" #include "InputFile.h" #include "Sort.h" #include "Error.h" #include int StringArray::alloc = 32; bool StringArray::lazyMemoryManagement = false; StringArray::StringArray(int startsize) { count = startsize; size = (startsize + alloc) / alloc * alloc; strings = new String * [size]; for (int i = 0; i < count; i++) strings[i] = new String; for (int i = count; i < size; i++) strings[i] = NULL; }; StringArray::StringArray(StringArray & rhs) { count = rhs.count; size = (rhs.count + alloc) / alloc * alloc; strings = new String * [size]; for (int i = 0; i < count; i++) strings[i] = new String(rhs[i]);; for (int i = count; i < size; i++) strings[i] = NULL; } StringArray::~StringArray() { for (int i = 0; i < size; i++) if (strings[i] != NULL) delete strings[i]; else break; delete [] strings; } int StringArray::CharLength() { int charlen = 0; for (int i = 0; i < count; i++) charlen += strings[i]->Length(); return charlen; } void StringArray::Read(const char * filename) { IFILE f = ifopen(filename, "rb"); if (f == NULL) return; Read(f); ifclose(f); } void StringArray::Write(const char * filename) { FILE * f = fopen(filename, "wt"); if (f == NULL) return; Write(f); fclose(f); } void StringArray::WriteLine(const char * filename) { FILE * f = fopen(filename, "wt"); if (f == NULL) return; WriteLine(f); fclose(f); } void StringArray::Read(FILE * f) { while (!feof(f)) { Grow(count + 1); if (strings[count] == NULL) strings[count] = new String; strings[count]->ReadLine(f); count++; } } void StringArray::Write(FILE * f) { for (int i = 0; i < count; i++) strings[i]->WriteLine(f); } void StringArray::WriteLine(FILE * f) { for (int i = 0; i < count; i++) fprintf(f, "%s%c", (const char *)(*strings[i]), i == count-1 ? '\n' : '\t'); } void StringArray::Read(IFILE & f) { while (!ifeof(f)) { Grow(count + 1); if (strings[count] == NULL) strings[count] = new String; strings[count]->ReadLine(f); if (ifeof(f) && strings[count]->Length()==0) { return; } count++; } } void StringArray::Grow(int newsize) { if (newsize >= size) { int oldsize = size; if ((newsize >> 1) >= size) size = (newsize + alloc) / alloc * alloc; else { size = alloc; while (size <= newsize) size *= 2; } String ** tmp = new String * [size]; for (int i = 0; i < oldsize; i++) tmp[i] = strings[i]; for (int i = oldsize; i < size; i++) tmp[i] = NULL; delete [] strings; strings = tmp; } } void StringArray::Clear() { if (!lazyMemoryManagement) { for (int i = 0; i < size; i++) { if (strings[i] != NULL) { delete strings[i]; strings[i] = NULL; } else { break; } } } count = 0; } int StringArray::AddColumns(const String & s, char ch) { if (s.Length() > 0) for (int pos = 0; pos <= s.Length(); pos++) { int oldpos = pos; pos = s.FindChar(ch, pos); if (pos == -1) pos = s.Length(); Grow(count + 1); if (strings[count] == NULL) { strings[count] = new String(pos - oldpos); } strings[count]->SetLength(pos - oldpos); memcpy((char *) *strings[count++], ((const char *) s) + oldpos, pos - oldpos); } return count; } int StringArray::AddColumns(const String & s, char ch, int maxColumns) { maxColumns += count; if (s.Length() > 0) for (int pos = 0; pos <= s.Length() && maxColumns != count; pos++) { int oldpos = pos; pos = s.FindChar(ch, pos); if (pos == -1) pos = s.Length(); Grow(count + 1); if (strings[count] == NULL) strings[count] = new String(pos - oldpos); strings[count]->SetLength(pos - oldpos); memcpy((char *) *strings[count++], ((const char *) s) + oldpos, pos - oldpos); }; return count; } int StringArray::AddTokens(const String & s, char ch) { for (int pos = 0; pos < s.Length(); pos++) { while (pos < s.Length() && s[pos] == ch) pos++; int oldpos = pos; while (pos < s.Length() && s[pos] != ch) pos++; if (oldpos < s.Length()) { Grow(count + 1); if (strings[count] == NULL) { strings[count] = new String(pos - oldpos); } strings[count]->SetLength(pos - oldpos); memcpy((char *) *strings[count++], (const char *) s + oldpos, pos - oldpos); } } return count; } int StringArray::AddTokens(const String & s, const String & separators) { for (int pos = 0; pos < s.Length(); pos++) { while (pos < s.Length() && separators.FindChar(s[pos]) != -1) pos++; int oldpos = pos; while (pos < s.Length() && separators.FindChar(s[pos]) == -1) pos++; if (oldpos < s.Length()) { Grow(count + 1); if (strings[count] == NULL) strings[count] = new String(pos - oldpos); strings[count]->SetLength(pos - oldpos); memcpy((char *) *strings[count++], ((const char *) s) + oldpos, pos - oldpos); } } return count; } int StringArray::Dimension(int newcount) { if (newcount > count) { Grow(newcount); for (int i = count; i < newcount; i++) { if (strings[i] == NULL) strings[i] = new String; else strings[i]->Clear(); } count = newcount; } else if (newcount < count) { if (!lazyMemoryManagement) { for (int i = newcount; i < size; i++) { if (strings[i] != NULL) { delete strings[i]; strings[i] = NULL; } else { break; } } } count = newcount; } return count; } int StringArray::Find(const String & s) const { for (int i = 0; i < count; i++) if (*(strings[i]) == s) return i; return -1; } int StringArray::FastFind(const String & s) const { for (int i = 0; i < count; i++) if (strings[i]->FastCompare(s) == 0) return i; return -1; } int StringArray::SlowFind(const String & s) const { for (int i = 0; i < count; i++) if (strings[i]->SlowCompare(s) == 0) return i; return -1; } int StringArray::Add(const String & s) { Grow(count + 1); if (strings[count] == NULL) { strings[count] = new String(s); } else { *strings[count] = s; } return ++count; } void StringArray::InsertAt(int position, const String & s) { Grow(count + 1); String * newString = strings[count]; if (newString == NULL) newString = new String(s); else *newString = s; for (int i = count; i > position; i--) strings[i] = strings[i - 1]; strings[position] = newString; count++; } String & StringArray::Last() const { if (!count) error("StringArray: Null String Access"); return *(strings[count - 1]); } void StringArray::Delete(int index) { String * oldString = strings[index]; count--; for (; index < count; index++) strings[index] = strings[index + 1]; strings[count] = oldString; } StringArray & StringArray::operator = (const StringArray & rhs) { Dimension(rhs.count); for (int i = 0; i < rhs.count; i++) *strings[i] = *rhs.strings[i]; return *this; } bool StringArray::operator == (const StringArray & rhs) const { if (count != rhs.count) return false; for (int i = 0; i < rhs.count; i++) if (*strings[i] != *rhs.strings[i]) return false; return true; } void StringArray::Sort() { QuickSort(strings, count, sizeof(String *), ComparisonForSort); } int StringArray::ComparisonForSort(const void * a, const void * b) { String * string1 = *(String **) a; String * string2 = *(String **) b; return Compare(*string1, *string2); } String StringArray::Pop() { String result = *(strings[count - 1]); Dimension(count - 1); return result; } void StringArray::Trim() { for (int i = 0; i < count; i++) strings[i]->Trim(); } void StringArray::Print() { Print(stdout); } void StringArray::Print(FILE * output) { for (int i = 0; i < count; i++) fprintf(output, "%s\n", (const char *)(*strings[i])); } void StringArray::PrintLine() { PrintLine(stdout); } void StringArray::PrintLine(FILE * output) { for (int i = 0; i < count; i++) fprintf(output, "%s%c", (const char *)(*strings[i]), i == count - 1 ? '\n' : '\t'); } void StringArray::Swap(StringArray & s) { String ** temp = s.strings; s.strings = strings; strings = temp; int swap = s.size; s.size = size; size = swap; swap = s.count; s.count = count; count = swap; } libStatGen-1.0.14/general/StringArray.h000066400000000000000000000070201254730101300176570ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __STRING_ARRAY_H__ #define __STRING_ARRAY_H__ #include "StringBasics.h" class StringArray { protected: String ** strings; int size, count; public: static int alloc; static bool lazyMemoryManagement; StringArray(int startsize = 0); StringArray(StringArray & original); virtual ~StringArray(); // Each line in a file is parsed into a separate array element // void Read(FILE * f); void Write(FILE * f); void WriteLine(FILE * f); void Read(const char * filename); void Write(const char * filename); void WriteLine(const char * filename); void Read(IFILE & f); // Write all strings to the screen void Print(); void PrintLine(); // Write all strings to a file void Print(FILE * f); void PrintLine(FILE * f); void Grow(int newsize); void Clear(); int Length() const { return count; } int Dimension(int newcount); int CharLength(); String & operator [](int i) { return *(strings[i]); } const String & operator [](int i) const { return *(strings[i]); } // These functions divide a string into tokens and append these to the // array. Return value is the new array length // int AddColumns(const String & s, char ch = '\t'); int AddColumns(const String & s, char ch, int maxColumns); int AddTokens(const String & s, char ch); int AddTokens(const String & s, const String & separators = " \t\r\n"); int ReplaceColumns(const String & s, char ch = '\t') { Clear(); return AddColumns(s, ch); } int ReplaceTokens(const String & s, const String & separators = " \t\r\n") { Clear(); return AddTokens(s, separators); } // These functions add, insert or remove a single array element // int Add(const String & s); void InsertAt(int position, const String & s); void Delete(int position); // These functions manipulate a string as a stack // String & Last() const; int Push(const String & s) { return Add(s); } String Pop(); // Linear search (N/2 comparisons on average) for a single element // If searching is required, StringMaps are a better option // int Find(const String & s) const; int FastFind(const String & s) const; int SlowFind(const String & s) const; // Alphetically orders strings // void Sort(); // Trims strings to remove whitespace void Trim(); StringArray & operator = (const StringArray & rhs); bool operator == (const StringArray & rhs) const; bool operator != (const StringArray & rhs) const { return !(*this == rhs); } void Swap(StringArray & s); private: static int ComparisonForSort(const void * a, const void * b); }; #endif libStatGen-1.0.14/general/StringBasics.cpp000066400000000000000000000776551254730101300203650ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "StringBasics.h" #include "Error.h" #include "Constant.h" #include "MathConstant.h" #include #include #include #include #define SWP(A,B) {int tmp=a; a=b; b=tmp;} #ifdef _MSC_VER #ifndef snprintf #define vsnprintf _vsnprintf #define snprintf _snprintf #endif #endif // If natural ordering is defined, comparisons will // order strings including numbers correctly // (eg, ... "8", "9", "10" ...) rather than using // ASCII ordering (... "10", "8", "9", ...) #define NATURAL_ORDERING 1 int String::alloc = 8; bool String::caseSensitive = true; void String::NewString(int startsize) { len = 0; size = (startsize + alloc) / alloc * alloc; buffer = new char [size]; buffer[0] = 0; } String::String(const char * s) { int clen = s == NULL ? 0 : strlen(s); NewString(clen); if (clen) { len = clen; memcpy(buffer, s, len + 1); } } String::String(char ch, int count) { NewString(count); memset(buffer, ch, count); buffer[count] = 0; len = count; } String::String(const String & s) { len = s.len; size = (s.len + alloc) / alloc * alloc;; buffer = new char [size]; memcpy(buffer, s.buffer, len + 1); } void String::Grow(int newSize) { if (newSize >= size) { if ((newSize >> 1) >= size) size = (newSize + alloc) / alloc * alloc; else { size = alloc; while (size <= newSize) size *= 2; } char * tmp = new char [size]; // // len + 1 due to terminating NUL which is not counted in len // memcpy(tmp, buffer, len + 1); memcpy(tmp, buffer, len); tmp[len] = '\0'; delete [] buffer; buffer = tmp; } } void String::Swap(String & s) { char * temp = s.buffer; s.buffer = buffer; buffer = temp; int swap = s.size; s.size = size; size = swap; swap = s.len; s.len = len; len = swap; } String & String::Copy(const String & s) { Grow(s.len); len = s.len; memcpy(buffer, s.buffer, len + 1); return *this; } String & String::Copy(const String & s, int start, int n) { if (s.len <= start) return Clear(); if (s.len < start + n) n = s.len - start; Grow(n); memcpy(buffer, s.buffer + start, n); buffer[len = n] = 0; return *this; } String & String::Copy(const char * s) { if (s == NULL) { len = 0; buffer[0] = 0; } else { int clen = strlen(s); Grow(clen); len = clen; memcpy(buffer, s, len + 1); } return *this; } String & String::ToUpper() { for (int i = 0; i < len; i++) buffer[i] = (char) toupper(buffer[i]); return *this; } String & String::ToLower() { for (int i = 0; i < len; i++) buffer[i] = (char) tolower(buffer[i]); return *this; } String String::AsUpper() { String temp; temp = *this; return temp.ToUpper(); } String String::AsLower() { String temp; temp = *this; return temp.ToLower(); } String String::Capitalize() { String temp; temp = *this; temp.buffer[0] = (char) toupper(temp.buffer[0]); return temp; } String & String::operator = (const String & rhs) { Copy(rhs); return *this; } String & String::operator = (const char * rhs) { Copy(rhs); return * this; } String & String::operator += (const String & rhs) { Grow(len + rhs.len); memcpy(buffer + len, rhs.buffer, rhs.len + 1); len += rhs.len; return *this; } String & String::operator += (const char * rhs) { if (rhs != NULL) { int clen = strlen(rhs); Grow(len + clen); memcpy(buffer + len, rhs, clen + 1); len += clen; } return *this; } String String::operator + (const String & rhs) const { String result(len + rhs.len); memcpy(result.buffer, buffer, len); memcpy(result.buffer + len, rhs.buffer, rhs.len + 1); result.len = len + rhs.len; return result; } String String::operator + (const char * rhs) const { if (rhs != NULL) { int clen = strlen(rhs); String result(len + clen); memcpy(result.buffer, buffer, len); memcpy(result.buffer + len, rhs, clen + 1); result.len = len + clen; return result; } return *this; } String & String::operator = (char ch) { if (ch) { Grow(1); buffer[0] = ch; buffer[1] = 0; len = 1; } else len = buffer[0] = 0; return *this; } String & String::operator += (char ch) { if (ch) { Grow(len + 1); buffer[len] = ch; buffer[++len] = 0; } return *this; } String String::operator + (char ch) const { String result(*this); result += ch; return result; } String & String::operator = (int rhs) { Clear(); if (rhs < 0) { Add('-'); *this += (unsigned int) -rhs; } else *this = (unsigned int) rhs; return *this; } String & String::operator = (unsigned int rhs) { Clear(); unsigned long long base = 10; int digits = 1; while (rhs >= base) { base *= 10; digits++; } Grow(digits); while (base /= 10) { char ch = char(rhs / base); rhs = rhs - ch * base; buffer[len++] = char(ch + '0'); } buffer[len] = 0; return *this; }; String String::operator + (int rhs) const { String result(*this); result += rhs; return result; }; String String::operator + (unsigned int rhs) const { String result(*this); result += rhs; return result; }; String & String::operator += (int rhs) { String temp; temp = rhs; return *this += temp; } String & String::operator += (unsigned int rhs) { String temp; temp = rhs; return *this += temp; } String & String::operator *= (unsigned int rhs) { if (rhs == 0) Clear(); else { String original(*this); Grow(len * rhs); for (unsigned int i = 1; i < rhs; i++) *this += original; } return *this; } String & String::operator = (double rhs) { LockBuffer(32); sprintf(buffer, "%.3f", rhs); UnlockBuffer(); return *this; } String String::operator + (double rhs) const { String result(*this); result += rhs; return result; } String & String::operator += (double rhs) { String temp; temp = rhs; return *this += temp; } void String::appendFullFloat(float rhs) { std::ostringstream os; os << rhs; *this += os.str().c_str(); } char * String::LockBuffer(int min) { if (min > 0) Grow(min); return buffer; } String & String::UnlockBuffer() { for (len = 0; len < size; len++) if (buffer[len] == 0) return *this; error("BasicString - direct access overflowed buffer"); return *this; } int String::Compare(const String & s) const { if (caseSensitive) return String::FastCompare(s); else return String::SlowCompare(s); } int String::Compare(const char * s) const { return caseSensitive ? FastCompare(s) : SlowCompare(s); } int String::FastCompare(const String & s) const { for (int i = 0; i <= len; i++) if (buffer[i] - s.buffer[i]) { #ifdef NATURAL_ORDERING int d = i; while (isdigit(buffer[d]) && isdigit(s.buffer[d])) d++; if (isdigit(buffer[d])) return 1; if (isdigit(s.buffer[d])) return -1; #endif return buffer[i] - s.buffer[i]; } return 0; } int String::FastCompare(const char * s) const { if (s == NULL) return -len; for (int i = 0; i <= len; i++) if (buffer[i] - s[i]) { #ifdef NATURAL_ORDERING int d = i; while (isdigit(buffer[d]) && isdigit(s[d])) d++; if (isdigit(buffer[d])) return 1; if (isdigit(s[d])) return -1; #endif return buffer[i] - s[i]; } return 0; } int String::SlowCompare(const String & s) const { for (int i = 0; i <= len; i++) if (toupper(buffer[i]) - toupper(s.buffer[i])) { #ifdef NATURAL_ORDERING int d = i; while (isdigit(buffer[d]) && isdigit(s[d])) d++; if (isdigit(buffer[d])) return 1; if (isdigit(s.buffer[d])) return -1; #endif return toupper(buffer[i]) - toupper(s.buffer[i]); } return 0; } int String::SlowCompare(const char * s) const { if (s == NULL) return -len; for (int i = 0; i <= len; i++) if (toupper(buffer[i]) - toupper(s[i])) { #ifdef NATURAL_ORDERING int d = i; while (isdigit(buffer[d]) && isdigit(s[d])) d++; if (isdigit(buffer[d])) return 1; if (isdigit(s[d])) return -1; #endif return toupper(buffer[i]) - toupper(s[i]); } return 0; } int String::ReadLine(FILE * f) { len = 0; buffer[len] = 0; if (f == NULL) return -1; int clen = 0; char check[2] = {0, 0}; int step = 128; String format("%128[^\n\r]%1[\n\r]"); int returnValue = 1; int io = 0; while (check[0] != '\n' && check[0] != '\r') { if (clen) { step *= 2; format.printf("%%%d%s", step, "[^\n\r]%1[\n\r]"); } clen += step; io = fscanf(f, format, LockBuffer(clen) + len, check); UnlockBuffer(); // Avoid getting stuck on zero length lines (system specific!) if (io == 0 && check[0] != '\n' && check[0] != '\r') io = fscanf(f, "%1[\n\r]", check); if (io == 0 || io == EOF) { // Set return value to indicate error/EOF returnValue = -1; break; } } if (check[0] == '\n') io = fscanf(f, "%*1[\r]"); if (check[0] == '\r') io = fscanf(f, "%*1[\n]"); return returnValue; } String & String::Read(FILE * f) { len = 0; buffer[len] = 0; if (f == NULL) return *this; int clen = 0; char check[2] = {'G', 0}; while (strchr(WHITESPACE, check[0]) == NULL) { clen += READBUF; int io = fscanf(f, " %" READBUFSTR "[^" WHITESPACE "]" "%1[" WHITESPACE "]", LockBuffer(clen) + len, check); if (io == 0 || io == EOF) break; UnlockBuffer(); } return *this; } String & String::Read() { return Read(stdin); } String & String::Read(IFILE & f) { len = 0; buffer[len] = 0; if (f == NULL) return *this; bool leading = true; while (true) { int ch = ifgetc(f); if (ch == -1) break; if (strchr(WHITESPACE, ch) != NULL) { if (leading) { continue; } else { break; } } if (len + 1 == size) Grow(len + 1); buffer[len++] = (char) ch; buffer[len] = 0; leading = false; } return *this; } int String::ReadLine() { static int last = 0; int ch; len = 0; buffer[len] = 0; while (true) { ch = getchar(); if (ch == EOF) { break; } if (ch == 10) { if (last == 13) { last = 0; continue; } else { last = 10; break; } } if (ch == 13) { if (last == 10) { last = 0; continue; } else { last = 13; break; } } if (len + 1 == size) { Grow(len + 1); } last = ch; buffer[len++] = (char) last; buffer[len] = 0; } if ((ch == EOF) && (len == 0)) { // Indicate error/EOF if nothing was read. return -1; } // Return success. return 1; } // Read line using getc. #if defined(_WIN32) int String::ReadLine(IFILE & f) { static int last = 0; int ch; len = 0; buffer[len] = 0; while (true) { ch = f->ifgetc(); if (ch == EOF) { break; } if (ch == 10) { if (last == 13) { last = 0; continue; } else { last = 10; break; } } if (ch == 13) { if (last == 10) { last = 0; continue; } else { last = 13; break; } } if (len + 1 == size) { Grow(len + 1); } last = ch; buffer[len++] = (char) last; buffer[len] = 0; } if ((ch == EOF) && (len == 0)) { // Indicate error/EOF if nothing was read. return -1; } return 1; } #else int String::ReadLine(IFILE & f) { int ch; char *ptr = buffer; char *endBuffer = buffer + size; len = 0; while ( ((ch = f->ifgetc()) != EOF) && (ch != '\n')) { if (ptr >= endBuffer - 1) { // resize: 1 byte for the next character, 1 byte // for the NUL at the end. Grow(len + 2); endBuffer = buffer + size; ptr = buffer + len; } *ptr++ = ch; len++; } // NB: assumes that buffer is always allocated. buffer[len] = 0; if ((ch == EOF) && (len == 0)) { // Indicate error/EOF if nothing was read. return -1; } return 1; } #endif void String::Write(FILE * f) { fprintf(f, "%s", buffer); } void String::Write() { Write(stdout); } void String::WriteLine() { WriteLine(stdout); } void String::WriteLine(FILE * f) { if (f == NULL) return; fprintf(f, "%s\n", buffer); } std::ostream& operator << (std::ostream& os, const String& s) { return os << s.c_str(); } String String::Left(int n) const { if (n < 0) n = 0; if (len < n) n = len; String result(n); memcpy(result.buffer, buffer, n); result.buffer[result.len = n] = 0; return result; } String String::Right(int n) const { if (n < 0) n = 0; if (len < n) n = len; String result(n); memcpy(result.buffer, buffer + len - n, n); result.buffer[result.len = n] = 0; return result; } String String::SubStr(int start, int n) const { if (start < 0) { n += start; start = 0; }; n = min(len - start, n); n = max(n, 0); String result(n); if (start > len) return result; memcpy(result.buffer, buffer + start, n); result.buffer[result.len = n] = 0; return result; } String String::SubStr(int start) const { return SubStr(start, len - start); } String String::Mid(int start, int end) const { return SubStr(start, end - start + 1); } int String::FindChar(char ch, int start) const { return caseSensitive ? FastFindChar(ch, start) : SlowFindChar(ch, start); } int String::FastFindChar(char ch, int start) const { for (; start < len; start++) if (buffer[start] == ch) return start; return -1; } int String::SlowFindChar(char ch, int start) const { ch = (char) toupper(ch); for (; start < len; start++) if (toupper(buffer[start]) == ch) return start; return -1; } int String::FindLastChar(char ch) const { return caseSensitive ? FastFindLastChar(ch) : SlowFindLastChar(ch); } int String::FastFindLastChar(char ch) const { for (int start = len-1; start >= 0; start--) if (buffer[start] == ch) return start; return -1; } int String::SlowFindLastChar(char ch) const { ch = (char) toupper(ch); for (int start = len-1 ; start >= 0; start--) if (toupper(buffer[start]) == ch) return start; return -1; } int String::Find(const String & pattern, int start) const { return caseSensitive ? FastFind(pattern, start) : SlowFind(pattern, start); } // TODO -- We should have a better string search algorithm int String::FastFind(const String & pattern, int start) const { for (int i ; start <= len - pattern.Length(); start++) if (buffer[start] == pattern[0]) { for (i = 1; i < pattern.Length(); i++) if (pattern[i] != buffer[start + i]) break; if (i == pattern.Length()) return start; } return -1; } int String::SlowFind(const String & pattern, int start) const { int firstchar = toupper(pattern[0]); for (int i ; start <= len - pattern.Length(); start++) if (toupper(buffer[start]) == firstchar) { for (i = 1; i < pattern.Length(); i++) if (toupper(pattern[i]) != toupper(buffer[start + i])) break; if (i == pattern.Length()) return start; } return -1; } int String::SetLength(int newlen) { if (newlen > len) { Grow(newlen); memset(buffer + len, ' ', newlen - len); } buffer[newlen] = 0; return len = newlen; } String & String::Filter(const String & s) { int to = 0; for (int from = 0; from < len; from++) if (s.FindChar(buffer[from]) != -1) buffer[to++] = buffer[from]; buffer[len = to] = 0; return *this; } String & String::Filter(const char * s) { String filter(s); return Filter(filter); } String & String::ExcludeCharacters(const String & s) { int to = 0; for (int from = 0; from < len; from++) if (s.FindChar(buffer[from]) == -1) buffer[to++] = buffer[from]; buffer[len = to] = 0; return *this; } String & String::ExcludeCharacters(const char * s) { String excluded(s); return ExcludeCharacters(excluded); } String operator + (const char * lhs, const String & rhs) { String result(lhs); result += rhs; return result; } String operator + (char lhs, const String & rhs) { String result(lhs); result += rhs; return result; } String operator + (int lhs, const String & rhs) { String result; result = lhs; result += rhs; return result; } String operator + (unsigned int lhs, const String & rhs) { String result; result = lhs; result += rhs; return result; } long String::AsInteger() const { long returnValue = 0; if(!AsInteger(returnValue)) { // This is not an integer, but nothing to do but return a value. } return(returnValue); } // Check that the string is an integer when converting it. // If the entire string is an integer, return true, if not, return false. bool String::AsInteger(long& intValue) const { long integer = 0; int base = 10; int pos = 0; int sign = 1; bool isInt = true; // If this is no value for this integer, return false. if (pos == len) { return(false); } if (buffer[pos] == '-') { sign = -1, pos++; } if ((len > pos + 2) && (buffer[pos] == '0') && ((buffer[pos+1] == 'x') || (buffer[pos+1] == 'X'))) { base = 16, pos += 2; } // If this is no value for this integer, return false. if (pos == len) { return(false); } for (; pos < len; pos++) { char digit = (char) toupper(buffer[pos]); if (digit >= '0' && digit <= '9') { integer = integer * base + digit - '0'; } else if (digit >= 'A' && digit <= 'F' && base == 16) { integer = integer * base + digit - 'A' + 10; } else { isInt = false; break; } } intValue = sign*integer; return(isInt); } // Check that the string is an integer when converting it. // If the entire string is an integer, return true, if not, return false. bool String::AsInteger(int& intValue) const { int integer = 0; int base = 10; int pos = 0; int sign = 1; bool isInt = true; // If this is no value for this integer, return false. if (pos == len) { return(false); } if (buffer[pos] == '-') { sign = -1, pos++; } if ((len > pos + 2) && (buffer[pos] == '0') && ((buffer[pos+1] == 'x') || (buffer[pos+1] == 'X'))) { base = 16, pos += 2; } // If this is no value for this integer, return false. if (pos == len) { return(false); } for (; pos < len; pos++) { char digit = (char) toupper(buffer[pos]); if (digit >= '0' && digit <= '9') { integer = integer * base + digit - '0'; } else if (digit >= 'A' && digit <= 'F' && base == 16) { integer = integer * base + digit - 'A' + 10; } else { isInt = false; break; } } intValue = sign*integer; return(isInt); } String & String::Invert() { for (int i = 0, j = len - 1; i < j; i++, j--) { char tmp = buffer[i]; buffer[i] = buffer[j]; buffer[j] = tmp; } return *this; } String String::RightToLeft() { String result(*this); result.Invert(); return result; } String & String::Invert(const String & s) { Copy(s); return Invert(); } int String::CompareToStem(const String & stem) const { if (caseSensitive) return String::FastCompareToStem(stem); else return String::SlowCompareToStem(stem); } int String::FastCompareToStem(const String & stem) const { for (int i = 0; i < stem.len; i++) if (buffer[i] - stem.buffer[i]) { #ifdef NATURAL_ORDERING int d = i; while (isdigit(buffer[d]) && isdigit(stem.buffer[d]) && d < stem.len) d++; if (isdigit(buffer[d]) && d < stem.len) return 1; if (isdigit(stem.buffer[d])) return -1; #endif return buffer[i] - stem.buffer[i]; } return 0; } int String::SlowCompareToStem(const String & stem) const { for (int i = 0; i < stem.len; i++) if (toupper(buffer[i]) - toupper(stem.buffer[i])) { #ifdef NATURAL_ORDERING int d = i; while (isdigit(buffer[d]) && isdigit(stem.buffer[d]) && d < stem.len) d++; if (isdigit(buffer[d]) && d < stem.len) return 1; if (isdigit(stem.buffer[d])) return -1; #endif return toupper(buffer[i]) - toupper(stem.buffer[i]); } return 0; } int String::CompareToStem(const char * stem) const { if (caseSensitive) return String::FastCompareToStem(stem); else return String::SlowCompareToStem(stem); } int String::FastCompareToStem(const char * stem) const { for (int i = 0; stem[i] != 0; i++) if (buffer[i] - stem[i]) return buffer[i] - stem[i]; return 0; } int String::SlowCompareToStem(const char * stem) const { for (int i = 0; stem[i] != 0; i++) if (toupper(buffer[i]) - toupper(stem[i])) return toupper(buffer[i]) - toupper(stem[i]); return 0; } int String::MatchesBeginningOf(const String & stem) const { if (caseSensitive) return String::FastMatchesBeginningOf(stem); else return String::SlowMatchesBeginningOf(stem); } int String::FastMatchesBeginningOf(const String & stem) const { for (int i = 0; i < len; i++) if (buffer[i] - stem.buffer[i]) return buffer[i] - stem.buffer[i]; return 0; } int String::SlowMatchesBeginningOf(const String & stem) const { for (int i = 0; i < len; i++) if (toupper(buffer[i]) - toupper(stem.buffer[i])) return toupper(buffer[i]) - toupper(stem.buffer[i]); return 0; } int String::MatchesBeginningOf(const char * stem) const { if (caseSensitive) return String::FastMatchesBeginningOf(stem); else return String::SlowMatchesBeginningOf(stem); } int String::FastMatchesBeginningOf(const char * stem) const { for (int i = 0; i < len; i++) if (buffer[i] - stem[i]) return buffer[i] - stem[i]; return 0; } int String::SlowMatchesBeginningOf(const char * stem) const { for (int i = 0; i < len; i++) if (toupper(buffer[i]) - toupper(stem[i])) return toupper(buffer[i]) - toupper(stem[i]); return 0; } String & String::Trim(char character) { int first = 0; while (buffer[first] && buffer[first] == character) first++; int last = len - 1; while (last >= 0 && buffer[last] == character) last--; int out = 0; while (first <= last) buffer[out++] = buffer[first++]; buffer[len = out] = 0; return *this; } String & String::Trim() { int first = 0; while (buffer[first] && isspace(buffer[first])) first++; int last = len - 1; while (last >= 0 && isspace(buffer[last])) last--; int out = 0; while (first <= last) buffer[out++] = buffer[first++]; buffer[len = out] = 0; return *this; } vector *String::Split(char splitChar) { vector *result = new vector; String word; for (int i = 0; ipush_back(word); word.Clear(); } else word.Add((*this)[i]); } if (word.Length()>0) result->push_back(word); return result; } #define VSNPRINTF_NOT_CHECKED 0 #define VSNPRINTF_IS_OK 1 #define VSNPRINTF_NOT_OK 2 int String::vsnprintfChecked = 0; int String::printf(const char * format, ...) { va_list ap; va_start(ap, format); vprintf(format, ap); va_end(ap); return len; } int String::catprintf(const char * format, ...) { va_list ap; va_start(ap, format); vcatprintf(format, ap); va_end(ap); return len; } int String::vprintf(const char * format, va_list ap) { check_vsnprintf(); while (true) { int bytes_needed; #ifdef va_copy va_list arguments; va_copy(arguments, ap); #else va_list & arguments = ap; #endif if (vsnprintfChecked == VSNPRINTF_IS_OK) bytes_needed = vsnprintf(buffer, size, format, arguments); else bytes_needed = my_vsnprintf(buffer, size, format, arguments); #ifdef va_copy va_end(arguments); #endif if (bytes_needed >= size) Grow(bytes_needed); else if (bytes_needed == -1) Grow(size * 2); else { return len = bytes_needed; } } } void String::check_vsnprintf() { if (vsnprintfChecked == VSNPRINTF_NOT_CHECKED) { char temp[100]; memset(temp, 0, 100); int check = snprintf(temp, 5, "%5s", "VSNPRINTF"); if (temp[6] != 0 || temp[7] != 0 || (check != 9 && check != -1)) /* error("This program requires a working version of vsnprintf\n" "However, vsnprintf in the current library seems buggy\n\n" "Recompiling this program with the -D__REPLACE_SNPRINTF__ flag\n" "may solve this problem.\n\n"); */ vsnprintfChecked = VSNPRINTF_NOT_OK; else vsnprintfChecked = VSNPRINTF_IS_OK; } } int String::vcatprintf(const char * format, va_list ap) { check_vsnprintf(); if (len == size) Grow(size * 2); while (true) { int bytes_needed; #ifdef va_copy va_list arguments; va_copy(arguments, ap); #else va_list & arguments = ap; #endif if (vsnprintfChecked == VSNPRINTF_IS_OK) bytes_needed = len + vsnprintf(buffer + len, size - len, format, arguments); else bytes_needed = len + my_vsnprintf(buffer + len, size - len, format, arguments); #ifdef va_copy va_end(arguments); #endif if (bytes_needed >= size) Grow(bytes_needed); else if (bytes_needed < len) Grow(size * 2); else { return len = bytes_needed; } } } FILE * String::my_vsnprintf_file = NULL; int String::my_vsnprintf(char * buffer, int bufsize, const char * format, va_list args) { if (my_vsnprintf_file == NULL) { my_vsnprintf_file = tmpfile(); atexit(my_vsnprintf_close_file); } rewind(my_vsnprintf_file); int len = vfprintf(my_vsnprintf_file, format, args); rewind(my_vsnprintf_file); if (len < bufsize) buffer[bufsize = len] = 0; int numRead = fread(buffer, 1, bufsize, my_vsnprintf_file); if(numRead != bufsize) { std::cerr << "Warning, StringBasics failed reading stream in my_vsnprintf\n"; } return len; } int String::my_snprintf(char * buffer, int bufsize, const char * format, ...) { va_list ap; va_start(ap, format); int bytes = my_vsnprintf(buffer, bufsize, format, ap); va_end(ap); return bytes; } void String::my_vsnprintf_close_file() { fclose(my_vsnprintf_file); } bool String::IsNumber() { int pos = 0; bool digits = false; // Skip leading sign if (buffer[pos] == '-' || buffer[pos] == '+') pos++; // Check integer portion while (buffer[pos] >= '0' && buffer[pos] <= '9') pos++, digits = true; // Skip decimal point if (buffer[pos] == '.') { pos++; // Check fractional portion while (buffer[pos] >= '0' && buffer[pos] <= '9') pos++, digits = true; } if (!digits) return false; // Check exponent if (buffer[pos] == 'E' || buffer[pos] == 'e') { pos++; // Skip leading sign if (buffer[pos] == '-' || buffer[pos] == '+') pos++; digits = false; // Check exponent digits while (buffer[pos] >= '0' && buffer[pos] <= '9') pos++, digits = true; } return (pos == len) && digits; } void String::Fill(char ch, int length) { if (length >= 0) SetLength(length); for (int i = 0; i < len; i++) buffer[i] = ch; } String & String::Reverse() { for (int i = 0, j = len - 1; i < j; i++, j--) { int tmp = buffer[i]; buffer[i] = buffer[j]; buffer[j] = tmp; } return *this; } // String::LeftClip() trims the string so only characters after clipPoint remain String & String::LeftClip(int clipAmount) { if (clipAmount == 0) return *this; if (clipAmount > Length()) { len = 0; return *this; } // Use memory move, because the two blocks can overlap memmove(buffer, buffer + clipAmount, len - clipAmount); buffer[len -= clipAmount] = 0; return *this; } String & String::RightClip(int clipAmount) { if (clipAmount == 0) return *this; if (clipAmount > Length()) { len = 0; return *this; } len -= clipAmount; buffer[len] = 0; return *this; } // Implementation of long double convertors is in flux across different platforms #ifdef __GNUC__ String::operator long double() const { return strtold(buffer, NULL); } #else #ifdef __BORLANDC__ String::operator long double() const { return _strtold(buffer, NULL); } #else String::operator long double() const { return atof(buffer); } #endif #endif libStatGen-1.0.14/general/StringBasics.h000066400000000000000000000251211254730101300200070ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __BASICSTRING_H__ #define __BASICSTRING_H__ #include #include #include #include #include using std::vector; #define READBUF 128 #define READBUFSTR "128" #ifdef __PREFIX_STRING__ #define String BasicString #endif #include "InputFile.h" class String { private: void NewString(int startsize); protected: char * buffer; int len, size; public: static int alloc; static bool caseSensitive; explicit String(int startsize = 0) { NewString(startsize); } String(const char * s); String(const String & s); explicit String(char ch, int count = 1); ~String() { if(buffer != NULL) delete [] buffer; } String & Clear() { len = buffer[0] = 0; return *this; } String & Copy(const String & s); String & Copy(const String & s, int start, int count); String & Copy(const char * s); bool IsEmpty() const { return len == 0; } String & ToUpper(); String & ToLower(); String AsUpper(); String AsLower(); String Capitalize(); String & Reverse(); String & LeftClip(int clipAmount); String & RightClip(int clipAmount); String & operator = (char ch); String operator + (char ch) const; String & operator += (char ch); String & operator = (const String & rhs); String operator + (const String & rhs) const; String & operator += (const String & rhs); String & operator = (const char * rhs); String operator + (const char * rhs) const; String & operator += (const char * rhs); String & operator = (int rhs); String operator + (int rhs) const; String & operator += (int rhs); String & operator = (double rhs); String operator + (double rhs) const; String & operator += (double rhs); void appendFullFloat(float rhs); String & operator = (unsigned int rhs); String operator + (unsigned int rhs) const; String & operator += (unsigned int rhs); String operator *(unsigned int rhs) const; String & operator *= (unsigned int rhs); int Compare(const String & rhs) const; int FastCompare(const String & rhs) const; int SlowCompare(const String & rhs) const; int Compare(const char * rhs) const; int FastCompare(const char * rhs) const; int SlowCompare(const char * rhs) const; int CompareToStem(const String & stem) const; int FastCompareToStem(const String & stem) const; int SlowCompareToStem(const String & stem) const; int CompareToStem(const char * stem) const; int FastCompareToStem(const char * stem) const; int SlowCompareToStem(const char * stem) const; int MatchesBeginningOf(const String & stem) const; int FastMatchesBeginningOf(const String & stem) const; int SlowMatchesBeginningOf(const String & stem) const; int MatchesBeginningOf(const char * stem) const; int FastMatchesBeginningOf(const char * stem) const; int SlowMatchesBeginningOf(const char * stem) const; int operator == (const String & rhs) const { return Compare(rhs) == 0; } int operator != (const String & rhs) const { return Compare(rhs) != 0; } int operator < (const String & rhs) const { return Compare(rhs) < 0; } int operator > (const String & rhs) const { return Compare(rhs) > 0; } int operator >= (const String & rhs) const { return Compare(rhs) >= 0; } int operator <= (const String & rhs) const { return Compare(rhs) <= 0; } int operator == (const char * rhs) const { return Compare(rhs) == 0; } int operator != (const char * rhs) const { return Compare(rhs) != 0; } int operator < (const char * rhs) const { return Compare(rhs) < 0; } int operator > (const char * rhs) const { return Compare(rhs) > 0; } int operator <= (const char * rhs) const { return Compare(rhs) <= 0; } int operator >= (const char * rhs) const { return Compare(rhs) >= 0; } operator const char *() const { return buffer; } const char *c_str() const { return (const char *) buffer; } operator char *() { return buffer; } operator int () const { return atoi(buffer); } operator double() const { return atof(buffer); } operator long double() const; char operator [](int i) const { return buffer[i]; } char & operator [](int i) { return buffer[i]; } char & Last() { return buffer[len - 1]; } char & First() { return buffer[0]; } void Grow(int newSize); void Swap(String & s); char * LockBuffer(int size = -1); String & UnlockBuffer(); String & Read(); // Return the status. A negative number indicates an error/EOF. int ReadLine(); void WriteLine(); void Write(); String & Read(FILE * f); // Return the status. A negative number indicates an error/EOF. int ReadLine(FILE * f); void WriteLine(FILE * f); void Write(FILE * f); String & Read(IFILE & f); // Read a line using getc // Return the status. A negative number indicates an error/EOF. int ReadLine(IFILE & f); String Left(int count) const; String Right(int count) const; String Mid(int start, int end) const; String SubStr(int start, int count) const; String SubStr(int start) const; int FindChar(char ch, int start = 0) const; int FastFindChar(char ch, int start = 0) const; int SlowFindChar(char ch, int start = 0) const; int FindLastChar(char ch) const; int FastFindLastChar(char ch) const; int SlowFindLastChar(char ch) const; // Since there is no longer implicit conversion // from char to String, declare this method that // takes a character rather than a String reference. int Find(char ch, int start = 0) const { return(FindChar(ch, start)); } int Find(const String & str, int start = 0) const; int FastFind(const String & str, int start = 0) const; int SlowFind(const String & str, int start = 0) const; String & Filter(const String & s); String & Filter(const char * s); String & ExcludeCharacters(const String & s); String & ExcludeCharacters(const char * s); int Length() const { return len; } int BufferSize() const { return size; } int SetLength(int newlen); int Dimension(int newlen) { return SetLength(newlen); } String & Add(const String & s) { return *this += s; } String & Add(char ch) { return *this += ch; } String RightToLeft(); String & Invert(); String & Invert(const String & s); String & Trim(); String & Trim(char character); vector *Split(char splitChar); long AsInteger() const; bool AsInteger(long& intValue) const; bool AsInteger(int& intValue) const; double AsDouble() const { return (double) *this; } long double AsLongDouble() const { return (long double) *this; } int printf(const char * format, ...); int vprintf(const char * format, va_list arglist); int catprintf(const char * format, ...); int vcatprintf(const char * format, va_list arglist); // Replacement vsnprintf and snprint functions for // problematic architectures... static int my_snprintf(char * buffer, int bufsize, const char * format, ...); static int my_vsnprintf(char * buffer, int bufsize, const char * format, va_list args); static void my_vsnprintf_close_file(); static void check_vsnprintf(); // Check string contents bool IsNumber(); // Explicit conversions const unsigned char * uchar() const { return (unsigned char *) buffer; } const signed char * schar() const { return (signed char *) buffer; } static FILE * my_vsnprintf_file; // Utility functions void Fill(char ch, int length = -1); private: static int vsnprintfChecked; }; inline int Compare(const String & s1, const String & s2) { return s1.Compare(s2); } inline int Compare(const String & s1, const char * s2) { return s1.Compare(s2); } inline int Compare(const char * s1, const String & s2) { return -s2.Compare(s1); } inline int FastCompare(const String & s1, const String & s2) { return s1.FastCompare(s2); } inline int FastCompare(const String & s1, const char * s2) { return s1.FastCompare(s2); } inline int FastCompare(const char * s1, const String & s2) { return -s2.FastCompare(s1); } inline int SlowCompare(const String & s1, const String & s2) { return s1.SlowCompare(s2); } inline int SlowCompare(const String & s1, const char * s2) { return s1.SlowCompare(s2); } inline int SlowCompare(const char * s1, const String & s2) { return -s2.SlowCompare(s1); } String operator + (char lhs, const String & rhs); String operator + (const char * lhs, const String & rhs); String operator + (int lhs, const String & rhs); String operator + (unsigned int lhs, const String & rhs); std::ostream& operator << (std::ostream& os, const String& s); /// Write to a file using streaming. /// \param stream file to write to - IFILE is a pointer to an InputFile object /// \param str string containing what should be written to the file. inline InputFile& operator << (InputFile& stream, const String& str) { unsigned int numExpected = str.Length(); unsigned int numWritten = stream.ifwrite(str.c_str(), numExpected); if(numExpected != numWritten) { std::cerr << "Failed to stream to IFILE, expected " << numExpected << " but only wrote " << numWritten << std::endl; } return(stream); } #endif libStatGen-1.0.14/general/StringHash.cpp000066400000000000000000000376441254730101300200360ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "StringHash.h" #include "InputFile.h" #include "Error.h" StringHash::StringHash(int startsize) : StringHashBase() { count = 0; size = startsize; mask = startsize - 1; // In this implementation, the size of hash tables must be a power of two if (startsize & mask) error("StringHash: Hash table size must be a power of two.\n"); strings = new String * [size]; objects = new void * [size]; keys = new unsigned int [size]; for (unsigned int i = 0; i < size; i++) { strings[i] = NULL; objects[i] = NULL; } }; StringHash::~StringHash() { for (unsigned int i = 0; i < size; i++) if (strings[i] != NULL) delete strings[i]; if(strings) delete [] strings; if(objects) delete [] objects; if(keys) delete [] keys; } void StringHash::Clear() { for (unsigned int i = 0; i < size; i++) if (strings[i] != NULL) { delete strings[i]; strings[i] = NULL; } count = 0; if (size > 256) SetSize(256); } void StringHash::SetSize(int newsize) { int newmask = newsize - 1; String ** newstrings = new String * [newsize]; void ** newobjects = new void * [newsize]; unsigned int * newkeys = new unsigned int [newsize]; for (int i = 0; i < newsize; i++) { newstrings[i] = NULL; newobjects[i] = NULL; } if (count) for (unsigned int i = 0; i < size; i++) if (strings[i] != NULL) { unsigned int key = keys[i]; unsigned int h = key & newmask; while (newstrings[h] != NULL && (newkeys[h] != key || (!stringsEqual(*(newstrings[h]), *(strings[i]))))) h = (h + 1) & newmask; newkeys[h] = key; newstrings[h] = strings[i]; newobjects[h] = objects[i]; } if(strings) delete [] strings; if(objects) delete [] objects; if(keys) delete [] keys; strings = newstrings; objects = newobjects; keys = newkeys; size = newsize; mask = newmask; } int StringHash::Add(const String & string, void * object) { unsigned int key = getKey(string); unsigned int h = Iterate(key, string); if (strings[h] == NULL) Insert(h, key, string); objects[h] = object; if (count * 2 > size) { Grow(); return Iterate(key, string); } return h; } int StringHash::Find(const String & string, void *(*create_object)()) { unsigned int key = getKey(string); unsigned int h = Iterate(key, string); if (strings[h] == NULL && create_object == NULL) return -1; if (strings[h] == NULL && create_object != NULL) { Insert(h, key, string); objects[h] = create_object(); if (count * 2 > size) { Grow(); return Iterate(key, string); } } return h; } int StringHash::Find(const String & string) const { unsigned int key = getKey(string); unsigned int h = Iterate(key, string); if (strings[h] == NULL) return -1; return h; } void * StringHash::CreateHash() { return (void *) new StringHash(); } void StringHash::Delete(unsigned int index) { if (index >= size || strings[index] == NULL) return; delete strings[index]; strings[index] = NULL; count--; if (count * 8 < size && size > 32) Shrink(); else { // rehash the next strings until we find empty slot index = (index + 1) & mask; while (strings[index] != NULL) { if ((keys[index] & mask) != index) { unsigned int h = Iterate(keys[index], *strings[index]); if (h != (unsigned int) index) { keys[h] = keys[index]; strings[h] = strings[index]; objects[h] = objects[index]; strings[index] = NULL; objects[index] = NULL; } } index = (index + 1) & mask; } } } void StringHash::ReadLinesFromFile(const char * filename) { IFILE f = ifopen(filename, "rb"); if (f == NULL) return; ReadLinesFromFile(f); ifclose(f); } void StringHash::ReadLinesFromFile(FILE * f) { String buffer; while (!feof(f)) { buffer.ReadLine(f); Add(buffer.Trim()); } } void StringHash::ReadLinesFromFile(IFILE & f) { String buffer; while (!ifeof(f)) { buffer.ReadLine(f); Add(buffer.Trim()); } } // StringIntHash implementation StringIntHash::StringIntHash(int startsize) : StringHashBase() { count = 0; size = startsize; mask = startsize - 1; // In this implementation, the size of hash tables must be a power of two if (startsize & mask) error("StringIntHash: Hash table size must be a power of two.\n"); strings = new String * [size]; integers = new int [size]; keys = new unsigned int [size]; for (unsigned int i = 0; i < size; i++) strings[i] = NULL; }; StringIntHash::~StringIntHash() { for (unsigned int i = 0; i < size; i++) if (strings[i] != NULL) delete strings[i]; if(strings) delete [] strings; if(integers) delete [] integers; if(keys) delete [] keys; } void StringIntHash::SetSize(int newsize) { int newmask = newsize - 1; String ** newstrings = new String * [newsize]; int * newintegers = new int [newsize]; unsigned int * newkeys = new unsigned int [newsize]; for (int i = 0; i < newsize; i++) newstrings[i] = NULL; for (unsigned int i = 0; i < size; i++) if (strings[i] != NULL) { unsigned int key = keys[i]; unsigned int h = key & newmask; while (newstrings[h] != NULL && (newkeys[h] != key || (!stringsEqual(*(newstrings[h]), *(strings[i]))))) h = (h + 1) & newmask; newkeys[h] = key; newstrings[h] = strings[i]; newintegers[h] = integers[i]; } if(strings) delete [] strings; if(integers) delete [] integers; if(keys) delete [] keys; strings = newstrings; integers = newintegers; keys = newkeys; size = newsize; mask = newmask; } void StringIntHash::Clear() { for (unsigned int i = 0; i < size; i++) if (strings[i] != NULL) { delete strings[i]; strings[i] = NULL; } count = 0; if (size > 256) SetSize(256); } int StringIntHash::Add(const String & string, int value) { unsigned int key = getKey(string); unsigned int h = Iterate(key, string); if (strings[h] == NULL) Insert(h, key, string); integers[h] = value; if (count * 2 > size) { Grow(); return Iterate(key, string); } return h; } int StringIntHash::Find(const String & string, int defaultValue) { unsigned int key = getKey(string); unsigned int h = Iterate(key, string); if (strings[h] == NULL) { Insert(h, key, string); integers[h] = defaultValue; if (count * 2 > size) { Grow(); return Iterate(key, string); } } return h; } int StringIntHash::Find(const String & string) const { unsigned int key = getKey(string); unsigned int h = Iterate(key, string); if (strings[h] == NULL) return -1; return h; } void StringIntHash::Delete(unsigned int index) { if (index >= size || strings[index] == NULL) return; delete strings[index]; strings[index] = NULL; count--; if (count * 8 < size && size > 32) Shrink(); else { // rehash the next strings until we find empty slot index = (index + 1) & mask; while (strings[index] != NULL) { if ((keys[index] & mask) != index) { unsigned int h = Iterate(keys[index], *strings[index]); if (h != (unsigned int) index) { keys[h] = keys[index]; strings[h] = strings[index]; integers[h] = integers[index]; strings[index] = NULL; } } index = (index + 1) & mask; } } } // StringDoubleHash implementation StringDoubleHash::StringDoubleHash(int startsize) : StringHashBase() { count = 0; size = startsize; mask = startsize - 1; // In this implementation, the size of hash tables must be a power of two if (startsize & mask) error("StringDoubleHash: Hash table size must be a power of two.\n"); strings = new String * [size]; doubles = new double [size]; keys = new unsigned int [size]; for (unsigned int i = 0; i < size; i++) strings[i] = NULL; }; StringDoubleHash::~StringDoubleHash() { for (unsigned int i = 0; i < size; i++) if (strings[i] != NULL) delete strings[i]; if(strings) delete [] strings; if(doubles) delete [] doubles; if(keys) delete [] keys; } void StringDoubleHash::SetSize(int newsize) { int newmask = newsize - 1; String ** newstrings = new String * [newsize]; double * newdoubles = new double [newsize]; unsigned int * newkeys = new unsigned int [newsize]; for (int i = 0; i < newsize; i++) newstrings[i] = NULL; for (unsigned int i = 0; i < size; i++) if (strings[i] != NULL) { unsigned int key = keys[i]; unsigned int h = key & newmask; while (newstrings[h] != NULL && (newkeys[h] != key || (!stringsEqual(*(newstrings[h]), *(strings[i]))))) h = (h + 1) & newmask; newkeys[h] = key; newstrings[h] = strings[i]; newdoubles[h] = doubles[i]; } if(strings) delete [] strings; if(doubles) delete [] doubles; if(keys) delete [] keys; strings = newstrings; doubles = newdoubles; keys = newkeys; size = newsize; mask = newmask; } int StringDoubleHash::Add(const String & string, double value) { unsigned int key = getKey(string); unsigned int h = Iterate(key, string); if (strings[h] == NULL) Insert(h, key, string); doubles[h] = value; if (count * 2 > size) { Grow(); return Iterate(key, string); } return h; } int StringDoubleHash::Find(const String & string, double defaultValue) { unsigned int key = getKey(string); unsigned int h = Iterate(key, string); if (strings[h] == NULL) { Insert(h, key, string); doubles[h] = defaultValue; if (count * 2 > size) { Grow(); return Iterate(key, string); } } return h; } int StringDoubleHash::Find(const String & string) const { unsigned int key = getKey(string); unsigned int h = Iterate(key, string); if (strings[h] == NULL) return -1; return h; } void StringDoubleHash::Delete(unsigned int index) { if (index >= size || strings[index] == NULL) return; delete strings[index]; strings[index] = NULL; count--; if (count * 8 < size && size > 32) Shrink(); else { // rehash the next strings until we find empty slot index = (index + 1) & mask; while (strings[index] != NULL) { if ((keys[index] & mask) != index) { unsigned int h = Iterate(keys[index], *strings[index]); if (h != (unsigned int) index) { keys[h] = keys[index]; strings[h] = strings[index]; doubles[h] = doubles[index]; strings[index] = NULL; } } index = (index + 1) & mask; } } } void StringHash::Print() { Print(stdout); } void StringHash::Print(const char * filename) { FILE * output = fopen(filename, "wt"); if (output == NULL) return; Print(output); fclose(output); } void StringHash::Print(FILE * output) { for (unsigned int i = 0; i < size; i++) if (SlotInUse(i)) strings[i]->WriteLine(output); } String StringHash::StringList(char separator) { String list; for (unsigned int i = 0; i < size; i++) if (SlotInUse(i)) list += *strings[i] + separator; list.SetLength(list.Length() - 1); return list; } int StringIntHash::GetCount(const String & key) const { int index = Find(key); return index == -1 ? 0 : integers[index]; } int StringIntHash::IncrementCount(const String & key) { int index = Find(key); if (index != -1) return ++(integers[index]); SetInteger(key, 1); return 1; } int StringIntHash::IncrementCount(const String & key, int amount) { int index = Find(key); if (index != -1) return (integers[index] += amount); SetInteger(key, amount); return amount; } int StringIntHash::DecrementCount(const String & key) { int index = Find(key); if (index != -1) return --(integers[index]); SetInteger(key, -1); return -1; } void StringDoubleHash::Clear() { for (unsigned int i = 0; i < size; i++) if (strings[i] != NULL) { delete strings[i]; strings[i] = NULL; } count = 0; if (size > 256) SetSize(256); } StringHash & StringHash::operator = (const StringHash & rhs) { Clear(); for (int i = 0; i < rhs.Capacity(); i++) if (rhs.SlotInUse(i)) Add(*(rhs.strings[i]), rhs.objects[i]); return *this; } StringIntHash & StringIntHash::operator = (const StringIntHash & rhs) { Clear(); for (int i = 0; i < rhs.Capacity(); i++) if (rhs.SlotInUse(i)) Add(*(rhs.strings[i]), rhs.integers[i]); return *this; } bool StringIntHash::operator == (const StringIntHash & rhs) const { if (Capacity() != rhs.Capacity()) return false; if (Entries() != rhs.Entries()) return false; for (int i = 0; i < rhs.Capacity(); i++) { if(rhs.SlotInUse(i) != SlotInUse(i)) { return(false); } if (rhs.SlotInUse(i)) { if(*(strings[i]) != *(rhs.strings[i])) { return(false); } if(rhs.integers[i] != integers[i]) { return(false); } } } return(true); } StringDoubleHash & StringDoubleHash::operator = (const StringDoubleHash & rhs) { Clear(); for (int i = 0; i < rhs.Capacity(); i++) if (rhs.SlotInUse(i)) Add(*(rhs.strings[i]), rhs.doubles[i]); return *this; } void StringHash::Swap(StringHash & s) { String ** tstrings = s.strings; s.strings = strings; strings = tstrings; void ** tobjects = s.objects; s.objects = objects; objects = tobjects; unsigned int * tkeys = s.keys; s.keys = keys; keys = tkeys; unsigned int temp = s.count; s.count = count; count = temp; temp = s.size; s.size = size; size = temp; temp = s.mask; s.mask = mask; mask = temp; } libStatGen-1.0.14/general/StringHash.h000066400000000000000000000214201254730101300174640ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __STRINGHASH_H__ #define __STRINGHASH_H__ #include "StringBasics.h" #include "Constant.h" #include "Hash.h" class StringHashBase { public: inline void setCaseSensitive(bool caseSensitive) {myCaseSensitive = caseSensitive;} StringHashBase() : myCaseSensitive(false) {} virtual ~StringHashBase() {} // Make pure virtual virtual void SetSize(int newsize) = 0; protected: inline bool stringsEqual(const String& string1, const String& string2) const { if(myCaseSensitive) { // Case sensitive is faster. return(string1.FastCompare(string2) == 0); } // Case insensitive - slow compare - convert to same case. return(string1.SlowCompare(string2) == 0); } inline unsigned int getKey(const String& string) const { if(myCaseSensitive) { return(hash(string.uchar(), string.Length(), 0)); } // Case insensitive. return(hash_no_case(string.uchar(), string.Length(), 0)); } bool myCaseSensitive; }; class StringHash : public StringHashBase { protected: String ** strings; void ** objects; unsigned int * keys; unsigned int count, size; unsigned int mask; public: StringHash(int startsize = 32); virtual ~StringHash(); void Grow() { SetSize(size * 2); } void Shrink() { SetSize(size / 2); } void SetSize(int newsize); void Clear(); int Capacity() const { return size; } int Entries() const { return count; } void * Object(int i) const { return objects[i]; } void * Object(const String & key) const { int index = Find(key); return index >= 0 ? objects[index] : NULL; } void * Object(const String & key, void *(*create_object)()) { int index = Find(key, create_object); return objects[index]; } void SetObject(int i, void * object) { objects[i] = object; } void SetObject(const String & key, void * object) { Add(key, object); } int Add(const String & s, void * object = NULL); int Find(const String & s, void *(*create_object)() = NULL); int Find(const String & s) const; StringHash & operator = (const StringHash & rhs); const String & operator [](int i) const { return *(strings[i]); } String & operator [](int i) { return *(strings[i]); } // String & String(int i) { return *(strings[i]); } static void * CreateHash(); void Delete(unsigned int index); void Delete(const String & key) { Delete(Find(key)); } bool SlotInUse(int index) const { return strings[index] != NULL; } void Print(); void Print(FILE * file); void Print(const char * filename); String StringList(char separator = ','); // Initialize hash with the contents of a file void ReadLinesFromFile(FILE * file); void ReadLinesFromFile(const char * filename); void ReadLinesFromFile(IFILE & file); void Swap(StringHash & s); private: unsigned int Iterate(unsigned int key, const String & string) const { unsigned int h = key & mask; while (strings[h] != NULL && (keys[h] != key || (!stringsEqual(*(strings[h]), string)))) h = (h + 1) & mask; return h; } void Insert(unsigned int where, unsigned int key, const String & string) { strings[where] = new String; *(strings[where]) = string; keys[where] = key; count++; } }; class StringIntHash : public StringHashBase { protected: String ** strings; int * integers; unsigned int * keys; unsigned int count, size; unsigned int mask; public: StringIntHash(int startsize = 32); virtual ~StringIntHash(); void Grow() { SetSize(size * 2); } void Shrink() { SetSize(size / 2); } void SetSize(int newsize); void Clear(); int Capacity() const { return size; } int Entries() const { return count; } int Integer(int i) const { return integers[i]; } int Integer(const String & key) const { int index = Find(key); return index >= 0 ? integers[index] : -1; } void SetInteger(int i, int value) { integers[i] = value; } void SetInteger(const String & key, int value) { Add(key, value); } int IncrementCount(const String & key); int IncrementCount(const String & key, int amount); int DecrementCount(const String & key); int GetCount(const String & key) const; int GetCount(int index) const { return integers[index]; } int Add(const String & s, int integer); int Find(const String & s, int defaultValue); int Find(const String & s) const; StringIntHash & operator = (const StringIntHash & rhs); bool operator == (const StringIntHash & rhs) const; const String & operator [](int i) const { return *(strings[i]); } String & operator [](int i) { return *(strings[i]); } // String & String(int i) { return *(strings[i]); } void Delete(unsigned int index); void Delete(const String & key) { Delete(Find(key)); } bool SlotInUse(int index) const { return strings[index] != NULL; } private: unsigned int Iterate(unsigned int key, const String & string) const { unsigned int h = key & mask; while (strings[h] != NULL && (keys[h] != key || (!stringsEqual(*(strings[h]), string)))) h = (h + 1) & mask; return h; } void Insert(unsigned int where, unsigned int key, const String & string) { strings[where] = new String; *(strings[where]) = string; keys[where] = key; count++; } }; class StringDoubleHash : public StringHashBase { protected: String ** strings; double * doubles; unsigned int * keys; unsigned int count, size; unsigned int mask; public: StringDoubleHash(int startsize = 32); virtual ~StringDoubleHash(); void Grow() { SetSize(size * 2); } void Shrink() { SetSize(size / 2); } void SetSize(int newsize); void Clear(); int Capacity() const { return size; } int Entries() const { return count; } double Double(int i) const { return doubles[i]; } double Double(const String & key) const { int index = Find(key); return index >= 0 ? doubles[index] : _NAN_; } void SetDouble(int i, double value) { doubles[i] = value; } void SetDouble(const String & key, double value) { Add(key, value); } int Add(const String & s, double value); int Find(const String & s, double defaultValue); int Find(const String & s) const; StringDoubleHash & operator = (const StringDoubleHash & rhs); const String & operator [](int i) const { return *(strings[i]); } String & operator [](int i) { return *(strings[i]); } // String & String(int i) { return *(strings[i]); } void Delete(unsigned int index); void Delete(const String & key) { Delete(Find(key)); } bool SlotInUse(int index) const { return strings[index] != NULL; } private: unsigned int Iterate(unsigned int key, const String & string) const { unsigned int h = key & mask; while (strings[h] != NULL && (keys[h] != key || (!stringsEqual(*(strings[h]), string)))) h = (h + 1) & mask; return h; } void Insert(unsigned int where, unsigned int key, const String & string) { strings[where] = new String; *(strings[where]) = string; keys[where] = key; count++; } }; #endif libStatGen-1.0.14/general/StringMap.cpp000066400000000000000000000262421254730101300176600ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "StringMap.h" int StringMap::alloc = 8; StringMap::StringMap(int startsize) { count = 0; size = (startsize + alloc) / alloc * alloc; strings = new ::String * [size]; objects = new void * [size]; }; StringMap::~StringMap() { for (int i = 0; i < count; i++) delete strings[i]; delete [] strings; delete [] objects; } void StringMap::Grow(int newsize) { if (newsize >= size) { if ((newsize >> 1) >= size) size = (newsize + alloc) / alloc * alloc; else { size = alloc; while (size <= newsize) size *= 2; } size = (newsize + alloc) / alloc * alloc; ::String ** newStrings = new ::String * [size]; void ** newObjects = new void * [size]; for (int i = 0; i < count; i++) { newStrings[i] = strings[i]; newObjects[i] = objects[i]; } delete [] strings; delete [] objects; strings = newStrings; objects = newObjects; } } int StringMap::Add(const ::String & key, void * object) { if (count == 0) { Grow(1); strings[0] = new ::String(key); objects[0] = object; return count++; } int left = 0; int right = count - 1; while (right > left) { int probe = (left + right) / 2; int test = key.SlowCompare(*(strings[probe])); if (test == 0) { objects[probe] = object; return probe; } if (test < 0) right = probe - 1; else left = probe + 1; } int insertAt = left; int test = key.SlowCompare(*(strings[insertAt])); if (test == 0) { objects[insertAt] = object; return insertAt; } if (test > 0) insertAt++; Grow(count + 1); if (insertAt < count) { for (int i = count; i > insertAt; i--) { strings[i] = strings[i - 1]; objects[i] = objects[i - 1]; } } strings[insertAt] = new ::String(key); objects[insertAt] = object; count++; return insertAt; } int StringMap::Find(const ::String & s, void *(*create_object)()) { if (!count) return create_object == NULL ? -1 : Add(s, create_object()); int left = 0; int right = count - 1; while (right > left) { int probe = (left + right) / 2; int test = s.SlowCompare(*(strings[probe])); if (test == 0) return probe; if (test < 0) right = probe - 1; else left = probe + 1; } int position = left; int test = s.SlowCompare(*(strings[left])); if (test == 0) return position; if (create_object == NULL) return -1; if (test > 0) position++; Grow(count + 1); if (position < count) { for (int i = count; i > position; i--) { strings[i] = strings[i - 1]; objects[i] = objects[i - 1]; } } strings[position] = new ::String(s); objects[position] = create_object(); count++; return position; } int StringMap::Find(const ::String & s) const { if (!count) return -1; int left = 0; int right = count - 1; while (right > left) { int probe = (left + right) / 2; int test = s.SlowCompare(*(strings[probe])); if (test == 0) return probe; if (test < 0) right = probe - 1; else left = probe + 1; } int position = left; int test = s.SlowCompare(*(strings[left])); if (test == 0) return position; return -1; } int StringMap::FindStem(const ::String & stem) const { if (!count) return -1; int left = 0; int right = count - 1; while (right > left) { int probe = (left + right) / 2; int test = strings[probe]->SlowCompareToStem(stem); if (test == 0) { if ((left < probe && strings[probe-1]->SlowCompareToStem(stem) == 0) || (right > probe && strings[probe+1]->SlowCompareToStem(stem) == 0)) return -2; return probe; } if (test > 0) right = probe - 1; else left = probe + 1; } if (strings[left]->SlowCompareToStem(stem) == 0) return left; return -1; } int StringMap::FindFirstStem(const ::String & stem) const { if (!count) return -1; int left = 0; int right = count - 1; while (right > left) { int probe = (left + right) / 2; int test = strings[probe]->SlowCompareToStem(stem); if (test == 0) { while (left < probe && strings[probe-1]->SlowCompareToStem(stem) == 0) probe--; return probe; } if (test > 0) right = probe - 1; else left = probe + 1; } if (strings[left]->SlowCompareToStem(stem) == 0) return left; return -1; } void * StringMap::CreateMap() { return (void *) new StringMap(); } void StringMap::Clear() { for (int i = 0; i < count; i++) delete strings[i]; count = 0; } void StringMap::Delete(int index) { count--; delete strings[index]; for (int i = index; i < count; i++) { strings[i] = strings[i+1]; objects[i] = objects[i+1]; } } // StringIntMap class // int StringIntMap::alloc = 8; StringIntMap::StringIntMap(int startsize) { count = 0; size = (startsize + alloc) / alloc * alloc; strings = new ::String * [size]; integers = new int[size]; }; StringIntMap::~StringIntMap() { for (int i = 0; i < count; i++) delete strings[i]; delete [] strings; delete [] integers; } void StringIntMap::Grow(int newsize) { if (newsize >= size) { if ((newsize >> 1) >= size) size = (newsize + alloc) / alloc * alloc; else { size = alloc; while (size <= newsize) size *= 2; } ::String ** newStrings = new ::String * [size]; int * newIntegers = new int [size]; for (int i = 0; i < count; i++) { newStrings[i] = strings[i]; newIntegers[i] = integers[i]; } delete [] strings; delete [] integers; strings = newStrings; integers = newIntegers; } } int StringIntMap::Add(const ::String & key, int integer) { if (count == 0) { Grow(1); strings[0] = new ::String(key); integers[0] = integer; return count++; } int left = 0; int right = count - 1; while (right > left) { int probe = (left + right) / 2; int test = key.SlowCompare(*(strings[probe])); if (test == 0) { integers[probe] = integer; return probe; } if (test < 0) right = probe - 1; else left = probe + 1; } int insertAt = left; int test = key.SlowCompare(*(strings[insertAt])); if (test == 0) { integers[insertAt] = integer; return insertAt; } if (test > 0) insertAt++; Grow(count + 1); if (insertAt < count) { for (int i = count; i > insertAt; i--) { strings[i] = strings[i - 1]; integers[i] = integers[i - 1]; } } strings[insertAt] = new ::String(key); integers[insertAt] = integer; count++; return insertAt; } int StringIntMap::Find(const ::String & s, int defaultValue) { if (!count) return Add(s, defaultValue); int left = 0; int right = count - 1; while (right > left) { int probe = (left + right) / 2; int test = s.SlowCompare(*(strings[probe])); if (test == 0) return probe; if (test < 0) right = probe - 1; else left = probe + 1; } int position = left; int test = s.SlowCompare(*(strings[left])); if (test == 0) return position; if (test > 0) position++; Grow(count + 1); if (position < count) { for (int i = count; i > position; i--) { strings[i] = strings[i - 1]; integers[i] = integers[i - 1]; } } strings[position] = new ::String(s); integers[position] = defaultValue; count++; return position; } int StringIntMap::Find(const ::String & s) const { if (!count) return -1; int left = 0; int right = count - 1; while (right > left) { int probe = (left + right) / 2; int test = s.SlowCompare(*(strings[probe])); if (test == 0) return probe; if (test < 0) right = probe - 1; else left = probe + 1; } int position = left; int test = s.SlowCompare(*(strings[left])); if (test == 0) return position; return -1; } int StringIntMap::FindStem(const ::String & stem) const { if (!count) return -1; int left = 0; int right = count - 1; while (right > left) { int probe = (left + right) / 2; int test = strings[probe]->SlowCompareToStem(stem); if (test == 0) { if ((left < probe && strings[probe-1]->SlowCompareToStem(stem) == 0) || (right > probe && strings[probe+1]->SlowCompareToStem(stem) == 0)) return -2; return probe; } if (test > 0) right = probe - 1; else left = probe + 1; } if (strings[left]->SlowCompareToStem(stem) == 0) return left; return -1; } void StringIntMap::Clear() { for (int i = 0; i < count; i++) delete strings[i]; count = 0; } int StringIntMap::GetCount(const ::String & key) const { int index = Find(key); return index == -1 ? 0 : integers[index]; } int StringIntMap::IncrementCount(const ::String & key) { int index = Find(key); if (index != -1) return ++(integers[index]); SetInteger(key, 1); return 1; } int StringIntMap::DecrementCount(const ::String & key) { int index = Find(key); if (index != -1) return --(integers[index]); SetInteger(key, -1); return -1; } void StringIntMap::Delete(int index) { count--; delete strings[index]; for (int i = index; i < count; i++) { strings[i] = strings[i+1]; integers[i] = integers[i+1]; } } libStatGen-1.0.14/general/StringMap.h000066400000000000000000000072071254730101300173250ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __STRINGMAP_H__ #define __STRINGMAP_H__ #include "StringBasics.h" class StringMap { protected: ::String ** strings; void ** objects; int count, size; public: static int alloc; StringMap(int startsize = 0); virtual ~StringMap(); void Grow(int newsize); void Clear(); int Length() const { return count; } void * Object(int i) const { return objects[i]; } void * Object(const ::String & key) const { int index = Find(key); return (index >= 0) ? objects[index] : NULL; } void * Object(const ::String & key, void *(*create_object)()) { return objects[Find(key, create_object)]; } void SetObject(int i, void * object) { objects[i] = object; } void SetObject(const ::String & key, void * object) { Add(key, object); } int Add(const ::String & s, void * object = NULL); int Find(const ::String & s, void *(*create_object)() = NULL); int Find(const ::String & s) const; int FindStem(const ::String & stem) const; int FindFirstStem(const ::String & stem) const; StringMap & operator = (const StringMap & rhs); const ::String & operator [](int i) const { return *(strings[i]); } ::String & operator [](int i) { return *(strings[i]); } ::String & String(int i) { return *(strings[i]); } static void * CreateMap(); void Delete(int index); }; class StringIntMap { protected: ::String ** strings; int * integers; int count, size; public: static int alloc; StringIntMap(int startsize = 0); virtual ~StringIntMap(); void Grow(int newsize); void Clear(); int Length() const { return count; } int Integer(int i) const { return integers[i]; } int Integer(const ::String & key) const { int index = Find(key); return (index >= 0) ? (int) integers[index] : -1; } void SetInteger(int i, int value) { integers[i] = value; } void SetInteger(const ::String & key, int value) { Add(key, value); } int Add(const ::String & s, int i); int Find(const ::String & s, int defaultValue); int Find(const ::String & s) const; int FindStem(const ::String & stem) const; StringIntMap & operator = (const StringIntMap & rhs); const ::String & operator [](int i) const { return *(strings[i]); } ::String & operator [](int i) { return *(strings[i]); } ::String & String(int i) { return *(strings[i]); } static void * CreateMap(); int IncrementCount(const ::String & key); int DecrementCount(const ::String & key); int GetCount(const ::String & key) const; int GetCount(int index) const { return integers[index]; } void Delete(int index); }; #endif libStatGen-1.0.14/general/Tabix.cpp000066400000000000000000000157511254730101300170260ustar00rootroot00000000000000/* * Copyright (C) 2012-2013 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Tabix.h" #include #include "StringBasics.h" Tabix::Tabix() : IndexBase(), myChromNamesBuffer(NULL) { } Tabix::~Tabix() { if(myChromNamesBuffer != NULL) { delete[] myChromNamesBuffer; myChromNamesBuffer = NULL; } } // Reset the member data for a new index file. void Tabix::resetIndex() { IndexBase::resetIndex(); if(myChromNamesBuffer != NULL) { delete[] myChromNamesBuffer; myChromNamesBuffer = NULL; } myChromNamesVector.clear(); } // Read & parse the specified index file. StatGenStatus::Status Tabix::readIndex(const char* filename) { // Reset the index from anything that may previously be set. resetIndex(); IFILE indexFile = ifopen(filename, "rb"); // Failed to open the index file. if(indexFile == NULL) { return(StatGenStatus::FAIL_IO); } // read the tabix index structure. // Read the magic string. char magic[4]; if(ifread(indexFile, magic, 4) != 4) { // Failed to read the magic return(StatGenStatus::FAIL_IO); } // If this is not an index file, set num references to 0. if (magic[0] != 'T' || magic[1] != 'B' || magic[2] != 'I' || magic[3] != 1) { // Not a Tabix Index file. return(StatGenStatus::FAIL_PARSE); } // It is a tabix index file. // Read the number of reference sequences. if(ifread(indexFile, &n_ref, 4) != 4) { // Failed to read. return(StatGenStatus::FAIL_IO); } // Size the references. myRefs.resize(n_ref); // Read the Format configuration. if(ifread(indexFile, &myFormat, sizeof(myFormat)) != sizeof(myFormat)) { // Failed to read. return(StatGenStatus::FAIL_IO); } // Read the length of the chromosome names. uint32_t l_nm; if(ifread(indexFile, &l_nm, sizeof(l_nm)) != sizeof(l_nm)) { // Failed to read. return(StatGenStatus::FAIL_IO); } // Read the chromosome names. myChromNamesBuffer = new char[l_nm]; if(ifread(indexFile, myChromNamesBuffer, l_nm) != l_nm) { return(StatGenStatus::FAIL_IO); } myChromNamesVector.resize(n_ref); // Parse out the chromosome names. bool prevNull = true; int chromIndex = 0; for(uint32_t i = 0; i < l_nm; i++) { if(chromIndex >= n_ref) { // already set the pointer for the last chromosome name, // so stop looping. break; } if(prevNull == true) { myChromNamesVector[chromIndex++] = myChromNamesBuffer + i; prevNull = false; } if(myChromNamesBuffer[i] == '\0') { prevNull = true; } } for(int refIndex = 0; refIndex < n_ref; refIndex++) { // Read each reference. Reference* ref = &(myRefs[refIndex]); // Read the number of bins. if(ifread(indexFile, &(ref->n_bin), 4) != 4) { // Failed to read the number of bins. // Return failure. return(StatGenStatus::FAIL_PARSE); } // Resize the bins. ref->bins.resize(ref->n_bin + 1); // Read each bin. for(int binIndex = 0; binIndex < ref->n_bin; binIndex++) { uint32_t binNumber; // Read in the bin number. if(ifread(indexFile, &(binNumber), 4) != 4) { // Failed to read the bin number. // Return failure. return(StatGenStatus::FAIL_IO); } // Add the bin to the reference and get the // pointer back so the values can be set in it. Bin* binPtr = &(ref->bins[binIndex]); binPtr->bin = binNumber; // Read in the number of chunks. if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4) { // Failed to read number of chunks. // Return failure. return(StatGenStatus::FAIL_IO); } // Read in the chunks. // Allocate space for the chunks. uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk); binPtr->chunks = (Chunk*)malloc(sizeOfChunkList); if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList) { // Failed to read the chunks. // Return failure. return(StatGenStatus::FAIL_IO); } } // Read the number of intervals. if(ifread(indexFile, &(ref->n_intv), 4) != 4) { // Failed to read, set to 0. ref->n_intv = 0; // Return failure. return(StatGenStatus::FAIL_IO); } // Allocate space for the intervals and read them. uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t); ref->ioffsets = (uint64_t*)malloc(linearIndexSize); if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize) { // Failed to read the linear index. // Return failure. return(StatGenStatus::FAIL_IO); } } // Successfully read teh bam index file. return(StatGenStatus::SUCCESS); } bool Tabix::getStartPos(const char* refName, int32_t start, uint64_t& fileStartPos) const { // Look for the reference name in the list. int refID = 0; for(refID = 0; refID < n_ref; refID++) { if(strcmp(refName, myChromNamesVector[refID]) == 0) { // found the reference break; } } if(refID >= n_ref) { // Didn't find the refName, so return false. return(false); } // Look up in the linear index. if(start < 0) { // Negative index, so start at 0. start = 0; } return(getMinOffsetFromLinearIndex(refID, start, fileStartPos)); } const char* Tabix::getRefName(unsigned int indexNum) const { if(indexNum >= myChromNamesVector.size()) { String message = "ERROR: Out of range on Tabix::getRefName("; message += indexNum; message += ")"; throw(std::runtime_error(message.c_str())); return(NULL); } return(myChromNamesVector[indexNum]); } libStatGen-1.0.14/general/Tabix.h000066400000000000000000000046161254730101300164710ustar00rootroot00000000000000/* * Copyright (C) 2012-2013 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __TABIX_H__ #define __TABIX_H__ #include #include #include #include #include "IndexBase.h" #include "InputFile.h" #include "StatGenStatus.h" class Tabix : public IndexBase { public: enum Format { FORMAT_GENERIC = 0, FORMAT_SAM = 1, FORMAT_VCF = 2 }; Tabix(); virtual ~Tabix(); /// Reset the member data for a new index file. void resetIndex(); // Read & parse the specified index file. /// \param filename the bam index file to be read. /// \return the status of the read. StatGenStatus::Status readIndex(const char* filename); /// Get the starting file offset to look for the specified start position. /// For an entire reference ID, set start to -1. /// To start at the beginning of the region, set start to 0/-1. bool getStartPos(const char* refName, int32_t start, uint64_t& fileStartPos) const; /// Return the reference name at the specified index or /// throws an exception if out of range. const char* getRefName(unsigned int indexNum) const; // Get the format of this tabix file. inline int32_t getFormat() const { return myFormat.format; } private: struct TabixFormat { int32_t format; int32_t col_seq; int32_t col_beg; int32_t col_end; int32_t meta; // character that starts header lines int32_t skip; // Number of lines to skip from putting into the index. }; TabixFormat myFormat; char* myChromNamesBuffer; // vector pointing to the chromosome names. std::vector myChromNamesVector; }; #endif libStatGen-1.0.14/general/TrimSequence.h000066400000000000000000000116251254730101300200240ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _TRIMSEQUENCE_H #define _TRIMSEQUENCE_H #include #include #include #ifndef __WIN32__ #include #endif /// /// TrimSequence is a templated function to find bases /// which are below a certain moving mean threshold, /// and can be applied to either end of the sequence string. /// /// @param sequence is the input sequence /// @param meanValue is the value below which we wish to trim. /// @return the iterator of the location at which untrimmed values begin /// /// Details: /// /// trimFromLeft is a bool indicating which direction we wish /// to trim. true -> left to right, false is right to left. /// /// The code is convoluted enough here, so for implementation /// and testing sanity, the following definitions are made: /// /// When trimFromLeft is true: /// result == sequence.begin() implies no trimming /// result == sequence.end() implies all values are trimmed /// /// When trimFromLeft is false: /// result == sequence.begin() implies all values are trimmed /// result == sequence.end() no values are trimmed /// /// result will always be in the range [sequence.begin() , sequence.end()) /// (begin is inclusive, end is exclusive). /// /// NOTE: See TrimSequence.h and test/TrimSequence_test.cpp for examples /// /// THIS CODE IS EXCEPTIONALLY FRAGILE. DO NOT ATTEMPT TO FIX OR /// IMPROVE WITHOUT INCLUDING DOCUMENTED, UNDERSTANABLE TEST CASES THAT CLEARLY /// SHOW WHY OR WHY NOT SOMETHING WORKS. /// template typename sequenceType::iterator trimSequence(sequenceType &sequence, meanValueType meanValue, const bool trimFromLeft) { const int howManyValues = 4; // this is used in signed arithmetic below int windowThreshold = howManyValues * meanValue; int64_t sumOfWindow = 0; typename sequenceType::iterator it; // // Sanity check to weed out what otherwise would be // a large number of boundary checks below. If the input // is too small, just punt it back to the caller. Technically, // we can still trim, but we'd just do the simple iteration // loop. Save that for when we care. // if (sequence.size() < (size_t) howManyValues) return trimFromLeft? sequence.begin() : sequence.end(); typename sequenceType::iterator sequenceBegin; typename sequenceType::iterator sequenceEnd; // The algorithm should be clear and efficient // so it does not bother to write codes for two directions. // It that way, we avoid thinking trimming from left and right interchangably. if (trimFromLeft) { // sequenceBegin is inclusive, sequenceEnd is exclusive, sequenceBegin = sequence.begin(); sequenceEnd = sequence.end(); for (it = sequenceBegin; it < sequenceBegin + howManyValues; it++) sumOfWindow += *it; for (; it < sequenceEnd; it ++) { if (sumOfWindow > windowThreshold) break; sumOfWindow += *it; sumOfWindow -= *(it - howManyValues); } // here it is in the range of [sequenceBegin+howManyValues, sequenceEnd] inclusively // the range is also [sequence.begin() + howManyValues, sequence.end()] while (*(it-1) >= meanValue && (it-1) >= sequenceBegin) it--; } else { sequenceBegin = sequence.end() - 1; sequenceEnd = sequence.begin() - 1; for (it = sequenceBegin; it > sequenceBegin - howManyValues; it--) sumOfWindow += *it; for (; it > sequenceEnd; it--) { if (sumOfWindow > windowThreshold) break; sumOfWindow += *it; sumOfWindow -= *(it + howManyValues); } // here it is in the range of [sequenceEnd, sequenceBegin - howManyValues] inclusively // the range is also [sequence.begin() -1, sequence.end() - 1 - howManyValues] while (*(it+1) >= meanValue && (it+1) <= sequenceBegin) it ++; // note, the return value should in the range [sequence.begin(), sequence.end()] it += 1; } // 'it' may be sequence.end() in some cases assert(it >= sequence.begin() && it <= sequence.end()); return it; } #endif libStatGen-1.0.14/general/UncompressedFileType.cpp000066400000000000000000000036541254730101300220670ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "UncompressedFileType.h" #include UncompressedFileType::UncompressedFileType(const char * filename, const char * mode) : filePtr(NULL), kfilePtr(NULL), keof(false) { // Check if opening for read. if((mode[0] == 'r') || (mode[0] == 'R')) { if(strcmp(filename, "-") == 0) { // read from stdin filePtr = stdin; } else if((strstr(filename, "ftp://") == filename) || (strstr(filename, "http://") == filename)) { // Reading http/ftp, so open the file using knetfile. kfilePtr = knet_open(filename, mode); } else { // Open the file. filePtr = fopen(filename, mode); } } else { // Not for read. // If the file is for write and is '-', then write to stdout. if(((mode[0] == 'w') || (mode[0] == 'W')) && (strcmp(filename, "-") == 0)) { // Write to stdout. filePtr = stdout; } else { // Open the file. filePtr = fopen(filename, mode); } } }; libStatGen-1.0.14/general/UncompressedFileType.h000066400000000000000000000130061254730101300215240ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __UNCOMPRESSEDFILETYPE_H__ #define __UNCOMPRESSEDFILETYPE_H__ #include #include #include "FileType.h" #include "knetfile.h" class UncompressedFileType : public FileType { public: UncompressedFileType() { filePtr = NULL; kfilePtr = NULL; keof = false; } virtual ~UncompressedFileType() { if((filePtr != NULL) || (kfilePtr != NULL)) { close(); } } UncompressedFileType(const char * filename, const char * mode); bool operator == (void * rhs) { // No two file pointers are the same, so if rhs is not NULL, then // the two pointers are different (false). if (rhs != NULL) return false; // rhs is NULL. They are the same if both filePtr & kfilePtr are NULL. return((filePtr == rhs) && (kfilePtr == rhs)); } bool operator != (void * rhs) { // No two file pointers are the same, so if rhs is not NULL, then // the two pointers are different (true). if (rhs != NULL) return true; // rhs is NULL. They are the different if either filePtr or kfilePtr // are not NULL. return((filePtr != rhs) || (kfilePtr != rhs)); } // Close the file. inline int close() { if(filePtr != NULL) { if((filePtr != stdout) && (filePtr != stdin)) { int result = fclose(filePtr); filePtr = NULL; return result; } filePtr = NULL; } else if(kfilePtr != NULL) { int result = knet_close(kfilePtr); kfilePtr = NULL; return result; } return 0; } // Reset to the beginning of the file. inline void rewind() { // Just call rewind to move to the beginning of the file. if(filePtr != NULL) { ::rewind(filePtr); } else if (kfilePtr != NULL) { knet_seek(kfilePtr, 0, SEEK_SET); } } // Check to see if we have reached the EOF. inline int eof() { // check the file for eof. if(kfilePtr != NULL) { return(keof); } else { return feof(filePtr); } } // Check to see if the file is open. virtual inline bool isOpen() { if((filePtr != NULL) || (kfilePtr != NULL)) { // filePtr is not null, so the file is open. return(true); } return(false); } // Write to the file inline unsigned int write(const void * buffer, unsigned int size) { // knetfile is never used for writing. return fwrite(buffer, 1, size, filePtr); } // Read into a buffer from the file. Since the buffer is passed in and // this would bypass the fileBuffer used by this class, this method must // be protected. inline int read(void * buffer, unsigned int size) { if(kfilePtr != NULL) { int bytesRead = knet_read(kfilePtr, buffer, size); if((bytesRead == 0) && (size != 0)) { keof = true; } else if((bytesRead != (int)size) && (bytesRead >= 0)) { // Less then the requested size was read and an error // was not returned (bgzf_read returns -1 on error). keof = true; } else { keof = false; } return(bytesRead); } return fread(buffer, 1, size, filePtr); } // Get current position in the file. // -1 return value indicates an error. virtual inline int64_t tell() { if(kfilePtr != NULL) { return knet_tell(kfilePtr); } return ftell(filePtr); } // Seek to the specified offset from the origin. // origin can be any of the following: // Note: not all are valid for all filetypes. // SEEK_SET - Beginning of file // SEEK_CUR - Current position of the file pointer // SEEK_END - End of file // Returns true on successful seek and false on a failed seek. virtual inline bool seek(int64_t offset, int origin) { int returnVal = 0; if(kfilePtr != NULL) { returnVal = knet_seek(kfilePtr, offset, origin); keof = false; } else { returnVal = fseek(filePtr, offset, origin); } // Check for success - 0 return value. if (returnVal == 0) { return true; } // Successful. return false; } protected: // A FILE Pointer is used. FILE* filePtr; knetFile *kfilePtr; bool keof; }; #endif libStatGen-1.0.14/general/UnitTest.h000066400000000000000000000026021254730101300171720ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __UNITTEST_H #define __UNITTEST_H #include #include class UnitTest { protected: std::string m_title; int m_failures; int m_testNum; public: UnitTest(const char *title) : m_title(title), m_failures(0), m_testNum(0) {;}; void test(); int getPassCount() {return m_testNum - m_failures;} int getFailureCount() {return m_failures;} const std::string getTitle() const {return m_title;} }; std::ostream &operator << (std::ostream &stream, UnitTest &test) { stream << test.getTitle() << " PASS: " << test.getPassCount() << " FAIL: " << test.getFailureCount() << std::endl; return stream; } #endif libStatGen-1.0.14/general/glfHandler.cpp000066400000000000000000000220761254730101300200230ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "glfHandler.h" #include "BaseQualityHelper.h" char glfHandler::translateBase[16] = {0, 1, 2, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0}; char glfHandler::backTranslateBase[5] = { 15, 1, 2, 4, 8 }; unsigned char glfHandler::nullLogLikelihoods[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; double glfHandler::nullLikelihoods[10] = {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}; glfHandler::glfHandler() { isStub = true; sections = 0; currentSection = 0; maxPosition = position = endOfSection = 0; } glfHandler::~glfHandler() { // Not safe to close the file here in case a copy of the file was generated // if (isOpen()) // Close(); } bool glfHandler::Open(const String & filename) { isStub = false; handle = ifopen(filename, "rb"); if (handle == NULL) { isStub = true; return false; } if (!ReadHeader()) ifclose(handle); endOfSection = true; return handle != NULL; } void glfHandler::OpenStub() { isStub = true; handle = NULL; endOfSection = true; data.recordType = 0; maxPosition = 1999999999; position = maxPosition + 1; } bool glfHandler::Create(const String & filename) { isStub = false; // glf is in BGZF format. handle = ifopen(filename, "wb", InputFile::BGZF); if (handle == NULL) { isStub = true; return false; } WriteHeader(); return handle != NULL; } bool glfHandler::isOpen() { return handle != NULL; } bool glfHandler::ReadHeader() { if (isStub) return true; if (handle == NULL) return false; char magicNumber[4]; if (ifread(handle, magicNumber, 4) != 4) { errorMsg = "unexpected end of file"; return false; } if (magicNumber[0] != 'G' || magicNumber[1] != 'L' || magicNumber[2] != 'F') { errorMsg = "invalid format"; return false; } if (magicNumber[3] != 3) { errorMsg = "unsupported version"; return false; } unsigned int headerLength = 0; if (ifread(handle, &headerLength, 4) != 4) { errorMsg = "unexpected end of file"; return false; } if (headerLength > 1024 * 1024) { errorMsg = "header too large -- bailing"; return false; } header.SetLength(headerLength + 1); header[headerLength] = 0; if (headerLength && ifread(handle, header.LockBuffer(headerLength + 1), headerLength) != headerLength) { errorMsg = "unexpected end of file"; return false; } return true; } void glfHandler::Close() { if (isOpen()) ifclose(handle); } void glfHandler::Rewind() { if (isOpen()) { ifrewind(handle); if (!ReadHeader()) ifclose(handle); endOfSection = true; } } bool glfHandler::NextSection() { if (isStub) { endOfSection = true; data.recordType = 0; maxPosition = 1999999999; position = maxPosition + 1; return true; } while (!endOfSection && !ifeof(handle)) NextEntry(); endOfSection = false; int labelLength = 0; currentSection++; position = 0; if (ifread(handle, &labelLength, sizeof(int)) == sizeof(int)) { ifread(handle, label.LockBuffer(labelLength+1), labelLength * sizeof(char)); label.UnlockBuffer(); maxPosition = 0; ifread(handle, &maxPosition, sizeof(int)); return ((maxPosition > 0) && !ifeof(handle)); } return false; } bool glfHandler::NextBaseEntry() { bool result = true; do { result = NextEntry(); } while (result && data.recordType == 2); return result; } bool glfHandler::NextEntry() { if (isStub) return false; // Read record type if (endOfSection || (ifread(handle, &data, 1) != 1)) { endOfSection = true; data.recordType = 0; position = maxPosition + 1; return false; } // printf("%d/%d\n", data.recordType, data.refBase); if (position > maxPosition) return true; switch (data.recordType) { case 0 : endOfSection = true; position = maxPosition + 1; return true; case 1 : if (ifread(handle,((char *) &data) + 1, sizeof(data) - 1) == sizeof(data) - 1) { data.refBase = translateBase[data.refBase]; for (int i = 0; i < 10; i++) likelihoods[i] = bQualityConvertor.toDouble(data.lk[i]); position = position + data.offset; return true; } // Premature end of file data.recordType = 0; position = maxPosition + 1; return false; case 2 : while (ifread(handle, ((char *) &data) + 1, sizeof(data) - 4) == sizeof(data) - 4) { data.refBase = translateBase[data.refBase]; for (int i = 0; i < 3; i++) likelihoods[i] = bQualityConvertor.toDouble(data.indel.lk[i]); position = position + data.offset; indelSequence[0].SetLength(abs(data.indel.length[0]) + 1); indelSequence[0][abs(data.indel.length[0])] = 0; if (ifread(handle, indelSequence[0].LockBuffer(), abs(data.indel.length[0])) != (unsigned int) abs(data.indel.length[0])) break; indelSequence[1].SetLength(abs(data.indel.length[1]) + 1); indelSequence[1][abs(data.indel.length[1])] = 0; if (ifread(handle, indelSequence[1].LockBuffer(), abs(data.indel.length[1])) != (unsigned int) abs(data.indel.length[1])) break; return true; } // Premature end of file data.recordType = 0; position = maxPosition + 1; return false; } return false; } glfEntry & glfEntry::operator = (glfEntry & rhs) { refBase = rhs.refBase; recordType = rhs.recordType; offset = rhs.offset; mapQuality = rhs.mapQuality; for (int i = 0; i < 10; i++) lk[i] = rhs.lk[i]; minLLK = rhs.minLLK; depth = rhs.depth; return * this; } const double * glfHandler::GetLikelihoods(int pos) { if (pos == position) return likelihoods; return nullLikelihoods; } const unsigned char * glfHandler::GetLogLikelihoods(int pos) { if (pos == position) return data.lk; return nullLogLikelihoods; } char glfHandler::GetReference(int pos, char defaultBase) { if (pos == position) return data.refBase; return defaultBase; } int glfHandler::GetDepth(int pos) { if (pos == position) return data.depth; return 0; } int glfHandler::GetMapQuality(int pos) { if (pos == position) return data.mapQuality; return 0; } void glfHandler::WriteHeader(const String & headerText) { char magicNumber[4] = {'G', 'L', 'F', 3}; ifwrite(handle, magicNumber, 4); unsigned int headerLength = headerText.Length(); ifwrite(handle, &headerLength, 4); ifwrite(handle, (void *)(const char *) headerText, headerLength); } void glfHandler::BeginSection(const String & sectionLabel, int sectionLength) { int labelLength = sectionLabel.Length() + 1; ifwrite(handle, &labelLength, sizeof(int)); ifwrite(handle, (void *)(const char *) sectionLabel, labelLength); ifwrite(handle, §ionLength, sizeof(int)); label = sectionLabel; maxPosition = sectionLength; } void glfHandler::EndSection() { char marker = 0; ifwrite(handle, &marker, sizeof(char)); } void glfHandler::WriteEntry(int outputPosition) { data.offset = outputPosition - position; position = outputPosition; switch (data.recordType) { case 0 : EndSection(); return; case 1 : data.refBase = backTranslateBase[data.refBase]; ifwrite(handle, &data, sizeof(data)); data.refBase = translateBase[data.refBase]; return; case 2 : data.refBase = backTranslateBase[data.refBase]; ifwrite(handle, &data, sizeof(data) - 3); data.refBase = translateBase[data.refBase]; ifwrite(handle, (void *)(const char *) indelSequence[0], abs(data.indel.length[0])); ifwrite(handle, (void *)(const char *) indelSequence[1], abs(data.indel.length[1])); return; } } libStatGen-1.0.14/general/glfHandler.h000066400000000000000000000074331254730101300174700ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GLF_HANDLER_H__ #define __GLF_HANDLER_H__ #include "InputFile.h" #include "StringBasics.h" #if defined(__APPLE__) // #pragma warn "Caution, glfHandler.h is non-portable" #else #pragma pack(push) #pragma pack(1) #endif struct glfIndel { // Likelihood for the 1/1, 2/2 and 1/2 unsigned char lk[3]; // Allele lengths short length[2]; unsigned char padding[3]; }; struct glfEntry { /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ unsigned char refBase:4, recordType:4; /** offset of this record from the previous one, in bases */ unsigned int offset; /** log10 minimum likelihood * 10 and the number of mapped reads */ unsigned depth:24, minLLK:8; /** root mean squared maximum mapping quality for overlapping reads */ unsigned char mapQuality; union { /** log10 likelihood ratio * 10 for genotypes AA, AC, AG, AT, CC, CG, CT, GG, GT, TT */ unsigned char lk[10]; glfIndel indel; }; glfEntry & operator = (glfEntry & rhs); }; #if defined(__APPLE__) // #pragma warn "Caution, glfHandler.h is non-portable" #else #pragma pack(pop) #endif class glfHandler { public: // Global information about the current GLF file bool isStub; IFILE handle; String header; // Information about the current section String label; int sections; int currentSection; int maxPosition; // Information on whether the end of the current section has been reached bool endOfSection; // Currently active GLF record glfEntry data; int position; double likelihoods[10]; String indelSequence[2]; // Error message in case previous command fails const char * errorMsg; glfHandler(); ~glfHandler(); bool Open(const String & filename); void OpenStub(); bool Create(const String & filename); bool isOpen(); void Close(); void Rewind(); bool NextSection(); bool NextEntry(); bool NextBaseEntry(); void BeginSection(const String & sectionLabel, int sectionLength); void EndSection(); void WriteEntry(int outputPosition); char GetReference(int position, char defaultBase); int GetDepth(int position); const double * GetLikelihoods(int position); const unsigned char * GetLogLikelihoods(int position); int GetMapQuality(int position); static const double * GetDefaultLikelihoods() { return nullLikelihoods; } static const unsigned char * GetDefaultLogLikelihoods() { return nullLogLikelihoods; } static int GenotypeIndex(int base1, int base2) { return base1 < base2 ? (base1 - 1) *(10 - base1) / 2 + (base2 - base1) : (base2 - 1) *(10 - base2) / 2 + (base1 - base2); } private: static char translateBase[16]; static char backTranslateBase[5]; static double nullLikelihoods[10]; static unsigned char nullLogLikelihoods[10]; bool ReadHeader(); void WriteHeader(const String & headerText = ""); }; #endif libStatGen-1.0.14/general/obj/000077500000000000000000000000001254730101300160145ustar00rootroot00000000000000libStatGen-1.0.14/general/obj/README.txt000066400000000000000000000000621254730101300175100ustar00rootroot00000000000000This contains the objects for the above directory.libStatGen-1.0.14/general/test/000077500000000000000000000000001254730101300162215ustar00rootroot00000000000000libStatGen-1.0.14/general/test/Chromosome_test.cpp000066400000000000000000000016351254730101300221040ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "Chromosome.h" TEST(ChromosomeTest, staticLookupTest) { // GenomeSequence *gs = NULL; // Chromosome c(gs, 0U); // quick sanity check... } libStatGen-1.0.14/general/test/GenomeSequence_test.cpp000066400000000000000000000204751254730101300226770ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "GenomeSequence.h" const char* RM_BS_REFERENCE = "rm -f ./phiX-bs.umfa"; const char* RM_CS_REFERENCE = "rm -f ./phiX-cs.umfa"; const char* REFERENCE_NAME = "./phiX.fa"; TEST(GenomeSequenceTest, staticLookupTest) { GenomeSequence s; // quick sanity check... EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'A']], 'A'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'a']], 'A'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'T']], 'T'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 't']], 'T'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'C']], 'C'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'c']], 'C'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'G']], 'G'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'g']], 'G'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'N']], 'N'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'n']], 'N'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'M']], 'M'); EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'm']], 'M'); EXPECT_EQ(GenomeSequence::base2int[(int) 'N'], 4); EXPECT_EQ(GenomeSequence::base2int[(int) 'n'], 4); EXPECT_EQ(GenomeSequence::base2int[(int) 'A'], 0); EXPECT_EQ(GenomeSequence::base2int[(int) 'a'], 0); EXPECT_EQ(GenomeSequence::base2int[(int) 'T'], 3); EXPECT_EQ(GenomeSequence::base2int[(int) 't'], 3); EXPECT_EQ(GenomeSequence::base2int[(int) 'C'], 1); EXPECT_EQ(GenomeSequence::base2int[(int) 'c'], 1); EXPECT_EQ(GenomeSequence::base2int[(int) 'G'], 2); EXPECT_EQ(GenomeSequence::base2int[(int) 'g'], 2); } TEST(GenomeSequenceTest, testBaseSpaceReference) { GenomeSequence s; int exitCode = system(RM_BS_REFERENCE); EXPECT_EQ(exitCode, 0); s.setReferenceName(REFERENCE_NAME); bool rc = s.create(false); EXPECT_EQ(rc, false); EXPECT_EQ(s[0], 'G'); EXPECT_EQ(s[1], 'A'); EXPECT_EQ(s[2], 'G'); EXPECT_EQ(s[s.getNumberBases()-3], 'G'); EXPECT_EQ(s[s.getNumberBases()-2], 'C'); EXPECT_EQ(s[s.getNumberBases()-1], 'A'); EXPECT_EQ(s[s.getNumberBases()], 'N'); // check bounds checker s.close(); } TEST(GenomeSequenceTest, testColorSpaceReference) { GenomeSequence s; int exitCode = system(RM_CS_REFERENCE); EXPECT_EQ(exitCode, 0); s.setReferenceName(REFERENCE_NAME); bool rc = s.create(true); // NB: I did not calculate these expected values, I just // read them from the converted genome and set them here. // So in theory, they should be checked by hand to ensure // that they are correct. EXPECT_EQ(rc, false); EXPECT_EQ(s[0], 'N'); // in color space, first symbol is unknown EXPECT_EQ(s[1], '2'); EXPECT_EQ(s[2], '2'); EXPECT_EQ(s[s.getNumberBases()-3], '1'); EXPECT_EQ(s[s.getNumberBases()-2], '3'); EXPECT_EQ(s[s.getNumberBases()-1], '1'); EXPECT_EQ(s[s.getNumberBases()], 'N'); // check bounds checker s.close(); } #if 0 void simplestExample(void) { GenomeSequence reference; genomeIndex_t index; // a particular reference is set by: // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa") // // In the above example, the suffix .fa is stripped and replaced with .umfa, // which contains the actual file being opened. // if (reference.open()) { perror("GenomeSequence::open"); exit(1); } index = 1000000000; // 10^9 // // Write the base at the given index. Here, index is 0 based, // and is across the whole genome, as all chromosomes are sequentially // concatenated, so the allowed range is // // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last)) // // (where int last = reference.getChromosomeCount() - 1;) // std::cout << "base[" << index << "] = " << reference[index] << std::endl; // // Example for finding chromosome and one based chromosome position given // and absolute position on the genome in 'index': // int chr = reference.getChromosome(index); genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1; // 1-based std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl; // // Example for finding an absolute genome index position when the // chromosome name and one based position are known: // const char *chromosomeName = "5"; chr = reference.getChromosome(chromosomeName); // 0-based chrIndex = 100000; // 1-based index = reference.getChromosomeStart(chr) + chrIndex - 1; std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl; reference.close(); } void testGenomeSequence(void) { GenomeSequence reference; #if 0 std::string referenceName = "someotherreference"; if (reference.setFastaName(referenceName)) { std::cerr << "failed to open reference file " << referenceName << std::endl; exit(1); } #endif std::cerr << "open and prefetch the reference genome: "; // open it if (reference.open()) { exit(1); } std::cerr << "done!" << std::endl; // // For the human genome, genomeIndex ranges from 0 to 3.2x10^9 // genomeIndex_t genomeIndex; // 0 based unsigned int chromosomeIndex; // 1 based unsigned int chromosome; // 0..23 or so std::string chromosomeName; // // Here we'll start with a chromosome name, then obtain the genome // index, and use it to find the base we want: // chromosomeName = "2"; chromosomeIndex = 1234567; // this call is slow (string search for chromsomeName): genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex); assert(genomeIndex!=INVALID_GENOME_INDEX); std::cout << "Chromosome " << chromosomeName << ", index "; std::cout << chromosomeIndex << " contains base " << reference[genomeIndex]; std::cout << " at genome index position " << genomeIndex << std::endl; // // now reverse it - given a genomeIndex from above, find the chromosome // name and index: // // slow (binary search on genomeIndex): chromosome = reference.getChromosome(genomeIndex); unsigned int newChromosomeIndex; // not slow: newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1; assert(chromosomeIndex == newChromosomeIndex); // more testing... at least test and use PackedRead: // PackedRead pr; pr.set("ATCGATCG", 0); assert(pr.size()==8); assert(pr[0]==GenomeSequence::base2int[(int) 'A']); assert(pr[1]==GenomeSequence::base2int[(int) 'T']); assert(pr[2]==GenomeSequence::base2int[(int) 'C']); assert(pr[3]==GenomeSequence::base2int[(int) 'G']); pr.set("ATCGATCG", 1); assert(pr.size()==9); pr.set("", 0); assert(pr.size()==0); pr.set("", 1); assert(pr.size()==1); pr.set("", 2); assert(pr.size()==2); pr.set("", 3); assert(pr.size()==3); assert(pr[0]==GenomeSequence::base2int[(int) 'N']); assert(pr[1]==GenomeSequence::base2int[(int) 'N']); assert(pr[2]==GenomeSequence::base2int[(int) 'N']); pr.set("C", 1); assert(pr.size()==2); assert(pr[0]==GenomeSequence::base2int[(int) 'N']); assert(pr[1]==GenomeSequence::base2int[(int) 'C']); } #endif libStatGen-1.0.14/general/test/GreedyTupleAligner_test.cpp000066400000000000000000000401611254730101300235210ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "GreedyTupleAligner.h" #include #include #include #include // design a mock class for GenomeSequence class MockGenomeSequence{ public: MockGenomeSequence(std::string sequence) : sequence(sequence) {}; char operator[] (int index) const { if (index < 0 || index >= (int)sequence.size()) { std::cerr << "exceeds boundary! at " << __FILE__ << ":" <<__LINE__ << std::endl; return 'N'; } return sequence[index]; } char& operator[] (int index) { if (index < 0 || index >= (int)sequence.size()) { std::cerr << "exceeds boundary! at " << __FILE__ << ":" <<__LINE__ << std::endl; return sequence[0]; } return sequence[index]; } int getNumberBases() const{ return sequence.size(); } private: std::string sequence; }; void printRefQueryCigar(const std::vector& prettyPrintReference, const std::vector& prettyPrintQuery, CigarRoller& cs, std::ostream& out){ out << " ref = "; for(std::vector::const_iterator i=prettyPrintReference.begin(); i ga(wt); ga.Align(query, strlen(query), reference, 0, strlen(ref), cs, matchPosition); } void computePrettyOutput (std::vector& prettyPrintReference, const char* ref, std::vector& prettyPrintQuery, const char* query, const int matchPosition, const CigarRoller& expectedCigar) { const char* pRef = ref; const char* pQuery = query; for (int index = 0; index < matchPosition; index++){ prettyPrintReference.push_back(*pRef++); prettyPrintQuery.push_back(' '); } for (int i = 0; i< expectedCigar.size(); i++) { switch( expectedCigar[i].operation) { case CigarRoller::mismatch: case CigarRoller::match: for (unsigned int j = 0; j < expectedCigar[i].count; j++){ prettyPrintReference.push_back(*pRef++); prettyPrintQuery.push_back(*pQuery++); } break; case CigarRoller::del: for (unsigned int j = 0; j < expectedCigar[i].count; j++){ prettyPrintReference.push_back(*pRef++); prettyPrintQuery.push_back(' '); } break; case CigarRoller::insert: for (unsigned int j = 0; j < expectedCigar[i].count; j++){ prettyPrintReference.push_back(' '); prettyPrintQuery.push_back(*pQuery++); } break; default: break; } } /// end for while (*pRef !='\0') prettyPrintReference.push_back(*pRef++); while (*pQuery !='\0') prettyPrintReference.push_back(*pQuery++); } bool verifyAlign(const char *query, const char *ref, const char *expectedCigarString, std::ostream& out = std::cout) { out.seekp(std::ios_base::beg); CigarRoller cs; int matchPosition; CigarRoller expectedCigar(expectedCigarString); runAlign(query, ref, cs, matchPosition); if (matchPosition < 0) { fprintf(stderr, "No match in %s, %d \n", __FILE__, __LINE__); return false; } std::vector prettyPrintReference,prettyPrintQuery; computePrettyOutput( prettyPrintReference, ref, prettyPrintQuery, query, matchPosition, expectedCigar); const char *str = cs.getString(); if((unsigned int)expectedCigar.getExpectedQueryBaseCount() != strlen(query)) { printRefQueryCigar(prettyPrintReference, prettyPrintQuery, cs, out); out << std::endl; out << "Expected Cigar string length " << expectedCigar.getExpectedQueryBaseCount() << " does not match the length of the query " << strlen(query) << ". Please fix test case." << std::endl; return false; } if((unsigned int)cs.getExpectedQueryBaseCount() != strlen(query)) { printRefQueryCigar(prettyPrintReference, prettyPrintQuery, cs, out); out << std::endl; out << "Query Length of " << strlen(query) << " does not match computed cigar string length of " << cs.getExpectedQueryBaseCount() << std::endl; return false; } if (strcmp(expectedCigarString, str) == 0) { // printf("[Correct Answer = %s] \n", expectedCigarString) ; return true; } else { printRefQueryCigar(prettyPrintReference, prettyPrintQuery, cs, out); out << "[Correct Answer = " << expectedCigarString << "] --------------------- Wrong!" << std::endl; return false; } return true; } TEST(GreedyTupleAlignerTest, AlignToShortReference) { std::stringstream ss(std::stringstream::out); #if 0 //exact align EXPECT_TRUE( verifyAlign("12345", "123456789", "5M") ) << ss.str().c_str(); EXPECT_TRUE( verifyAlign("23456", "123456789", "5M") ) << ss.str().c_str(); //mismatch EXPECT_TRUE( verifyAlign("123B567", "123456789", "7M") ) << ss.str().c_str(); EXPECT_TRUE( verifyAlign("234D678", "123456789", "7M") ) << ss.str().c_str(); // del EXPECT_TRUE( verifyAlign("123467890","1234567890", "4M1D5M") ) << ss.str().c_str(); EXPECT_TRUE( verifyAlign("123467890","B1234567890B", "4M1D5M") ) << ss.str().c_str(); // ins EXPECT_TRUE( verifyAlign("12345067890","1234567890", "5M1I5M") ) << ss.str().c_str(); EXPECT_TRUE( verifyAlign("12345067890","BBBB1234567890BBBB", "5M1I5M") ) << ss.str().c_str(); // soft clip EXPECT_TRUE( verifyAlign("1234", "1235", "3M1S") ) << ss.str().c_str(); // The following will treat as two mismatches // EXPECT_TRUE( verifyAlign("123456700", "123456789", "7M2S", ss) ) << ss.str().c_str(); #endif EXPECT_TRUE( verifyAlign("1023456700", "123456789","1I7M2S") ) << ss.str().c_str(); } TEST(GreedyTupleTestAligner, AlignToLongReference) { std::stringstream ss(std::stringstream::out); EXPECT_TRUE( verifyAlign("TTAGAATGCTATTGTGTTTGGAGATTTGAGGAAAGTGGGCGTGAAGACTTAGTGTTCATTTCCTCAACCTCTCTCTGTGTGAACATACGTCATCGGTCAGAAATTGGG","CCGAGATTGTGCCATTGCACTCCTGCCTGGGTAACAGAGTCAGACCCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAGATTAGGTTTTATAGATGGAAAATTCACAGCTCTCTCCAGATCAGAAATCTCCAAGAGTAAATTAGTGTCTTAAAGGGGTTGTAATAACTTTCCTATGTGACTAAGTGCATTATTAATCAATTTTTCTATGATCAAGTACTCCTTTACATACCTGCTAATACAATTTTTGATATGAAATCAGTCCTAGAGGGAATCAATGTAAGATACAGACTTGATGAGTGCTTGCAGTTTTTTATTGACAATCTGAAGAATGACTTGACTCTAAATTGCAGCTCAAGGCTTAGAATGCTATTGTGTTTGGAGATTTGAGGAAAGTGGGCGTGAAGACTTAGTGTTCATTTCCTCAACCTCTCTCTGTGTGAACATACAGGAATCAAATCTGTCTAGCCTCTCTTTTTGGCAAGGTTAAGAACAATTCCACTTCATCCTAATCCCAATGATTCCTGCCGACCCTCTTCCAAAAACTATTTAAAGACATGTTCTTCAAAGTTATATTTGTCTTTCCTTCAGGGAGAAAAAGAATACCAATCACTTATAATATGGAAACTAGCAGAAATGGGTCACATAAGTCATCTGTCAGAAATTGGGAAAATAGAGTAGGTCAGTCTTTCCAGTCATGGTACTTTTACCTTCAATCA", "88M200D20M") ) << ss.str().c_str(); // This reads cigar string is the wrong length. By that I mean that the number // of matches listed in the cigar should be the same as the length of // the original read. // EXPECT_TRUE( verifyAlign("GTGAAACTCCATCTCAAAAATAAGTAAATAAATAAATACATACATAGGCACAGTGCAGTTGTTAGTCAGAATTAGGTCACACTGGATTAGGGTGAGTACTTAATGCAACAGGTCTGGGG","GTGCCAGAGTTTAATTAATAGGATAAGGTTATGAGTCAGACTGTGTACCCCAAAAAAGATATGTTGAACTCCTAAGCCCCTGAACCACAGAATGGGATCCTATTCAGAAATAGGCACAGTGTCCGGGCACCATGGCTCACACTGGTAATCCCAGCACTCTGGGAGGCTGAGGTGGGTGCATCACCTGAGGTCAGGAGTTTGAGACCAGCCTGGCCAACATGGTGAAACCCCATCTCTACTAAAAATACAAACAGAACAGTTAGCCAGGTGTGGTGGTGGGCACCTGTAATCCCAGCTACTTGGGAGGCTGAGACAGGAGAATGGCTTGAACCCAGGAGGTGGAGGTTGCAGTGAGCCGAGATCGTGCCATTGCACTTCAGCCTGGGCCACAAGAGTGAAACTCCATCTCAAAAATAAGTAAATAAATAAATACATACGTAGGCACAGTGCAGTTGTTGTTAGTTAGAATTAGGTCACACTGGATTAGGGTGAGTCCTTAATCCAACAGGTCTGGTGTCCTTACAAATAGACAAATACACAGAAGGAACATGGCCACATGGAGATACAGACACACCAAAACATCATATTGAGATGTGGGCAAAGATTGGAGAGACACTTCTCCAAGTCAAGGAACATCTGGGACTACCCAGAAACTGTAAGAGGCAGAGAAAGGTCCTTCCCTGTAGGCTTTAGAGGAACATGGCCCTGCCAACATCTTGATCTTGGATTTCCAGCCTCCAGCATGTGAGACAAGTTTCTGGGTTTTTTTGGAGACAGAGTCTCACTCTTGTCACCCAGGCTGGAGTGCAGTGGCATGAACTTGGCTCACTGCAACCTCCTCCCAGGATCAAGGTATTGTCCTGCCTCAGCCTCCCGAGTAGCTGGGATGACAGGGGCCCGCCACCACGCCAGCTCATTTTTGTATTTTTTACTAGAGAAGGGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTGACCTCAAGTGATCCACCCGCCTTGGCCTCCCAAAGTGCTAGGATTACAGGTGTGAGCCACTGCGCCTGGCAAGTTTCTGTTGCCTTAAGCCACTCTTTCTGTGGTAATTTGTTATCATGGCCCTAAGAAATGACTAGAGAGAGAAAGCAAATCCCTTTGTTTCTGCATTTACTGAAACAGATGAATAGATTTCTAGCTCCCTTGGGGTCTGAACTTTTAAAAGAGAGATTTCTTATACATATGATAATCATGATATTGT", "63M3D56M") ) << ss.str().c_str(); EXPECT_TRUE( verifyAlign("ATATTGTTTTTTTCAATGCATATCAAAACAATGTTTACAATATACTACAGCCTAAGTGTGCAATAGCATTATGTGTAGAAATGCACATACCATAATTAGTTTTTTTTTTTGAAAAAACT","GTTCCAAAAGATTATATTTGTTAGGTTAGAGAATTTTAACTTATTTATATAATGGAGATTTTCTAATACTGAGAATACCTTAATTCTTATTGTAAGCCTACTTAACAGTGACAAAATGTTATTATAACGTGGTATTGAAATTAATATGATAGTATTTTATATGGATATTTGCATATGCAATTGACATATATTGTATATACAATATATAACTGTGTATTATATATTATATTTATATAATGTTATATTGTATATGAATATATTTGAATTATATGTATATACATATATATAGGCATTCATCAGAAATATTGCAGGTTTGGTTTCAGACGACTATAATAAAGTGAATATTGCAATAAAGCGAATCACAAGAAATTATTGTTTTTTTCAATGCATATCAAAACAATGTTTACAATATACTACAGCCTAAGTGTGCAATAGCATTATGTGTAGAAATGCACATACCATAATTAGTTTTTTTTTTGAAAAAACTGTTAATGATTATCTGAGCCTTCAGTGAGTTGTAATCTTTTCATGGTGGAGGATCATACCTCTACGTTGATGTCAGCTGACTGATCAGGGTAGTAGTTGCTGAAGGCTTGGGTGGCTGTGGCAATTTCTTAAAATAGGATAACAATGGCATTTACCACATTAATTGACTCCTTCTTTCACAAAAGATTTCTCTGTCTCATGCAATGCTGTTTGACAGCATTTTCCCCACAGTAGAATTTCTTTAAAAATTGG", "109M249D10M") ) << ss.str().c_str(); EXPECT_TRUE( verifyAlign("CCAGACTATCTCAAGCAATCAACAGATTTAATGTAAGGAGTGTCAAAATCTGAATGATGCTTTTTGCAGAAATAGAAAATCCCTTTCTAATATTTTTATATTTTTGAC","TTATCGAGGCTGGCGGATTTTGTGAGGCCAGGAGTTCAAGACCAGCCTGGCCAACATGGCAAAACTCTGTCTCTACTAAAAATACAAAAGTTAGCTGGGCATAGTGGCACATGTCTATAGTTCTAGCTATGTGGGAGGCTGAGACACGAGAATCGCTTGAACCCAGGAGGTGGAGGTTGTGGTGAGCCGAGCTCACGCCATTGCTCTCCAGCCTGGGCAACAGAGCAAGACTGTCTCAAAAACAAAAACAAAAAACACAAAAACTACAAGACTTTTATGAAATAACTTAAGGAAGATATAAATAAATGGAAAGATATCCCATGTTCCTGACTTGGAAGACTTAATTTTGTTAAGATGTCCATACTATCTCAAGCAATCAACAGATTTAATGTAAGGAGTGTCAAAATCTGAATGATGCTTTTTGCAGAAATAGAAAATCCCTTTCTAATATTTTTATGTAATCTCAAGGGACCCCAAATAGCCAAAAGAATCCTGAAAAAGTAGAATAAAGCTGGAGGACTCATGATTCCTGATTTGAAAACTTACTACCAGATACAATAATCAAAACAGTTCCGTGCTTGTCATAAAGACAAACATATAGACCAATGGAACAGAATAGAGATTACAGGGACAAATCCTCATATATATGGTCAAATGATTTTTGACCAGTGCCAAGATCATTCATGGGTGAAAAGACAATCTTTTCAATAAAAGAT", "99M200D9M") ) << ss.str().c_str(); // this is by no mean works, since our local aligner generates the output // 54M200D34M47D8M7D2M1I5M4S , since local aligner prefer gapped alignment. EXPECT_FALSE( verifyAlign("ATGAGGTCAGGAGATGGAGACCATCCTGGCTAACATGGTGAAACCCCATCTCTAAAAAAAGTGTAACAGAGGTGCATACTCAAAACTACAAAAGTCTCGTGAAAGGAA","CAAGAAAAAGAAATAAAATACATTTTAGTAGGAAAGGAAGAAGTTAAATTGTCTCCATTTGGTGACAACATGAGCTTATATGCAGAAAACCTAAAGACTCTACCAAAAAAACTGCTGGAACTGATAAATGAATTCGGTGAAGTCCTAGGGTATAAAATCAATGTACAAAATAAGTGGTGTTTCTATATTCTAATAAATTATTCAAAAGGGAAATTAAGAAATCAATCCCATTTTCAATAGCAACAACAAAAAAAATGACAATGCCAAAGTATAAATTTAACCAAGAAGCTACAAGAGTCTGGGCGCAGTGGCTTATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGTGGATCATGAGGTCAGGAGATGGAGACCATCCTGGCTAACATGGTGAAACCCCATCTCTACTAAAAAAAAATAATAATAATAATAATAATAATAATAATTAGCCGGGCGTGGTGGTAGGCATCTGTAGTCCCAGCTACTCGGGAGGCTGAGGTAGGAGAATGGCATGAACCTGGGAGGTGGAGCTTGCAGTGAGCAGAGATCACACCCCTGCACTCCAGCCTGGGTGACAGGGCGAGACTCCGTCTCAAAAAAAAAAAAAAAAAAAGTGTAACAGAGTTGCATACTGAAAACTATAAAATTCTGATGAAAGAAAATGAAGCAACAAATAATTAATATAATAAAAAAGTCCATACTATCCAAAAT", "54M200D54M") ) << ss.str().c_str(); } #if 0 // // In this test case, there is no indel that I am aware of, however, the // aligner produces quite a long cigar string with many insertions // and deletions. For now, I filter this case out later, but it would // be nice if it would limit itself to one or a small number of indels. // EXPECT_TRUE( verifyAlign("GAATCAATACGCTCGGGATGCAGCGCCTAGCCGTTGGTTTGAGAATGGTTCTCTAGAGTTATCTTCACCCTCTACCTTGTGTGGCACTATTTCTTCTATGACCTTGAC","TGGCTCAAGACCTGACCTTGTGCACGTCTTGGATGCCAGTTCTATTCCCCTCACAGGCCATATGAATCCTGTCCTTTCTGCCTCAAAATGCCCATCCAGAGCCTCTACATTGATTAGCTTTTCCCTCCCTTCCAGAAAAGTTCAAAGGCTACCTCCTCCTTGAAGCCTTCACAAATACCTTAATCTAACTGTTTATAACCCTCTGCCATCTTAGCACTGTGGAAAATACATAAACTTGGGGTTAGAACATCATCAGTTTTAATGTAAGCACCATCCTTTCTAGCTGTACAGCCCTCCTGAGCCTTAGTTCTTACATCTTGAAGATGGAACCAGCTCAACAAGCATAGGGATGTAGCAAGAATCAAGACACTGTAGATGCAGCACCTGGCCGGTGGTAAGAGCTTGGTTATCACAAGTTATCTTCACCCTCTACCTTGTGTGGCACTATTTCTTCTATGACCTTGACTGCTCTCTGCTCTGATCTGGAAGTTCGCTGGGAAAAGGTGTCCCCTTTTTTATTACCTACCGGGAGAACCATGAGTGATGCTCTACTTGTAGTATATATACCCTGAGATGATTATTCTTAAAGACTAGTTCTCATGACTTGAGAGTTTGCTCTGTGTTAGGTACCATTCTAACACTGGATGTTGACTATGTATGTTATTTAATACTTCCATCAACCCCATAATGTAGGGAGAATCATTATGCCCATTTTA", "108M") ) << ss.str().c_str(); #endif TEST(GreedyTupleTestAligner, EquivalentAlignment) { std::stringstream ss(std::stringstream::out); // The test cases here are different to what we expected, // but hte alignment are both valid. See the example: // ref: AAAT TGGGG (4M5D5M) // ref: AAA TTGGGG (3M5D6M) // read: AAAT CCCC TTGGGG // We obtain 89M200D19M, the correct is 87M200D21M, // the 2 matched bases shifted to the right // if you check the output, you will see the 2 bases can be put before '200D' or after '200D' // and they are essentially the same EXPECT_FALSE( verifyAlign("TTTTCTTTTCAAAAATTTAAAAGTGACATACAAAATTATATGTGTATGTACAACAAAAGCTTAACTATAACACCTTGTTACATACTTTGGAATTGAAAGGCAGGAATG","CAGCACCCTAATTCACTATGCCCTAAGCTTCAAGGGCTTCAGAGTAAGCTCTCAGTGGAGTCTGATTGGAATCCCTCTTCGCCAGCTTGTGAGGTATGGGGCTAGGTTCCACAATATTCCCTTTGAGGGAGTAGATCTTCCAGCCTTCTGGGGCATGCTCTGAAAGTCCTCTTTGCAGAAGTAGCTCTTTAAAATCATATTCTCTTTCCAATTTGACCTCTTTTTTTATCCTTGTTCTGTCCATGCTGTCCAAAGCATCTTGGACTAAGTTTTGACTTTTTTTTTAAGTGCTGCATTTCCATTTGACATTTTACCTTTGTAAATTTCTATTTTTTTACCTTTGTGACTTATTAAAATATTTTCTTTTCAAAAATTTAAAAGTGACATACAAAATTATATGTGTATGTACAACAAAAGCTTAACTATAACACCTTGTTACATACTTTGCTATCCAGGCCACTGATCCTTTCTTACATAGTAAGTCAGCTATAGTTCATTAGCTTACAGTTTTTAGATACAAGTCTTAATCCATCCCTTCTCCTTTTGTATTCTTTACTTTCTGCAATATTTAAGACTTTTTGCGTTCTGACTAAAAGAAACCACCTGAAATTGGCATATGCAACTGTTCATGAATGAGAACTCGCATGGAATTGAAAGGCAGGAATGCAGCTTGACCTTAGAATGGATTTGATCCAGGAACTAGAAGGTGGGTAGGA", "87M200D21M") ) << ss.str().c_str(); // for the same reason as before EXPECT_FALSE( verifyAlign("AACAGTTGAGAGGTACTAAAATTGAGTTTTCTTGAAAAATATATTTAATCTAAAGTACTGAAAATTTGGGGGAAAATGCTTAAGGTCATATTCCTTTTTTGAAAAGAT","TCATCTTTCTCCCATACTGGCTGTTTCCTGCCCTCAAACACTGGACTCCAAGTTCTTCAGCTTGTGGACTCTTGGACCTACAACCAGTGGTCTGCCAGGGCCCTTTGGGCCTTCGGCCACAGACTGATGGCTACACTGTCGGCTCCCCTACTTTTGAGGTTTTGTGTCTTGGACTGGCTTTCTTGCTCCTCAGCTTGCAGACAGCCTACTGTAGGACTTCACTTTGTGACTATTTGAGTCAATACTCCTTAATAAACACCCTTTCATATATACATATATCCTATTAGTCCTGTTCCTCTAGAGAACCCTAATACAGTGTTGTACATTGAAATAAATATAATTATTCTGGTTTTGGTTGAACAGTTGAGAGGTACTAAAATTGAGTTTTCTTGAAAAATATATTTAATCTAAAGTACTGAAAATTTGGGGGAAAATGCTTCTGTAAATCCTAAGTTATTATTTCTTCAACTATATTCTGTAGTTAATTTCTCCAGCAATTCTTAATTTCAGCACAAATTAGCCACTGTTTGAATTAGGAATACTGAATCGTCTCCATTGCAGTGCAGTTAATAAGTCATTTCTTGATGAAGTAGTCCATGTAGGACTTGAAATCTTGTCTTTTTCATGATACATTATCATAAGGTCATATTCCTTTTTTGAAAAGATTGATGATACTATTCTGAAAGACACTAGTAGAGTTAGGCTTGGTTTTATGA", "80M200D28M") ) << ss.str().c_str(); } libStatGen-1.0.14/general/test/Makefile000066400000000000000000000056201254730101300176640ustar00rootroot00000000000000 include ../../Makefiles/Makefile.include .DEFAULT_GOAL := all SUBDIRS=inputFileTest cigar string memoryMapArrayTest packedVectorTest referenceSequenceTest nonOverlapRegions baseUtilitiesTest trimSequence reusableVector OPTFLAG?=-O0 .PHONY: $(SUBDIRS) all test clean all: TARGET = all test: TARGET = test clean: TARGET = clean all test: $(SUBDIRS) clean: $(SUBDIRS) # clean current directory rm -f *.o *~ rm -f phiX-bs.umfa phiX-cs.umfa gtest $(SUBDIRS): @$(MAKE) OPTFLAG="$(OPTFLAG)" -C $@ $(TARGET) ###################################################################### GTEST_COMPILE_FLAG=-DGTEST_HAS_TR1_TUPLE=0 -D GTEST_HAS_RTTI=0 LIBRARY=$(REQ_LIBS) .c.o : $(CXX) $(CFLAGS) $(GTEST_COMPILE_FLAG) -o $@ -c $*.c -DVERSION=\""$(VERSION)\"" -DDATE="\"${DATE}\"" -DNODE="\"${NODE}\"" -DUSER="\"${USER}\"" -I$(GTEST_INC) .cpp.o : $(CXX) $(CFLAGS) $(GTEST_COMPILE_FLAG) -o $@ -c $*.cpp -DVERSION="\"$(VERSION)\"" -DDATE="\"${DATE}\"" -DNODE="\"${NODE}\"" -DUSER="\"${USER}\"" -I$(GTEST_INC) # Google test part GTEST_ROOT=../../../gtest-1.5.0 GTEST_INC = $(GTEST_ROOT)/include GTEST_SRC = $(GTEST_ROOT)/src/gtest-all.cc GTEST_OBJ = $(GTEST_ROOT)/gtest-all.o $(GTEST_OBJ): $(GTEST_SRC) g++ -c -o $@ $< -I$(GTEST_INC) -I$(GTEST_ROOT) #TOOLTEST=$(TOOLBASE:=_test) TOOLTEST=gtest TrimSequence_test GreedyTupleAligner_test STLUtilities_test GenomeSequence_test Chromosome_test TOOLTESTHDR=$(TOOLTEST:=.h) TOOLTESTSRC=$(TOOLTEST:=.cpp) TOOLTESTOBJ=$(TOOLTESTSRC:.cpp=.o) TOOLOBJ_NOMAIN=$(TOOLOBJ:Main.o=) GenomeSequence_test.o: GenomeSequence_test.cpp ../GenomeSequence.h ../GenomeSequence.cpp $(CXX) $(CFLAGS) $(GTEST_COMPILE_FLAG) -o $@ -c $< -DVERSION="\"$(VERSION)\"" -DDATE="\"${DATE}\"" -DNODE="\"${NODE}\"" -DUSER="\"${USER}\"" -I$(GTEST_INC) STLUtilities_test.o: STLUtilities_test.cpp ../STLUtilities.h ../STLUtilities.cpp $(CXX) $(CFLAGS) $(GTEST_COMPILE_FLAG) -std=c++0x -o $@ -c $< -DVERSION="\"$(VERSION)\"" -DDATE="\"${DATE}\"" -DNODE="\"${NODE}\"" -DUSER="\"${USER}\"" -I$(GTEST_INC) TrimSequence_test.o: TrimSequence_test.cpp ../TrimSequence.h $(CXX) $(CFLAGS) $(GTEST_COMPILE_FLAG) -o $@ -c $< -DVERSION="\"$(VERSION)\"" -DDATE="\"${DATE}\"" -DNODE="\"${NODE}\"" -DUSER="\"${USER}\"" -I$(GTEST_INC) GreedyTupleAligner_test.o: GreedyTupleAligner_test.cpp ../GreedyTupleAligner.h $(CXX) $(CFLAGS) $(GTEST_COMPILE_FLAG) -o $@ -c $< -DVERSION="\"$(VERSION)\"" -DDATE="\"${DATE}\"" -DNODE="\"${NODE}\"" -DUSER="\"${USER}\"" -I$(GTEST_INC) Chromosome_test.o: Chromosome_test.cpp ../Chromosome.h $(CXX) $(CFLAGS) $(GTEST_COMPILE_FLAG) -o $@ -c $< -DVERSION="\"$(VERSION)\"" -DDATE="\"${DATE}\"" -DNODE="\"${NODE}\"" -DUSER="\"${USER}\"" -I$(GTEST_INC) gtest_build: $(TOOLTESTOBJ) $(TOOLOBJ) $(GTEST_OBJ) g++ $(TOOLTESTOBJ) $(TOOLOBJ_NOMAIN) $(GTEST_OBJ) $(LIBRARY) -lm -lz -lssl -lpthread -lcrypto -o ./gtest ../../libStatGen_debug.a gtest: gtest_build ./gtest --gtest_color=yes include $(MAKEFILES_PATH)/Makefile.footer libStatGen-1.0.14/general/test/STLUtilities_test.cpp000066400000000000000000000163641254730101300223340ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "STLUtilities.h" // #define _STLUTILITIES_BENCHMARK_ // This can turn on the benchmark of STLUtilities class and String class // #ifdef _STLUTILITIES_BENCHMARK_ #include "Performance.h" #include "Random.h" #include "StringBasics.h" #endif /* _STLUTILITIES_BENCHMARK_ */ #include TEST(STLUtilitiesTest, tSTLUtilitiesTest) { #if 0 std::string test; std::string::iterator result; test = "445566"; result = trimSequence(test, '5', true); EXPECT_EQ(result - test.begin() , 2); test = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'A', true); EXPECT_TRUE(result == test.begin()); #endif using namespace STLUtilities; // for overloaded std::string << operator std::string toot; toot << "double: " << 0.123456 << " LL: " << -5LL << " UL: " << 999UL << " char: " << '!'; EXPECT_TRUE(toot == "double: 0.123456 LL: -5 UL: 999 char: !"); // same result as above using different methods toot.clear(); append(toot, "double: "); append(toot, 0.123456); append(toot, " LL: "); append(toot, -5LL); append(toot, " UL: "); append(toot, 999UL); append(toot, " char: "); append(toot, (char) (' ' + 1)); EXPECT_TRUE(toot == "double: 0.123456 LL: -5 UL: 999 char: !"); toot.clear(); std::vector v; v.push_back(1); v.push_back(2); v.push_back(3); v.push_back(4); v.push_back(5); toot = "array: "; append(toot, v, "\t", true); EXPECT_TRUE(toot == "array: 0: 1\t1: 2\t2: 3\t3: 4\t4: 5"); std::vector tokens; Tokenize(tokens, "ab\tcd\tefg\thi\tjk", '\t'); EXPECT_EQ(tokens.size(), 5U); EXPECT_TRUE(tokens[0] == "ab"); EXPECT_TRUE(tokens[1] == "cd"); EXPECT_TRUE(tokens[2] == "efg"); EXPECT_TRUE(tokens[3] == "hi"); EXPECT_TRUE(tokens[4] == "jk"); Tokenize(tokens, "ab\tcd\tefg\thi\tjk\t", '\t'); EXPECT_EQ(tokens.size(), 6U); EXPECT_TRUE(tokens[5] == ""); // a single tab splits two empty fields, so should see two tokens here: Tokenize(tokens, "\t", '\t'); EXPECT_EQ(tokens.size(), 2U); EXPECT_TRUE(tokens[0] == ""); EXPECT_TRUE(tokens[1] == ""); Tokenize(tokens, "bahbah", '\t'); EXPECT_EQ(tokens.size(), 1U); EXPECT_TRUE(tokens[0] == "bahbah"); // // no data on the line is the same as a single empty field. // the reason is we don't want to have a file with a single // column of data, but two separate values for .size(). Better // to let the caller simply say 'if tokens[0]==""' // Tokenize(tokens, "", '\t'); EXPECT_EQ(tokens.size(), 1U); EXPECT_TRUE(tokens[0] == ""); #if 0 toot = ""; append(toot, tokens, '\t'); std::cout << toot << std::endl; exit(0); #endif } // // Variadic templates necessary for reasonable printf implementation // are only supported as an experimental feature that in theory is // subject to changes in the future draft standard for C++. // // Only defined when the g++ option -std=c++0x is used. // #if defined(__GXX_EXPERIMENTAL_CXX0X__) TEST(STLUtilitiesTestPrintf, tSTLUtilitiesTestPrintf) { using namespace STLUtilities; // for overloaded std::string << operator std::string result; sprintf(result, "Hello, world!"); EXPECT_TRUE(result=="Hello, world!"); sprintf(result, "n = %20.5lXXX", 123ULL); EXPECT_TRUE(result=="n = 7bXX"); sprintf(result, "hello, world! %20sand boo", "well then"); EXPECT_TRUE(result=="hello, world! well thenand boo"); sprintf(result, "addr = %08xXXX", 1234); EXPECT_TRUE(result=="addr = 000004d2XXX"); sprintf(result, "Hello, world!! Imagine: %d!", 2345.1234); EXPECT_TRUE(result=="Hello, world!! Imagine: 2345.12!"); } #endif #ifdef _STLUTILITIES_BENCHMARK_ // // Compare StringBasics.h String with std::string and STLUtilities append methods // // NB: these are mostly inline with the exception of String::operator +(char c), which // is a function call, so as currently implemented, String is at a disadvantage. // // However, all of these potentially suffer from limitations in g++, as I've noticed // that it at times is unable to inline more than a few levels of nested functions // deep even if all are trivially short and inlined. // TEST(STLUtilitiesTestPrintf, Benchmark1) { using namespace STLUtilities; // for overloaded std::string << operator std::string result; Random random; int range = 'z' - 'A'; unsigned int round = 1e6; Timing timing; random.Reset(0); timing.start(); for (unsigned int i = 0; i < round; i++) { result << (char)('A' + ( random.NextInt() % range)); } timing.end(); std::cout << "STLUtilities " << round << " times takes " << timing.interval() << " second " << std::endl; String s; random.Reset(0); timing.start(); for (unsigned int i = 0; i < round; i++) { s += (char)('A' + ( random.NextInt() % range)); } timing.end(); std::cout << "String " << round << " times takes " << timing.interval() << " second " << std::endl; EXPECT_EQ(result, s.c_str()); std::string st; random.Reset(0); timing.start(); for (unsigned int i = 0; i < round; i++) { st += (char)('A' + ( random.NextInt() % range)); } timing.end(); std::cout << "std::string " << round << " times takes " << timing.interval() << " second " << std::endl; EXPECT_EQ(result, st); } TEST(STLUtilitiesTestPrintf, Benchmark2) { using namespace STLUtilities; // for overloaded std::string << operator std::string result; Random random; unsigned int round = 1e6; Timing timing; timing.start(); for (unsigned int i = 0; i < round; i++) { result = ""; for(int j=0; j<15; j++) result << (char) 'A'; } timing.end(); std::cout << "STLUtilities " << round << " times takes " << timing.interval() << " second " << std::endl; String s; timing.start(); for (unsigned int i = 0; i < round; i++) { s = ""; for(int j=0; j<15; j++) s += 'A'; } timing.end(); std::cout << "String " << round << " times takes " << timing.interval() << " second " << std::endl; EXPECT_EQ(result, s.c_str()); std::string st; timing.start(); for (unsigned int i = 0; i < round; i++) { st = ""; for(int j=0; j<15; j++) st += 'A'; } timing.end(); std::cout << "std::string " << round << " times takes " << timing.interval() << " second " << std::endl; EXPECT_EQ(result, st); } #endif /* _STLUTILITIES_BENCHMARK_ */ libStatGen-1.0.14/general/test/TrimSequence_test.cpp000066400000000000000000000073731254730101300224020ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include TEST(TrimSequenceTest, trimSequenceTest) { std::string test; std::string::iterator result; test = "445566"; result = trimSequence(test, '5', true); EXPECT_EQ(result - test.begin() , 2); test = "445554555"; result = trimSequence(test, '5', true); EXPECT_EQ(result - test.begin(), 6); test = "4455545556"; result = trimSequence(test, '5', true); EXPECT_EQ(result - test.begin(), 6); test = "44555455566"; result = trimSequence(test, '5', true); EXPECT_EQ(result - test.begin(), 6); test = "665544"; result = trimSequence(test, '5', false); EXPECT_EQ(test.end() - result , 2); test = "555455544"; result = trimSequence(test, '5', false); EXPECT_EQ(test.end() - result, 6); test = "6555455544"; result = trimSequence(test, '5', false); EXPECT_EQ(test.end() - result, 6); // Paul's test cases in TrimSequence.cpp // // from the left: // test = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'A', true); EXPECT_TRUE(result == test.begin()); test = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, '~', true); EXPECT_TRUE(result == test.end()); test = "AAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'B', true); EXPECT_TRUE(result == (test.begin() + 5)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'B', true); EXPECT_TRUE(result == (test.begin() + 8)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'F', true); EXPECT_TRUE(result == (test.begin() + 12)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, '@', true); EXPECT_TRUE(result == (test.begin() + 0)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, '@', true); EXPECT_TRUE(result == (test.begin() + 0)); test = "AAAFAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'F', true); EXPECT_TRUE(result == (test.begin() + 12)); // // from the right: // test = "ZYXWVUTSRQPONMLKJIHGFEDCBA"; result = trimSequence(test, 'A', false); EXPECT_TRUE(result == test.end()); test = "ZYXWVUTSRQPONMLKJIHGFEDCBA"; result = trimSequence(test, '~', false); EXPECT_TRUE(result == test.begin()); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAA"; result = trimSequence(test, 'B', false); EXPECT_TRUE(result == (test.end() - 5)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAA"; result = trimSequence(test, 'B', false); EXPECT_TRUE(result == (test.end() - 7)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAAA"; result = trimSequence(test, 'F', false); EXPECT_TRUE(result == (test.end() - 12)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAAA"; result = trimSequence(test, '@', false); EXPECT_TRUE(result == (test.end() + 0)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAFAAA"; result = trimSequence(test, 'F', false); EXPECT_TRUE(result == (test.end() - 12)); }; libStatGen-1.0.14/general/test/baseUtilitiesTest/000077500000000000000000000000001254730101300216675ustar00rootroot00000000000000libStatGen-1.0.14/general/test/baseUtilitiesTest/.gitignore000066400000000000000000000000231254730101300236520ustar00rootroot00000000000000baseUtilitiesTest libStatGen-1.0.14/general/test/baseUtilitiesTest/BaseUtilitiesTest.cpp000066400000000000000000000024661254730101300260110ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "BaseUtilitiesTest.h" #include #include int main(int argc, char ** argv) { testReverseComplement(); } void testReverseComplement() { // Test odd number of bases. std::string testString = "ACGTAACCTTGGG"; std::string expectedReverse = "CCCAAGGTTACGT"; BaseUtilities::reverseComplement(testString); assert(testString == expectedReverse); // Test even number of bases. testString = "ACGTAACCTGGG"; expectedReverse = "CCCAGGTTACGT"; BaseUtilities::reverseComplement(testString); assert(testString == expectedReverse); } libStatGen-1.0.14/general/test/baseUtilitiesTest/BaseUtilitiesTest.h000066400000000000000000000014561254730101300254540ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "BaseUtilities.h" void testReverseComplement(); libStatGen-1.0.14/general/test/baseUtilitiesTest/Makefile000066400000000000000000000002011254730101300233200ustar00rootroot00000000000000EXE = baseUtilitiesTest TOOLBASE = BaseUtilitiesTest TEST_COMMAND= ./baseUtilitiesTest include ../../../Makefiles/Makefile.testlibStatGen-1.0.14/general/test/cigar/000077500000000000000000000000001254730101300173065ustar00rootroot00000000000000libStatGen-1.0.14/general/test/cigar/.gitignore000066400000000000000000000000201254730101300212660ustar00rootroot00000000000000cigarRollerTest libStatGen-1.0.14/general/test/cigar/CigarRollerTest.cpp000066400000000000000000001640501254730101300230650ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ // put TEST below here, so that makedepend will see the .h, so that we // can get a clean dependency for SmithWaterman.o, so that we can at least // compile the header when we change it. #include #include "Generic.h" #include "CigarRollerTest.h" // // Test the obvious cases. // Add non-obvious ones as bugs come up. // int CigarRollerTest::test(void) { int failures = 0, testNum = 0; Cigar::CigarOperator op; // const char *str; //////////////////////////////////// // Test foundInReference static methods. check(failures, ++testNum, "foundInReference(none)", false, Cigar::foundInReference(Cigar::none)); check(failures, ++testNum, "foundInReference(match)", true, Cigar::foundInReference(Cigar::match)); check(failures, ++testNum, "foundInReference(mismatch)", true, Cigar::foundInReference(Cigar::mismatch)); check(failures, ++testNum, "foundInReference(insert)", false, Cigar::foundInReference(Cigar::insert)); check(failures, ++testNum, "foundInReference(del)", true, Cigar::foundInReference(Cigar::del)); check(failures, ++testNum, "foundInReference(skip)", true, Cigar::foundInReference(Cigar::skip)); check(failures, ++testNum, "foundInReference(softClip)", false, Cigar::foundInReference(Cigar::softClip)); check(failures, ++testNum, "foundInReference(hardClip)", false, Cigar::foundInReference(Cigar::hardClip)); check(failures, ++testNum, "foundInReference(pad)", false, Cigar::foundInReference(Cigar::pad)); check(failures, ++testNum, "foundInReference('?')", false, Cigar::foundInReference('?')); check(failures, ++testNum, "foundInReference('z')", false, Cigar::foundInReference('z')); check(failures, ++testNum, "foundInReference('M')", true, Cigar::foundInReference('M')); check(failures, ++testNum, "foundInReference('X')", true, Cigar::foundInReference('X')); check(failures, ++testNum, "foundInReference('=')", true, Cigar::foundInReference('=')); check(failures, ++testNum, "foundInReference('I')", false, Cigar::foundInReference('I')); check(failures, ++testNum, "foundInReference('D')", true, Cigar::foundInReference('D')); check(failures, ++testNum, "foundInReference('N')", true, Cigar::foundInReference('N')); check(failures, ++testNum, "foundInReference('S')", false, Cigar::foundInReference('S')); check(failures, ++testNum, "foundInReference('H')", false, Cigar::foundInReference('H')); check(failures, ++testNum, "foundInReference('P')", false, Cigar::foundInReference('P')); op.count = 3; op.operation = Cigar::none; check(failures, ++testNum, "foundInReference(none)", false, Cigar::foundInReference(op)); op.operation = Cigar::match; check(failures, ++testNum, "foundInReference(match)", true, Cigar::foundInReference(op)); op.operation = Cigar::mismatch; check(failures, ++testNum, "foundInReference(mismatch)", true, Cigar::foundInReference(op)); op.operation = Cigar::insert; check(failures, ++testNum, "foundInReference(insert)", false, Cigar::foundInReference(op)); op.operation = Cigar::del; check(failures, ++testNum, "foundInReference(del)", true, Cigar::foundInReference(op)); op.operation = Cigar::skip; check(failures, ++testNum, "foundInReference(skip)", true, Cigar::foundInReference(op)); op.operation = Cigar::softClip; check(failures, ++testNum, "foundInReference(softClip)", false, Cigar::foundInReference(op)); op.operation = Cigar::hardClip; check(failures, ++testNum, "foundInReference(hardClip)", false, Cigar::foundInReference(op)); op.operation = Cigar::pad; check(failures, ++testNum, "foundInReference(pad)", false, Cigar::foundInReference(op)); //////////////////////////////////// // Test foundInQuery static methods. check(failures, ++testNum, "foundInQuery(none)", false, Cigar::foundInQuery(Cigar::none)); check(failures, ++testNum, "foundInQuery(match)", true, Cigar::foundInQuery(Cigar::match)); check(failures, ++testNum, "foundInQuery(mismatch)", true, Cigar::foundInQuery(Cigar::mismatch)); check(failures, ++testNum, "foundInQuery(insert)", true, Cigar::foundInQuery(Cigar::insert)); check(failures, ++testNum, "foundInQuery(del)", false, Cigar::foundInQuery(Cigar::del)); check(failures, ++testNum, "foundInQuery(skip)", false, Cigar::foundInQuery(Cigar::skip)); check(failures, ++testNum, "foundInQuery(softClip)", true, Cigar::foundInQuery(Cigar::softClip)); check(failures, ++testNum, "foundInQuery(hardClip)", false, Cigar::foundInQuery(Cigar::hardClip)); check(failures, ++testNum, "foundInQuery(pad)", false, Cigar::foundInQuery(Cigar::pad)); check(failures, ++testNum, "foundInQuery('?')", false, Cigar::foundInQuery('?')); check(failures, ++testNum, "foundInQuery('z')", false, Cigar::foundInQuery('z')); check(failures, ++testNum, "foundInQuery('M')", true, Cigar::foundInQuery('M')); check(failures, ++testNum, "foundInQuery('X')", true, Cigar::foundInQuery('X')); check(failures, ++testNum, "foundInQuery('=')", true, Cigar::foundInQuery('=')); check(failures, ++testNum, "foundInQuery('I')", true, Cigar::foundInQuery('I')); check(failures, ++testNum, "foundInQuery('D')", false, Cigar::foundInQuery('D')); check(failures, ++testNum, "foundInQuery('N')", false, Cigar::foundInQuery('N')); check(failures, ++testNum, "foundInQuery('S')", true, Cigar::foundInQuery('S')); check(failures, ++testNum, "foundInQuery('H')", false, Cigar::foundInQuery('H')); check(failures, ++testNum, "foundInQuery('P')", false, Cigar::foundInQuery('P')); op.count = 3; op.operation = Cigar::none; check(failures, ++testNum, "foundInQuery(none)", false, Cigar::foundInQuery(op)); op.operation = Cigar::match; check(failures, ++testNum, "foundInQuery(match)", true, Cigar::foundInQuery(op)); op.operation = Cigar::mismatch; check(failures, ++testNum, "foundInQuery(mismatch)", true, Cigar::foundInQuery(op)); op.operation = Cigar::insert; check(failures, ++testNum, "foundInQuery(insert)", true, Cigar::foundInQuery(op)); op.operation = Cigar::del; check(failures, ++testNum, "foundInQuery(del)", false, Cigar::foundInQuery(op)); op.operation = Cigar::skip; check(failures, ++testNum, "foundInQuery(skip)", false, Cigar::foundInQuery(op)); op.operation = Cigar::softClip; check(failures, ++testNum, "foundInQuery(softClip)", true, Cigar::foundInQuery(op)); op.operation = Cigar::hardClip; check(failures, ++testNum, "foundInQuery(hardClip)", false, Cigar::foundInQuery(op)); op.operation = Cigar::pad; check(failures, ++testNum, "foundInQuery(pad)", false, Cigar::foundInQuery(op)); //////////////////////////////////// // Test isClip static methods. check(failures, ++testNum, "isClip(none)", false, Cigar::isClip(Cigar::none)); check(failures, ++testNum, "isClip(match)", false, Cigar::isClip(Cigar::match)); check(failures, ++testNum, "isClip(mismatch)", false, Cigar::isClip(Cigar::mismatch)); check(failures, ++testNum, "isClip(insert)", false, Cigar::isClip(Cigar::insert)); check(failures, ++testNum, "isClip(del)", false, Cigar::isClip(Cigar::del)); check(failures, ++testNum, "isClip(skip)", false, Cigar::isClip(Cigar::skip)); check(failures, ++testNum, "isClip(softClip)", true, Cigar::isClip(Cigar::softClip)); check(failures, ++testNum, "isClip(hardClip)", true, Cigar::isClip(Cigar::hardClip)); check(failures, ++testNum, "isClip(pad)", false, Cigar::isClip(Cigar::pad)); check(failures, ++testNum, "isClip('?')", false, Cigar::isClip('?')); check(failures, ++testNum, "isClip('z')", false, Cigar::isClip('z')); check(failures, ++testNum, "isClip('M')", false, Cigar::isClip('M')); check(failures, ++testNum, "isClip('X')", false, Cigar::isClip('X')); check(failures, ++testNum, "isClip('=')", false, Cigar::isClip('=')); check(failures, ++testNum, "isClip('I')", false, Cigar::isClip('I')); check(failures, ++testNum, "isClip('D')", false, Cigar::isClip('D')); check(failures, ++testNum, "isClip('N')", false, Cigar::isClip('N')); check(failures, ++testNum, "isClip('S')", true, Cigar::isClip('S')); check(failures, ++testNum, "isClip('H')", true, Cigar::isClip('H')); check(failures, ++testNum, "isClip('P')", false, Cigar::isClip('P')); op.count = 3; op.operation = Cigar::none; check(failures, ++testNum, "isClip(none)", false, Cigar::isClip(op)); op.operation = Cigar::match; check(failures, ++testNum, "isClip(match)", false, Cigar::isClip(op)); op.operation = Cigar::mismatch; check(failures, ++testNum, "isClip(mismatch)", false, Cigar::isClip(op)); op.operation = Cigar::insert; check(failures, ++testNum, "isClip(insert)", false, Cigar::isClip(op)); op.operation = Cigar::del; check(failures, ++testNum, "isClip(del)", false, Cigar::isClip(op)); op.operation = Cigar::skip; check(failures, ++testNum, "isClip(skip)", false, Cigar::isClip(op)); op.operation = Cigar::softClip; check(failures, ++testNum, "isClip(softClip)", true, Cigar::isClip(op)); op.operation = Cigar::hardClip; check(failures, ++testNum, "isClip(hardClip)", true, Cigar::isClip(op)); op.operation = Cigar::pad; check(failures, ++testNum, "isClip(pad)", false, Cigar::isClip(op)); /////////////////////////////// // Create the CigarRoller. CigarRoller cigar; // const char *str; String str; std::string result; cigar.getCigarString(str); result = str.c_str(); // result = str = cigar.getString(); delete str; check(failures, ++testNum, "constructor", result, ""); // test empty case op.operation = CigarRoller::match; op.count = 10; cigar += op; op.count=5; cigar += op; op.count=5; op.operation = CigarRoller::mismatch; // test that match/mismatch get combined cigar += op; op.count=5; op.operation = CigarRoller::insert; cigar += op; op.count=5; op.operation = CigarRoller::insert; cigar += op; op.count=5; op.operation = CigarRoller::del; cigar += op; op.count=5; op.operation = CigarRoller::mismatch; cigar += op; op.count=5; op.operation = CigarRoller::match; cigar += op; op.count=5; op.operation = CigarRoller::skip; cigar += op; op.count=5; op.operation = CigarRoller::match; cigar += op; op.count=2; op.operation = CigarRoller::pad; cigar += op; op.count=3; op.operation = CigarRoller::match; cigar += op; cigar.getCigarString(str); result = str.c_str(); // result = str = cigar.getString(); delete str; check(failures, ++testNum, "match combining", "20M10I5D10M5N5M2P3M", result); check(failures, ++testNum, "length check", 8, cigar.size()); ////////////////////////////////////////////////////////////////////////////////////////////////////////// // Test getRefOffset, getQueryIndex, getRefPosition, & getQueryIndex(that takes ref position) // Validate the reference offsets when passing in a query index, // and the query offsets when passing in a query index. // Spot check the offsets out of order - to verify order doesn't matter. check(failures, ++testNum, "getRefOffset(20)", -1, cigar.getRefOffset(20)); check(failures, ++testNum, "getRefOffset(30)", 25, cigar.getRefOffset(30)); check(failures, ++testNum, "getRefOffset(46)", 46, cigar.getRefOffset(46)); check(failures, ++testNum, "getRefOffset(0)", 0, cigar.getRefOffset(0)); check(failures, ++testNum, "getRefPosition(20, 5)", -1, cigar.getRefPosition(20, 5)); check(failures, ++testNum, "getRefPosition(30, 5)", 30, cigar.getRefPosition(30, 5)); check(failures, ++testNum, "getRefPosition(46, 5)", 51, cigar.getRefPosition(46, 5)); check(failures, ++testNum, "getRefPosition(0, 5)", 5, cigar.getRefPosition(0, 5)); check(failures, ++testNum, "getQueryIndex(20)", CigarRoller::INDEX_NA, cigar.getQueryIndex(20)); check(failures, ++testNum, "getQueryIndex(30)", 35, cigar.getQueryIndex(30)); check(failures, ++testNum, "getQueryIndex(35)", Cigar::INDEX_NA, cigar.getQueryIndex(35)); check(failures, ++testNum, "getQueryIndex(46)", 46, cigar.getQueryIndex(46)); check(failures, ++testNum, "getQueryIndex(0)", 0, cigar.getQueryIndex(0)); check(failures, ++testNum, "getQueryIndex(25, 5)", -1, cigar.getQueryIndex(20)); check(failures, ++testNum, "getQueryIndex(35, 5)", 35, cigar.getQueryIndex(30)); check(failures, ++testNum, "getQueryIndex(40, 5)", -1, cigar.getQueryIndex(35)); check(failures, ++testNum, "getQueryIndex(51, 5)", 46, cigar.getQueryIndex(46)); check(failures, ++testNum, "getQueryIndex(5, 5)", 0, cigar.getQueryIndex(0)); int i = 0; int queryIndex = 0; int refOffset = 0; // Validate the 20 matches. for(i = 0; i < 20; i++) { check(failures, ++testNum, "getRefOffset(queryIndex)", refOffset, cigar.getRefOffset(queryIndex)); check(failures, ++testNum, "getQueryIndex(refOffset)", queryIndex, cigar.getQueryIndex(refOffset)); check(failures, ++testNum, "getRefPosition(queryIndex, 5)", refOffset + 5, cigar.getRefPosition(queryIndex, 5)); check(failures, ++testNum, "getQueryIndex(refPosition, 5)", queryIndex, cigar.getQueryIndex(refOffset + 5, 5)); ++queryIndex; ++refOffset; } // Validate the 10 Inserts - exist in query, but not in the reference. for(i = 0; i < 10; i++) { check(failures, ++testNum, "getRefOffset(queryIndex)", -1, cigar.getRefOffset(queryIndex)); check(failures, ++testNum, "getRefPosition(queryIndex, 5)", -1, cigar.getRefPosition(queryIndex, 5)); ++queryIndex; } // Validate the 5 Deletions - exist in the reference, but not the query. for(i = 0; i < 5; i++) { check(failures, ++testNum, "getQueryIndex(refOffset)", -1, cigar.getQueryIndex(refOffset)); check(failures, ++testNum, "getQueryIndex(refPosition, 5)", -1, cigar.getQueryIndex(refOffset + 5, 5)); refOffset++; } // Validate the 10 matches. for(i = 0; i < 10; i++) { check(failures, ++testNum, "getRefOffset(queryIndex)", refOffset, cigar.getRefOffset(queryIndex)); check(failures, ++testNum, "getQueryIndex(refOffset)", queryIndex, cigar.getQueryIndex(refOffset)); check(failures, ++testNum, "getRefPosition(queryIndex, 5)", refOffset + 5, cigar.getRefPosition(queryIndex, 5)); check(failures, ++testNum, "getQueryIndex(refPosition, 5)", queryIndex, cigar.getQueryIndex(refOffset + 5, 5)); ++queryIndex; ++refOffset; } // Validate the 5 Skips - exist in the reference, but not the query. for(i = 0; i < 5; i++) { check(failures, ++testNum, "getQueryIndex(refOffset)", -1, cigar.getQueryIndex(refOffset)); check(failures, ++testNum, "getQueryIndex(refPosition, 5)", -1, cigar.getQueryIndex(refOffset + 5, 5)); refOffset++; } // Validate the 5 matches. for(i = 0; i < 5; i++) { check(failures, ++testNum, "getRefOffset(queryIndex)", refOffset, cigar.getRefOffset(queryIndex)); check(failures, ++testNum, "getQueryIndex(refOffset)", queryIndex, cigar.getQueryIndex(refOffset)); check(failures, ++testNum, "getRefPosition(queryIndex, 5)", refOffset + 5, cigar.getRefPosition(queryIndex, 5)); check(failures, ++testNum, "getQueryIndex(refPosition, 5)", queryIndex, cigar.getQueryIndex(refOffset + 5, 5)); ++queryIndex; ++refOffset; } // Nothing to validate for the 2 pads since they do not show up in either the reference or the query. // Validate the 3 matches. for(i = 0; i < 3; i++) { check(failures, ++testNum, "getRefOffset(queryIndex)", refOffset, cigar.getRefOffset(queryIndex)); check(failures, ++testNum, "getQueryIndex(refOffset)", queryIndex, cigar.getQueryIndex(refOffset)); check(failures, ++testNum, "getRefPosition(queryIndex, 5)", refOffset + 5, cigar.getRefPosition(queryIndex, 5)); check(failures, ++testNum, "getQueryIndex(refPosition, 5)", queryIndex, cigar.getQueryIndex(refOffset + 5, 5)); ++queryIndex; ++refOffset; } // Validate that if you go beyond the end, -1 is returned. check(failures, ++testNum, "getRefOffset(queryIndex)", -1, cigar.getRefOffset(queryIndex)); check(failures, ++testNum, "getQueryIndex(refOffset)", -1, cigar.getQueryIndex(refOffset)); check(failures, ++testNum, "getRefPosition(queryIndex, 5)", -1, cigar.getRefPosition(queryIndex, 5)); check(failures, ++testNum, "getQueryIndex(refPosition, 5)", -1, cigar.getQueryIndex(refOffset + 5, 5)); ++queryIndex; ++refOffset; check(failures, ++testNum, "getRefOffset(queryIndex)", -1, cigar.getRefOffset(queryIndex)); check(failures, ++testNum, "getQueryIndex(refOffset)", -1, cigar.getQueryIndex(refOffset)); check(failures, ++testNum, "getRefPosition(queryIndex, 5)", -1, cigar.getRefPosition(queryIndex, 5)); check(failures, ++testNum, "getQueryIndex(refPosition, 5)", -1, cigar.getQueryIndex(refOffset + 5, 5)); //////////////////////////////////////////////////////////////////////// // Test getNumOverlaps // When query starts at position 5: // Overlaps are at 5-24, 30-39, 45-49, 50-52 // Test with region [1-5) where query starts at position 5 = 0 overlaps. check(failures, ++testNum, "getNumOverlaps(1, 5, 5)", (uint32_t)0, cigar.getNumOverlaps(1, 5, 5)); // Test with region [53-101) where query starts at position 5 = 0 overlaps. check(failures, ++testNum, "getNumOverlaps(53, 101, 5)", (uint32_t)0, cigar.getNumOverlaps(53, 101, 5)); // Test with region [53-10) where query starts at position 5 = 0 overlaps. check(failures, ++testNum, "getNumOverlaps(53, 10, 5)", (uint32_t)0, cigar.getNumOverlaps(53, 10, 5)); // Test with region [35-10) where query starts at position 5 = 0 overlaps. check(failures, ++testNum, "getNumOverlaps(35, 10, 5)", (uint32_t)0, cigar.getNumOverlaps(35, 10, 5)); // Test with region [35-1) where query starts at position 5 = 0 overlaps. check(failures, ++testNum, "getNumOverlaps(35, 1, 5)", (uint32_t)0, cigar.getNumOverlaps(35, 1, 5)); // Test with region [1-6) where query starts at position 5 - 1 overlap = pos 5. check(failures, ++testNum, "getNumOverlaps(1, 6, 5)", (uint32_t)1, cigar.getNumOverlaps(1, 6, 5)); // Test with region [25-30) where query has only DELETIONS from the reference = 0 overlaps. check(failures, ++testNum, "getNumOverlaps(25, 30, 5)", (uint32_t)0, cigar.getNumOverlaps(25, 30, 5)); // Test with region [24-30) where query has only a match at position 24 = 1 overlap. check(failures, ++testNum, "getNumOverlaps(24, 30, 5)", (uint32_t)1, cigar.getNumOverlaps(24, 30, 5)); // Test with region [25-31) where query has only a match at position 30 = 1 overlap. check(failures, ++testNum, "getNumOverlaps(25, 31, 5)", (uint32_t)1, cigar.getNumOverlaps(25, 31, 5)); // Test with region [1-31), match pos 5-24 & 30 = 21 overlaps check(failures, ++testNum, "getNumOverlaps(1, 31, 5)", (uint32_t)21, cigar.getNumOverlaps(1, 31, 5)); // Test a region that covers the entire range [1-101), match pos 5-24, 30-39, 45-49, & 50-52 = 38 overlaps check(failures, ++testNum, "getNumOverlaps(1, 101, 5)", (uint32_t)38, cigar.getNumOverlaps(1, 101, 5)); // Test a region that covers the entire range [-1--1), (whole region) match pos 5-24, 30-39, 45-49, & 50-52 = 38 overlaps check(failures, ++testNum, "getNumOverlaps(-1, -1, 5)", (uint32_t)38, cigar.getNumOverlaps(-1, -1, 5)); // Test a region that covers the entire range [6-52), match pos 6-24, 30-39, 45-49, & 50-51 = 36 overlaps check(failures, ++testNum, "getNumOverlaps(6, 52, 5)", (uint32_t)36, cigar.getNumOverlaps(6, 52, 5)); // Test with region [40-45) where query has only SKIPS from the reference = 0 overlaps. check(failures, ++testNum, "getNumOverlaps(40, 45, 5)", (uint32_t)0, cigar.getNumOverlaps(40, 45, 5)); // Test a region that covers the range [-1-10), (whole region) match pos 5-9 = 5 overlaps check(failures, ++testNum, "getNumOverlaps(-1, 10, 5)", (uint32_t)5, cigar.getNumOverlaps(-1, 10, 5)); // Test a region that covers the range [50--1), (whole region) match pos 50-52 = 3 overlaps check(failures, ++testNum, "getNumOverlaps(50, -1, 5)", (uint32_t)3, cigar.getNumOverlaps(50, -1, 5)); //////////////////////////////////////////////////////////////////////////// // Test a new CIGAR. cigar.Set("4M10N4M3I2M4D3M"); String expectedResult = "4M10N4M3I2M4D3M"; String cigarString = "HI"; cigar.getCigarString(cigarString); check(failures, ++testNum, "getCigarString", expectedResult, cigarString); // 4M10N4M3I2M4D3M // 11111111112222222222 // ExtIndex: 012345678901234567890123456789 // ExtCigar: MMMMNNNNNNNNNNMMMMIIIMMDDDDMMM // 111 111 // QueryIndx:0123 456789012 345 // 11111111 112222222 // RefOffset:012345678901234567 890123456 // 1111111111222 222222233 // RefPos: 567890123456789012 345678901 // Test getExpandedCigarIndex & getCigar op check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(0, 5)", -1, cigar.getExpandedCigarIndexFromRefPos(0,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(1, 5)", -1, cigar.getExpandedCigarIndexFromRefPos(1,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(2, 5)", -1, cigar.getExpandedCigarIndexFromRefPos(2,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(3, 5)", -1, cigar.getExpandedCigarIndexFromRefPos(3,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(4, 5)", -1, cigar.getExpandedCigarIndexFromRefPos(4,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(5, 5)", 0, cigar.getExpandedCigarIndexFromRefPos(5,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(6, 5)", 1, cigar.getExpandedCigarIndexFromRefPos(6,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(7, 5)", 2, cigar.getExpandedCigarIndexFromRefPos(7,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(8, 5)", 3, cigar.getExpandedCigarIndexFromRefPos(8,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(9, 5)", 4, cigar.getExpandedCigarIndexFromRefPos(9,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(10, 5)", 5, cigar.getExpandedCigarIndexFromRefPos(10,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(11, 5)", 6, cigar.getExpandedCigarIndexFromRefPos(11,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(12, 5)", 7, cigar.getExpandedCigarIndexFromRefPos(12,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(13, 5)", 8, cigar.getExpandedCigarIndexFromRefPos(13,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(14, 5)", 9, cigar.getExpandedCigarIndexFromRefPos(14,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(15, 5)", 10, cigar.getExpandedCigarIndexFromRefPos(15,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(16, 5)", 11, cigar.getExpandedCigarIndexFromRefPos(16,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(17, 5)", 12, cigar.getExpandedCigarIndexFromRefPos(17,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(18, 5)", 13, cigar.getExpandedCigarIndexFromRefPos(18,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(19, 5)", 14, cigar.getExpandedCigarIndexFromRefPos(19,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(20, 5)", 15, cigar.getExpandedCigarIndexFromRefPos(20,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(21, 5)", 16, cigar.getExpandedCigarIndexFromRefPos(21,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(22, 5)", 17, cigar.getExpandedCigarIndexFromRefPos(22,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(23, 5)", 21, cigar.getExpandedCigarIndexFromRefPos(23,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(24, 5)", 22, cigar.getExpandedCigarIndexFromRefPos(24,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(25, 5)", 23, cigar.getExpandedCigarIndexFromRefPos(25,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(26, 5)", 24, cigar.getExpandedCigarIndexFromRefPos(26,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(27, 5)", 25, cigar.getExpandedCigarIndexFromRefPos(27,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(28, 5)", 26, cigar.getExpandedCigarIndexFromRefPos(28,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(29, 5)", 27, cigar.getExpandedCigarIndexFromRefPos(29,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(30, 5)", 28, cigar.getExpandedCigarIndexFromRefPos(30,5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefPos(31, 5)", 29, cigar.getExpandedCigarIndexFromRefPos(31,5)); // check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(0)", 0, cigar.getExpandedCigarIndexFromRefOffset(0)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(1)", 1, cigar.getExpandedCigarIndexFromRefOffset(1)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(2)", 2, cigar.getExpandedCigarIndexFromRefOffset(2)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(3)", 3, cigar.getExpandedCigarIndexFromRefOffset(3)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(4)", 4, cigar.getExpandedCigarIndexFromRefOffset(4)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(5)", 5, cigar.getExpandedCigarIndexFromRefOffset(5)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(6)", 6, cigar.getExpandedCigarIndexFromRefOffset(6)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(7)", 7, cigar.getExpandedCigarIndexFromRefOffset(7)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(8)", 8, cigar.getExpandedCigarIndexFromRefOffset(8)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(9)", 9, cigar.getExpandedCigarIndexFromRefOffset(9)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(10)", 10, cigar.getExpandedCigarIndexFromRefOffset(10)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(11)", 11, cigar.getExpandedCigarIndexFromRefOffset(11)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(12)", 12, cigar.getExpandedCigarIndexFromRefOffset(12)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(13)", 13, cigar.getExpandedCigarIndexFromRefOffset(13)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(14)", 14, cigar.getExpandedCigarIndexFromRefOffset(14)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(15)", 15, cigar.getExpandedCigarIndexFromRefOffset(15)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(16)", 16, cigar.getExpandedCigarIndexFromRefOffset(16)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(17)", 17, cigar.getExpandedCigarIndexFromRefOffset(17)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(18)", 21, cigar.getExpandedCigarIndexFromRefOffset(18)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(19)", 22, cigar.getExpandedCigarIndexFromRefOffset(19)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(20)", 23, cigar.getExpandedCigarIndexFromRefOffset(20)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(21)", 24, cigar.getExpandedCigarIndexFromRefOffset(21)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(22)", 25, cigar.getExpandedCigarIndexFromRefOffset(22)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(23)", 26, cigar.getExpandedCigarIndexFromRefOffset(23)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(24)", 27, cigar.getExpandedCigarIndexFromRefOffset(24)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(25)", 28, cigar.getExpandedCigarIndexFromRefOffset(25)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(26)", 29, cigar.getExpandedCigarIndexFromRefOffset(26)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(27)", -1, cigar.getExpandedCigarIndexFromRefOffset(27)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(28)", -1, cigar.getExpandedCigarIndexFromRefOffset(28)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(29)", -1, cigar.getExpandedCigarIndexFromRefOffset(29)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(30)", -1, cigar.getExpandedCigarIndexFromRefOffset(30)); check(failures, ++testNum, "getExpandedCigarIndexFromRefOffset(31)", -1, cigar.getExpandedCigarIndexFromRefOffset(31)); // check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(0)", 0, cigar.getExpandedCigarIndexFromQueryIndex(0)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(1)", 1, cigar.getExpandedCigarIndexFromQueryIndex(1)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(2)", 2, cigar.getExpandedCigarIndexFromQueryIndex(2)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(3)", 3, cigar.getExpandedCigarIndexFromQueryIndex(3)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(4)", 14, cigar.getExpandedCigarIndexFromQueryIndex(4)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(5)", 15, cigar.getExpandedCigarIndexFromQueryIndex(5)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(6)", 16, cigar.getExpandedCigarIndexFromQueryIndex(6)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(7)", 17, cigar.getExpandedCigarIndexFromQueryIndex(7)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(8)", 18, cigar.getExpandedCigarIndexFromQueryIndex(8)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(9)", 19, cigar.getExpandedCigarIndexFromQueryIndex(9)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(10)", 20, cigar.getExpandedCigarIndexFromQueryIndex(10)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(11)", 21, cigar.getExpandedCigarIndexFromQueryIndex(11)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(12)", 22, cigar.getExpandedCigarIndexFromQueryIndex(12)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(13)", 27, cigar.getExpandedCigarIndexFromQueryIndex(13)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(14)", 28, cigar.getExpandedCigarIndexFromQueryIndex(14)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(15)", 29, cigar.getExpandedCigarIndexFromQueryIndex(15)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(16)", -1, cigar.getExpandedCigarIndexFromQueryIndex(16)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(17)", -1, cigar.getExpandedCigarIndexFromQueryIndex(17)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(18)", -1, cigar.getExpandedCigarIndexFromQueryIndex(18)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(19)", -1, cigar.getExpandedCigarIndexFromQueryIndex(19)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(20)", -1, cigar.getExpandedCigarIndexFromQueryIndex(20)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(21)", -1, cigar.getExpandedCigarIndexFromQueryIndex(21)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(22)", -1, cigar.getExpandedCigarIndexFromQueryIndex(22)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(23)", -1, cigar.getExpandedCigarIndexFromQueryIndex(23)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(24)", -1, cigar.getExpandedCigarIndexFromQueryIndex(24)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(25)", -1, cigar.getExpandedCigarIndexFromQueryIndex(25)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(26)", -1, cigar.getExpandedCigarIndexFromQueryIndex(26)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(27)", -1, cigar.getExpandedCigarIndexFromQueryIndex(27)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(28)", -1, cigar.getExpandedCigarIndexFromQueryIndex(28)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(29)", -1, cigar.getExpandedCigarIndexFromQueryIndex(29)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(30)", -1, cigar.getExpandedCigarIndexFromQueryIndex(30)); check(failures, ++testNum, "getExpandedCigarIndexFromQueryIndex(31)", -1, cigar.getExpandedCigarIndexFromQueryIndex(31)); // Test getCigarCharOp. check(failures, ++testNum, "getCigarCharOp(-1)", '?', cigar.getCigarCharOp(-1)); check(failures, ++testNum, "getCigarCharOp(0)", 'M', cigar.getCigarCharOp(0)); check(failures, ++testNum, "getCigarCharOp(1)", 'M', cigar.getCigarCharOp(1)); check(failures, ++testNum, "getCigarCharOp(2)", 'M', cigar.getCigarCharOp(2)); check(failures, ++testNum, "getCigarCharOp(3)", 'M', cigar.getCigarCharOp(3)); check(failures, ++testNum, "getCigarCharOp(4)", 'N', cigar.getCigarCharOp(4)); check(failures, ++testNum, "getCigarCharOp(5)", 'N', cigar.getCigarCharOp(5)); check(failures, ++testNum, "getCigarCharOp(6)", 'N', cigar.getCigarCharOp(6)); check(failures, ++testNum, "getCigarCharOp(7)", 'N', cigar.getCigarCharOp(7)); check(failures, ++testNum, "getCigarCharOp(8)", 'N', cigar.getCigarCharOp(8)); check(failures, ++testNum, "getCigarCharOp(9)", 'N', cigar.getCigarCharOp(9)); check(failures, ++testNum, "getCigarCharOp(10)", 'N', cigar.getCigarCharOp(10)); check(failures, ++testNum, "getCigarCharOp(11)", 'N', cigar.getCigarCharOp(11)); check(failures, ++testNum, "getCigarCharOp(12)", 'N', cigar.getCigarCharOp(12)); check(failures, ++testNum, "getCigarCharOp(13)", 'N', cigar.getCigarCharOp(13)); check(failures, ++testNum, "getCigarCharOp(14)", 'M', cigar.getCigarCharOp(14)); check(failures, ++testNum, "getCigarCharOp(15)", 'M', cigar.getCigarCharOp(15)); check(failures, ++testNum, "getCigarCharOp(16)", 'M', cigar.getCigarCharOp(16)); check(failures, ++testNum, "getCigarCharOp(17)", 'M', cigar.getCigarCharOp(17)); check(failures, ++testNum, "getCigarCharOp(18)", 'I', cigar.getCigarCharOp(18)); check(failures, ++testNum, "getCigarCharOp(19)", 'I', cigar.getCigarCharOp(19)); check(failures, ++testNum, "getCigarCharOp(20)", 'I', cigar.getCigarCharOp(20)); check(failures, ++testNum, "getCigarCharOp(21)", 'M', cigar.getCigarCharOp(21)); check(failures, ++testNum, "getCigarCharOp(22)", 'M', cigar.getCigarCharOp(22)); check(failures, ++testNum, "getCigarCharOp(23)", 'D', cigar.getCigarCharOp(23)); check(failures, ++testNum, "getCigarCharOp(24)", 'D', cigar.getCigarCharOp(24)); check(failures, ++testNum, "getCigarCharOp(25)", 'D', cigar.getCigarCharOp(25)); check(failures, ++testNum, "getCigarCharOp(26)", 'D', cigar.getCigarCharOp(26)); check(failures, ++testNum, "getCigarCharOp(27)", 'M', cigar.getCigarCharOp(27)); check(failures, ++testNum, "getCigarCharOp(28)", 'M', cigar.getCigarCharOp(28)); check(failures, ++testNum, "getCigarCharOp(29)", 'M', cigar.getCigarCharOp(29)); check(failures, ++testNum, "getCigarCharOp(30)", '?', cigar.getCigarCharOp(30)); // Test getCigarCharOpFromQueryIndex. check(failures, ++testNum, "getCigarCharOpFromQueryIndex(-1)", '?', cigar.getCigarCharOpFromQueryIndex(-1)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(0)", 'M', cigar.getCigarCharOpFromQueryIndex(0)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(1)", 'M', cigar.getCigarCharOpFromQueryIndex(1)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(2)", 'M', cigar.getCigarCharOpFromQueryIndex(2)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(3)", 'M', cigar.getCigarCharOpFromQueryIndex(3)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(4)", 'M', cigar.getCigarCharOpFromQueryIndex(4)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(5)", 'M', cigar.getCigarCharOpFromQueryIndex(5)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(6)", 'M', cigar.getCigarCharOpFromQueryIndex(6)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(7)", 'M', cigar.getCigarCharOpFromQueryIndex(7)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(8)", 'I', cigar.getCigarCharOpFromQueryIndex(8)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(9)", 'I', cigar.getCigarCharOpFromQueryIndex(9)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(10)", 'I', cigar.getCigarCharOpFromQueryIndex(10)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(11)", 'M', cigar.getCigarCharOpFromQueryIndex(11)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(12)", 'M', cigar.getCigarCharOpFromQueryIndex(12)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(13)", 'M', cigar.getCigarCharOpFromQueryIndex(13)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(14)", 'M', cigar.getCigarCharOpFromQueryIndex(14)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(15)", 'M', cigar.getCigarCharOpFromQueryIndex(15)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(16)", '?', cigar.getCigarCharOpFromQueryIndex(16)); check(failures, ++testNum, "getCigarCharOpFromQueryIndex(17)", '?', cigar.getCigarCharOpFromQueryIndex(17)); // Test getCigarCharOpFromRefOffset. check(failures, ++testNum, "getCigarCharOpFromRefOffset(-1)", '?', cigar.getCigarCharOpFromRefOffset(-1)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(0)", 'M', cigar.getCigarCharOpFromRefOffset(0)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(1)", 'M', cigar.getCigarCharOpFromRefOffset(1)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(2)", 'M', cigar.getCigarCharOpFromRefOffset(2)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(3)", 'M', cigar.getCigarCharOpFromRefOffset(3)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(4)", 'N', cigar.getCigarCharOpFromRefOffset(4)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(5)", 'N', cigar.getCigarCharOpFromRefOffset(5)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(6)", 'N', cigar.getCigarCharOpFromRefOffset(6)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(7)", 'N', cigar.getCigarCharOpFromRefOffset(7)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(8)", 'N', cigar.getCigarCharOpFromRefOffset(8)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(9)", 'N', cigar.getCigarCharOpFromRefOffset(9)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(10)", 'N', cigar.getCigarCharOpFromRefOffset(10)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(11)", 'N', cigar.getCigarCharOpFromRefOffset(11)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(12)", 'N', cigar.getCigarCharOpFromRefOffset(12)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(13)", 'N', cigar.getCigarCharOpFromRefOffset(13)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(14)", 'M', cigar.getCigarCharOpFromRefOffset(14)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(15)", 'M', cigar.getCigarCharOpFromRefOffset(15)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(16)", 'M', cigar.getCigarCharOpFromRefOffset(16)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(17)", 'M', cigar.getCigarCharOpFromRefOffset(17)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(18)", 'M', cigar.getCigarCharOpFromRefOffset(18)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(19)", 'M', cigar.getCigarCharOpFromRefOffset(19)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(20)", 'D', cigar.getCigarCharOpFromRefOffset(20)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(21)", 'D', cigar.getCigarCharOpFromRefOffset(21)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(22)", 'D', cigar.getCigarCharOpFromRefOffset(22)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(23)", 'D', cigar.getCigarCharOpFromRefOffset(23)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(24)", 'M', cigar.getCigarCharOpFromRefOffset(24)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(25)", 'M', cigar.getCigarCharOpFromRefOffset(25)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(26)", 'M', cigar.getCigarCharOpFromRefOffset(26)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(27)", '?', cigar.getCigarCharOpFromRefOffset(27)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(28)", '?', cigar.getCigarCharOpFromRefOffset(28)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(29)", '?', cigar.getCigarCharOpFromRefOffset(29)); check(failures, ++testNum, "getCigarCharOpFromRefOffset(30)", '?', cigar.getCigarCharOpFromRefOffset(30)); // Test getCigarCharOpFromRefPos. check(failures, ++testNum, "getCigarCharOpFromRefPos(-1, 5)", '?', cigar.getCigarCharOpFromRefPos(-1,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(0, 5)", '?', cigar.getCigarCharOpFromRefPos(0,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(1, 5)", '?', cigar.getCigarCharOpFromRefPos(1,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(2, 5)", '?', cigar.getCigarCharOpFromRefPos(2,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(3, 5)", '?', cigar.getCigarCharOpFromRefPos(3,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(4, 5)", '?', cigar.getCigarCharOpFromRefPos(4,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(5, 5)", 'M', cigar.getCigarCharOpFromRefPos(5,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(6, 5)", 'M', cigar.getCigarCharOpFromRefPos(6,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(7, 5)", 'M', cigar.getCigarCharOpFromRefPos(7,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(8, 5)", 'M', cigar.getCigarCharOpFromRefPos(8,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(9, 5)", 'N', cigar.getCigarCharOpFromRefPos(9,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(10, 5)", 'N', cigar.getCigarCharOpFromRefPos(10,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(11, 5)", 'N', cigar.getCigarCharOpFromRefPos(11,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(12, 5)", 'N', cigar.getCigarCharOpFromRefPos(12,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(13, 5)", 'N', cigar.getCigarCharOpFromRefPos(13,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(14, 5)", 'N', cigar.getCigarCharOpFromRefPos(14,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(15, 5)", 'N', cigar.getCigarCharOpFromRefPos(15,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(16, 5)", 'N', cigar.getCigarCharOpFromRefPos(16,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(17, 5)", 'N', cigar.getCigarCharOpFromRefPos(17,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(18, 5)", 'N', cigar.getCigarCharOpFromRefPos(18,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(19, 5)", 'M', cigar.getCigarCharOpFromRefPos(19,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(20, 5)", 'M', cigar.getCigarCharOpFromRefPos(20,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(21, 5)", 'M', cigar.getCigarCharOpFromRefPos(21,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(22, 5)", 'M', cigar.getCigarCharOpFromRefPos(22,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(23, 5)", 'M', cigar.getCigarCharOpFromRefPos(23,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(24, 5)", 'M', cigar.getCigarCharOpFromRefPos(24,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(25, 5)", 'D', cigar.getCigarCharOpFromRefPos(25,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(26, 5)", 'D', cigar.getCigarCharOpFromRefPos(26,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(27, 5)", 'D', cigar.getCigarCharOpFromRefPos(27,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(28, 5)", 'D', cigar.getCigarCharOpFromRefPos(28,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(29, 5)", 'M', cigar.getCigarCharOpFromRefPos(29,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(30, 5)", 'M', cigar.getCigarCharOpFromRefPos(30,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(31, 5)", 'M', cigar.getCigarCharOpFromRefPos(31,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(32, 5)", '?', cigar.getCigarCharOpFromRefPos(32,5)); check(failures, ++testNum, "getCigarCharOpFromRefPos(33, 5)", '?', cigar.getCigarCharOpFromRefPos(33,5)); //////////////////////// // Test getNumOverlaps. check(failures, ++testNum, "getNumOverlaps(5,32,5)", (uint32_t)13, cigar.getNumOverlaps(5,32,5)); check(failures, ++testNum, "getNumOverlaps(5,31,5)", (uint32_t)12, cigar.getNumOverlaps(5,31,5)); check(failures, ++testNum, "getNumOverlaps(0,100,5)", (uint32_t)13, cigar.getNumOverlaps(0,100,5)); check(failures, ++testNum, "getNumOverlaps(-1, -1,5)", (uint32_t)13, cigar.getNumOverlaps(-1, -1,5)); check(failures, ++testNum, "getNumOverlaps(-1,10,5)", (uint32_t)4, cigar.getNumOverlaps(-1,10,5)); check(failures, ++testNum, "getNumOverlaps(10,-1,5)", (uint32_t)9, cigar.getNumOverlaps(10,-1,5)); check(failures, ++testNum, "getNumOverlaps(9,19,5)", (uint32_t)0, cigar.getNumOverlaps(9,19,5)); check(failures, ++testNum, "getNumOverlaps(9,20,5)", (uint32_t)1, cigar.getNumOverlaps(9,20,5)); check(failures, ++testNum, "getNumOverlaps(9,6,5)", (uint32_t)0, cigar.getNumOverlaps(9,6,5)); check(failures, ++testNum, "getNumOverlaps(0,5,5)", (uint32_t)0, cigar.getNumOverlaps(0,5,5)); check(failures, ++testNum, "getNumOverlaps(32,40,5)", (uint32_t)0, cigar.getNumOverlaps(32,40,5)); check(failures, ++testNum, "getNumOverlaps(0,5,1)", (uint32_t)4, cigar.getNumOverlaps(0,5,1)); check(failures, ++testNum, "getNumOverlaps(32,40,32)", (uint32_t)4, cigar.getNumOverlaps(32,40,32)); // Get Query Index for reference offset 0 - 27 // 4M check(failures, ++testNum, "getQueryIndex(0)", 0, cigar.getQueryIndex(0)); check(failures, ++testNum, "getQueryIndex(1)", 1, cigar.getQueryIndex(1)); check(failures, ++testNum, "getQueryIndex(2)", 2, cigar.getQueryIndex(2)); check(failures, ++testNum, "getQueryIndex(3)", 3, cigar.getQueryIndex(3)); // 10N check(failures, ++testNum, "getQueryIndex(4)", -1, cigar.getQueryIndex(4)); check(failures, ++testNum, "getQueryIndex(5)", -1, cigar.getQueryIndex(5)); check(failures, ++testNum, "getQueryIndex(6)", -1, cigar.getQueryIndex(6)); check(failures, ++testNum, "getQueryIndex(7)", -1, cigar.getQueryIndex(7)); check(failures, ++testNum, "getQueryIndex(8)", -1, cigar.getQueryIndex(8)); check(failures, ++testNum, "getQueryIndex(9)", -1, cigar.getQueryIndex(9)); check(failures, ++testNum, "getQueryIndex(10)", -1, cigar.getQueryIndex(10)); check(failures, ++testNum, "getQueryIndex(11)", -1, cigar.getQueryIndex(11)); check(failures, ++testNum, "getQueryIndex(12)", -1, cigar.getQueryIndex(12)); check(failures, ++testNum, "getQueryIndex(13)", -1, cigar.getQueryIndex(13)); // 4M check(failures, ++testNum, "getQueryIndex(14)", 4, cigar.getQueryIndex(14)); check(failures, ++testNum, "getQueryIndex(15)", 5, cigar.getQueryIndex(15)); check(failures, ++testNum, "getQueryIndex(16)", 6, cigar.getQueryIndex(16)); check(failures, ++testNum, "getQueryIndex(17)", 7, cigar.getQueryIndex(17)); // 3I - nothing to check - not in reference - covers query indices 8-10 // 2M check(failures, ++testNum, "getQueryIndex(18)", 11, cigar.getQueryIndex(18)); check(failures, ++testNum, "getQueryIndex(19)", 12, cigar.getQueryIndex(19)); // 4D check(failures, ++testNum, "getQueryIndex(20)", -1, cigar.getQueryIndex(20)); check(failures, ++testNum, "getQueryIndex(21)", -1, cigar.getQueryIndex(21)); check(failures, ++testNum, "getQueryIndex(22)", -1, cigar.getQueryIndex(22)); check(failures, ++testNum, "getQueryIndex(23)", -1, cigar.getQueryIndex(23)); // 3M check(failures, ++testNum, "getQueryIndex(24)", 13, cigar.getQueryIndex(24)); check(failures, ++testNum, "getQueryIndex(25)", 14, cigar.getQueryIndex(25)); check(failures, ++testNum, "getQueryIndex(26)", 15, cigar.getQueryIndex(26)); // Get Query Index for reference positions 0-33 // N/A check(failures, ++testNum, "getQueryIndex(0, 5)", -1, cigar.getQueryIndex(0, 5)); check(failures, ++testNum, "getQueryIndex(1, 5)", -1, cigar.getQueryIndex(1, 5)); check(failures, ++testNum, "getQueryIndex(2, 5)", -1, cigar.getQueryIndex(2, 5)); check(failures, ++testNum, "getQueryIndex(3, 5)", -1, cigar.getQueryIndex(3, 5)); check(failures, ++testNum, "getQueryIndex(4, 5)", -1, cigar.getQueryIndex(4, 5)); // 4M check(failures, ++testNum, "getQueryIndex(5, 5)", 0, cigar.getQueryIndex(5, 5)); check(failures, ++testNum, "getQueryIndex(6, 5)", 1, cigar.getQueryIndex(6, 5)); check(failures, ++testNum, "getQueryIndex(7, 5)", 2, cigar.getQueryIndex(7, 5)); check(failures, ++testNum, "getQueryIndex(8, 5)", 3, cigar.getQueryIndex(8, 5)); // 10N check(failures, ++testNum, "getQueryIndex(9, 5)", -1, cigar.getQueryIndex(9, 5)); check(failures, ++testNum, "getQueryIndex(10, 5)", -1, cigar.getQueryIndex(10, 5)); check(failures, ++testNum, "getQueryIndex(11, 5)", -1, cigar.getQueryIndex(11, 5)); check(failures, ++testNum, "getQueryIndex(12, 5)", -1, cigar.getQueryIndex(12, 5)); check(failures, ++testNum, "getQueryIndex(13, 5)", -1, cigar.getQueryIndex(13, 5)); check(failures, ++testNum, "getQueryIndex(14, 5)", -1, cigar.getQueryIndex(14, 5)); check(failures, ++testNum, "getQueryIndex(15, 5)", -1, cigar.getQueryIndex(15, 5)); check(failures, ++testNum, "getQueryIndex(16, 5)", -1, cigar.getQueryIndex(16, 5)); check(failures, ++testNum, "getQueryIndex(17, 5)", -1, cigar.getQueryIndex(17, 5)); check(failures, ++testNum, "getQueryIndex(18, 5)", -1, cigar.getQueryIndex(18, 5)); // 4M check(failures, ++testNum, "getQueryIndex(19, 5)", 4, cigar.getQueryIndex(19, 5)); check(failures, ++testNum, "getQueryIndex(20, 5)", 5, cigar.getQueryIndex(20, 5)); check(failures, ++testNum, "getQueryIndex(21, 5)", 6, cigar.getQueryIndex(21, 5)); check(failures, ++testNum, "getQueryIndex(22, 5)", 7, cigar.getQueryIndex(22, 5)); // 3I - nothing to check - not in reference - covers query indices 8-10 // 2M check(failures, ++testNum, "getQueryIndex(23, 5)", 11, cigar.getQueryIndex(23, 5)); check(failures, ++testNum, "getQueryIndex(24, 5)", 12, cigar.getQueryIndex(24, 5)); // 4D check(failures, ++testNum, "getQueryIndex(25, 5)", -1, cigar.getQueryIndex(25, 5)); check(failures, ++testNum, "getQueryIndex(26, 5)", -1, cigar.getQueryIndex(26, 5)); check(failures, ++testNum, "getQueryIndex(27, 5)", -1, cigar.getQueryIndex(27, 5)); check(failures, ++testNum, "getQueryIndex(28, 5)", -1, cigar.getQueryIndex(28, 5)); // 3M check(failures, ++testNum, "getQueryIndex(29, 5)", 13, cigar.getQueryIndex(29, 5)); check(failures, ++testNum, "getQueryIndex(30, 5)", 14, cigar.getQueryIndex(30, 5)); check(failures, ++testNum, "getQueryIndex(31, 5)", 15, cigar.getQueryIndex(31, 5)); // Get reference offset for query index 0 - 17 // 4M check(failures, ++testNum, "getRefOffset(0)", 0, cigar.getRefOffset(0)); check(failures, ++testNum, "getRefOffset(1)", 1, cigar.getRefOffset(1)); check(failures, ++testNum, "getRefOffset(2)", 2, cigar.getRefOffset(2)); check(failures, ++testNum, "getRefOffset(3)", 3, cigar.getRefOffset(3)); // 10N - nothing to check - not in query - covers ref offsets 4-13 // 4M check(failures, ++testNum, "getRefOffset(4)", 14, cigar.getRefOffset(4)); check(failures, ++testNum, "getRefOffset(5)", 15, cigar.getRefOffset(5)); check(failures, ++testNum, "getRefOffset(6)", 16, cigar.getRefOffset(6)); check(failures, ++testNum, "getRefOffset(7)", 17, cigar.getRefOffset(7)); // 3I check(failures, ++testNum, "getRefOffset(8)", -1, cigar.getRefOffset(8)); check(failures, ++testNum, "getRefOffset(9)", -1, cigar.getRefOffset(9)); check(failures, ++testNum, "getRefOffset(10)", -1, cigar.getRefOffset(10)); // 2M check(failures, ++testNum, "getRefOffset(11)", 18, cigar.getRefOffset(11)); check(failures, ++testNum, "getRefOffset(12)", 19, cigar.getRefOffset(12)); // 4D - nothing to check - not in query - covers ref offsets 20-23 // 3M check(failures, ++testNum, "getRefOffset(13)", 24, cigar.getRefOffset(13)); check(failures, ++testNum, "getRefOffset(14)", 25, cigar.getRefOffset(14)); check(failures, ++testNum, "getRefOffset(15)", 26, cigar.getRefOffset(15)); // Get reference position for query index 0 - 17 // 4M check(failures, ++testNum, "getRefPosition(0, 5)", 5, cigar.getRefPosition(0, 5)); check(failures, ++testNum, "getRefPosition(1, 5)", 6, cigar.getRefPosition(1, 5)); check(failures, ++testNum, "getRefPosition(2, 5)", 7, cigar.getRefPosition(2, 5)); check(failures, ++testNum, "getRefPosition(3, 5)", 8, cigar.getRefPosition(3, 5)); // 10N - nothing to check - not in query - covers ref offsets 4-13 // 4M check(failures, ++testNum, "getRefPosition(4, 5)", 19, cigar.getRefPosition(4, 5)); check(failures, ++testNum, "getRefPosition(5, 5)", 20, cigar.getRefPosition(5, 5)); check(failures, ++testNum, "getRefPosition(6, 5)", 21, cigar.getRefPosition(6, 5)); check(failures, ++testNum, "getRefPosition(7, 5)", 22, cigar.getRefPosition(7, 5)); // 3I check(failures, ++testNum, "getRefPosition(8, 5)", -1, cigar.getRefPosition(8, 5)); check(failures, ++testNum, "getRefPosition(9, 5)", -1, cigar.getRefPosition(9, 5)); check(failures, ++testNum, "getRefPosition(10, 5)", -1, cigar.getRefPosition(10, 5)); // 2M check(failures, ++testNum, "getRefPosition(11, 5)", 23, cigar.getRefPosition(11, 5)); check(failures, ++testNum, "getRefPosition(12, 5)", 24, cigar.getRefPosition(12, 5)); // 4D - nothing to check - not in query - covers ref pos 25-28 // 3M check(failures, ++testNum, "getRefPosition(13, 5)", 29, cigar.getRefPosition(13, 5)); check(failures, ++testNum, "getRefPosition(14, 5)", 30, cigar.getRefPosition(14, 5)); check(failures, ++testNum, "getRefPosition(15, 5)", 31, cigar.getRefPosition(15, 5)); //////////////////////////////////////////////////////////////////////////// // Test a new CIGAR set by buffer. // 2S 3M 1I 2M 1D 1M 2P 1M 3N 1M 3H uint32_t cigarBuf[] = {0x24, // 2S = 2 << 4 | 4 0x30, // 3M = 3 << 4 | 0 0x11, // 1I = 1 << 4 | 1 0x20, // 2M = 2 << 4 | 0 0x12, // 1D = 1 << 4 | 2 0x10, // 1M = 1 << 4 | 0 0x26, // 2P = 2 << 4 | 6 0x10, // 1m = 1 << 4 | 0 0x33, // 3N = 3 << 4 | 3 0x10, // 1M = 1 << 4 | 0 0x35}; // 3H = 3 << 4 | 5 cigar.Set(cigarBuf, 11); cigarString = "HI"; cigar.getCigarString(cigarString); expectedResult = "2S3M1I2M1D1M2P1M3N1M3H"; check(failures, ++testNum, "getCigarString", expectedResult, cigarString); check(failures, ++testNum, "getNumEndClips", 3, cigar.getNumEndClips()); check(failures, ++testNum, "getNumBeginClips", 2, cigar.getNumBeginClips()); std::cout << "\nCigarRoller PASS: " << testNum - failures << " FAIL: " << failures << std::endl; // return the number of failures. return(failures); } int main(int argc, const char **argv) { CigarRollerTest roller; bool showAllCasesFlag = false; int opt; while(( opt = getopt(argc, (char **) argv, "v")) != -1) { switch(opt) { case 'v': showAllCasesFlag = true; break; default: std::cerr << "usage: testSW [-v]" << std::endl; exit(1); } } if(showAllCasesFlag) { } // // do cigar roller tests first // return(roller.test()); // CIGAR explanation - for backward SW runs, the corresponding // CIGAR string is generated from the back of the string to the // front. Recall that the soft clipping is only done at the // "end" of the string, taking direction into account. // Comment out this result since it doesn't refelct the results of test. // cout << endl << "Total Errors found: " << errors << endl; } libStatGen-1.0.14/general/test/cigar/CigarRollerTest.h000066400000000000000000000014731254730101300225310ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "CigarRoller.h" class CigarRollerTest { public: int test(); private: }; libStatGen-1.0.14/general/test/cigar/Makefile000066400000000000000000000002531254730101300207460ustar00rootroot00000000000000PATH_TO_BASE=../../.. EXE = cigarRollerTest TOOLBASE = CigarRollerTest TEST_COMMAND= mkdir -p results; \ ./cigarRollerTest include $(PATH_TO_BASE)/Makefiles/Makefile.testlibStatGen-1.0.14/general/test/dbsnp/000077500000000000000000000000001254730101300173275ustar00rootroot00000000000000libStatGen-1.0.14/general/test/dbsnp/.gitignore000066400000000000000000000000231254730101300213120ustar00rootroot00000000000000dbsnpTest results/ libStatGen-1.0.14/general/test/dbsnp/Main.cpp000066400000000000000000000040011254730101300207120ustar00rootroot00000000000000#include #include #include "GenomeSequence.h" #include "InputFile.h" void readDbsnp(mmapArrayBool_t& dbSNP, const char* fileName, GenomeSequence& ref); int main(int argc, char ** argv) { // time_t startTime; // time_t endTime; // startTime = time(NULL); GenomeSequence* refPtr = new GenomeSequence("testFiles/chr1_partial.fa"); // endTime = time(NULL); // std::cerr << "Time to read reference: " << endTime - startTime << std::endl; if(refPtr == NULL) { std::cerr << "Failed to read the reference\n"; return(-1); } std::cerr << "\nStandard VCF DBSNP test\n"; mmapArrayBool_t dbsnpArray1; const char* dbsnpFileName = "testFiles/dbsnp.vcf"; // startTime = time(NULL); refPtr->loadDBSNP(dbsnpArray1, dbsnpFileName); // endTime = time(NULL); // std::cerr << "Time to read dbsnp through reference: " << endTime - startTime << std::endl; genomeIndex_t mapPos = refPtr->getGenomePosition("1", 10233); std::cerr << "dbsnp " << mapPos << ": " << dbsnpArray1[mapPos] << std::endl; std::cerr << "dbsnp " << mapPos+1 << ": " << dbsnpArray1[mapPos+1] << std::endl; std::cerr << "dbsnp " << mapPos+2 << ": " << dbsnpArray1[mapPos+2] << std::endl; std::cerr << "\nGZIP VCF DBSNP test\n"; mmapArrayBool_t dbsnpArray2; dbsnpFileName = "testFiles/dbsnp.vcf.gz"; // startTime = time(NULL); refPtr->loadDBSNP(dbsnpArray2, dbsnpFileName); // endTime = time(NULL); // std::cerr << "Time to read dbsnp through reference: " << endTime - startTime << std::endl; mapPos = refPtr->getGenomePosition("1", 10233); std::cerr << "dbsnp " << mapPos << ": " << dbsnpArray2[mapPos] << std::endl; std::cerr << "dbsnp " << mapPos+1 << ": " << dbsnpArray2[mapPos+1] << std::endl; std::cerr << "dbsnp " << mapPos+2 << ": " << dbsnpArray2[mapPos+2] << std::endl; return(0); } libStatGen-1.0.14/general/test/dbsnp/Makefile000066400000000000000000000003271254730101300207710ustar00rootroot00000000000000PATH_TO_BASE=../../.. EXE = dbsnpTest SRCONLY = Main.cpp TEST_COMMAND=@mkdir -p results; ./dbsnpTest 2> results/results.txt && diff expected/results.txt results/results.txt include ../../../Makefiles/Makefile.testlibStatGen-1.0.14/general/test/dbsnp/expected/000077500000000000000000000000001254730101300211305ustar00rootroot00000000000000libStatGen-1.0.14/general/test/dbsnp/expected/results.txt000066400000000000000000000006571254730101300234020ustar00rootroot00000000000000 Standard VCF DBSNP test Load dbSNP file 'testFiles/dbsnp.vcf': (as text file) GenomeSequence::populateDBSNP: ignored 1 SNP positions due to invalid format of line. DONE! dbsnp 10232: 0 dbsnp 10233: 1 dbsnp 10234: 0 GZIP VCF DBSNP test Load dbSNP file 'testFiles/dbsnp.vcf.gz': (as text file) GenomeSequence::populateDBSNP: ignored 1 SNP positions due to invalid format of line. DONE! dbsnp 10232: 0 dbsnp 10233: 1 dbsnp 10234: 0 libStatGen-1.0.14/general/test/dbsnp/testFiles/000077500000000000000000000000001254730101300212715ustar00rootroot00000000000000libStatGen-1.0.14/general/test/dbsnp/testFiles/chr1_partial-bs.umfa000066400000000000000000000144741254730101300251300ustar00rootroot000000000000003y12*IPmktrostmktrost-laptop--2b57eeb883c6c7078241876e733e49501DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD3DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD11111111111111111111111111!!2 "!!!"#1" 12!32##"!!1!!2!"##1" !!2#"!!!  !!"!"!"!!!  !!"!"!"!!!  !!"!"!#1 !!""#"!#  !!"!"!#1!!""#"!#  !!"!"!#1 !!""#"!#  1"""2#"""!##3  "!"#"""" "#!#!!1!"#"!!"12!!"0!!33 ""3"#12"#1"!"1 "111#12"##1"!""1 "11#1120 #2"!1# 1 "0#"23"#2"1# 1""#"3 1"!30 "" #123" 1#libStatGen-1.0.14/general/test/dbsnp/testFiles/chr1_partial.fa000066400000000000000000000267641254730101300241710ustar00rootroot00000000000000>1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNACTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTAACCCTAACCCTAACCCTA ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAA CCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCC TAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCC TAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC CCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGG TCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCC GTTGCAAAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCG CAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCA GAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGA CACATGCTAGCGCGTCGGGGTGGAGGCGTGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGG CGCAGGCGCAGAGACACATGCTACCGCGTCCAGGGGTGGAGGCGTGGCGCAGGCGCAGAG AGGCGCACCGCGCCGGCGCAGGCGCAGAGACACATGCTAGCGCGTCCAGGGGTGGAGGCG TGGCGCAGGCGCAGAGACGCAAGCCTACGGGCGGGGGTTGGGGGGGCGTGTGTTGCAGGA GCAAAGTCGCACGGCGCCGGGCTGGGGCGGGGGGAGGGTGGCGCCGTGCACGCGCAGAAA CTCACGTCACGGTGGCGCGGCGCAGAGACGGGTAGAACCTCAGTAATCCGAAAAGCCGGG ATCGACCGCCCCTTGCTTGCAGCCGGGCACTACAGGACCCGCTTGCTCACGGTGCTGTGC CAGGGCGCCCCCTGCTGGCGACTAGGGCAACTGCAGGGCTCTCTTGCTTAGAGTGGTGGC CAGCGCCCCCTGCTGGCGCCGGGGCACTGCAGGGCCCTCTTGCTTACTGTATAGTGGTGG CACGCCGCCTGCTGGCAGCTAGGGACATTGCAGGGTCCTCTTGCTCAAGGTGTAGTGGCA GCACGCCCACCTGCTGGCAGCTGGGGACACTGCCGGGCCCTCTTGCTCCAACAGTACTGG CGGATTATAGGGAAACACCCGGAGCATATGCTGTTTGGTCTCAGTAGACTCCTAAATATG libStatGen-1.0.14/general/test/dbsnp/testFiles/dbsnp.vcf000066400000000000000000000372611254730101300231100ustar00rootroot00000000000000##fileformat=VCFv4.1 ##FILTER= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO=5% minor allele frequency in 1+ populations"> ##INFO=5% minor allele frequency in each and all populations"> ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO=SubSNP->Batch.link_out"> ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##LeftAlignVariants="analysis_type=LeftAlignVariants input_file=[] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false disable_experimental_low_memory_sharding=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=00-All.vcf) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub filter_mismatching_base_and_quals=false" ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##dbSNP_BUILD_ID=135 ##fileDate=20111104 ##phasing=partial ##reference=GRCh37.3 ##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta ##source=dbSNP ##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf #CHROM POS ID REF ALT QUAL FILTER INFO 1 10144 rs144773400 TA T . PASS ASP;RSPOS=10145;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 1 10228 rs143255646 TA T . PASS ASP;RSPOS=10229;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 1 10234 rs145599635 C T . PASS ASP;RSPOS=10234;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 1 10248 rs148908337 A T . PASS ASP;RSPOS=10248;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 1 10254 rs140194106 TA T . PASS ASP;RSPOS=10255;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 1 10291 rs145427775 C T . PASS ASP;RSPOS=10291;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 1 10327 rs112750067 T C . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10327;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132 1 10329 rs150969722 AC A . PASS ASP;RSPOS=10330;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 1 10351 rs145072688 CTA C,CA . PASS ASP;RSPOS=10352;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 1 10382 rs147093981 AAC A,AC . PASS ASP;RSPOS=10383;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 1 10433 rs56289060 A AC . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10433;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 1 10439 rs112766696 AC A . PASS ASP;GENEINFO=LOC100652771:100652771;GNO;RSPOS=10440;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=132 1 10439 rs138941843 AC A . PASS ASP;RSPOS=10440;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 1 10440 rs112155239 C A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10440;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132 1 10492 rs55998931 C T . PASS ASP;GENEINFO=LOC100652771:100652771;GMAF=0.0617001828153565;RSPOS=10492;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040000000100;WGT=0;dbSNPBuildID=129 1 10519 rs62636508 G C . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10519;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=129 1 10583 rs58108140 G A . PASS ASP;GENEINFO=LOC100652771:100652771;GMAF=0.270566727605119;KGPilot123;RSPOS=10583;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040010000100;WGT=0;dbSNPBuildID=129 1 10611 rs189107123 C G . PASS KGPilot123;RSPOS=10611;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 1 10828 rs10218492 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10828;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=119 1 10904 rs10218493 G A . PASS ASP;GENEINFO=LOC100652771:100652771;GNO;RSPOS=10904;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=119 1 10927 rs10218527 A G . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10927;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=119 1 10938 rs28853987 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10938;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125 1 11014 rs28484712 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=11014;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125 1 11022 rs28775022 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=11022;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125 1 11081 rs10218495 G T . PASS CFL;GENEINFO=LOC100652771:100652771;GNO;RSPOS=11081;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=119 1 11863 rs187669455 C A . PASS RSPOS=11863;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=135 libStatGen-1.0.14/general/test/dbsnp/testFiles/dbsnp.vcf.gz000066400000000000000000000104161254730101300235200ustar00rootroot00000000000000+IPdbsnp.vcf[sȲ+6uu0\M\Ĺ[NQ X=# b֖cou{| ZKݻۻf^{mx]A׻`'p(϶WuO\]S;[P^|YnsV_^( CsouƉ#8YM$+!9{Md.SFL6N7Vu/#KD@6O6N̑$ 2C%ޣ禎DPnu%iW$KbrlL3|g0~A WBqy(9wyB.+wV}Q0%OD%B~߿}FyP>r@ lEW gu.H"{>1OFO`M"JD< D)/CNld$ V׉DČBn*Msp?::yee[lJ3qVh%Ra~L㫢ymʽEJOW;'B:ODδ9o*\#bM3Y0d˞4N59r{qo=DG-(mKE2RcY(+. ՋR1#CVkDP:PBP@N厰$4#AZ==IX$h1+JrNJRξXuc~:lu6>L&7Tp^gV 8yt|l l</ٰR*^oAit0t[%.(kr!*a!ItYSkL7 xILtυ}F;gVS=Tk{HEFql&[!N 2ZdD<:_TǕ}0/idURS +v&[y[ )bcѥ*sY}~ +fn LH'i`ƈJ5ՋycmSy72OΏ(2]Nn.iG 8\lYmZVnC!FFyC.qEh:-We:GQTT=!C@;VW Dt5yi`gQ䭽= : >RJƃ>DNCsőM$>ֲvH06q| P3 )PaD9TR+,E鳁n!PxA҆hH[x>5i SrgG 4[׫RI,uBN'?ciJNM6‘r()\g^zԽ"Rg490u&KvL#wQΣL!B|4}khaRR>ϳ 5,kJ b_L1/lbu Y"J ?wvg[E<}b|(oq /yޝ7c qK9nm~tb:lVBgu2GRgYuYJYN矔,O*渊Y-҄Fη@mV"g\Ld&r5pDМqW)߲QNHp;Sw:"tmb= JDu@O)*?,@dNߎ)_R~pME:^gVf.m߸<:~b֜G)T~ލ[۝W͋}Iu\mvv7wf_/g:gjj'Gl8qX%`!<dnrW̳qv@s?y15mqpjnoϾx0P֤A6_RⴃpKc5D̑x<.GPՕ\mLf8rvlȗq[|t7_.wஷ>Oz3~M|!sʗy ;G98^Lmќ-"8s/G_@2m(z$A,po˗ھd1@=.6kKruhZD $Mx IEz~0ES|dMw3yv6-ӗ۟%|[JT/_EGf_Kq(Hn/R h,ER6:ZJ`i̷tRh"@Sy(_1j$"(%=92qy=J9 KafuS4:zB@ѵ) G{FYslY>@{a`rQ#=5]RF(G]pR ӽMY똺ai ̦Fp~ZM5Q2CzPSK!VV֢hnqzT lΖr)Э_~]w PKtIg0Z+U]}U}M;RbZ(kuujl)w(ju,v}Nie[i6_?3OSO:&K rݠ7#t5s,,Kǣk{]ݟt7H*! y}!5`q5Oq sW,^+۴u :?zwEDv˻1Mw4Р/rCdMkD1~IRciƸ36:}rQ[0N=|x%ԭ?m_3,,_?CЊ$ xQ+[jUm辠TժJ×gjQ-[TltRzKzvRj1gl_Mj˙}54͗koxTU1ItgBe +ӥ AA̛RrP6%h.)TUzCzng5p{}^fKM ݅%7:tffK}C(Ь ː`Ϋ1oJsPva??iC'ɧMѩ-IMOMqKUJXgt߹mK-ɐMhphSVXjg恥:C1TCFu\)oa%2В% ~.3n00@xp@#o\! lY2 ПяVDBu@tECAV+0A[xRaKӞ,$%•xU@췩&Xg&(btI4+1Ĥ(A,:0Y{;8'IqY5\ UCJD & TSC>libStatGen-1.0.14/general/test/gtest.cpp000066400000000000000000000034051254730101300200550ustar00rootroot00000000000000/* * Copyright (c) 2009 Regents of the University of Michigan * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include class GlobalTestEnvironment : public ::testing::Environment { public: virtual ~GlobalTestEnvironment(); virtual void SetUp(); virtual void TearDown(); //public: // }; GlobalTestEnvironment::~GlobalTestEnvironment() { } void GlobalTestEnvironment::SetUp() { } void GlobalTestEnvironment::TearDown() { } ::testing::Environment* const globalEnvironment = ::testing::AddGlobalTestEnvironment(new GlobalTestEnvironment); int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } libStatGen-1.0.14/general/test/inputFileTest/000077500000000000000000000000001254730101300210205ustar00rootroot00000000000000libStatGen-1.0.14/general/test/inputFileTest/.gitignore000066400000000000000000000000271254730101300230070ustar00rootroot00000000000000inputFileTest results/ libStatGen-1.0.14/general/test/inputFileTest/InputFileTest.cpp000066400000000000000000001200341254730101300242630ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "InputFileTest.h" #include #include #include "StringBasics.h" void testAdditional(const char *extension); void testWrite(); int main(int argc, char ** argv) { IFILE_Test myFile; myFile.test(); testWrite(); std::cout << "\nAdditional Tests: " << std::endl; testAdditional("txt"); #ifdef __ZLIB_AVAILABLE__ testAdditional("gz"); #endif } const int IFILE_Test::TEST_FILE_SIZE = 37; const int IFILE_Test::BGZF_TEST_FILE_SIZE = 93; const std::string IFILE_Test::TEST_FILE_CONTENTS = "ABCDabcd1234\nEFGefg567\nhijklHIJKL8910"; void IFILE_Test::test() { std::cout << "\nUncompressedFileType Tests:" << std::endl; testAll("txt"); #ifdef __ZLIB_AVAILABLE__ std::cout << "\nGzipFileType Tests:" << std::endl; testAll("gz"); std::cout << "\nBgzfFileType Tests:" << std::endl; testAll("bam"); std::cout << "\n.glf file Tests:" << std::endl; testAll("glf"); #endif } void IFILE_Test::testAll(const char* extension) { test_readFromFile(extension); test_readTilChar(extension); test_ifeof_ifrewind(extension); test_ifread_ifgetc(extension); test_ifclose(extension); test_ifseek(extension); test_noExistRead(extension); } void IFILE_Test::test_readFromFile(const char* extension) { // First open the test file. openFile(extension); // Verify the file successfully opened. assert(myFileTypePtr != NULL); assert(isOpen()); assert(myFileTypePtr->isOpen()); // Track how many bytes are read by each call. int numBytesRead = 0; // Track the total number of the bytes that have been read from the file // at any given point. int totalBytesPreviouslyRead = 0; // Test readFromFile. numBytesRead = readFromFile(myTestBuffer, 4); assert(myTestBuffer[0] == TEST_FILE_CONTENTS[0]); assert(myTestBuffer[1] == TEST_FILE_CONTENTS[1]); assert(myTestBuffer[2] == TEST_FILE_CONTENTS[2]); assert(myTestBuffer[3] == TEST_FILE_CONTENTS[3]); assert(numBytesRead == 4); totalBytesPreviouslyRead += numBytesRead; // This read should not have affected the internal buffer. assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); // Should not be at eof assert(myFileTypePtr->eof() == false); assert(ifeof() == false); // Read again to verify that the next characters could be read. numBytesRead = readFromFile(myTestBuffer, 2); // Read 2 more characters from the test file. assert(myTestBuffer[0] == TEST_FILE_CONTENTS[4]); assert(myTestBuffer[1] == TEST_FILE_CONTENTS[5]); assert(myTestBuffer[2] == TEST_FILE_CONTENTS[2]); assert(myTestBuffer[3] == TEST_FILE_CONTENTS[3]); assert(numBytesRead == 2); totalBytesPreviouslyRead += numBytesRead; // This read should not have affected the internal buffer. assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); // Should not be at eof assert(myFileTypePtr->eof() == false); assert(ifeof() == false); // Read the rest of the file. // Determine expected results for reading the rest of the file by // taking the substring starting after what had been previously read. numBytesRead = readFromFile(myTestBuffer, MAX_TEST_BUFFER_SIZE); // Read the rest of the file, so the number of bytes read is // what was left in the file. assert(numBytesRead == (TEST_FILE_SIZE - totalBytesPreviouslyRead)); assert(numBytesRead != MAX_TEST_BUFFER_SIZE); for(int i = 0; i < numBytesRead; i++) { assert(myTestBuffer[i] == TEST_FILE_CONTENTS[totalBytesPreviouslyRead+i]); } totalBytesPreviouslyRead += numBytesRead; assert(myFileTypePtr->eof() != 0); assert(ifeof() != 0); // Try to read one more time, making sure it doesn't read anything. numBytesRead = readFromFile(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Should be at eof assert(myFileTypePtr->eof() != 0); assert(ifeof() != 0); ifclose(); std::cout << " Passed test_readFromFile" << std::endl; } void IFILE_Test::test_readTilChar(const char* extension) { // First open the test file. openFile(extension); // Verify the file successfully opened. assert(myFileTypePtr != NULL); assert(isOpen()); assert(myFileTypePtr->isOpen()); // Track position of ending char found. int pos = 0; // Test readTilChar. std::string output = ""; std::string endChars = "a5d"; pos = readTilChar(endChars, output); assert(pos == 0); // read til a assert(output == "ABCD"); output.clear(); pos = readTilChar(endChars, output); assert(pos == 2); // read til d assert(output == "bc"); pos = readTilChar(endChars, output); assert(pos == 1); // read til 5 assert(output == "bc1234\nEFGefg"); output.clear(); pos = readTilChar(endChars, output); assert(pos == -1); // read til 5 assert(output == "67\nhijklHIJKL8910"); ifrewind(); // Test readTilChar. pos = readTilChar(endChars); assert(pos == 0); // read til a pos = readTilChar(endChars); assert(pos == 2); // read til d pos = readTilChar(endChars); assert(pos == 1); // read til 5 pos = readTilChar(endChars); assert(pos == -1); // read til 5 ifclose(); std::cout << " Passed test_readTilChar" << std::endl; } void IFILE_Test::test_ifeof_ifrewind(const char* extension) { // First open the test file. openFile(extension); // Verify the file successfully opened. assert(myFileTypePtr != NULL); assert(isOpen()); assert(myFileTypePtr->isOpen()); // Not at eof - verify that it reports not eof. assert(ifeof() == false); // Track the total number of the bytes that have been read from the file // at any given point. int totalBytesPreviouslyRead = 0; int numBytesRead = 0; ////////////////////////////////////////////////////////////// // Test doing reads from file without IFILE internal buffering. disableBuffering(); // Verify position in file. assert(iftell() == 0); // Read a character from the file. numBytesRead = readFromFile(myTestBuffer, 1); assert(numBytesRead == 1); assert(myTestBuffer[0] == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); // Now that we have tested based on the previous total bytes read, // increment the count. totalBytesPreviouslyRead += numBytesRead; // Not at eof assert(ifeof() == false); // Perform char read. char readChar = ifgetc(); assert(readChar == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); // Now that we have tested based on the previous total bytes read, // increment the count. ++totalBytesPreviouslyRead; // Not at eof assert(ifeof() == false); assert(iftell() == totalBytesPreviouslyRead); // Now read the rest. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == (TEST_FILE_SIZE - totalBytesPreviouslyRead)); // Hit the end of the file before reading the entire requested size. assert(numBytesRead != MAX_TEST_BUFFER_SIZE); // Now that we have tested based on the previous total bytes read, // increment the count. totalBytesPreviouslyRead += numBytesRead; assert(myFileTypePtr->eof() != 0); assert(ifeof() != 0); numBytesRead = readFromFile(myTestBuffer, 1); assert(numBytesRead == 0); // Now it registers eof assert(ifeof() != 0); // bgzf files use a specialized return value for iftell that // is not just straight file offset. if((strcmp(extension, "bam") == 0) || (strcmp(extension, "glf") == 0)) { assert(iftell() == (BGZF_TEST_FILE_SIZE << 16)); } else { assert(iftell() == TEST_FILE_SIZE); } /////////////////////////////////// // Test doing IFILE buffered reads. // rewind the file and verify that it no longer registers eof. ifrewind(); totalBytesPreviouslyRead = 0; // No longer at eof assert(ifeof() == false); // Verify position in file. assert(iftell() == 0); // Buffer reads - may have been disabled for iftell to work for bgzf. bufferReads(); // Read a character from the file. numBytesRead = readFromFile(myTestBuffer, 1); assert(numBytesRead == 1); assert(myTestBuffer[0] == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); // Now that we have tested based on the previous total bytes read, // increment the count. totalBytesPreviouslyRead += numBytesRead; // Not at eof assert(ifeof() == false); // Perform char read. readChar = ifgetc(); assert(readChar == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); // Now that we have tested based on the previous total bytes read, // increment the count. ++totalBytesPreviouslyRead; // Not at eof assert(ifeof() == false); // bgzf files use a specialized return value for iftell that // is not just straight file offset. if((strcmp(extension, "bam") == 0) || (strcmp(extension, "glf") == 0)) { bool caught = false; try { assert(iftell() == totalBytesPreviouslyRead); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "IFILE: CANNOT use buffered reads and tell for BGZF files") == 0); } assert(caught); } else { assert(iftell() == totalBytesPreviouslyRead); } // Now read the rest. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == (TEST_FILE_SIZE - totalBytesPreviouslyRead)); // Now that we have tested based on the previous total bytes read, // increment the count. totalBytesPreviouslyRead += numBytesRead; // Registers eof. assert(ifeof() != 0); // Read past eof. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Eof. assert(ifeof() != 0); // bgzf files use a specialized return value for iftell that // is not just straight file offset. if((strcmp(extension, "bam") == 0) || (strcmp(extension, "glf") == 0)) { bool caught = false; try { assert(iftell() == (BGZF_TEST_FILE_SIZE << 16)); } catch (std::exception& e) { caught = true; assert(strcmp(e.what(), "IFILE: CANNOT use buffered reads and tell for BGZF files") == 0); } assert(caught); disableBuffering(); assert(iftell() == (BGZF_TEST_FILE_SIZE << 16)); } else { assert(iftell() == TEST_FILE_SIZE); } // Verify that after rewind, eof is no longer registered. ifrewind(); // reset since we are back to the beginning of the file. totalBytesPreviouslyRead = 0; // No longer at eof assert(ifeof() == false); // Verify position in file. assert(iftell() == 0); // Verify properly works even if already at the beginning. ifrewind(); // reset since we are back to the beginning of the file. totalBytesPreviouslyRead = 0; // Not eof assert(ifeof() == false); // Verify position in file. assert(iftell() == 0); // Buffer reads - may have been disabled for iftell to work for bgzf. bufferReads(); ////////////////////// // Close the test file. ifclose(); std::cout << " Passed test_ifeof_ifrewind" << std::endl; } void IFILE_Test::test_ifread_ifgetc(const char* extension) { // First open the test file. openFile(extension); // Verify the file successfully opened. assert(myFileTypePtr != NULL); assert(isOpen()); assert(myFileTypePtr->isOpen()); int numBytesRead = 0; int totalBytesPreviouslyRead = 0; //////////////////////////////////// // Test reading entire file at once. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == TEST_FILE_SIZE); for(int i = 0; i < TEST_FILE_SIZE; i++) { assert(myTestBuffer[i] == TEST_FILE_CONTENTS[i]); } totalBytesPreviouslyRead += numBytesRead; // Should affect the IFILE buffer assert(myCurrentBufferSize == TEST_FILE_SIZE); assert(myBufferIndex == TEST_FILE_SIZE); assert(myFileTypePtr->eof() != 0); assert(ifeof() != 0); // Try reading at end of file twice. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // 2nd read attempt at eof. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; ////////////////////////////////////////// // Test reading entire file using getc. // Loop through reading the file. char readChar; for(int index = 0; index < TEST_FILE_SIZE; index++) { // Read a character. readChar = ifgetc(); assert(readChar == TEST_FILE_CONTENTS[index]); // Should affect the IFILE buffer assert(myCurrentBufferSize == TEST_FILE_SIZE); assert(myBufferIndex == index+1); } // Now that we have read the file, try reading again at eof. readChar = ifgetc(); assert(readChar == EOF); assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); // Try again at eof. // Now that we have read the file, try reading again at eof. readChar = ifgetc(); assert(readChar == EOF); assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; //////////////////////////////////////////////// // Test reading just the beginning of the file. numBytesRead = ifread(myTestBuffer, 4); assert(myTestBuffer[0] == TEST_FILE_CONTENTS[0]); assert(myTestBuffer[1] == TEST_FILE_CONTENTS[1]); assert(myTestBuffer[2] == TEST_FILE_CONTENTS[2]); assert(myTestBuffer[3] == TEST_FILE_CONTENTS[3]); assert(numBytesRead == 4); totalBytesPreviouslyRead += numBytesRead; // This read should have affected the internal buffer. assert(myCurrentBufferSize == TEST_FILE_SIZE); assert(myBufferIndex == 4); // Should not be at eof assert(ifeof() == false); // Test reading rest of file. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == (TEST_FILE_SIZE - (int)totalBytesPreviouslyRead)); // Verify contents of what read. for(int i = 0; i < numBytesRead; i++) { assert(myTestBuffer[i] == TEST_FILE_CONTENTS[i + totalBytesPreviouslyRead]); } totalBytesPreviouslyRead += numBytesRead; // Try at end of file twice. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // 2nd read attempt at eof. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; ////////////////////////////////////// // Test reading just the beginning. numBytesRead = ifread(myTestBuffer, 4); assert(myTestBuffer[0] == TEST_FILE_CONTENTS[0]); assert(myTestBuffer[1] == TEST_FILE_CONTENTS[1]); assert(myTestBuffer[2] == TEST_FILE_CONTENTS[2]); assert(myTestBuffer[3] == TEST_FILE_CONTENTS[3]); assert(numBytesRead == 4); totalBytesPreviouslyRead += numBytesRead; // This read should have affected the internal buffer. assert(myCurrentBufferSize == TEST_FILE_SIZE); assert(myBufferIndex == 4); // Should not be at eof assert(ifeof() == false); // Test doing 2 getc. readChar = ifgetc(); assert(readChar == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); int bufferSize = TEST_FILE_SIZE; assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == 5); totalBytesPreviouslyRead++; readChar = ifgetc(); assert(readChar == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == 6); totalBytesPreviouslyRead++; // Test reading rest of file. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == (TEST_FILE_SIZE - (int)totalBytesPreviouslyRead)); // Verify contents of what read. for(int i = 0; i < numBytesRead; i++) { assert(myTestBuffer[i] == TEST_FILE_CONTENTS[i + totalBytesPreviouslyRead]); } totalBytesPreviouslyRead += numBytesRead; // Try at end of file twice. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // 2nd read attempt at eof. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); ////////////////////////////////// // Start with 2 getc. readChar = ifgetc(); assert(readChar == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); bufferSize = TEST_FILE_SIZE; assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == 1); totalBytesPreviouslyRead++; readChar = ifgetc(); assert(readChar == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == 2); totalBytesPreviouslyRead++; // Test reading part of the rest of the file. numBytesRead = ifread(myTestBuffer, 4); assert(myTestBuffer[0] == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); assert(myTestBuffer[1] == TEST_FILE_CONTENTS[totalBytesPreviouslyRead + 1]); assert(myTestBuffer[2] == TEST_FILE_CONTENTS[totalBytesPreviouslyRead + 2]); assert(myTestBuffer[3] == TEST_FILE_CONTENTS[totalBytesPreviouslyRead + 3]); assert(numBytesRead == 4); totalBytesPreviouslyRead += numBytesRead; // This read should have affected the internal buffer. assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == totalBytesPreviouslyRead); // Should not be at eof assert(ifeof() == false); // Test reading 2 char with getc. readChar = ifgetc(); assert(readChar == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); assert(myCurrentBufferSize == bufferSize); totalBytesPreviouslyRead++; assert(myBufferIndex == totalBytesPreviouslyRead); readChar = ifgetc(); assert(readChar == TEST_FILE_CONTENTS[totalBytesPreviouslyRead]); assert(myCurrentBufferSize == bufferSize); totalBytesPreviouslyRead++; assert(myBufferIndex == totalBytesPreviouslyRead); // Test reading rest of file. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == (TEST_FILE_SIZE - (int)totalBytesPreviouslyRead)); // Verify contents of what read. for(int i = 0; i < numBytesRead; i++) { assert(myTestBuffer[i] == TEST_FILE_CONTENTS[i + totalBytesPreviouslyRead]); } totalBytesPreviouslyRead += numBytesRead; assert(myBufferIndex == 0); assert(myCurrentBufferSize == 0); // Try at end of file twice. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // 2nd read attempt at eof. numBytesRead = ifread(myTestBuffer, MAX_TEST_BUFFER_SIZE); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); ////////////// // Close the test file. ifclose(); //////////////////////////////////////////////////////////////////////// // Repeat the test on a test file that is larger than the IFILE // buffer size. // First open the test file. openLargeFile(extension); // This file contains DEFAULT_BUFFER_SIZE of '0's followed by "12345" // The size of the file is DEFAULT_BUFFER_SIZE + 5. int largeTestFileSize = DEFAULT_BUFFER_SIZE + 5; char largeBuffer[largeTestFileSize + 5]; // Verify the file successfully opened. assert(myFileTypePtr != NULL); assert(isOpen()); assert(myFileTypePtr->isOpen()); numBytesRead = 0; totalBytesPreviouslyRead = 0; //////////////////////////////////// // Test reading part of the file, then more then the buffer size, // then the rest of the file (test buffer handling when read // available and directly into the file, then read more). numBytesRead = ifread(largeBuffer, 2); assert(numBytesRead == 2); numBytesRead = ifread(largeBuffer + 2, DEFAULT_BUFFER_SIZE * 3); assert(numBytesRead == DEFAULT_BUFFER_SIZE + 3); // Should be at the end of the file. assert(myFileTypePtr->eof() != 0); assert(ifeof() != 0); numBytesRead = ifread(largeBuffer + DEFAULT_BUFFER_SIZE + 3, 2); assert(numBytesRead == 0); // Validate all the 0s for(unsigned int i = 0; i < DEFAULT_BUFFER_SIZE; i++) { assert(largeBuffer[i] == '0'); } // Now validate the "12345" assert(largeBuffer[DEFAULT_BUFFER_SIZE] == '1'); assert(largeBuffer[DEFAULT_BUFFER_SIZE+1] == '2'); assert(largeBuffer[DEFAULT_BUFFER_SIZE+2] == '3'); assert(largeBuffer[DEFAULT_BUFFER_SIZE+3] == '4'); assert(largeBuffer[DEFAULT_BUFFER_SIZE+4] == '5'); totalBytesPreviouslyRead += numBytesRead; // Should affect the IFILE buffer - 0 because read // is bigger than the buffer, so just read directly // into the largeBuffer. assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(myFileTypePtr->eof() != 0); assert(ifeof() != 0); // Try reading at end of file twice. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // 2nd read attempt at eof. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; //////////////////////////////////// // Test reading entire file at once. numBytesRead = ifread(largeBuffer, largeTestFileSize + 4); assert(numBytesRead == largeTestFileSize); // Validate all the 0s for(unsigned int i = 0; i < DEFAULT_BUFFER_SIZE; i++) { assert(largeBuffer[i] == '0'); } // Now validate the "12345" assert(largeBuffer[DEFAULT_BUFFER_SIZE] == '1'); assert(largeBuffer[DEFAULT_BUFFER_SIZE+1] == '2'); assert(largeBuffer[DEFAULT_BUFFER_SIZE+2] == '3'); assert(largeBuffer[DEFAULT_BUFFER_SIZE+3] == '4'); assert(largeBuffer[DEFAULT_BUFFER_SIZE+4] == '5'); totalBytesPreviouslyRead += numBytesRead; // Should affect the IFILE buffer - 0 because read // is bigger than the buffer, so just read directly // into the largeBuffer. assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(myFileTypePtr->eof() != 0); assert(ifeof() != 0); // Try reading at end of file twice. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // 2nd read attempt at eof. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; ////////////////////////////////////////// // Test reading entire file using getc. // Loop through reading the file. // First loop through verifying the 0's for(int index = 0; index < (int)DEFAULT_BUFFER_SIZE; index++) { // Read a character. readChar = ifgetc(); assert(readChar == '0'); // Should affect the IFILE buffer assert(myCurrentBufferSize == (int)DEFAULT_BUFFER_SIZE); assert(myBufferIndex == index+1); } // Now read the 12345. readChar = ifgetc(); assert(readChar == '1'); // Should affect the IFILE buffer assert(myCurrentBufferSize == 5); assert(myBufferIndex == 1); readChar = ifgetc(); assert(readChar == '2'); // Should affect the IFILE buffer assert(myCurrentBufferSize == 5); assert(myBufferIndex == 2); readChar = ifgetc(); assert(readChar == '3'); // Should affect the IFILE buffer assert(myCurrentBufferSize == 5); assert(myBufferIndex == 3); readChar = ifgetc(); assert(readChar == '4'); // Should affect the IFILE buffer assert(myCurrentBufferSize == 5); assert(myBufferIndex == 4); readChar = ifgetc(); assert(readChar == '5'); // Should affect the IFILE buffer assert(myCurrentBufferSize == 5); assert(myBufferIndex == 5); // Now that we have read the file, try reading again at eof. readChar = ifgetc(); assert(readChar == EOF); assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); // Try again at eof. // Now that we have read the file, try reading again at eof. readChar = ifgetc(); assert(readChar == EOF); assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; //////////////////////////////////////////////// // Test reading just the beginning of the file. numBytesRead = ifread(largeBuffer, 4); assert(largeBuffer[0] == '0'); assert(largeBuffer[1] == '0'); assert(largeBuffer[2] == '0'); assert(largeBuffer[3] == '0'); assert(numBytesRead == 4); totalBytesPreviouslyRead += numBytesRead; // This read should have affected the internal buffer. assert(myCurrentBufferSize == (int)DEFAULT_BUFFER_SIZE); assert(myBufferIndex == 4); // Should not be at eof assert(ifeof() == false); // Test reading rest of file. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == (largeTestFileSize - (int)totalBytesPreviouslyRead)); // Verify contents of what read. First check the 0's for(int i = 0; i < (numBytesRead-5); i++) { assert(largeBuffer[i] == '0'); } // Check the 12345 assert(largeBuffer[numBytesRead - 5] == '1'); assert(largeBuffer[numBytesRead - 5 + 1] == '2'); assert(largeBuffer[numBytesRead - 5 + 2] == '3'); assert(largeBuffer[numBytesRead - 5 + 3] == '4'); assert(largeBuffer[numBytesRead - 5 + 4] == '5'); totalBytesPreviouslyRead += numBytesRead; // Try at end of file twice. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Trying to read at the end cleared the buffer.. assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // 2nd read attempt at eof. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Should affect the IFILE buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; ////////////////////////////////////// // Test reading just the beginning. numBytesRead = ifread(largeBuffer, 2); assert(largeBuffer[0] == '0'); assert(largeBuffer[1] == '0'); assert(numBytesRead == 2); totalBytesPreviouslyRead += numBytesRead; // This read should have affected the internal buffer. assert(myCurrentBufferSize == (int)DEFAULT_BUFFER_SIZE); assert(myBufferIndex == 2); // Should not be at eof assert(ifeof() == false); // Test doing 2 getc. readChar = ifgetc(); assert(readChar == '0'); bufferSize = DEFAULT_BUFFER_SIZE; assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == 3); totalBytesPreviouslyRead++; readChar = ifgetc(); assert(readChar == '0'); assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == 4); totalBytesPreviouslyRead++; // Test reading rest of file. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == (largeTestFileSize - (int)totalBytesPreviouslyRead)); // Verify contents of what read. // All except the last 5 should be '0' for(int i = 0; i < numBytesRead - 5; i++) { assert(largeBuffer[i] == '0'); } assert(largeBuffer[numBytesRead - 5] == '1'); assert(largeBuffer[numBytesRead - 4] == '2'); assert(largeBuffer[numBytesRead - 3] == '3'); assert(largeBuffer[numBytesRead - 2] == '4'); assert(largeBuffer[numBytesRead - 1] == '5'); totalBytesPreviouslyRead += numBytesRead; // Try at end of file twice. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Reading at the end clears the buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // 2nd read attempt at eof. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Reading at the end clears the buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); ////////////////////////////////// // Start with 2 getc. readChar = ifgetc(); assert(readChar == '0'); bufferSize = DEFAULT_BUFFER_SIZE; assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == 1); totalBytesPreviouslyRead++; readChar = ifgetc(); assert(readChar == '0'); assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == 2); totalBytesPreviouslyRead++; // Test reading part of the rest of the file. numBytesRead = ifread(myTestBuffer, 2); assert(myTestBuffer[0] == '0'); assert(myTestBuffer[1] == '0'); assert(numBytesRead == 2); totalBytesPreviouslyRead += numBytesRead; // This read should have affected the internal buffer. assert(myCurrentBufferSize == bufferSize); assert(myBufferIndex == totalBytesPreviouslyRead); // Should not be at eof assert(ifeof() == false); // Test reading 2 char with getc. readChar = ifgetc(); assert(readChar == '0'); assert(myCurrentBufferSize == bufferSize); totalBytesPreviouslyRead++; assert(myBufferIndex == totalBytesPreviouslyRead); readChar = ifgetc(); assert(readChar == '0'); assert(myCurrentBufferSize == bufferSize); totalBytesPreviouslyRead++; assert(myBufferIndex == totalBytesPreviouslyRead); // Test reading rest of file. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == (largeTestFileSize - (int)totalBytesPreviouslyRead)); // Verify contents of what read. for(int i = 0; i < numBytesRead - 5; i++) { assert(largeBuffer[i] == '0'); } // Verify the 12345 assert(largeBuffer[numBytesRead - 5] == '1'); assert(largeBuffer[numBytesRead - 5 + 1] == '2'); assert(largeBuffer[numBytesRead - 5 + 2] == '3'); assert(largeBuffer[numBytesRead - 5 + 3] == '4'); assert(largeBuffer[numBytesRead - 5 + 4] == '5'); totalBytesPreviouslyRead += numBytesRead; bufferSize = 5; assert(myBufferIndex == bufferSize); assert(myCurrentBufferSize == bufferSize); // Try at end of file twice. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Reading at the end clears the buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // 2nd read attempt at eof. numBytesRead = ifread(largeBuffer, largeTestFileSize); assert(numBytesRead == 0); // Reading at the end clears the buffer assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); assert(ifeof() != 0); // RESET ifrewind(); totalBytesPreviouslyRead = 0; assert(myCurrentBufferSize == 0); assert(myBufferIndex == 0); ifclose(); std::cout << " Passed test_ifread_ifgetc" << std::endl; } // Test closing a file. void IFILE_Test::test_ifclose(const char* extension) { // First open the test file. openFile(extension); // Verify the file successfully opened. assert(myFileTypePtr != NULL); assert(isOpen()); assert(myFileTypePtr->isOpen()); ifclose(); assert(myFileTypePtr == NULL); assert(isOpen() == false); std::cout << " Passed test_ifclose" << std::endl; } void IFILE_Test::test_ifseek(const char* extension) { disableBuffering(); // First open the test file. openFile(extension); // Read a character from the file. int numBytesRead = readFromFile(myTestBuffer, 1); assert(numBytesRead == 1); assert(myTestBuffer[0] == TEST_FILE_CONTENTS[0]); // Get the current position. long int currentPos = iftell(); // Read the next character from the file. numBytesRead = readFromFile(myTestBuffer, 1); assert(numBytesRead == 1); assert(myTestBuffer[0] == TEST_FILE_CONTENTS[1]); // Seek to just before the character that was just read and read again // Should be the same character. assert(ifseek(currentPos, SEEK_SET) == true); numBytesRead = readFromFile(myTestBuffer, 1); assert(numBytesRead == 1); assert(myTestBuffer[0] == TEST_FILE_CONTENTS[1]); ifclose(); assert(myFileTypePtr == NULL); assert(isOpen() == false); // Buffer reads - may have been disabled for iftell to work for bgzf. bufferReads(); std::cout << " Passed test_ifseek" << std::endl; } void IFILE_Test::test_noExistRead(const char* extension) { openNoExistFile(extension); } // Open a file for testing. void IFILE_Test::openFile(const char* extension) { std::string filename = "data/InputFileTest."; filename += extension; assert(InputFile::openFile(filename.c_str(), "rb", InputFile::DEFAULT) == true); } // Open a file for testing. void IFILE_Test::openLargeFile(const char* extension) { std::string filename = "data/InputFileTestLarge."; filename += extension; assert(InputFile::openFile(filename.data(), "rb", InputFile::DEFAULT) == true); } void IFILE_Test::openNoExistFile(const char* extension) { std::string filename = "data/noExist."; filename += extension; assert(InputFile::openFile(filename.data(), "rb", InputFile::DEFAULT) == false); } void testWrite() { std::string filenameNoExt = "results/InputFileTest."; std::string filename = filenameNoExt + "glf"; IFILE filePtr = ifopen(filename.c_str(), "wt"); assert(filePtr != NULL); assert(ifwrite(filePtr, IFILE_Test::TEST_FILE_CONTENTS.c_str(), IFILE_Test::TEST_FILE_CONTENTS.length()) == IFILE_Test::TEST_FILE_CONTENTS.length()); assert(ifclose(filePtr) == 0); filename = "results/uncompressedFile.glf"; filePtr = ifopen(filename.c_str(), "wt", InputFile::UNCOMPRESSED); assert(filePtr != NULL); assert(ifwrite(filePtr, IFILE_Test::TEST_FILE_CONTENTS.c_str(), IFILE_Test::TEST_FILE_CONTENTS.length()) == IFILE_Test::TEST_FILE_CONTENTS.length()); assert(ifclose(filePtr) == 0); filename = "results/bgzfFile.glf"; filePtr = ifopen(filename.c_str(), "wt", InputFile::BGZF); assert(filePtr != NULL); assert(ifwrite(filePtr, IFILE_Test::TEST_FILE_CONTENTS.c_str(), IFILE_Test::TEST_FILE_CONTENTS.length()) == IFILE_Test::TEST_FILE_CONTENTS.length()); assert(ifclose(filePtr) == 0); filename = "results/gzipFile.glf"; filePtr = ifopen(filename.c_str(), "wt", InputFile::GZIP); assert(filePtr != NULL); assert(ifwrite(filePtr, IFILE_Test::TEST_FILE_CONTENTS.c_str(), IFILE_Test::TEST_FILE_CONTENTS.length()) ==IFILE_Test:: TEST_FILE_CONTENTS.length()); assert(ifclose(filePtr) == 0); filename = "results/defaultFile.glf"; filePtr = ifopen(filename.c_str(), "wt"); assert(filePtr != NULL); assert(ifwrite(filePtr, IFILE_Test::TEST_FILE_CONTENTS.c_str(), IFILE_Test::TEST_FILE_CONTENTS.length()) == IFILE_Test::TEST_FILE_CONTENTS.length()); assert(ifclose(filePtr) == 0); filename = "results/defaultFile.gz"; filePtr = ifopen(filename.c_str(), "wt"); assert(filePtr != NULL); assert(ifwrite(filePtr, IFILE_Test::TEST_FILE_CONTENTS.c_str(), IFILE_Test::TEST_FILE_CONTENTS.length()) == IFILE_Test::TEST_FILE_CONTENTS.length()); assert(ifclose(filePtr) == 0); filename = "results/textFile.gz"; unsigned int myuint = 99; int myint = -99; char mychar = 'z'; filePtr = ifopen(filename.c_str(), "wt"); (*filePtr) << "Hello\n"; (*filePtr) << "Hello." << 3 << ' ' << -2 << "How are you"; (*filePtr) << "?" << "\n"; std::string mytext = "Bye\n"; (*filePtr) << mytext; (*filePtr) << 3.125 << mychar; (*filePtr) << myuint; (*filePtr) << mychar; (*filePtr) << myint; String myString = "Good Bye!\n"; (*filePtr) << myString; assert(ifclose(filePtr) == 0); filename = "results/textFile1.gz"; InputFile& fileRef = *(ifopen(filename.c_str(), "wt")); fileRef << "Hello\n"; fileRef << "Hello." << 3 << ' ' << -2 << "How are you"; fileRef << "?" << "\n"; fileRef << mytext; fileRef << 3.125 << mychar; fileRef << myuint; fileRef << mychar; fileRef << myint; fileRef << myString; InputFile* fileRefPtr = &fileRef; assert(ifclose(fileRefPtr) == 0); assert(fileRefPtr == NULL); // TODO - automatically verify that the files were written in the // correct format - rather than hand checking. } void testAdditional(const char* extension) { std::string fileName = "data/InputFileTest2."; fileName += extension; IFILE testFile = ifopen(fileName.c_str(), "r"); assert(testFile != NULL); std::string buffer = "989"; std::string stopChars = "C5F2"; // Test readTilChar that stores the string. assert(testFile->readTilChar(stopChars, buffer) == 0); assert(buffer == "989AB"); buffer.clear(); assert(testFile->readTilChar(stopChars, buffer) == 2); assert(buffer == "DE"); assert(testFile->readTilChar(stopChars, buffer) == 3); assert(buffer == "DEG\tabcdefg\n1"); // Test readTilChar that discards the string. assert(testFile->readTilChar(stopChars) == 1); buffer.clear(); buffer = "t"; assert(testFile->readTilTab(buffer) == 1); assert(buffer == "t6"); assert(testFile->readTilTab(buffer) == 0); assert(buffer == "t6hijklm"); assert(testFile->readTilTab(buffer) == 0); assert(buffer == "t6hijklm1"); assert(testFile->readTilTab(buffer) == 1); assert(buffer == "t6hijklm1NOP"); assert(testFile->readLine(buffer) == 0); assert(buffer == "t6hijklm1NOPQRST\tUVW"); assert(testFile->readTilTab(buffer) == 0); assert(buffer == "t6hijklm1NOPQRST\tUVW"); buffer.clear(); assert(testFile->discardLine() == 0); assert(testFile->readLine(buffer) == -1); assert(buffer == "@#$"); assert(testFile->discardLine() == -1); assert(testFile->readTilTab(buffer) == -1); assert(testFile->readTilChar(stopChars, buffer) == -1); assert(testFile->readTilChar(stopChars) == -1); assert(buffer == "@#$"); ifclose(testFile); } libStatGen-1.0.14/general/test/inputFileTest/InputFileTest.h000066400000000000000000000034401254730101300237310ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "InputFile.h" class IFILE_Test : public InputFile { public: void test(); static const int TEST_FILE_SIZE; static const int BGZF_TEST_FILE_SIZE; static const std::string TEST_FILE_CONTENTS; private: void testAll(const char* extension); void test_readFromFile(const char* extension); void test_readTilChar(const char* extension); // Tested together because they are used to test each other. void test_ifeof_ifrewind(const char* extension); // Tested together to verify they can be successfully be called after the // other has been called. void test_ifread_ifgetc(const char* extension); void test_ifclose(const char* extension); void test_ifseek(const char* extension); void test_noExistRead(const char *extension); void openFile(const char* extension); void openLargeFile(const char* extension); void openNoExistFile(const char* extension); // Buffer used for reading into. static const int MAX_TEST_BUFFER_SIZE = 100; char myTestBuffer[MAX_TEST_BUFFER_SIZE]; }; libStatGen-1.0.14/general/test/inputFileTest/Makefile000066400000000000000000000023711254730101300224630ustar00rootroot00000000000000EXE = inputFileTest TOOLBASE = InputFileTest ifeq ($(ZLIB_AVAIL), 0) TEST_COMMAND= mkdir -p results && \ ./inputFileTest > results/results.log && \ diff data/InputFileTest.txt results/bgzfFile.glf && \ diff data/InputFileTest.txt results/defaultFile.glf && \ diff data/InputFileTest.txt results/defaultFile.gz && \ diff data/InputFileTest.txt results/gzipFile.glf && \ diff data/InputFileTest.txt results/InputFileTest.glf && \ diff data/InputFileTest.txt results/uncompressedFile.glf && \ diff data/textFile.txt results/textFile.gz && \ diff data/textFile.txt results/textFile1.gz && \ diff results/results.log expected/resultsNoZlib.log else TEST_COMMAND= mkdir -p results && \ ./inputFileTest > results/results.log && \ diff data/InputFileTest.bam results/bgzfFile.glf && \ diff data/InputFileTest.txt results/defaultFile.glf && \ diff data/InputFileTest.gz results/defaultFile.gz && \ diff data/InputFileTest.gz results/gzipFile.glf && \ diff data/InputFileTest.txt results/InputFileTest.glf && \ diff data/InputFileTest.txt results/uncompressedFile.glf && \ diff data/textFile.gz results/textFile.gz && \ diff data/textFile.gz results/textFile1.gz && \ diff results/results.log expected/results.log endif include ../../../Makefiles/Makefile.testlibStatGen-1.0.14/general/test/inputFileTest/data/000077500000000000000000000000001254730101300217315ustar00rootroot00000000000000libStatGen-1.0.14/general/test/inputFileTest/data/InputFileTest.bam000066400000000000000000000001351254730101300251500ustar00rootroot00000000000000BC@strvILJN1426rusOMK75344<%BClibStatGen-1.0.14/general/test/inputFileTest/data/InputFileTest.glf000066400000000000000000000001351254730101300251610ustar00rootroot00000000000000BC@strvILJN1426rusOMK75344<%BClibStatGen-1.0.14/general/test/inputFileTest/data/InputFileTest.gz000066400000000000000000000000711254730101300250300ustar00rootroot00000000000000strvILJN1426rusOMK75344<%libStatGen-1.0.14/general/test/inputFileTest/data/InputFileTest.txt000066400000000000000000000000451254730101300252300ustar00rootroot00000000000000ABCDabcd1234 EFGefg567 hijklHIJKL8910libStatGen-1.0.14/general/test/inputFileTest/data/InputFileTest2.gz000066400000000000000000000001401254730101300251070ustar00rootroot00000000000000&CIPInputFileTest2.txtstrvqusLLJNIMK2426152  ⊈4rPVQLm9libStatGen-1.0.14/general/test/inputFileTest/data/InputFileTest2.txt000066400000000000000000000000711254730101300253110ustar00rootroot00000000000000ABCDEFG abcdefg 123456 hijklm 1 NOP QRST UVW XYZ 789 @#$libStatGen-1.0.14/general/test/inputFileTest/data/InputFileTestLarge.bam000066400000000000000000000002461254730101300261260ustar00rootroot00000000000000BCh  j QIhBC 34261:BClibStatGen-1.0.14/general/test/inputFileTest/data/InputFileTestLarge.glf000066400000000000000000000002461254730101300261370ustar00rootroot00000000000000BCh  j QIhBC 34261:BClibStatGen-1.0.14/general/test/inputFileTest/data/InputFileTestLarge.gz000066400000000000000000000001751254730101300260100ustar00rootroot00000000000000] RInputFileTestLarge.txtJf z*4libStatGen-1.0.14/general/test/inputFileTest/data/InputFileTestLarge..0.14/general/test/inputFileTest/data/textFile.gz000066400000000000000000000001071254730101300240550ustar00rootroot00000000000000Hz F E \N\zFUU) @AE.zI7libStatGen-1.0.14/general/test/inputFileTest/data/textFile.txt000066400000000000000000000000671254730101300242610ustar00rootroot00000000000000Hello Hello.3 -2How are you? Bye 3.125z99z-99Good Bye! libStatGen-1.0.14/general/test/inputFileTest/expected/000077500000000000000000000000001254730101300226215ustar00rootroot00000000000000libStatGen-1.0.14/general/test/inputFileTest/expected/results.log000066400000000000000000000013211254730101300250220ustar00rootroot00000000000000 UncompressedFileType Tests: Passed test_readFromFile Passed test_readTilChar Passed test_ifeof_ifrewind Passed test_ifread_ifgetc Passed test_ifclose Passed test_ifseek GzipFileType Tests: Passed test_readFromFile Passed test_readTilChar Passed test_ifeof_ifrewind Passed test_ifread_ifgetc Passed test_ifclose Passed test_ifseek BgzfFileType Tests: Passed test_readFromFile Passed test_readTilChar Passed test_ifeof_ifrewind Passed test_ifread_ifgetc Passed test_ifclose Passed test_ifseek .glf file Tests: Passed test_readFromFile Passed test_readTilChar Passed test_ifeof_ifrewind Passed test_ifread_ifgetc Passed test_ifclose Passed test_ifseek Additional Tests: libStatGen-1.0.14/general/test/inputFileTest/expected/resultsNoZlib.log000066400000000000000000000003121254730101300261370ustar00rootroot00000000000000 UncompressedFileType Tests: Passed test_readFromFile Passed test_readTilChar Passed test_ifeof_ifrewind Passed test_ifread_ifgetc Passed test_ifclose Passed test_ifseek Additional Tests: libStatGen-1.0.14/general/test/memoryMapArrayTest/000077500000000000000000000000001254730101300220265ustar00rootroot00000000000000libStatGen-1.0.14/general/test/memoryMapArrayTest/.gitignore000066400000000000000000000000231254730101300240110ustar00rootroot00000000000000memoryMapArrayTest libStatGen-1.0.14/general/test/memoryMapArrayTest/Makefile000066400000000000000000000003471254730101300234720ustar00rootroot00000000000000PATH_TO_BASE=../../.. EXE = memoryMapArrayTest TOOLBASE = MemoryMapArrayTest TEST_COMMAND= mkdir -p results; \ ./memoryMapArrayTest include $(PATH_TO_BASE)/Makefiles/Makefile.test obj/MemoryMapArrayTest.o: ../../MemoryMapArray.h libStatGen-1.0.14/general/test/memoryMapArrayTest/MemoryMapArrayTest.cpp000066400000000000000000000310711254730101300263010ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "Generic.h" #include #include "MemoryMapArray.h" #include "MemoryMapArrayTest.h" #include #include #define TEST_FILE_NAME "results/testMemoryMapArray.vector" class MemoryMapArrayTest : public UnitTest { public: MemoryMapArrayTest(const char *title) : UnitTest(title) {;} void testBool(); void test2Bit(); void test4Bit(); void test32Bit(); void test() { testBool(); test2Bit(); test4Bit(); test32Bit(); } }; void MemoryMapArrayTest::testBool(void) { mmapArrayBool_t testVector; // ignore return code here if(unlink(TEST_FILE_NAME) == 0) { // Nothing to do, just deleting previous test file } check(m_failures, ++m_testNum, "Create 1 bit vector file", 0, testVector.create(TEST_FILE_NAME, 11)); testVector.set(0,0); testVector.set(1,1); testVector.set(2,0); testVector.set(3,1); testVector.set(4,1); testVector.set(5,0); testVector.set(6,1); testVector.set(7,0); testVector.set(8,0); testVector.set(9,0); testVector.set(10,1); check(m_failures, ++m_testNum, "Access 1 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 1 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 1 bit element 2", 0U, testVector[2]); check(m_failures, ++m_testNum, "Access 1 bit element 3", 1U, testVector[3]); check(m_failures, ++m_testNum, "Access 1 bit element 4", 1U, testVector[4]); check(m_failures, ++m_testNum, "Access 1 bit element 5", 0U, testVector[5]); check(m_failures, ++m_testNum, "Access 1 bit element 6", 1U, testVector[6]); check(m_failures, ++m_testNum, "Access 1 bit element 7", 0U, testVector[7]); check(m_failures, ++m_testNum, "Access 1 bit element 8", 0U, testVector[8]); check(m_failures, ++m_testNum, "Access 1 bit element 9", 0U, testVector[9]); check(m_failures, ++m_testNum, "Access 1 bit element 10", 1U, testVector[10]); check(m_failures, ++m_testNum, "Close vector file", false, testVector.close()); check(m_failures, ++m_testNum, "Re-open vector file", false, testVector.open(TEST_FILE_NAME)); check(m_failures, ++m_testNum, "Access 1 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 1 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 1 bit element 2", 0U, testVector[2]); check(m_failures, ++m_testNum, "Access 1 bit element 3", 1U, testVector[3]); check(m_failures, ++m_testNum, "Access 1 bit element 4", 1U, testVector[4]); check(m_failures, ++m_testNum, "Access 1 bit element 5", 0U, testVector[5]); check(m_failures, ++m_testNum, "Access 1 bit element 6", 1U, testVector[6]); check(m_failures, ++m_testNum, "Access 1 bit element 7", 0U, testVector[7]); check(m_failures, ++m_testNum, "Access 1 bit element 8", 0U, testVector[8]); check(m_failures, ++m_testNum, "Access 1 bit element 9", 0U, testVector[9]); check(m_failures, ++m_testNum, "Access 1 bit element 10", 1U, testVector[10]); check(m_failures, ++m_testNum, "Close vector file", false, testVector.close()); check(m_failures, ++m_testNum, "Unlink vector file", 0, unlink(TEST_FILE_NAME)); } void MemoryMapArrayTest::test2Bit(void) { mmapArray2Bit_t testVector; // ignore return code here if(unlink(TEST_FILE_NAME) == 0) { // Nothing to do, just deleting previous test file } check(m_failures, ++m_testNum, "Create 2 bit vector file", 0, testVector.create(TEST_FILE_NAME, 11)); testVector.set(0,0); testVector.set(1,1); testVector.set(2,2); testVector.set(3,3); testVector.set(4,3); testVector.set(5,2); testVector.set(6,1); testVector.set(7,0); testVector.set(8,2); testVector.set(9,1); testVector.set(10,3); check(m_failures, ++m_testNum, "Access 2 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 2 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 2 bit element 2", 2U, testVector[2]); check(m_failures, ++m_testNum, "Access 2 bit element 3", 3U, testVector[3]); check(m_failures, ++m_testNum, "Access 2 bit element 4", 3U, testVector[4]); check(m_failures, ++m_testNum, "Access 2 bit element 5", 2U, testVector[5]); check(m_failures, ++m_testNum, "Access 2 bit element 6", 1U, testVector[6]); check(m_failures, ++m_testNum, "Access 2 bit element 7", 0U, testVector[7]); check(m_failures, ++m_testNum, "Access 2 bit element 8", 2U, testVector[8]); check(m_failures, ++m_testNum, "Access 2 bit element 9", 1U, testVector[9]); check(m_failures, ++m_testNum, "Access 2 bit element 10", 3U, testVector[10]); check(m_failures, ++m_testNum, "Close vector file", false, testVector.close()); check(m_failures, ++m_testNum, "Re-open vector file", false, testVector.open(TEST_FILE_NAME)); check(m_failures, ++m_testNum, "Access 2 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 2 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 2 bit element 2", 2U, testVector[2]); check(m_failures, ++m_testNum, "Access 2 bit element 3", 3U, testVector[3]); check(m_failures, ++m_testNum, "Access 2 bit element 4", 3U, testVector[4]); check(m_failures, ++m_testNum, "Access 2 bit element 5", 2U, testVector[5]); check(m_failures, ++m_testNum, "Access 2 bit element 6", 1U, testVector[6]); check(m_failures, ++m_testNum, "Access 2 bit element 7", 0U, testVector[7]); check(m_failures, ++m_testNum, "Access 2 bit element 8", 2U, testVector[8]); check(m_failures, ++m_testNum, "Access 2 bit element 9", 1U, testVector[9]); check(m_failures, ++m_testNum, "Access 2 bit element 10", 3U, testVector[10]); check(m_failures, ++m_testNum, "Close vector file", false, testVector.close()); check(m_failures, ++m_testNum, "Unlink vector file", 0, unlink(TEST_FILE_NAME)); } void MemoryMapArrayTest::test4Bit(void) { mmapArray4Bit_t testVector; // ignore return code here if(unlink(TEST_FILE_NAME) == 0) { // Nothing to do, just deleting previous test file } check(m_failures, ++m_testNum, "Create 4 bit vector file", 0, testVector.create(TEST_FILE_NAME, 11)); testVector.set(0,0); testVector.set(1,1); testVector.set(2,2); testVector.set(3,3); testVector.set(4,4); testVector.set(5,5); testVector.set(6,6); testVector.set(7,7); testVector.set(8,8); testVector.set(9,9); testVector.set(10,10); check(m_failures, ++m_testNum, "Access 4 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 4 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 4 bit element 2", 2U, testVector[2]); check(m_failures, ++m_testNum, "Access 4 bit element 3", 3U, testVector[3]); check(m_failures, ++m_testNum, "Access 4 bit element 4", 4U, testVector[4]); check(m_failures, ++m_testNum, "Access 4 bit element 5", 5U, testVector[5]); check(m_failures, ++m_testNum, "Access 4 bit element 6", 6U, testVector[6]); check(m_failures, ++m_testNum, "Access 4 bit element 7", 7U, testVector[7]); check(m_failures, ++m_testNum, "Access 4 bit element 8", 8U, testVector[8]); check(m_failures, ++m_testNum, "Access 4 bit element 9", 9U, testVector[9]); check(m_failures, ++m_testNum, "Access 4 bit element 10", 10U, testVector[10]); check(m_failures, ++m_testNum, "Close vector file", false, testVector.close()); check(m_failures, ++m_testNum, "Re-open vector file", false, testVector.open(TEST_FILE_NAME)); check(m_failures, ++m_testNum, "Access 4 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 4 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 4 bit element 2", 2U, testVector[2]); check(m_failures, ++m_testNum, "Access 4 bit element 3", 3U, testVector[3]); check(m_failures, ++m_testNum, "Access 4 bit element 4", 4U, testVector[4]); check(m_failures, ++m_testNum, "Access 4 bit element 5", 5U, testVector[5]); check(m_failures, ++m_testNum, "Access 4 bit element 6", 6U, testVector[6]); check(m_failures, ++m_testNum, "Access 4 bit element 7", 7U, testVector[7]); check(m_failures, ++m_testNum, "Access 4 bit element 8", 8U, testVector[8]); check(m_failures, ++m_testNum, "Access 4 bit element 9", 9U, testVector[9]); check(m_failures, ++m_testNum, "Access 4 bit element 10", 10U, testVector[10]); check(m_failures, ++m_testNum, "Close vector file", false, testVector.close()); check(m_failures, ++m_testNum, "Unlink vector file", 0, unlink(TEST_FILE_NAME)); } void MemoryMapArrayTest::test32Bit(void) { mmapArrayUint32_t testVector; // ignore return code here if(unlink(TEST_FILE_NAME) == 0) { // Nothing to do, just deleting previous test file } check(m_failures, ++m_testNum, "Create 32 bit vector file", 0, testVector.create(TEST_FILE_NAME, 11)); testVector.set(0,0); testVector.set(1,1); testVector.set(2,2); testVector.set(3,3); testVector.set(4,4); testVector.set(5,5); testVector.set(6,6); testVector.set(7,7); testVector.set(8,8); testVector.set(9,9); testVector.set(10,10); check(m_failures, ++m_testNum, "Access 32 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 32 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 32 bit element 2", 2U, testVector[2]); check(m_failures, ++m_testNum, "Access 32 bit element 3", 3U, testVector[3]); check(m_failures, ++m_testNum, "Access 32 bit element 4", 4U, testVector[4]); check(m_failures, ++m_testNum, "Access 32 bit element 5", 5U, testVector[5]); check(m_failures, ++m_testNum, "Access 32 bit element 6", 6U, testVector[6]); check(m_failures, ++m_testNum, "Access 32 bit element 7", 7U, testVector[7]); check(m_failures, ++m_testNum, "Access 32 bit element 8", 8U, testVector[8]); check(m_failures, ++m_testNum, "Access 32 bit element 9", 9U, testVector[9]); check(m_failures, ++m_testNum, "Access 32 bit element 10", 10U, testVector[10]); check(m_failures, ++m_testNum, "Close vector file", false, testVector.close()); check(m_failures, ++m_testNum, "Re-open vector file", false, testVector.open(TEST_FILE_NAME)); check(m_failures, ++m_testNum, "Access 32 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 32 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 32 bit element 2", 2U, testVector[2]); check(m_failures, ++m_testNum, "Access 32 bit element 3", 3U, testVector[3]); check(m_failures, ++m_testNum, "Access 32 bit element 4", 4U, testVector[4]); check(m_failures, ++m_testNum, "Access 32 bit element 5", 5U, testVector[5]); check(m_failures, ++m_testNum, "Access 32 bit element 6", 6U, testVector[6]); check(m_failures, ++m_testNum, "Access 32 bit element 7", 7U, testVector[7]); check(m_failures, ++m_testNum, "Access 32 bit element 8", 8U, testVector[8]); check(m_failures, ++m_testNum, "Access 32 bit element 9", 9U, testVector[9]); check(m_failures, ++m_testNum, "Access 32 bit element 10", 10U, testVector[10]); check(m_failures, ++m_testNum, "Close vector file", false, testVector.close()); check(m_failures, ++m_testNum, "Unlink vector file", 0, unlink(TEST_FILE_NAME)); } int main(int argc, char **argv) { MemoryMapArrayTest test("MemoryMapArrayTest"); #if 0 bool showAllCasesFlag = false; int opt; while(( opt = getopt(argc, (char **) argv, "v")) != -1) { switch(opt) { case 'v': showAllCasesFlag = true; break; default: std::cerr << "usage: testSW [-v]" << std::endl; exit(1); } } #endif test.test(); std::cout << test; exit(test.getFailureCount()); } libStatGen-1.0.14/general/test/memoryMapArrayTest/MemoryMapArrayTest.h000066400000000000000000000025171254730101300257510ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include class UnitTest { protected: std::string m_title; int m_failures; int m_testNum; public: UnitTest(const char *title) : m_title(title), m_failures(0), m_testNum(0) {;}; void test(); int getPassCount() {return m_testNum - m_failures;} int getFailureCount() {return m_failures;} const std::string getTitle() const {return m_title;} }; std::ostream &operator << (std::ostream &stream, UnitTest &test) { stream << test.getTitle() << " PASS: " << test.getPassCount() << " FAIL: " << test.getFailureCount() << std::endl; return stream; } libStatGen-1.0.14/general/test/nonOverlapRegions/000077500000000000000000000000001254730101300216735ustar00rootroot00000000000000libStatGen-1.0.14/general/test/nonOverlapRegions/.gitignore000066400000000000000000000000261254730101300236610ustar00rootroot00000000000000nonOverlapRegionsTest libStatGen-1.0.14/general/test/nonOverlapRegions/Makefile000066400000000000000000000003761254730101300233410ustar00rootroot00000000000000EXE = nonOverlapRegionsTest TOOLBASE = NonOverlapRegionsTest TEST_COMMAND= ./nonOverlapRegionsTest 2> results/NonOverlapRegionsTest.txt && diff results/NonOverlapRegionsTest.txt expectedNonOverlapRegionsTest.txt include ../../../Makefiles/Makefile.testlibStatGen-1.0.14/general/test/nonOverlapRegions/NonOverlapRegionsTest.cpp000066400000000000000000000402621254730101300266550ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "NonOverlapRegions.h" #include "NonOverlapRegionsTest.h" #include #include int main(int argc, char ** argv) { NonOverlapRegionsTest myTest; myTest.test(); } void NonOverlapRegionsTest::test() { testPos(); testChrom(); } void NonOverlapRegionsTest::testChrom() { NonOverlapRegions reg; // Assert that the regions are empty. assert(reg.myRegions.size() == 0); // Verify no regions. for(int i = 0; i < 30; i++) { assert(reg.inRegion("a", i) == false); assert(reg.inRegion("3", i) == false); } // The chromosomes checked for were added. assert(reg.myRegions.size() == 2); assert(reg.myRegions["a"].myRegions.size() == 0); assert(reg.myRegions["a"].myRegionIter == reg.myRegions["a"].myRegions.end()); assert(reg.myRegions["3"].myRegions.size() == 0); assert(reg.myRegions["3"].myRegionIter == reg.myRegions["3"].myRegions.end()); // Add a region. reg.add("3", 13, 15); // Verify regions. assert(reg.myRegions.size() == 2); for(int i = 0; i < 30; i++) { assert(reg.inRegion("a", i) == false); if((i >= 13) && (i < 15)) { assert(reg.inRegion("3", i) == true); } else { assert(reg.inRegion("3", i) == false); } } // Add a region. reg.add("a", 1, 5); // Verify regions. assert(reg.myRegions.size() == 2); for(int i = 0; i < 30; i++) { if((i >= 1) && (i < 5)) { assert(reg.inRegion("a", i) == true); } else { assert(reg.inRegion("a", i) == false); } if((i >= 13) && (i < 15)) { assert(reg.inRegion("3", i) == true); } else { assert(reg.inRegion("3", i) == false); } } } void NonOverlapRegionsTest::testPos() { NonOverlapRegionPos reg; std::list< std::pair >::iterator iter; // Assert that the regions are empty. assert(reg.myRegions.empty()); assert(reg.myRegionIter == reg.myRegions.end()); assert(reg.myTmpIter == reg.myRegions.end()); // Verify regions. for(int i = 0; i < 30; i++) { assert(reg.inRegion(i) == false); } // Add a region reg.add(13, 15); // Verify regions. assert(reg.myRegions.size() == 1); assert(reg.myRegionIter->first == 13); assert(reg.myRegionIter->second == 15); for(int i = 0; i < 30; i++) { if((i >= 13) && (i < 15)) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Insert before this. reg.add(4,6); assert(reg.myRegions.size() == 2); assert(reg.myRegionIter->first == 4); assert(reg.myRegionIter->second == 6); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 13); assert(iter->second == 15); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 13) && (i < 15))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Insert at the end. reg.add(22,26); assert(reg.myRegions.size() == 3); assert(reg.myRegionIter->first == 22); assert(reg.myRegionIter->second == 26); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 13); assert(iter->second == 15); ++iter; assert(iter->first == 22); assert(iter->second == 26); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 13) && (i < 15)) || ((i >= 22) && (i < 26))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Insert in the middle. reg.add(8,9); assert(reg.myRegions.size() == 4); assert(reg.myRegionIter->first == 8); assert(reg.myRegionIter->second == 9); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 8); assert(iter->second == 9); ++iter; assert(iter->first == 13); assert(iter->second == 15); ++iter; assert(iter->first == 22); assert(iter->second == 26); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 8) && (i < 9)) || ((i >= 13) && (i < 15)) || ((i >= 22) && (i < 26))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Insert start does not overlap, but the end does. reg.add(20,24); assert(reg.myRegions.size() == 4); assert(reg.myRegionIter->first == 20); assert(reg.myRegionIter->second == 26); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 8); assert(iter->second == 9); ++iter; assert(iter->first == 13); assert(iter->second == 15); ++iter; assert(iter->first == 20); assert(iter->second == 26); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 8) && (i < 9)) || ((i >= 13) && (i < 15)) || ((i >= 20) && (i < 26))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Add another region reg.add(18,19); assert(reg.myRegions.size() == 5); assert(reg.myRegionIter->first == 18); assert(reg.myRegionIter->second == 19); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 8); assert(iter->second == 9); ++iter; assert(iter->first == 13); assert(iter->second == 15); ++iter; assert(iter->first == 18); assert(iter->second == 19); ++iter; assert(iter->first == 20); assert(iter->second == 26); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 8) && (i < 9)) || ((i >= 13) && (i < 15)) || ((i >= 18) && (i < 19)) || ((i >= 20) && (i < 26))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Start is not in, but overlap two others (ending not at the end). reg.add(12,19); assert(reg.myRegions.size() == 4); assert(reg.myRegionIter->first == 12); assert(reg.myRegionIter->second == 19); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 8); assert(iter->second == 9); ++iter; assert(iter->first == 12); assert(iter->second == 19); ++iter; assert(iter->first == 20); assert(iter->second == 26); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 8) && (i < 9)) || ((i >= 12) && (i < 19)) || ((i >= 20) && (i < 26))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Completely in region to left. reg.add(5,6); assert(reg.myRegions.size() == 4); assert(reg.myRegionIter->first == 4); assert(reg.myRegionIter->second == 6); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 8); assert(iter->second == 9); ++iter; assert(iter->first == 12); assert(iter->second == 19); ++iter; assert(iter->first == 20); assert(iter->second == 26); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 8) && (i < 9)) || ((i >= 12) && (i < 19)) || ((i >= 20) && (i < 26))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Completely in region to right. reg.add(22,24); assert(reg.myRegions.size() == 4); assert(reg.myRegionIter->first == 20); assert(reg.myRegionIter->second == 26); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 8); assert(iter->second == 9); ++iter; assert(iter->first == 12); assert(iter->second == 19); ++iter; assert(iter->first == 20); assert(iter->second == 26); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 8) && (i < 9)) || ((i >= 12) && (i < 19)) || ((i >= 20) && (i < 26))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Add region to right. reg.add(28,29); assert(reg.myRegions.size() == 5); assert(reg.myRegionIter->first == 28); assert(reg.myRegionIter->second == 29); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 8); assert(iter->second == 9); ++iter; assert(iter->first == 12); assert(iter->second == 19); ++iter; assert(iter->first == 20); assert(iter->second == 26); ++iter; assert(iter->first == 28); assert(iter->second == 29); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 8) && (i < 9)) || ((i >= 12) && (i < 19)) || ((i >= 20) && (i < 26)) || ((i >= 28) && (i < 29))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Add region to left, start is in the region, and end extends past. reg.add(8,10); assert(reg.myRegions.size() == 5); assert(reg.myRegionIter->first == 8); assert(reg.myRegionIter->second == 10); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 6); ++iter; assert(iter->first == 8); assert(iter->second == 10); ++iter; assert(iter->first == 12); assert(iter->second == 19); ++iter; assert(iter->first == 20); assert(iter->second == 26); ++iter; assert(iter->first == 28); assert(iter->second == 29); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 6)) || ((i >= 8) && (i < 10)) || ((i >= 12) && (i < 19)) || ((i >= 20) && (i < 26)) || ((i >= 28) && (i < 29))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Add region start is in the region, and end extends past and overlaps // the next region. reg.add(5,9); assert(reg.myRegions.size() == 4); assert(reg.myRegionIter->first == 4); assert(reg.myRegionIter->second == 10); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 10); ++iter; assert(iter->first == 12); assert(iter->second == 19); ++iter; assert(iter->first == 20); assert(iter->second == 26); ++iter; assert(iter->first == 28); assert(iter->second == 29); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 10)) || ((i >= 12) && (i < 19)) || ((i >= 20) && (i < 26)) || ((i >= 28) && (i < 29))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Add region start is in the region, and end extends past and overlaps // the next region. reg.add(10,11); assert(reg.myRegions.size() == 5); assert(reg.myRegionIter->first == 10); assert(reg.myRegionIter->second == 11); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 10); ++iter; assert(iter->first == 10); assert(iter->second == 11); ++iter; assert(iter->first == 12); assert(iter->second == 19); ++iter; assert(iter->first == 20); assert(iter->second == 26); ++iter; assert(iter->first == 28); assert(iter->second == 29); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 10)) || ((i >= 10) && (i < 11)) || ((i >= 12) && (i < 19)) || ((i >= 20) && (i < 26)) || ((i >= 28) && (i < 29))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Add region start is in the region, and end extends past and overlaps // the next 2 regions. reg.add(10,24); assert(reg.myRegions.size() == 3); assert(reg.myRegionIter->first == 10); assert(reg.myRegionIter->second == 26); iter = reg.myRegions.begin(); assert(iter->first == 4); assert(iter->second == 10); ++iter; assert(iter->first == 10); assert(iter->second == 26); ++iter; assert(iter->first == 28); assert(iter->second == 29); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 30; i++) { if(((i >= 4) && (i < 10)) || ((i >= 10) && (i < 26)) || ((i >= 28) && (i < 29))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Add region start outside of a region and ends at the end. reg.add(2,30); assert(reg.myRegions.size() == 1); assert(reg.myRegionIter->first == 2); assert(reg.myRegionIter->second == 30); iter = reg.myRegions.begin(); assert(iter->first == 2); assert(iter->second == 30); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 50; i++) { if(((i >= 2) && (i < 30))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Add invalid region (start = end) reg.add(40,40); assert(reg.myRegions.size() == 1); iter = reg.myRegions.begin(); assert(iter->first == 2); assert(iter->second == 30); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 50; i++) { if(((i >= 2) && (i < 30))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } // Add invalid region (start < end) reg.add(40, 38); assert(reg.myRegions.size() == 1); iter = reg.myRegions.begin(); assert(iter->first == 2); assert(iter->second == 30); ++iter; assert(iter == reg.myRegions.end()); for(int i = 0; i < 50; i++) { if(((i >= 2) && (i < 30))) { assert(reg.inRegion(i) == true); } else { assert(reg.inRegion(i) == false); } } } libStatGen-1.0.14/general/test/nonOverlapRegions/NonOverlapRegionsTest.h000066400000000000000000000016461254730101300263250ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __NONOVERLAP_REGIONS_TEST_H__ #define __NONOVERLAP_REGIONS_TEST_H__ class NonOverlapRegionsTest { public: void test(); private: void testChrom(); void testPos(); }; #endif libStatGen-1.0.14/general/test/nonOverlapRegions/expectedNonOverlapRegionsTest.txt000066400000000000000000000002261254730101300304300ustar00rootroot00000000000000NonOverlapRegionPos::add: Invalid Range, start must be < end, but 40 >= 40 NonOverlapRegionPos::add: Invalid Range, start must be < end, but 40 >= 38 libStatGen-1.0.14/general/test/nonOverlapRegions/results/000077500000000000000000000000001254730101300233745ustar00rootroot00000000000000libStatGen-1.0.14/general/test/nonOverlapRegions/results/.gitignore000066400000000000000000000000051254730101300253570ustar00rootroot00000000000000*.txtlibStatGen-1.0.14/general/test/packedVectorTest/000077500000000000000000000000001254730101300214735ustar00rootroot00000000000000libStatGen-1.0.14/general/test/packedVectorTest/.gitignore000066400000000000000000000000211254730101300234540ustar00rootroot00000000000000packedVectorTest libStatGen-1.0.14/general/test/packedVectorTest/Makefile000066400000000000000000000003361254730101300231350ustar00rootroot00000000000000PATH_TO_BASE=../../.. EXE = packedVectorTest TOOLBASE = PackedVectorTest TEST_COMMAND= ./packedVectorTest include $(PATH_TO_BASE)/Makefiles/Makefile.test obj/PackedVectorTest.o: PackedVectorTest.cpp ../../PackedVector.h libStatGen-1.0.14/general/test/packedVectorTest/PackedVectorTest.cpp000066400000000000000000000155601254730101300254200ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "Generic.h" #include #include "PackedVector.h" #include "UnitTest.h" #include #include class PackedArrayTest : public UnitTest { public: PackedArrayTest(const char *title) : UnitTest(title) {;} void testBool(); void test2Bit(); void test4Bit(); void testResize(); void test() { testBool(); test2Bit(); test4Bit(); testResize(); } }; void PackedArrayTest::testBool(void) { PackedVectorBool_t testVector; testVector.resize(11); testVector.set(0,0); testVector.set(1,1); testVector.set(2,0); testVector.set(3,1); testVector.set(4,1); testVector.set(5,0); testVector.set(6,1); testVector.set(7,0); testVector.set(8,0); testVector.set(9,0); testVector.set(10,1); check(m_failures, ++m_testNum, "Access 1 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 1 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 1 bit element 2", 0U, testVector[2]); check(m_failures, ++m_testNum, "Access 1 bit element 3", 1U, testVector[3]); check(m_failures, ++m_testNum, "Access 1 bit element 4", 1U, testVector[4]); check(m_failures, ++m_testNum, "Access 1 bit element 5", 0U, testVector[5]); check(m_failures, ++m_testNum, "Access 1 bit element 6", 1U, testVector[6]); check(m_failures, ++m_testNum, "Access 1 bit element 7", 0U, testVector[7]); check(m_failures, ++m_testNum, "Access 1 bit element 8", 0U, testVector[8]); check(m_failures, ++m_testNum, "Access 1 bit element 9", 0U, testVector[9]); check(m_failures, ++m_testNum, "Access 1 bit element 10", 1U, testVector[10]); } void PackedArrayTest::test2Bit(void) { PackedVector2Bit_t testVector; testVector.resize(11); testVector.set(0,0); testVector.set(1,1); testVector.set(2,2); testVector.set(3,3); testVector.set(4,3); testVector.set(5,2); testVector.set(6,1); testVector.set(7,0); testVector.set(8,2); testVector.set(9,1); testVector.set(10,3); check(m_failures, ++m_testNum, "Access 2 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 2 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 2 bit element 2", 2U, testVector[2]); check(m_failures, ++m_testNum, "Access 2 bit element 3", 3U, testVector[3]); check(m_failures, ++m_testNum, "Access 2 bit element 4", 3U, testVector[4]); check(m_failures, ++m_testNum, "Access 2 bit element 5", 2U, testVector[5]); check(m_failures, ++m_testNum, "Access 2 bit element 6", 1U, testVector[6]); check(m_failures, ++m_testNum, "Access 2 bit element 7", 0U, testVector[7]); check(m_failures, ++m_testNum, "Access 2 bit element 8", 2U, testVector[8]); check(m_failures, ++m_testNum, "Access 2 bit element 9", 1U, testVector[9]); check(m_failures, ++m_testNum, "Access 2 bit element 10", 3U, testVector[10]); } void PackedArrayTest::test4Bit(void) { PackedVector4Bit_t testVector; testVector.resize(11); testVector.set(0,0); testVector.set(1,1); testVector.set(2,2); testVector.set(3,3); testVector.set(4,4); testVector.set(5,5); testVector.set(6,6); testVector.set(7,7); testVector.set(8,8); testVector.set(9,9); testVector.set(10,10); check(m_failures, ++m_testNum, "Access 4 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 4 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 4 bit element 2", 2U, testVector[2]); check(m_failures, ++m_testNum, "Access 4 bit element 3", 3U, testVector[3]); check(m_failures, ++m_testNum, "Access 4 bit element 4", 4U, testVector[4]); check(m_failures, ++m_testNum, "Access 4 bit element 5", 5U, testVector[5]); check(m_failures, ++m_testNum, "Access 4 bit element 6", 6U, testVector[6]); check(m_failures, ++m_testNum, "Access 4 bit element 7", 7U, testVector[7]); check(m_failures, ++m_testNum, "Access 4 bit element 8", 8U, testVector[8]); check(m_failures, ++m_testNum, "Access 4 bit element 9", 9U, testVector[9]); check(m_failures, ++m_testNum, "Access 4 bit element 10", 10U, testVector[10]); } void PackedArrayTest::testResize(void) { PackedVector4Bit_t testVector; testVector.resize(0); check(m_failures, ++m_testNum, "New size is 0", 0U, testVector.size()); testVector.push_back(0); testVector.push_back(1); testVector.push_back(2); testVector.push_back(3); testVector.push_back(4); testVector.push_back(5); testVector.push_back(6); testVector.push_back(7); testVector.push_back(8); testVector.push_back(9); testVector.push_back(10); check(m_failures, ++m_testNum, "New size is 11", 11U, testVector.size()); check(m_failures, ++m_testNum, "Access 4 bit element 0", 0U, testVector[0]); check(m_failures, ++m_testNum, "Access 4 bit element 1", 1U, testVector[1]); check(m_failures, ++m_testNum, "Access 4 bit element 2", 2U, testVector[2]); check(m_failures, ++m_testNum, "Access 4 bit element 3", 3U, testVector[3]); check(m_failures, ++m_testNum, "Access 4 bit element 4", 4U, testVector[4]); check(m_failures, ++m_testNum, "Access 4 bit element 5", 5U, testVector[5]); check(m_failures, ++m_testNum, "Access 4 bit element 6", 6U, testVector[6]); check(m_failures, ++m_testNum, "Access 4 bit element 7", 7U, testVector[7]); check(m_failures, ++m_testNum, "Access 4 bit element 8", 8U, testVector[8]); check(m_failures, ++m_testNum, "Access 4 bit element 9", 9U, testVector[9]); check(m_failures, ++m_testNum, "Access 4 bit element 10", 10U, testVector[10]); } int main(int argc, char **argv) { PackedArrayTest test("PackedArrayTest"); #if 0 bool showAllCasesFlag = false; int opt; while(( opt = getopt(argc, (char **) argv, "v")) != -1) { switch(opt) { case 'v': showAllCasesFlag = true; break; default: std::cerr << "usage: testSW [-v]" << std::endl; exit(1); } } #endif test.test(); std::cout << test; exit(test.getFailureCount()); } libStatGen-1.0.14/general/test/packedVectorTest/PackedVectorTest.h000066400000000000000000000000001254730101300250440ustar00rootroot00000000000000libStatGen-1.0.14/general/test/phiX.fa000066400000000000000000000126471254730101300174530ustar00rootroot00000000000000>1 phiX: http://www.genome.jp/dbget-bin/www_bget?refseq+NC_001422 GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAA AAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGAC TGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTT GCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAG TGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTT CATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGAT GCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCG AAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTG CTCGTCGCTGCGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCT TTCCTGCTCCTGTTGAGTTTATTGCTGCCGTCATTGCTTATTATGTTCATCCCGTCAACA TTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTACGGAAAACATTATTAATGGCG TCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTACGCGCAGGAA ACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCGGAAGGAG TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTT GCGAGGTACTAAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATT TTAATTGCAGGGGCTTCGGCCCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGG CGCCGAGCGTATGCCGCATGACCTTTCCCATCTTGGCTTCCTTGCTGGTCAGATTGGTCG TCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGACTCCTTCGAGATGGACGCCGT TGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTACTGTAGACAT TTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGC CGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGG TTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGC TAACCCTAATGAGCTTAATCAAGATGATGCTCGTTATGGTTTCCGTTGCTGCCATCTCAA AAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAATGACGACTTC TACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGCATACTGACCA AGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAAC CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGG CTATGATGTTGATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACA GACCTATAAACATTCTGTGCCGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCT TGCGCTTGTTCGTTTTCCGCCTACTGCGACTAAAGAGATTCAGTACCTTAACGCTAAAGG TGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCG TGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTTAAGATTGC TGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCG CCACCATGATTATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGT TAAATTTAATGTGACCGTTTATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTC GTGATAAAAGATTGAGTGTGAGGTTATAACGCCGAAGCGGTAAAAATTTTAATTTTTGCC GCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGCTTAGGAGTTTAATCATGTTT CAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGTTCTCACTTCT GTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATG GATACATCTGTCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGAT GCCGACCCTAAATTTTTTGCCTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACC CTCCCGACTGCCTATGATGTTTATCCTTTGAATGGTCGCCATGATGGTGGTTATTATACC GTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGCCGGGCAATAACGTTTATGTT GGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGTTTCGCTGAAT CAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAG GCGGTCAAAAAGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATA CTGTAGGCATGGGTGATGCTGGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACC CTGATGAGGCCGCCCCTAGTTTTGTTTCTGGTGCTATGGCTAAAGCTGGTAAAGGACTTC TTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGATAAGTTGCTTGATTTGGTTG GACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGCTGCTG CATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAG AGATTGCCGAGATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTT CACGCCAGAATACGAAAGACCAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGA AGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGC AGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCAAACGGCTGGTCAGTATTTTA CCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGACTTAGTTCATC AGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAG CTGTTGCCGATACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTA ATTTGTCTAGGAAATAACCGTCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTC CAAATCTTGGAGGCTTTTTTATGGTTCGTTCTTATTACCCTTCTGAATGTCACGCTGATT ATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTATTGAGGCTTGTGGCATTTCTA CTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGCATCAAGCTCT TGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGT TAATGGATGAATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTA TAGACCACCGCCCCGAAGGGGACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGC AGTTTTGCCGCAAGCTGGCTGCTGAACGCCCTCTTAAGGATATTCGCGATGAGTATAATT ACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATTGCTGGAGGCCTCCACTATGA AATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAGGCTCATGCTG ATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTG CCGAGGGTCGCAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTG AGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTC CTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATA GCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGG ACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAG CTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGC TGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAA TGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACG ACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGC TGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCA AATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA libStatGen-1.0.14/general/test/referenceSequenceTest/000077500000000000000000000000001254730101300225105ustar00rootroot00000000000000libStatGen-1.0.14/general/test/referenceSequenceTest/.gitignore000066400000000000000000000000261254730101300244760ustar00rootroot00000000000000referenceSequenceTest libStatGen-1.0.14/general/test/referenceSequenceTest/Makefile000066400000000000000000000003671254730101300241560ustar00rootroot00000000000000PATH_TO_BASE=../../.. EXE = referenceSequenceTest TOOLBASE = ReferenceSequenceTest TEST_COMMAND= ./referenceSequenceTest include $(PATH_TO_BASE)/Makefiles/Makefile.test obj/PackedVectorTest.o: ReferenceSequenceTest.cpp ../../ReferenceSequence.h libStatGen-1.0.14/general/test/referenceSequenceTest/ReferenceSequenceTest.cpp000066400000000000000000000133461254730101300274520ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "Generic.h" #include #include "ReferenceSequence.h" #include "UnitTest.h" #include #include class ReferenceSequenceTest : public UnitTest { public: ReferenceSequenceTest(const char *title) : UnitTest(title) {;} void test1(); void test2(); void test3(); void humanGenomeTest1(); void test() { test1(); test2(); test3(); // This test is very slow: // humanGenomeTest1(); } }; void ReferenceSequenceTest::test1(void) { std::string sequence("ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTG"); std::string word; word="ACTG"; check(m_failures, ++m_testNum, "Test wordMatch with std::string", true, Sequence::wordMatch(sequence, 4, word)); std::stringstream output; Sequence::printNearbyWords(output, sequence, 8, word, 4); std::string expect("\ word 'ACTG' found -4 away from position 8.\n\ word 'ACTG' found 0 away from position 8.\n\ "); check(m_failures, ++m_testNum, "Test printNearbyWords with std::string", expect, output.str()); Sequence::getString(sequence, 4, 4, word); check(m_failures, ++m_testNum, "Test getString with std::string", "ACTG", word); Sequence::getHighLightedString(sequence, 0, 12, word, 4, 8); check(m_failures, ++m_testNum, "Test getHighLightedStribng with std::string", "ACTGactgACTG",word); #if 0 // busted test - don't know why output.clear(); output.str(std::string()); // Sequence::printBaseContext(std::cout, sequence, 8, 4); Sequence::printBaseContext(output, sequence, 8, 4); expect="\ index: 8\n\ ACTGACTGA\n\ ^\n\ "; check(m_failures, ++m_testNum, "Test printBaseContext with std::string", expect, output.str()); #endif std::string result; std::string read("ACTGZZZZACTG"); expect = " ^^^^ "; Sequence::getMismatchHatString(sequence, 4, result, read); check(m_failures, ++m_testNum, "Test getMismatchHatString with std::string", expect, result); read="ACTG"; std::string quality(""); size_t location = Sequence::simpleLocalAligner(sequence, 0, read, quality, 12); check(m_failures, ++m_testNum, "Test simpleLocalAligner with std::string", (size_t) 0, location); read="ACNG"; int misMatches = Sequence::getMismatchCount(sequence, 0, read); check(m_failures, ++m_testNum, "Test getMismatchCount with std::string", 1, misMatches); read="ACNG"; quality="$$$$"; int sumQ = Sequence::getSumQ(sequence, 0, read, quality); check(m_failures, ++m_testNum, "Test getSumQ with std::string", 3, sumQ); } void ReferenceSequenceTest::test2(void) { PackedSequenceData sequence; std::string word; sequence.push_back('A'); sequence.push_back('C'); sequence.push_back('T'); sequence.push_back('G'); sequence.push_back('A'); sequence.push_back('C'); sequence.push_back('T'); sequence.push_back('G'); sequence.push_back('A'); sequence.push_back('C'); sequence.push_back('T'); sequence.push_back('G'); sequence.push_back('A'); sequence.push_back('C'); sequence.push_back('T'); sequence.push_back('G'); Sequence::getString(sequence, 4, 4, word); check(m_failures, ++m_testNum, "Test getString with PackedSequenceData", "ACTG", word); std::cout << "test2 sequence utilization is " << sequence.getUtilization() * 100 << "% - expect around 6.25%" << std::endl; } void ReferenceSequenceTest::test3(void) { std::vector chromosomeSequence; std::vector chromosomeNames; bool result = loadFastaFile("../phiX.fa", chromosomeSequence, chromosomeNames); if(result) { std::cout << "../phiX.fa not found - skipping these tests." << std::endl; return; } std::cout << "phiX reference utilization is " << chromosomeSequence[0].getUtilization() * 100 << "% - expect around 96.8%" << std::endl; check(m_failures, ++m_testNum, "Test loadFastaFile with PackedSequenceData", (size_t) 1, chromosomeNames.size()); check(m_failures, ++m_testNum, "Test loadFastaFile with PackedSequenceData", (size_t) 1, chromosomeSequence.size()); check(m_failures, ++m_testNum, "Test loadFastaFile with PackedSequenceData", "1", chromosomeNames[0]); std::string word; Sequence::getString(chromosomeSequence[0], 60, 10, word); check(m_failures, ++m_testNum, "Test loadFastaFile with PackedSequenceData", "AAATTATCTT", word); } void ReferenceSequenceTest::humanGenomeTest1(void) { std::vector chromosomeSequence; std::vector chromosomeNames; #define HUMAN_GENOME "/data/local/ref/karma.ref/human.g1k.v37.fa" bool result = loadFastaFile(HUMAN_GENOME, chromosomeSequence, chromosomeNames); if(result) { std::cout << HUMAN_GENOME << " not found - skipping these tests." << std::endl; return; } } int main(int argc, char **argv) { ReferenceSequenceTest test("ReferenceSequenceTest"); test.test(); std::cout << test; exit(test.getFailureCount()); } libStatGen-1.0.14/general/test/referenceSequenceTest/ReferenceSequenceTest.h000066400000000000000000000000001254730101300270760ustar00rootroot00000000000000libStatGen-1.0.14/general/test/reusableVector/000077500000000000000000000000001254730101300212065ustar00rootroot00000000000000libStatGen-1.0.14/general/test/reusableVector/.gitignore000066400000000000000000000000231254730101300231710ustar00rootroot00000000000000reusableVectorTest libStatGen-1.0.14/general/test/reusableVector/Makefile000066400000000000000000000002041254730101300226420ustar00rootroot00000000000000EXE = reusableVectorTest TOOLBASE = ReusableVectorTest TEST_COMMAND= ./reusableVectorTest include ../../../Makefiles/Makefile.testlibStatGen-1.0.14/general/test/reusableVector/ReusableVectorTest.cpp000066400000000000000000000141641254730101300255050ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "ReusableVector.h" #include "ReusableVectorTest.h" #include #include #include int ReusableVectorTestDataType::ourValue = 0; int ReusableVectorTestDataType::ourNumDestructs = 0; int main(int argc, char ** argv) { ReusableVectorTest myTest; myTest.test(); } void ReusableVectorTest::test() { assert(ReusableVectorTestDataType::ourNumDestructs == 0); testReuse(); assert(ReusableVectorTestDataType::ourNumDestructs == 8); } void ReusableVectorTest::testReuse() { ReusableVector testVector; ReusableVector testVector2; ReusableVectorTestDataType* dataPtr = NULL; assert(testVector.size() == 0); assert(testInvalidGetIndex(testVector, 0)); assert(testInvalidGetIndex(testVector, 1)); testVector.reset(); assert(testVector.size() == 0); assert(testInvalidGetIndex(testVector, 0)); assert(testInvalidGetIndex(testVector, 1)); // Get three data pointers and check they are each new. dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 0); assert(dataPtr->ourValue == 1); dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 1); assert(dataPtr->ourValue == 2); dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 2); assert(dataPtr->ourValue == 3); assert(testVector.size() == 3); // Check a 2nd test vector. assert(testVector2.size() == 0); assert(testInvalidGetIndex(testVector2, 0)); assert(testInvalidGetIndex(testVector2, 1)); testVector2.reset(); assert(testVector2.size() == 0); assert(testInvalidGetIndex(testVector2, 0)); assert(testInvalidGetIndex(testVector2, 1)); // Get data pointers and check they are each new. dataPtr = &(testVector2.getNextEmpty()); assert(dataPtr->myValue == 3); assert(dataPtr->ourValue == 4); dataPtr = &(testVector2.getNextEmpty()); assert(dataPtr->myValue == 4); assert(dataPtr->ourValue == 5); assert(testVector2.size() == 2); // Test the get accessor. assert(testVector2.get(1).myValue == 4); assert(testVector2.get(0).myValue == 3); assert(testInvalidGetIndex(testVector2, 2)); // Test the get accessor with the first vector. assert(testVector.get(1).myValue == 1); assert(testVector.get(0).myValue == 0); assert(testVector.get(2).myValue == 2); assert(testInvalidGetIndex(testVector, 3)); // Clear the 1st vector. testVector.clear(); assert(testVector.size() == 0); assert(testInvalidGetIndex(testVector, 0)); assert(testInvalidGetIndex(testVector, 1)); // Check the data values are reused. dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 0); assert(dataPtr->ourValue == 5); assert(testVector.size() == 1); dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 1); assert(dataPtr->ourValue == 5); assert(testVector.size() == 2); dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 2); assert(dataPtr->ourValue == 5); assert(testVector.size() == 3); // Test allocating a new value. dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 5); assert(dataPtr->ourValue == 6); assert(testVector.size() == 4); // Clear both vectors. testVector2.clear(); testVector.reset(); assert(testVector.size() == 0); assert(testInvalidGetIndex(testVector, 0)); assert(testInvalidGetIndex(testVector, 1)); assert(testVector2.size() == 0); assert(testInvalidGetIndex(testVector2, 0)); assert(testInvalidGetIndex(testVector2, 1)); // Get values for the vectors and verify they are reused. dataPtr = &(testVector2.getNextEmpty()); assert(dataPtr->myValue == 3); assert(dataPtr->ourValue == 6); assert(testVector2.size() == 1); dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 0); assert(dataPtr->ourValue == 6); assert(testVector.size() == 1); dataPtr = &(testVector2.getNextEmpty()); assert(dataPtr->myValue == 4); assert(dataPtr->ourValue == 6); assert(testVector2.size() == 2); dataPtr = &(testVector2.getNextEmpty()); assert(dataPtr->myValue == 6); assert(dataPtr->ourValue == 7); assert(testVector2.size() == 3); dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 1); assert(dataPtr->ourValue == 7); assert(testVector.size() == 2); dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 2); assert(dataPtr->ourValue == 7); assert(testVector.size() == 3); dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 5); assert(dataPtr->ourValue == 7); assert(testVector.size() == 4); dataPtr = &(testVector.getNextEmpty()); assert(dataPtr->myValue == 7); assert(dataPtr->ourValue == 8); assert(testVector.size() == 5); } bool ReusableVectorTest::testInvalidGetIndex(ReusableVector& testVector, int index) { bool caught = false; try { testVector.get(index); } catch(std::exception& e) { caught = true; assert(strcmp(e.what(), "ReusableVector::get called with out of range index.") == 0); } return(caught); } ReusableVectorTestDataType::ReusableVectorTestDataType() { myValue = ourValue++; } ReusableVectorTestDataType::~ReusableVectorTestDataType() { ++ourNumDestructs; } libStatGen-1.0.14/general/test/reusableVector/ReusableVectorTest.h000066400000000000000000000024751254730101300251540ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __REUSABLE_VECTOR_TEST_H__ #define __REUSABLE_VECTOR_TEST_H__ class ReusableVectorTestDataType; class ReusableVectorTest { public: void test(); private: void testReuse(); bool testInvalidGetIndex(ReusableVector& testVector, int index); }; class ReusableVectorTestDataType { public: ReusableVectorTestDataType(); ~ReusableVectorTestDataType(); void clear() {} static int ourValue; int myValue; static int ourNumDestructs; private: ReusableVectorTestDataType(const ReusableVectorTestDataType& other); }; #endif libStatGen-1.0.14/general/test/string/000077500000000000000000000000001254730101300175275ustar00rootroot00000000000000libStatGen-1.0.14/general/test/string/.gitignore000066400000000000000000000000131254730101300215110ustar00rootroot00000000000000stringTest libStatGen-1.0.14/general/test/string/Makefile000066400000000000000000000002031254730101300211620ustar00rootroot00000000000000EXE = stringTest TOOLBASE = StringTest TEST_COMMAND= mkdir -p results && \ ./stringTest include ../../../Makefiles/Makefile.testlibStatGen-1.0.14/general/test/string/StringTest.cpp000066400000000000000000000067201254730101300223460ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "StringTest.h" #include int main(int argc, char ** argv) { testAsInteger(); testReadLine(); } void testAsInteger() { // Test AsInteger with ints & negative ints. String intString = "123"; String negIntString = "-123"; assert(intString.AsInteger() == 123); assert(negIntString.AsInteger() == -123); // Run the same tests with AsInteger that returns a bool and takes // in a long to set. long retValue; assert(intString.AsInteger(retValue)); assert(retValue == 123); assert(negIntString.AsInteger(retValue)); assert(retValue == -123); // Strings that are not integers // For AsInteger, it returns just the starting integer portion. // For AsInteger that returns a bool and a long set, it returns false // and sets the long to the starting int. String nonIntString = "abd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); nonIntString = "12ab33"; assert(nonIntString.AsInteger() == 12); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 12); nonIntString = "as12ab3a4sd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 0); // Negatives are only recognized as the first characer. nonIntString = "-12ab3a4sd"; assert(nonIntString.AsInteger() == -12); assert(!nonIntString.AsInteger(retValue)); assert(retValue == -12); nonIntString = "-as12ab3a4sd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 0); nonIntString = "as-12ab3a4sd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 0); nonIntString = "as12-ab3a4sd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 0); } int temp1 = 0; void testReadLine() { IFILE filePtr = ifopen("testFiles/testFile.txt", "rb"); assert(filePtr != NULL); String line = ""; line.ReadLine(filePtr); assert(line == " Hello, I am a testFile. "); line.Trim(); assert(line == "Hello, I am a testFile."); // Does not compile in current version, but compiles in old verison. // This can be added back in to ensure that it will catch the difference // in return value for ReadLine (now: int; used to be: string&) // testMethod(line.ReadLine(filePtr)); line.ReadLine(filePtr); assert(temp1 == 0); testMethod(line); assert(temp1 == 1); // line.ReadLine(filePtr).Trim(); line.ReadLine(filePtr); line.Trim(); assert(line == "ThirdLine."); ifclose(filePtr); } void testMethod(String temp) { temp1 = 1; } libStatGen-1.0.14/general/test/string/StringTest.h000066400000000000000000000015071254730101300220110ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "StringBasics.h" void testAsInteger(); void testReadLine(); void testMethod(String temp); libStatGen-1.0.14/general/test/string/testFiles/000077500000000000000000000000001254730101300214715ustar00rootroot00000000000000libStatGen-1.0.14/general/test/string/testFiles/testFile.txt000066400000000000000000000001011254730101300240010ustar00rootroot00000000000000 Hello, I am a testFile. This is my 2nd line. ThirdLine. libStatGen-1.0.14/general/test/test_memmap_data.txt000066400000000000000000000002671254730101300222730ustar00rootroot00000000000000This is a test file for testing memmap.cpp - it does not have anything useful or interesting in it. If the size of this file changes, be sure to update memmap::test in memmap.cpp... libStatGen-1.0.14/general/test/trimSequence/000077500000000000000000000000001254730101300206655ustar00rootroot00000000000000libStatGen-1.0.14/general/test/trimSequence/.gitignore000066400000000000000000000000141254730101300226500ustar00rootroot00000000000000trimSequencelibStatGen-1.0.14/general/test/trimSequence/Makefile000066400000000000000000000002211254730101300223200ustar00rootroot00000000000000PATH_TO_BASE=../../.. EXE = trimSequence SRCONLY = TrimSequence.cpp TEST_COMMAND= ./trimSequence include $(PATH_TO_BASE)/Makefiles/Makefile.testlibStatGen-1.0.14/general/test/trimSequence/TrimSequence.cpp000066400000000000000000000067441254730101300240100ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "TrimSequence.h" #include #include #include #include int main(int argc, const char **argv) { std::string test; std::string::iterator result; // // from the left: // test = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'A', true); assert(result == test.begin()); test = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, '~', true); assert(result == test.end()); test = "AAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'B', true); assert(result == (test.begin() + 5)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'B', true); assert(result == (test.begin() + 8)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'F', true); assert(result == (test.begin() + 12)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, '@', true); assert(result == (test.begin() + 0)); test = "AAAAAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, '@', true); assert(result == (test.begin() + 0)); test = "AAAFAAAABCDEFGHIJKLMNOPQRSTUVWXYZ"; result = trimSequence(test, 'F', true); assert(result == (test.begin() + 12)); // trim left 12 bases, and untrimmed bases are 'FG' (turn bug into this test cass) test = "AAAFAAAABCDEFG"; result = trimSequence(test, 'F', true); assert(result == (test.begin() + 12)); // // from the right: // test = "ZYXWVUTSRQPONMLKJIHGFEDCBA"; result = trimSequence(test, 'A', false); assert(result == test.end()); test = "ZYXWVUTSRQPONMLKJIHGFEDCBA"; result = trimSequence(test, '~', false); assert(result == test.begin()); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAA"; result = trimSequence(test, 'B', false); assert(result == (test.end() - 5)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAA"; result = trimSequence(test, 'B', false); assert(result == (test.end() - 7)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAAA"; result = trimSequence(test, 'F', false); assert(result == (test.end() - 12)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAAAAA"; result = trimSequence(test, '@', false); assert(result == (test.end() + 0)); test = "ZYXWVUTSRQPONMLKJIHGFEDCBAAAAFAAA"; result = trimSequence(test, 'F', false); assert(result == (test.end() - 12)); test = "#################################"; result = trimSequence(test, 'F', false); assert(result == (test.begin())); #if 0 // TODO: add explanation why this test case should trim 5 right most bases? test = ">BC@>28B==>=>@8(>0309261/;6=@"; result = trimSequence(test, '0', false); assert(result == (test.end())-5); #endif exit(0); } libStatGen-1.0.14/glf/000077500000000000000000000000001254730101300143755ustar00rootroot00000000000000libStatGen-1.0.14/glf/COPYING000066400000000000000000001045141254730101300154350ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . libStatGen-1.0.14/glf/GlfException.cpp000066400000000000000000000026601254730101300174740ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "GlfException.h" GlfException::GlfException() : myStatus() { myStatus.setStatus(GlfStatus::UNKNOWN, "Failed operating on a GLF."); } GlfException::GlfException(const std::string& errorMsg) : myStatus() { myStatus.setStatus(GlfStatus::UNKNOWN, errorMsg.c_str()); } GlfException::GlfException(GlfStatus::Status status, const std::string& errorMsg) : myStatus() { myStatus.setStatus(status, errorMsg.c_str()); } GlfException::GlfException(const GlfStatus& status) : myStatus() { myStatus.addError(status); } GlfException::~GlfException() throw() { } const char* GlfException::what() const throw() { return(myStatus.getStatusMessage()); } libStatGen-1.0.14/glf/GlfException.h000066400000000000000000000040431254730101300171360ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GLF_EXCEPTION_H__ #define __GLF_EXCEPTION_H__ #include // stdexcept header file #include "GlfStatus.h" /// GlfException objects should be thrown by functions that operate on /// Glf files for exceptions. class GlfException : public std::exception { public: /// Constructor that sets the exception to a default status /// and error message. GlfException(); /// Constructor that sets the exception to a default status /// and the specified error message. /// \param what_arg error message associated with this exception. GlfException(const std::string& what_arg); /// Constructor that sets the exception to the specified status /// and error message. /// \param status glf status associated with this exception. /// \param errorMsg error message associated with this exception. GlfException(GlfStatus::Status status, const std::string& errorMsg); /// Constructor that sets the exception to the specified status. /// \param status glf status associated with this exception. GlfException(const GlfStatus& status); virtual ~GlfException() throw(); /// Returns the error message of this exception. /// \return errror message virtual const char* what() const throw(); private: GlfStatus myStatus; }; // end class GlfException #endif libStatGen-1.0.14/glf/GlfFile.cpp000066400000000000000000000357431254730101300164250ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include "GlfFile.h" #include "GlfException.h" // Constructor, init variables. GlfFile::GlfFile() : myFilePtr(NULL), myEndMarker() { resetFile(); } // Constructor, init variables and open the specified file based on the // specified mode (READ/WRITE). Default is READ.. GlfFile::GlfFile(const char* filename, OpenType mode) : myFilePtr(NULL), myEndMarker() { resetFile(); bool openStatus = true; if(mode == READ) { // open the file for read. openStatus = openForRead(filename); } else { // open the file for write. openStatus = openForWrite(filename); } if(!openStatus) { // Failed to open the file - print error and abort. fprintf(stderr, "%s\n", getStatusMessage()); std::cerr << "FAILURE - EXITING!!!" << std::endl; exit(-1); } } GlfFile::~GlfFile() { resetFile(); } // Open a glf file for reading with the specified filename. bool GlfFile::openForRead(const char * filename) { // Reset for any previously operated on files. resetFile(); myFilePtr = ifopen(filename, "rb"); if (myFilePtr == NULL) { std::string errorMessage = "Failed to Open "; errorMessage += filename; errorMessage += " for reading"; myStatus.setStatus(GlfStatus::FAIL_IO, errorMessage.c_str()); throw(GlfException(myStatus)); return(false); } myIsOpenForRead = true; // Successfully opened the file. myStatus = GlfStatus::SUCCESS; return(true); } // Open a glf file for reading with the specified filename and read the // header into the specified header. bool GlfFile::openForRead(const char * filename, GlfHeader& header) { if(!openForRead(filename)) { return(false); } // Read the header if(!readHeader(header)) { return(false); } return(true); } // Open a glf file for writing with the specified filename. bool GlfFile::openForWrite(const char * filename, bool compressed) { // Reset for any previously operated on files. resetFile(); if(compressed) { myFilePtr = ifopen(filename, "wb", InputFile::BGZF); } else { myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED); } if (myFilePtr == NULL) { std::string errorMessage = "Failed to Open "; errorMessage += filename; errorMessage += " for writing"; myStatus.setStatus(GlfStatus::FAIL_IO, errorMessage.c_str()); throw(GlfException(myStatus)); return(false); } myIsOpenForWrite = true; // Successfully opened the file. myStatus = GlfStatus::SUCCESS; return(true); } // Close the file if there is one open. void GlfFile::close() { // Resetting the file will close it if it is open, and // will reset all other variables. resetFile(); } // Returns whether or not the end of the file has been reached. // return: int - true = EOF; false = not eof. bool GlfFile::isEOF() { if (myFilePtr != NULL) { // File Pointer is set, so return if eof. return(ifeof(myFilePtr)); } // File pointer is not set, so return true, eof. return true; } // Read the header from the currently opened file. bool GlfFile::readHeader(GlfHeader& header) { if(myIsOpenForRead == false) { // File is not open for read myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot read header since the file is not open for reading"); throw(GlfException(myStatus)); return(false); } if(myNextSection != HEADER) { // The header has already been read. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot read header since it has already been read."); throw(GlfException(myStatus)); return(false); } if(header.read(myFilePtr)) { // The header has now been successfully read. myNextSection = REF_SECTION; myStatus = GlfStatus::SUCCESS; return(true); } myStatus.setStatus(GlfStatus::UNKNOWN, "Failed to read the header."); throw(GlfException(myStatus)); return(false); } // Write the header to the currently opened file. bool GlfFile::writeHeader(GlfHeader& header) { if(myIsOpenForWrite == false) { // File is not open for write // -OR- // The header has already been written. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot write header since the file is not open for writing"); throw(GlfException(myStatus)); return(false); } if(myNextSection != HEADER) { // The header has already been written. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot write header since it has already been written"); throw(GlfException(myStatus)); return(false); } if(header.write(myFilePtr)) { // The header has now been successfully written. myNextSection = REF_SECTION; myStatus = GlfStatus::SUCCESS; return(true); } // return the status. myStatus.setStatus(GlfStatus::UNKNOWN, "Failed to write the header."); throw(GlfException(myStatus)); return(false); } // Gets the next reference section from the file & stores it in the // passed in section. It will read until a new section is found. bool GlfFile::getNextRefSection(GlfRefSection& refSection) { if(myIsOpenForRead == false) { // File is not open for read myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot read reference section since the file is not open for reading"); throw(GlfException(myStatus)); return(false); } if(myNextSection == HEADER) { // The header has not yet been read. // TODO - maybe just read the header. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot read reference section since the header has not been read."); throw(GlfException(myStatus)); return(false); } // Keep reading until the next section is found. if(myNextSection == RECORD) { GlfRecord record; while(getNextRecord(record)) { // Nothing to do, with the record. } } // Check for end of file. If end of file, return false. if(isEOF()) { return(false); } if(myNextSection != REF_SECTION) { // Failed reading all the records, so throw exception. myStatus.setStatus(GlfStatus::FAIL_IO, "Failed to get to a reference section."); throw(GlfException(myStatus)); return(false); } // Ready to read the section: if(refSection.read(myFilePtr)) { myStatus = GlfStatus::SUCCESS; // Next a record should be read. myNextSection = RECORD; return(true); } // If it is the EOF, just return false. if(isEOF()) { return(false); } myStatus.setStatus(GlfStatus::UNKNOWN, "Failed reading a reference section from the file."); throw(GlfException(myStatus)); return(false); } // Write the reference section to the file. bool GlfFile::writeRefSection(const GlfRefSection& refSection) { if(myIsOpenForWrite == false) { // File is not open for write myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot write reference section since the file is not open for writing"); throw(GlfException(myStatus)); return(false); } if(myNextSection == HEADER) { // The header has not been written. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot write reference section since the header has not been written"); throw(GlfException(myStatus)); return(false); } if(myNextSection == RECORD) { // did not write a end marker record, so write one now. if(!writeRecord(myEndMarker)) { // Failed to write the end marker record. myStatus.setStatus(GlfStatus::FAIL_IO, "Failed to write end of chromosome/section marker."); throw(GlfException(myStatus)); return(false); } } if(myNextSection != REF_SECTION) { // Not ready to write a reference section. myStatus.setStatus(GlfStatus::FAIL_IO, "Not ready for a chromosome/section header."); throw(GlfException(myStatus)); return(false); } if(refSection.write(myFilePtr)) { myStatus = GlfStatus::SUCCESS; // A reference section has now been successfully written. myNextSection = RECORD; return(true); } // return the status. myStatus.setStatus(GlfStatus::UNKNOWN, "Failed writing a reference section to the file."); throw(GlfException(myStatus)); return(false); } // Gets the next reference section from the file & stores it in the // passed in record. bool GlfFile::getNextRecord(GlfRecord& record) { if(myIsOpenForRead == false) { // File is not open for read myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot read reference section since the file is not open for reading"); throw(GlfException(myStatus)); return(false); } if(myNextSection == HEADER) { // The header has not yet been read. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot read reference section since the header has not been read."); throw(GlfException(myStatus)); return(false); } if(myNextSection == REF_SECTION) { // The reference section has not yet been read. // TODO - maybe just read the reference section. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot read record since a reference section has not been read."); throw(GlfException(myStatus)); return(false); } // Check for end of file. If end of file, return false. if(isEOF()) { return(false); } // Read the record. if(record.read(myFilePtr)) { myStatus = GlfStatus::SUCCESS; if(record.getRecordType() != 0) { return(true); } else { // Not an error, so no exception thrown, but no more records. // The next thing is a reference section. myNextSection = REF_SECTION; return(false); } } myStatus.setStatus(GlfStatus::UNKNOWN, "Failed reading a record from the file."); throw(GlfException(myStatus)); return(false); } // Write the reference section to the file. bool GlfFile::writeRecord(const GlfRecord& record) { if(myIsOpenForWrite == false) { // File is not open for write // -OR- // The header has already been written. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot write record since the file is not open for writing"); throw(GlfException(myStatus)); return(false); } if(myNextSection == HEADER) { // The header has not been written. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot write record since the header has not been written"); throw(GlfException(myStatus)); return(false); } if(myNextSection != RECORD) { // The header has not been written. myStatus.setStatus(GlfStatus::FAIL_ORDER, "Cannot write record since a reference section has not been written"); throw(GlfException(myStatus)); return(false); } if(record.write(myFilePtr)) { myStatus = GlfStatus::SUCCESS; // The record has now been successfully written. // Check if it was the end marker - if so, set that next a // reference section is expected. if(record.getRecordType() == 0) { myNextSection = REF_SECTION; } return(true); } // return the status. myStatus.setStatus(GlfStatus::UNKNOWN, "Failed writing a record to the file."); throw(GlfException(myStatus)); return(false); } // Return the number of records that have been read/written so far. uint32_t GlfFile::getCurrentRecordCount() { return(myRecordCount); } // Reset variables for each file. void GlfFile::resetFile() { // Close the file. if (myFilePtr != NULL) { // If we already have an open file, close it. // First check if this is a write file and an end record needs to // be written, which is the case if the state is RECORD. if(myIsOpenForWrite && (myNextSection == RECORD)) { if(!writeRecord(myEndMarker)) { // Failed to write the end marker record. myStatus.setStatus(GlfStatus::FAIL_IO, "Failed to write end of chromosome/section marker."); throw(GlfException(myStatus)); } } ifclose(myFilePtr); myFilePtr = NULL; } myIsOpenForRead = false; myIsOpenForWrite = false; myRecordCount = 0; myStatus = GlfStatus::SUCCESS; myNextSection = HEADER; } // Default Constructor. GlfFileReader::GlfFileReader() { } // Constructor that opens the specified file for read. GlfFileReader::GlfFileReader(const char* filename) { if(!openForRead(filename)) { // Failed to open for reading - print error and abort. fprintf(stderr, "%s\n", getStatusMessage()); std::cerr << "FAILURE - EXITING!!!" << std::endl; exit(-1); } } GlfFileReader::~GlfFileReader() { } // Default Constructor. GlfFileWriter::GlfFileWriter() { } // Constructor that opens the specified file for write. GlfFileWriter::GlfFileWriter(const char* filename) { if(!openForWrite(filename)) { // Failed to open for reading - print error and abort. fprintf(stderr, "%s\n", getStatusMessage()); std::cerr << "FAILURE - EXITING!!!" << std::endl; exit(-1); } } GlfFileWriter::~GlfFileWriter() { } libStatGen-1.0.14/glf/GlfFile.h000066400000000000000000000155571254730101300160730ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GLF_FILE_H__ #define __GLF_FILE_H__ #include "InputFile.h" #include "GlfHeader.h" #include "GlfRefSection.h" #include "GlfRecord.h" #include "GlfStatus.h" /// This class allows a user to easily read/write a GLF file. class GlfFile { public: /// Enum for indicating whether to open the file for read or write. enum OpenType { READ, ///< open for reading. WRITE ///< open for writing. }; /// Default Constructor. GlfFile(); /// Constructor that opens the specified file based on the specified mode /// (READ/WRITE). Default is READ. /// \param filename name of the file to open. /// \param mode mode to use for opening the file (defaults to READ). GlfFile(const char* filename, OpenType mode = READ); /// Closes the file if there is one open, adding an end marker record /// if there is a previous section and one has not already been written. virtual ~GlfFile(); /// Open a glf file for reading with the specified filename. /// \param filename glf file to open for reading. /// \return true = success; false = failure. bool openForRead(const char * filename); /// Open a glf file for reading with the specified filename and read the /// header into the specified header. /// \param filename glf file to open for reading. /// \param header header object to populate with the file's glf header. /// \return true = success; false = failure. bool openForRead(const char * filename, GlfHeader& header); /// Open a glf file for writing with the specified filename. /// \param filename glf file to open for writing. /// \param compressed whether or not to compress the file, defaults to true /// \return true = success; false = failure. bool openForWrite(const char * filename, bool compressed = true); /// Close the file if there is one open, adding an end marker record /// if there is a previous section and one has not already been written. void close(); /// Returns whether or not the end of the file has been reached. /// \return true = EOF; false = not eof. /// If the file is not open, true is returned. bool isEOF(); /// Reads the header section from the file and stores it in /// the passed in header. /// \param header header object to populate with the file's glf header. /// \return true = success; false = failure. bool readHeader(GlfHeader& header); /// Writes the specified header into the file. /// \param header header object to write into the file. /// \return true = success; false = failure. bool writeHeader(GlfHeader& header); /// Gets the next reference section from the file & stores it in the /// passed in section, consuming records until a new section is found. /// \param refSection object to populate with the file's next reference /// section. /// \return true = section was successfully set. /// false = section was not successfully set. bool getNextRefSection(GlfRefSection& refSection); /// Write the reference section to the file, adding an end marker record /// if there is a previous section and one has not already been written. /// \param refSection reference section to write to the file. /// \return true = succes; false = failure. bool writeRefSection(const GlfRefSection& refSection); /// Gets the nextrecord from the file & stores it in the /// passed in record. /// \param record object to populate with the file's next record. /// \return true = record was successfully set. /// false = record not successfully set or for the endMarker record. bool getNextRecord(GlfRecord& record); /// Writes the specified record into the file. /// \param record record to write to the file. /// \return true = success; false = failure. bool writeRecord(const GlfRecord& record); /// Return the number of records that have been read/written so far. /// \return number of records that have been read/written so far. uint32_t getCurrentRecordCount(); /// Get the Status of the last call that sets status. /// To remain backwards compatable - will be removed later. inline GlfStatus::Status getFailure() { return(getStatus()); } /// Get the Status of the last call that sets status. /// \return status of the last method that sets a status. inline GlfStatus::Status getStatus() { return(myStatus.getStatus()); } /// Get the Status of the last call that sets status. /// \return status message of the last method that sets a status. inline const char* getStatusMessage() { return(myStatus.getStatusMessage()); } private: /// reset this file including all its attributes. void resetFile(); /// Pointer to the file IFILE myFilePtr; /// Flag to indicate if a file is open for reading. bool myIsOpenForRead; /// Flag to indicate if a file is open for writing. bool myIsOpenForWrite; /// End marker that is inserted when writing files if a new section /// is specified without one or if the file is closed without writing /// an endMarker. GlfRecord myEndMarker; /// Track the state of this file as to what it is expecting to read next. enum EXPECTED_SECTION { HEADER, REF_SECTION, RECORD } myNextSection; /// Keep count of the number of records that have been read/written so far. uint32_t myRecordCount; /// The status of the last GlfFile command. GlfStatus myStatus; }; class GlfFileReader : public GlfFile { public: /// Default Constructor. GlfFileReader(); /// Constructor that opens the specified file for read. /// \param filename file to open for reading. GlfFileReader(const char* filename); virtual ~GlfFileReader(); }; class GlfFileWriter : public GlfFile { public: /// Default Constructor. GlfFileWriter(); /// Constructor that opens the specified file for write. /// \param filename file to open for writing. GlfFileWriter(const char* filename); virtual ~GlfFileWriter(); }; #endif libStatGen-1.0.14/glf/GlfHeader.cpp000077500000000000000000000136211254730101300167300ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "GlfHeader.h" #include "GlfStatus.h" #include "GlfException.h" #include "StringBasics.h" const std::string GlfHeader::GLF_MAGIC = "GLF\3"; GlfHeader::GlfHeader() : myText() { resetHeader(); } GlfHeader::~GlfHeader() { resetHeader(); } // Copy Constructor GlfHeader::GlfHeader(const GlfHeader& header) : myText() { copy(header); } // Overload operator = to copy the passed in header into this header. GlfHeader & GlfHeader::operator = (const GlfHeader& header) { copy(header); return(*this); } bool GlfHeader::copy(const GlfHeader& header) { // Check to see if the passed in value is the same as this. if(this == &header) { return(true); } resetHeader(); // Copy the header. myText = header.myText; return(true); } // Reset the header for a new entry, clearing out previous values. void GlfHeader::resetHeader() { myText.reset(); } // Read the header from the specified file. Assumes the file is in // the correct position for reading the header. bool GlfHeader::read(IFILE filePtr) { if((filePtr == NULL) || (filePtr->isOpen() == false)) { // File is not open, return failure. std::string errorString = "Failed to read the header since the file is not open."; throw(GlfException(GlfStatus::FAIL_ORDER, errorString)); return(false); } // Read the magic int numRead = 0; char magic[GLF_MAGIC_LEN]; numRead = ifread(filePtr, &magic, GLF_MAGIC_LEN); if(numRead != GLF_MAGIC_LEN) { String errorMsg = "Failed to read the magic number ("; errorMsg += GLF_MAGIC_LEN; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Read the header length. int32_t headerLen = 0; int byteLen = sizeof(int32_t); numRead = ifread(filePtr, &headerLen, byteLen); if(numRead != byteLen) { String errorMsg = "Failed to read the length of the header text ("; errorMsg += byteLen; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Read the header from the file. numRead = myText.readFromFile(filePtr, headerLen); if(numRead != headerLen) { String errorMsg = "Failed to read the header text ("; errorMsg += headerLen; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Successfully read, return success. return(true); } // Write the header to the specified file. bool GlfHeader::write(IFILE filePtr) const { if((filePtr == NULL) || (filePtr->isOpen() == false)) { // File is not open, return failure. std::string errorString = "Failed to write the header since the file is not open."; throw(GlfException(GlfStatus::FAIL_ORDER, errorString)); return(false); } int numWrite = 0; // Write the magic numWrite = ifwrite(filePtr, GLF_MAGIC.c_str(), GLF_MAGIC_LEN); if(numWrite != GLF_MAGIC_LEN) { String errorMsg = "Failed to write the magic number ("; errorMsg += GLF_MAGIC_LEN; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Write the header length. int32_t headerLen = myText.length(); int byteLen = sizeof(int32_t); numWrite = ifwrite(filePtr, &headerLen, byteLen); if(numWrite != byteLen) { String errorMsg = "Failed to write the length of the header text ("; errorMsg += byteLen; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Write the header to the file. numWrite = ifwrite(filePtr, myText.c_str(), headerLen); if(numWrite != headerLen) { String errorMsg = "Failed to write the header text ("; errorMsg += headerLen; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Successfully wrote, return success. return(true); } // Set the passed in string to the text string stored in this header. bool GlfHeader::getHeaderTextString(std::string& text) { text = myText.c_str(); return(true); } // Set the header to the passed in string. bool GlfHeader::setHeaderTextString(const std::string& text) { myText = text; return(true); } libStatGen-1.0.14/glf/GlfHeader.h000077500000000000000000000054001254730101300163710ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GLF_HEADER_H__ #define __GLF_HEADER_H__ #include #include "InputFile.h" #include "CharBuffer.h" /// This class allows a user to easily get/set the fields in a GLF header. /// The GlfHeader contains: /// - Variable length text string class GlfHeader { public: GlfHeader(); ~GlfHeader(); /// Copy Constructor /// \param header glfheader to copy into this one. GlfHeader(const GlfHeader& header); /// Overload operator= to copy the passed in header into this header. /// \param header glfheader to copy into this one. GlfHeader & operator = (const GlfHeader& header); /// Copy the passed in header into this header. /// \param header glfheader to copy into this one. bool copy(const GlfHeader& header); /// Clear this header back to the default setting. void resetHeader(); /// Read the header from the specified file (file MUST be in /// the correct position for reading the header). /// \param filePtr file to read from that is in the correct position. /// \return true if the header was successfully read from the /// file, false if not. bool read(IFILE filePtr); /// Write the header to the specified file. /// \param filePtr file to write to that is in the correct position. /// \return true if the header was successfully written to the /// file, false if not. bool write(IFILE filePtr) const; /// Set the passed in string to the text string stored in this header. /// \param text string to populate with the header text string. /// \return true if text was successfully returned, false if not. bool getHeaderTextString(std::string& text); /// Set the header to the passed in string. /// \param text header text to assign to this header. /// \return true if the text was successfully set, false if not. bool setHeaderTextString(const std::string& text); private: int32_t myTextLen; CharBuffer myText; static const std::string GLF_MAGIC; static const int GLF_MAGIC_LEN = 4; }; #endif libStatGen-1.0.14/glf/GlfRecord.cpp000066400000000000000000000420741254730101300167570ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "GlfRecord.h" #include "GlfException.h" #include "StringBasics.h" std::string GlfRecord::REF_BASE_CHAR = "XACMGRSVTQYHKSVN"; GlfRecord::GlfRecord() { reset(); } GlfRecord::~GlfRecord() { reset(); } // Reset the record for a new entry, clearing out previous values. void GlfRecord::reset() { myRecTypeRefBase = 0; myRec1Base.offset = 0; myRec1Base.min_depth = 0; myRec1Base.rmsMapQ = 0; for(int i = 0; i < 10; i++) { myRec1Base.lk[i] = 0; } myRec2Base.offset = 0; myRec2Base.min_depth = 0; myRec2Base.rmsMapQ = 0; myRec2Base.lkHom1 = 0; myRec2Base.lkHom2 = 0; myRec2Base.lkHet = 0; myRec2Base.indelLen1 = 0; myRec2Base.indelLen2 = 0; myIndelSeq1.reset(); myIndelSeq2.reset(); } // Read the record from the specified file. Assumes the file is in // the correct position for reading the record. bool GlfRecord::read(IFILE filePtr) { // Read the record type and reference base. int numRead = 0; int byteLen = sizeof(uint8_t); numRead = ifread(filePtr, &myRecTypeRefBase, byteLen); if(numRead != byteLen) { String errorMsg = "Failed to read the record type & reference base ("; errorMsg += byteLen; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // TODO, split up by types of records... switch(getRecordType()) { case 0: // Last record. // Nothing more to read. break; case 1: // Read type 1. readType1(filePtr); break; case 2: // Read type 2. readType2(filePtr); break; default: String errorMsg = "Failed to read the record: unknown type: "; errorMsg += getRecordType(); std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::INVALID, errorString)); return(false); break; }; // Successfully read, return success. return(true); } // Write the record to the specified file. bool GlfRecord::write(IFILE filePtr) const { // TODO, split up by types of records... switch(getRecordType()) { case 0: writeRtypeRef(filePtr); break; case 1: // write type 1. writeType1(filePtr); break; case 2: // write type 2. writeType2(filePtr); break; default: // unknown type, return error. String errorMsg = "Failed to write the record: unknown type: "; errorMsg += getRecordType(); std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::INVALID, errorString)); return(false); break; }; return(true); } void GlfRecord::print() const { std::cout << "record_type: " << getRecordType() << "; ref_base: " << getRefBase() << "; ref_base_char: " << getRefBaseChar() << "\n"; // TODO, split up by types of records... switch(getRecordType()) { case 0: break; case 1: // print type 1. std::cout << "\toffset: " << myRec1Base.offset << "; min_lk: " << (myRec1Base.min_depth >> 24) << "; read_depth: " << (myRec1Base.min_depth & 0xFFFFFF) << "; rmsMapQ: " << (int)myRec1Base.rmsMapQ; for(int i = 0; i < 10; ++i) { std::cout << "; lk[" << i << "]: " << (int)myRec1Base.lk[i]; } std::cout << "\n"; break; case 2: // print type 2. std::cout << "\toffset: " << myRec2Base.offset << "; min_lk: " << (myRec2Base.min_depth >> 24) << "; read_depth: " << (myRec2Base.min_depth & 0xFFFFF) << "; rmsMapQ: " << (int)myRec2Base.rmsMapQ << "; lkHom1: " << (int)myRec2Base.lkHom1 << "; lkHom2: " << (int)myRec2Base.lkHom2 << "; lkHet: " << (int)myRec2Base.lkHet << "; indelLen1: " << myRec2Base.indelLen1 << "; indelLen2: " << myRec2Base.indelLen2 << "; myIndelSeq1: " << myIndelSeq1.c_str() << "; myIndelSeq2: " << myIndelSeq2.c_str() << "\n"; break; default: break; }; } bool GlfRecord::setRtypeRef(uint8_t rtypeRef) { myRecTypeRefBase = rtypeRef; return(true); } bool GlfRecord::setRecordType(uint8_t recType) { myRecTypeRefBase = (myRecTypeRefBase & REF_BASE_MASK) | (recType << REC_TYPE_SHIFT); return(true); } bool GlfRecord::setRefBaseInt(uint8_t refBase) { myRecTypeRefBase = (myRecTypeRefBase & REC_TYPE_MASK) | (refBase & REF_BASE_MASK); return(true); } // bool GlfRecord::setRefBaseChar(char refBase) // { // uint8_t refBaseInt = REF_BASE_CHAR_TO_INT[refBase]; // return(setRefBaseInt(refBaseInt)); // } bool GlfRecord::setOffset(uint32_t offset) { myRec1Base.offset = offset; myRec2Base.offset = offset; return(true); } bool GlfRecord::setMinDepth(uint32_t minDepth) { myRec1Base.min_depth = minDepth; myRec2Base.min_depth = minDepth; return(true); } bool GlfRecord::setMinLk(uint8_t minLk) { setMinDepth((myRec1Base.min_depth & READ_DEPTH_MASK) | (minLk << MIN_LK_SHIFT)); return(true); } bool GlfRecord::setReadDepth(uint32_t readDepth) { setMinDepth((myRec1Base.min_depth & MIN_LK_MASK) | (readDepth & READ_DEPTH_MASK)); return(true); } bool GlfRecord::setRmsMapQ(uint8_t rmsMapQ) { myRec1Base.rmsMapQ = rmsMapQ; myRec2Base.rmsMapQ = rmsMapQ; return(true); } // Accessors to get the gneric values. char GlfRecord::getRefBaseChar() const { int index = myRecTypeRefBase & REF_BASE_MASK; if((index > REF_BASE_MAX) || (index < 0)) { // TODO throw exception. return('N'); } return(REF_BASE_CHAR[index]); } uint32_t GlfRecord::getOffset() const { if(getRecordType() == 1) { return(myRec1Base.offset); } else if(getRecordType() == 2) { return(myRec2Base.offset); } throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getOffset for Record not of type 1 or 2.")); return(0); } uint32_t GlfRecord::getMinDepth() const { if(getRecordType() == 1) { return(myRec1Base.min_depth); } else if(getRecordType() == 2) { return(myRec2Base.min_depth); } throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getMinDepth for Record not of type 1 or 2.")); return(0); } uint8_t GlfRecord::getMinLk() const { if(getRecordType() == 1) { return(myRec1Base.min_depth >> MIN_LK_SHIFT); } else if(getRecordType() == 2) { return(myRec2Base.min_depth >> MIN_LK_SHIFT); } throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getMinLk for Record not of type 1 or 2.")); return(0); } uint32_t GlfRecord::getReadDepth() const { if(getRecordType() == 1) { return(myRec1Base.min_depth & READ_DEPTH_MASK); } else if(getRecordType() == 2) { return(myRec2Base.min_depth & READ_DEPTH_MASK); } throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getReadDepth for Record not of type 1 or 2.")); return(0); } uint8_t GlfRecord::getRmsMapQ() const { if(getRecordType() == 1) { return(myRec1Base.rmsMapQ); } else if(getRecordType() == 2) { return(myRec2Base.rmsMapQ); } throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getRmsMapQ for Record not of type 1 or 2.")); return(0); } // Accessors for getting record type 1 bool GlfRecord::setLk(int index, uint8_t value) { if((index < 0) || (index >= NUM_REC1_LIKELIHOOD)) { // Out of range. throw(GlfException(GlfStatus::UNKNOWN, "Trying to set Record Type 1 likelihood position< 0 or >= 10.")); return(false); } // In range. myRec1Base.lk[index] = value; return(true); } uint8_t GlfRecord::getLk(int index) { if(getRecordType() != 1) { throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getLk for Record not of type 1.")); return(0); } if((index < 0) || (index >= NUM_REC1_LIKELIHOOD)) { throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getLk for index < 0 or >= 10.")); return(0); } return(myRec1Base.lk[index]); } // Accessors for getting record type 2 bool GlfRecord::setLkHom1(uint8_t lk) { myRec2Base.lkHom1 = lk; return(true); } bool GlfRecord::setLkHom2(uint8_t lk) { myRec2Base.lkHom2 = lk; return(true); } bool GlfRecord::setLkHet(uint8_t lk) { myRec2Base.lkHet = lk; return(true); } bool GlfRecord::setInsertionIndel1(const std::string& indelSeq) { myRec2Base.indelLen1 = indelSeq.length(); myIndelSeq1 = indelSeq; return(true); } bool GlfRecord::setDeletionIndel1(const std::string& indelSeq) { myRec2Base.indelLen1 = 0 - (indelSeq.length()); myIndelSeq1 = indelSeq; return(true); } bool GlfRecord::setInsertionIndel2(const std::string& indelSeq) { myRec2Base.indelLen2 = indelSeq.length(); myIndelSeq2 = indelSeq; return(true); } bool GlfRecord::setDeletionIndel2(const std::string& indelSeq) { myRec2Base.indelLen2 = 0 - (indelSeq.length()); myIndelSeq2 = indelSeq; return(true); } uint8_t GlfRecord::getLkHom1() { if(getRecordType() != 2) { throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getLkHom1 for Record not of type 2.")); return(0); } return(myRec2Base.lkHom1); } uint8_t GlfRecord::getLkHom2() { if(getRecordType() != 2) { throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getLkHom2 for Record not of type 2.")); return(0); } return(myRec2Base.lkHom2); } uint8_t GlfRecord::getLkHet() { if(getRecordType() != 2) { throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getLkHet for Record not of type 2.")); return(0); } return(myRec2Base.lkHet); } int16_t GlfRecord::getIndel1(std::string& indelSeq) { if(getRecordType() != 2) { throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getIndel1 for Record not of type 2.")); return(0); } indelSeq = myIndelSeq1.c_str(); return(myRec2Base.indelLen1); } int16_t GlfRecord::getIndel2(std::string& indelSeq) { if(getRecordType() != 2) { throw(GlfException(GlfStatus::UNKNOWN, "Tried to call getIndel2 for Record not of type 2.")); return(0); } indelSeq = myIndelSeq2.c_str(); return(myRec2Base.indelLen2); } void GlfRecord::readType1(IFILE filePtr) { // Read record type 1 information. int numRead = 0; numRead = ifread(filePtr, &myRec1Base, REC1_BASE_SIZE); if(numRead != REC1_BASE_SIZE) { String errorMsg = "Failed to read record of type 1 ("; errorMsg += REC1_BASE_SIZE; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); } // Record type 1 is fixed size and has no additional variable length // fields, so done reading. } void GlfRecord::readType2(IFILE filePtr) { // Read record type 2 information. int numRead = 0; numRead = ifread(filePtr, &myRec2Base, REC2_BASE_SIZE); if(numRead != REC2_BASE_SIZE) { String errorMsg = "Failed to read record of type 2 base info ("; errorMsg += REC2_BASE_SIZE; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); } // Record type 2 has 2 additional variable length fields. Read those // fields. int16_t len = abs(myRec2Base.indelLen1); numRead = myIndelSeq1.readFromFile(filePtr, len); if(numRead != len) { String errorMsg = "Failed to read record of type 2, 1st indel sequence ("; errorMsg += len; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); } len = abs(myRec2Base.indelLen2); numRead = myIndelSeq2.readFromFile(filePtr, len); if(numRead != len) { String errorMsg = "Failed to read record of type 2, 2nd indel sequence ("; errorMsg += len; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); } } void GlfRecord::writeRtypeRef(IFILE filePtr) const { int byteLen = sizeof(myRecTypeRefBase); int numWrite = ifwrite(filePtr, &myRecTypeRefBase, byteLen); if(numWrite != byteLen) { String errorMsg = "Failed to write the length of the record type and reference base ("; errorMsg += byteLen; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); } } void GlfRecord::writeType1(IFILE filePtr) const { // Write the generic record field that all records have. writeRtypeRef(filePtr); // Record type 1 is fixed size and has no additional variable length // fields, so just write the base info. int numWrite = ifwrite(filePtr, &myRec1Base, REC1_BASE_SIZE); if(numWrite != REC1_BASE_SIZE) { // failed to write. String errorMsg = "Failed to write record of type 1 ("; errorMsg += REC1_BASE_SIZE; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); } // Done writing the record. } void GlfRecord::writeType2(IFILE filePtr) const { // Write the generic record field that all records have. writeRtypeRef(filePtr); // Write the record type 2 base info. int numWrite = ifwrite(filePtr, &myRec2Base, REC2_BASE_SIZE); if(numWrite != REC2_BASE_SIZE) { // failed to write. String errorMsg = "Failed to write record of type 2 base info ("; errorMsg += REC2_BASE_SIZE; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); } // Record type 2 has 2 additional variable length fields. Write those // fields. int len = myIndelSeq1.length(); numWrite = ifwrite(filePtr, myIndelSeq1.c_str(), len); if(numWrite != len) { // failed to write. String errorMsg = "Failed to write record of type 2, 1st indel sequence ("; errorMsg += len; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); } len = myIndelSeq2.length(); numWrite = ifwrite(filePtr, myIndelSeq2.c_str(), len); if(numWrite != len) { // failed to write. String errorMsg = "Failed to write record of type 2, 2nd indel sequence ("; errorMsg += len; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); } // Done writing the record. } libStatGen-1.0.14/glf/GlfRecord.h000077500000000000000000000273361254730101300164330ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GLF_RECORD_H__ #define __GLF_RECORD_H__ #include #include #include "InputFile.h" #include "CharBuffer.h" /// This class allows a user to easily get/set the fields in a GLF record. class GlfRecord { public: /// Constructor GlfRecord(); /// Destructor ~GlfRecord(); // // Copy Constructor // GlfRecord(const GlfRecord& record); // // Overload operator = to copy the passed in record into this record. // GlfRecord & operator = (const GlfRecord& record); // // Overload operator = to copy the passed in record into this record. // bool copy(const GlfRecord& record); /// Clear this record back to the default setting. void reset(); /// Read the record from the specified file (file MUST be in /// the correct position for reading a record). /// \param filePtr file to read from that is in the correct position. /// \return true if the record was successfully read from the file (even /// if it is an endMarker), false if it was not successfully read. bool read(IFILE filePtr); /// Write the record to the specified file. /// \param filePtr file to write to that is in the correct position. /// \return true if the record was successfully written to the /// file, false if not. bool write(IFILE filePtr) const; /// Print the reference section in a readable format. void print() const; /// @name Generic Accessors for Record Types 1 & 2 //@{ /// Set the record type and reference base. /// \param rtypeRef record type & reference base. Formatted as: /// record_type<<4|numeric_ref_base. /// \return true if the record type and reference base were successfully /// set, false if not. bool setRtypeRef(uint8_t rtypeRef); /// Set the record type. /// \param recType record type: 1 - simple likelihood record, /// 2 - indel likelihood record, 0 - end maker /// \return true if the record type was successfully set, false if not. bool setRecordType(uint8_t recType); /// Set the reference base from an integer value. /// \param refBase integer representation of the reference base. /// \anchor BaseCharacterIntMap /// /// /// ///
Int Value0123456789101112131415
Character BaseXACMGRSVTWYHKDBN
/// \return true if the reference base was successfully set, false if not. bool setRefBaseInt(uint8_t refBase); // TODO bool setRefBaseChar(char refBase); /// Set the offset from the precedent record. /// 0-based coordinate of the record minus the coordinate of the /// precedent record. For the first record in a reference sequence, /// the previous coordinate is 0. /// For insertions between x & x+1, the coordinate is x. /// For deletions between x & y, the coordinate is x. /// \param offset offset from the precedent record. /// \return true if successfully set, false if not. bool setOffset(uint32_t offset); /// Set the minimum likelihood and the read depth. /// \param minDepth minimum likelihood and read depth. Formatted as: /// min_lk<<24|read_dpeth. (min_lk capped at 255) /// \return true if successfully set, false if not. bool setMinDepth(uint32_t minDepth); /// Set the minimum likelihood. /// \param minLk minimum likelihood (capped at 255). /// \return true if successfully set, false if not. bool setMinLk(uint8_t minLk); /// Set the the read depth. /// \param readDepth read depth. /// \return true if successfully set, false if not. bool setReadDepth(uint32_t readDepth); /// Set the RMS of mapping qualities of reads covering the site. /// \param rmsMapQ RMS of mapping qualities /// \return true if successfully set, false if not. bool setRmsMapQ(uint8_t rmsMapQ); /// Return the record type. /// \return record type for this record: 0 - endMarker, /// 1 - simple likelihood, 2 - indel likelihood inline int getRecordType() const { return(myRecTypeRefBase >> REC_TYPE_SHIFT); } /// Return the reference base as an integer. /// \return integer representation of the reference base. /// See: \ref BaseCharacterIntMap inline int getRefBase() const { return(myRecTypeRefBase & REF_BASE_MASK); } /// Return the reference base as a character. /// \return character representation of the reference base. char getRefBaseChar() const; /// Return the offset from the precedent record. /// \return offset from the precedent record. uint32_t getOffset() const; /// Return the minimum likelihood and read depth. Formatted as: /// min_lk<<24|read_dpeth. (min_lk capped at 255) /// \return minimum likelihood and read depth uint32_t getMinDepth() const; /// Return the minimum likelihood /// \return minimum likelihood uint8_t getMinLk() const; /// Return the read depth. /// \return read depth uint32_t getReadDepth() const; /// Return the RMS of mapping qualities of reads covering the site. /// \return RMS of maping qualities. uint8_t getRmsMapQ() const; //@} /// @name Record Type 1 Accessors /// Record Type 1: Simple Likelihood Record //@{ //bool setType1(all fields for type 1); /// Set the likelihood for the specified genotype. /// Throws an exception if index is out of range. /// \param index index for the genotype for which the likelihood is /// being set. /// \anchor GenotypeIndexTable /// /// /// ///
Index0123456789
GenotypeAAACAGATCCCGCTGGGTTT
/// \param value likelihood for the genotype at the specified index. /// \return true if successfully set, false if not. bool setLk(int index, uint8_t value); //bool getType1(all fields for type 1); /// Get the likelihood for the specified genotype index. /// Throws an exception if index is out of range. /// \param index index of the genotype for which the likelihood should /// be returned. See: \ref GenotypeIndexTable /// \return likelihood of the specified index. uint8_t getLk(int index); //@} /// @name Record Type 2 Accessors /// Record Type2: Indel Likelihood Record //@{ // bool setType2(all fields for type 2); /// Set the likelihood of the first homozygous indel allele. /// \param lk likelihood of the 1st homozygous indel allele (capped at 255) /// \return true if successfully set, false if not. bool setLkHom1(uint8_t lk); /// Set the likelihood of the 2nd homozygous indel allele. /// \param lk likelihood of the 2nd homozygous indel allele (capped at 255) /// \return true if successfully set, false if not. bool setLkHom2(uint8_t lk); /// Set the likelihood of a heterozygote. /// \param lk likelihood of a heterozygote (capped at 255) /// \return true if successfully set, false if not. bool setLkHet(uint8_t lk); /// Set the sequence of the first indel allele if the /// first indel is an insertion. /// \param indelSeq sequence of the first indel allele (insertion). /// \return true if successfully set, false if not. bool setInsertionIndel1(const std::string& indelSeq); /// Set the sequence of the first indel allele if the /// first indel is an deletion. /// \param indelSeq sequence of the first indel allele (deletion). /// \return true if successfully set, false if not. bool setDeletionIndel1(const std::string& indelSeq); /// Set the sequence of the 2nd indel allele if the /// 2nd indel is an insertion. /// \param indelSeq sequence of the 2nd indel allele (insertion). /// \return true if successfully set, false if not. bool setInsertionIndel2(const std::string& indelSeq); /// Set the sequence of the 2nd indel allele if the /// 2nd indel is an deletion. /// \param indelSeq sequence of the 2nd indel allele (deletion). /// \return true if successfully set, false if not. bool setDeletionIndel2(const std::string& indelSeq); // bool setType2(all fields for type 2); /// Return the likelihood of the 1st homozygous indel allele. /// \return likelihood of the 1st homozygous indel allele. uint8_t getLkHom1(); /// Return the likelihood of the 2nd homozygous indel allele. /// \return likelihood of the 2nd homozygous indel allele. uint8_t getLkHom2(); /// Return the likelihood of a heterozygote. /// \return likelihood of a hetereozygote. uint8_t getLkHet(); /// Get the sequence and length (+:ins, -:del) of the 1st indel allele. /// \param indelSeq string to set with the sequence of the 1st indel allele /// \return length of the 1st indel allele /// (positive=insertion; negative=deletion; 0=no-indel) int16_t getIndel1(std::string& indelSeq); /// Get the sequence and length (+:ins, -:del) of the 2nd indel allele. /// \param indelSeq string to set with the sequence of the 2nd indel allele /// \return length of the 2nd indel allele /// (positive=insertion; negative=deletion; 0=no-indel) int16_t getIndel2(std::string& indelSeq); //@} private: // Read a record of record type 1. void readType1(IFILE filePtr); // Read a record of record type 2. void readType2(IFILE filePtr); // Write the rtyperef field. void writeRtypeRef(IFILE filePtr) const; // Write a record of record type 1. void writeType1(IFILE filePtr) const; // Write a record of record type 2. void writeType2(IFILE filePtr) const; // Contains record_type and ref_base. uint8_t myRecTypeRefBase; static const uint8_t REC_TYPE_SHIFT = 4; static const uint8_t REF_BASE_MASK = 0xF; static const uint8_t REC_TYPE_MASK = 0xF0; static const uint32_t MIN_LK_SHIFT = 24; static const uint32_t READ_DEPTH_MASK = 0xFFFFFF; static const uint32_t MIN_LK_MASK = 0xFF000000; static const char REF_BASE_MAX = 15; static std::string REF_BASE_CHAR; static const int NUM_REC1_LIKELIHOOD = 10; struct { uint32_t offset; uint32_t min_depth; uint8_t rmsMapQ; uint8_t lk[GlfRecord::NUM_REC1_LIKELIHOOD]; } myRec1Base; static const int REC1_BASE_SIZE = 19; struct { uint32_t offset; uint32_t min_depth; uint8_t rmsMapQ; uint8_t lkHom1; uint8_t lkHom2; uint8_t lkHet; int16_t indelLen1; int16_t indelLen2; } myRec2Base; // TODO rest of rec 2. CharBuffer myIndelSeq1; CharBuffer myIndelSeq2; static const int REC2_BASE_SIZE = 16; }; #endif libStatGen-1.0.14/glf/GlfRefSection.cpp000066400000000000000000000140061254730101300175740ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "GlfRefSection.h" #include "GlfException.h" #include "StringBasics.h" GlfRefSection::GlfRefSection() : myRefName() { resetRefSection(); } GlfRefSection::~GlfRefSection() { resetRefSection(); } // Copy Constructor GlfRefSection::GlfRefSection(const GlfRefSection& refSection) : myRefName() { copy(refSection); } // Overload operator = to copy the passed in refSection into this refSection. GlfRefSection & GlfRefSection::operator = (const GlfRefSection& refSection) { copy(refSection); return(*this); } bool GlfRefSection::copy(const GlfRefSection& refSection) { // Check to see if the passed in value is the same as this. if(this == &refSection) { return(true); } resetRefSection(); // Copy the refSection. myRefName = refSection.myRefName; myRefLen = refSection.myRefLen; return(true); } // Reset the refSection for a new entry, clearing out previous values. void GlfRefSection::resetRefSection() { myRefName.reset(); myRefLen = 0; } // Read the refSection from the specified file. Assumes the file is in // the correct position for reading the refSection. bool GlfRefSection::read(IFILE filePtr) { // Read the reference sequence name length int numRead = 0; int32_t refNameLen = 0; int byteLen = sizeof(int32_t); numRead = ifread(filePtr, &refNameLen, byteLen); if(numRead != byteLen) { // If no bytes were read and it is the end of the file, then return // false, but do not throw an exception. This is not an error, just // the end of the file. if((numRead == 0) && ifeof(filePtr)) { return(false); } String errorMsg = "Failed to read the length of the reference sequence name ("; errorMsg += byteLen; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Read the refSection from the file. numRead = myRefName.readFromFile(filePtr, refNameLen); if(numRead != refNameLen) { String errorMsg = "Failed to read the reference sequence name ("; errorMsg += refNameLen; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Read the ref length. byteLen = sizeof(uint32_t); numRead = ifread(filePtr, &myRefLen, byteLen); if(numRead != byteLen) { String errorMsg = "Failed to read the reference sequence length ("; errorMsg += byteLen; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Successfully read, return success. return(true); } // Write the refSection to the specified file. bool GlfRefSection::write(IFILE filePtr) const { int refNameLen = myRefName.length(); int byteLen = sizeof(int32_t); int numWrite = ifwrite(filePtr, &refNameLen, byteLen); if(numWrite != byteLen) { String errorMsg = "Failed to write the length of the reference sequence name ("; errorMsg += byteLen; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } numWrite = ifwrite(filePtr, myRefName.c_str(), refNameLen); if(numWrite != refNameLen) { String errorMsg = "Failed to write the reference sequence name ("; errorMsg += refNameLen; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Write the length of the reference sequence byteLen = sizeof(uint32_t); numWrite = ifwrite(filePtr, &myRefLen, byteLen); if(numWrite != byteLen) { String errorMsg = "Failed to write the reference sequence length ("; errorMsg += byteLen; errorMsg += " bytes). Only wrote "; errorMsg += numWrite; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Successfully wrote, return success. return(true); } bool GlfRefSection::getName(std::string& name) const { name = myRefName.c_str(); return(true); } uint32_t GlfRefSection::getRefLen() const { return(myRefLen); } bool GlfRefSection::setName(const std::string& name) { myRefName = name; return(true); } bool GlfRefSection::setRefLen(uint32_t refLen) { myRefLen = refLen; return(true); } void GlfRefSection::print() const { std::cout << "l_name: " << myRefName.length() << "; name: " << myRefName.c_str() << "; ref_len: " << myRefLen << "\n"; } libStatGen-1.0.14/glf/GlfRefSection.h000066400000000000000000000064371254730101300172520ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GLF_REFSECTION_H__ #define __GLF_REFSECTION_H__ #include #include "InputFile.h" #include "CharBuffer.h" ///This class allows a user to easily get/set the fields in a /// GLF section/chromosome header. /// The GlfRefSection contains: /// - Reference Sequence Name /// - Reference Sequence Length class GlfRefSection { public: GlfRefSection(); ~GlfRefSection(); /// Copy Constructor /// \param refSection reference section to copy into this one. GlfRefSection(const GlfRefSection& refSection); /// Overload operator= to copy the passed in refSection into this one. /// \param refSection reference section to copy into this one. GlfRefSection & operator = (const GlfRefSection& refSection); /// Copy the passed in refSection into this refSection. /// \param refSection reference section to copy into this one. bool copy(const GlfRefSection& refSection); /// Clear this reference section back to the default setting. void resetRefSection(); /// Read the refSection from the specified file (file MUST be in /// the correct position for reading a refSection). /// \param filePtr file to read from that is in the correct position. /// \return true if the reference section was successfully read from the /// file, false if not. bool read(IFILE filePtr); /// Write the refSection to the specified file. /// \param filePtr file to write to that is in the correct position. /// \return true if the reference section was successfully written to the /// file, false if not. bool write(IFILE filePtr) const; ///////////// // Accessors. /// Get the reference name. /// \param name string to populate with the reference name. /// \return true if the name was successfully returned, false if not. bool getName(std::string& name) const; /// Get the length of the reference sequence. /// \return reference sequence length for this reference section. uint32_t getRefLen() const; /// Set the reference name. /// \param name reference name to set this section to. /// \return true if the name was successfully set, false if not. bool setName(const std::string& name); /// Set the length of the reference sequence. /// \param refLen reference sequence length to set this section to. /// \return true if the length was successfully set, false if not. bool setRefLen(uint32_t refLen); /// Print the reference section in a readable format. void print() const; private: CharBuffer myRefName; uint32_t myRefLen; }; #endif libStatGen-1.0.14/glf/GlfStatus.cpp000066400000000000000000000067511254730101300170260ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "GlfStatus.h" const char* GlfStatus::enumStatusString[] = { "SUCCESS", "UNKNOWN", "FAIL_IO", "FAIL_ORDER", "FAIL_PARSE", "INVALID", "FAIL_MEM" }; const char* GlfStatus::getStatusString(GlfStatus::Status statusEnum) { return(enumStatusString[statusEnum]); } // Returns whether or not it is "safe" to keep processing the file // after the specified status return. bool GlfStatus::isContinuableStatus(GlfStatus::Status status) { if(status == GlfStatus::SUCCESS || status == GlfStatus::FAIL_PARSE || status == GlfStatus::INVALID) { // The status is such that file processing can continue. return(true); } // UNKNOWN, FAIL_IO, FAIL_ORDER, FAIL_MEM return(false); } // Constructor GlfStatus::GlfStatus() { reset(); } // Destructor GlfStatus::~GlfStatus() { } // Resets this status. void GlfStatus::reset() { myType = UNKNOWN; myMessage = ""; } // Set the status with the specified values. void GlfStatus::setStatus(Status newStatus, const char* newMessage) { myType = newStatus; myMessage = getStatusString(newStatus); myMessage += ": "; myMessage += newMessage; } // Adds the specified error message to the status message. // Sets the status to newStatus if the current status is SUCCESS. void GlfStatus::addError(Status newStatus, const char* newMessage) { if(myType == GlfStatus::SUCCESS) { myType = newStatus; } else { myMessage += "\n"; } myMessage += getStatusString(newStatus); myMessage += ": "; myMessage += newMessage; } // Adds the specified status to the status message. // Sets the status to newStatus if the current status is SUCCESS. void GlfStatus::addError(GlfStatus newStatus) { if(myType == GlfStatus::SUCCESS) { myType = newStatus.myType; } else { myMessage += "\n"; } myMessage += newStatus.myMessage; } // Return the enum for this status. GlfStatus::Status GlfStatus::getStatus() const { return(myType); } // Return the status message. const char* GlfStatus::getStatusMessage() const { return(myMessage.c_str()); } // Overload operator = to set the glf status type to the // passed in status and to clear the message string. GlfStatus & GlfStatus::operator = (GlfStatus::Status newStatus) { reset(); myType = newStatus; return(*this); } // Overload operator != to determine if the passed in type is not equal // to this status's type. bool GlfStatus::operator != (const GlfStatus::Status& compStatus) const { return(compStatus != myType); } // Overload operator != to determine if the passed in type is equal // to this status's type. bool GlfStatus::operator == (const GlfStatus::Status& compStatus) const { return(compStatus == myType); } libStatGen-1.0.14/glf/GlfStatus.h000066400000000000000000000107461254730101300164720ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __GLF_STATUS_H__ #define __GLF_STATUS_H__ #include /// This class is used to track the status results of some methods in the /// GLF classes using the status enum that is defined in this class to /// describe the return value of a method. class GlfStatus { public: /// Return value enum for the GlfFile class methods. enum Status { SUCCESS = 0, ///< method completed successfully. UNKNOWN, ///< unknown result (default value should never be used) FAIL_IO, ///< method failed due to an I/O issue. FAIL_ORDER, ///< method failed because it was called out of order, ///< like trying to read a file without opening it for ///< read or trying to read a record before the header. FAIL_PARSE, ///< failed to parse a record/header - invalid format. INVALID, ///< invalid. FAIL_MEM ///< fail a memory allocation. }; /// Returns the string representation of the specified enum. /// \param statusEnum enum to convert to a string /// \return string representation of the enum static const char* getStatusString(GlfStatus::Status statusEnum); /// Returns whether or not it is "safe" to keep processing the file /// after the specified status return. /// \param status enum to check if it is "safe" to continue processing. /// \return whether or not it is "safe" to keep processing the file /// after receiving the specified enum. static bool isContinuableStatus(GlfStatus::Status status); /// Constructor GlfStatus(); /// Destructor ~GlfStatus(); /// Resets this status. void reset(); /// Set the status with the specified values. /// \param newStatus new status to set this object to. /// \param newMessage message associated with the new status void setStatus(Status newStatus, const char* newMessage); /// Adds the specified error message to the status message, setting /// the status to newStatus if the current status is SUCCESS. /// \param newStatus status to add to this object. /// \param newMessage message to add to this object void addError(Status newStatus, const char* newMessage); /// Adds the specified status to the status message, setting /// the status to newStatus if the current status is SUCCESS. /// \param newStatus status to add to this object. void addError(GlfStatus newStatus); /// Return the enum for this status. /// \return enum for this status object. Status getStatus() const; /// Return the status message. /// \return status message associate with this status object. const char* getStatusMessage() const; /// Overload operator = to set the glf status type to the /// passed in status and to clear the message string. /// \param newStatus new status to set this object to. /// \return this object. GlfStatus & operator = (Status newStatus); // Overload operator = to set the glf status. // GlfStatus & operator = (GlfStatus newStatus); /// Overload operator != to determine if the passed in type is not equal /// to this status's type. /// \param compStatus status enum to compare this status object to. /// \return true if they are not equal, false if they are. bool operator != (const GlfStatus::Status& compStatus) const; /// Overload operator != to determine if the passed in type is equal /// to this status's type. /// \param compStatus status enum to compare this status object to. /// \return true if they are equal, false if they are not. bool operator == (const GlfStatus::Status& compStatus) const; private: static const char* enumStatusString[]; Status myType; std::string myMessage; }; #endif libStatGen-1.0.14/glf/Makefile000066400000000000000000000001561254730101300160370ustar00rootroot00000000000000TOOLBASE = GlfException GlfFile GlfHeader GlfRecord GlfRefSection GlfStatus include ../Makefiles/Makefile.liblibStatGen-1.0.14/glf/Makefile.depends000066400000000000000000000064771254730101300174740ustar00rootroot00000000000000# DO NOT DELETE $(OBJDIR_OPT)/GlfException.o: GlfException.h GlfStatus.h $(OBJDIR_OPT)/GlfFile.o: GlfFile.h ../include/InputFile.h $(OBJDIR_OPT)/GlfFile.o: ../include/FileType.h GlfHeader.h $(OBJDIR_OPT)/GlfFile.o: ../include/CharBuffer.h GlfRefSection.h GlfRecord.h $(OBJDIR_OPT)/GlfFile.o: GlfStatus.h GlfException.h $(OBJDIR_OPT)/GlfHeader.o: GlfHeader.h ../include/InputFile.h $(OBJDIR_OPT)/GlfHeader.o: ../include/FileType.h ../include/CharBuffer.h $(OBJDIR_OPT)/GlfHeader.o: GlfStatus.h GlfException.h $(OBJDIR_OPT)/GlfHeader.o: ../include/StringBasics.h $(OBJDIR_OPT)/GlfRecord.o: GlfRecord.h ../include/InputFile.h $(OBJDIR_OPT)/GlfRecord.o: ../include/FileType.h ../include/CharBuffer.h $(OBJDIR_OPT)/GlfRecord.o: GlfException.h GlfStatus.h $(OBJDIR_OPT)/GlfRecord.o: ../include/StringBasics.h $(OBJDIR_OPT)/GlfRefSection.o: GlfRefSection.h ../include/InputFile.h $(OBJDIR_OPT)/GlfRefSection.o: ../include/FileType.h ../include/CharBuffer.h $(OBJDIR_OPT)/GlfRefSection.o: GlfException.h GlfStatus.h $(OBJDIR_OPT)/GlfRefSection.o: ../include/StringBasics.h $(OBJDIR_OPT)/GlfStatus.o: GlfStatus.h $(OBJDIR_DEBUG)/GlfException.o: GlfException.h GlfStatus.h $(OBJDIR_DEBUG)/GlfFile.o: GlfFile.h ../include/InputFile.h $(OBJDIR_DEBUG)/GlfFile.o: ../include/FileType.h GlfHeader.h $(OBJDIR_DEBUG)/GlfFile.o: ../include/CharBuffer.h GlfRefSection.h $(OBJDIR_DEBUG)/GlfFile.o: GlfRecord.h GlfStatus.h GlfException.h $(OBJDIR_DEBUG)/GlfHeader.o: GlfHeader.h ../include/InputFile.h $(OBJDIR_DEBUG)/GlfHeader.o: ../include/FileType.h ../include/CharBuffer.h $(OBJDIR_DEBUG)/GlfHeader.o: GlfStatus.h GlfException.h $(OBJDIR_DEBUG)/GlfHeader.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/GlfRecord.o: GlfRecord.h ../include/InputFile.h $(OBJDIR_DEBUG)/GlfRecord.o: ../include/FileType.h ../include/CharBuffer.h $(OBJDIR_DEBUG)/GlfRecord.o: GlfException.h GlfStatus.h $(OBJDIR_DEBUG)/GlfRecord.o: ../include/StringBasics.h $(OBJDIR_DEBUG)/GlfRefSection.o: GlfRefSection.h ../include/InputFile.h $(OBJDIR_DEBUG)/GlfRefSection.o: ../include/FileType.h $(OBJDIR_DEBUG)/GlfRefSection.o: ../include/CharBuffer.h GlfException.h $(OBJDIR_DEBUG)/GlfRefSection.o: GlfStatus.h ../include/StringBasics.h $(OBJDIR_DEBUG)/GlfStatus.o: GlfStatus.h $(OBJDIR_PROFILE)/GlfException.o: GlfException.h GlfStatus.h $(OBJDIR_PROFILE)/GlfFile.o: GlfFile.h ../include/InputFile.h $(OBJDIR_PROFILE)/GlfFile.o: ../include/FileType.h GlfHeader.h $(OBJDIR_PROFILE)/GlfFile.o: ../include/CharBuffer.h GlfRefSection.h $(OBJDIR_PROFILE)/GlfFile.o: GlfRecord.h GlfStatus.h GlfException.h $(OBJDIR_PROFILE)/GlfHeader.o: GlfHeader.h ../include/InputFile.h $(OBJDIR_PROFILE)/GlfHeader.o: ../include/FileType.h ../include/CharBuffer.h $(OBJDIR_PROFILE)/GlfHeader.o: GlfStatus.h GlfException.h $(OBJDIR_PROFILE)/GlfHeader.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/GlfRecord.o: GlfRecord.h ../include/InputFile.h $(OBJDIR_PROFILE)/GlfRecord.o: ../include/FileType.h ../include/CharBuffer.h $(OBJDIR_PROFILE)/GlfRecord.o: GlfException.h GlfStatus.h $(OBJDIR_PROFILE)/GlfRecord.o: ../include/StringBasics.h $(OBJDIR_PROFILE)/GlfRefSection.o: GlfRefSection.h ../include/InputFile.h $(OBJDIR_PROFILE)/GlfRefSection.o: ../include/FileType.h $(OBJDIR_PROFILE)/GlfRefSection.o: ../include/CharBuffer.h GlfException.h $(OBJDIR_PROFILE)/GlfRefSection.o: GlfStatus.h ../include/StringBasics.h $(OBJDIR_PROFILE)/GlfStatus.o: GlfStatus.h libStatGen-1.0.14/glf/test/000077500000000000000000000000001254730101300153545ustar00rootroot00000000000000libStatGen-1.0.14/glf/test/.gitignore000066400000000000000000000000071254730101300173410ustar00rootroot00000000000000glfTestlibStatGen-1.0.14/glf/test/Main.cpp000066400000000000000000000015701254730101300167470ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "ReadFiles.h" #include "WriteFiles.h" //#include "SamFileTest.h" int main(int argc, char ** argv) { testReadGlf(); testWrite(); } libStatGen-1.0.14/glf/test/Makefile000066400000000000000000000002131254730101300170100ustar00rootroot00000000000000EXE = glfTest SRCONLY = Main.cpp TOOLBASE = ReadFiles Validate WriteFiles TEST_COMMAND = ./glfTest include ../../Makefiles/Makefile.testlibStatGen-1.0.14/glf/test/ReadFiles.cpp000066400000000000000000000027031254730101300177200ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "ReadFiles.h" #include "Validate.h" #include "GlfException.h" #include void testReadGlf() { GlfFile inGlf; assert(inGlf.openForRead("testFiles/testGlf.glf")); // Read the GLF Header. GlfHeader glfHeader; assert(inGlf.readHeader(glfHeader)); validateHeader(glfHeader); // TODO, validate the rest of the file. // GlfRecord glfRecord; // assert(inGlf.ReadRecord(glfHeader, glfRecord) == true); // validateRead1(glfRecord); // Try opening a file that doesn't exist. bool exceptionCaught = false; try { inGlf.openForRead("testFiles/unknown"); } catch(GlfException e) { exceptionCaught = true; } assert(exceptionCaught); } libStatGen-1.0.14/glf/test/ReadFiles.h000066400000000000000000000013671254730101300173720ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ void testReadGlf(); libStatGen-1.0.14/glf/test/Validate.cpp000066400000000000000000000024331254730101300176130ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "Validate.h" #include //const std::string GLF_HEADER_TEXT = ""; void validateRead1(GlfRecord& glfRecord) { ////////////////////////////////////////// // Validate Record 1 // Create record structure for validating. } void validateHeader(GlfHeader& glfHeader) { //////////////////////////////////////////////////////// // Get the text from the header and verify it is the expected value. std::string textString = "DUMMY"; assert(glfHeader.getHeaderTextString(textString)); assert(textString == GLF_HEADER_TEXT); } libStatGen-1.0.14/glf/test/Validate.h000066400000000000000000000015701254730101300172610ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "GlfFile.h" void validateRead1(GlfRecord& glfRecord); void validateHeader(GlfHeader& glfHeader); const std::string GLF_HEADER_TEXT = ""; libStatGen-1.0.14/glf/test/WriteFiles.cpp000066400000000000000000000404201254730101300201350ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "WriteFiles.h" #include "Validate.h" #include void testWrite() { TestWrite writeTest; writeTest.testWrite(); } const std::string TestWrite::HEADER_TEXT1 = "This is my 1st test header."; const std::string TestWrite::SEC1_REFNAME = "This is my 1st RefName"; const std::string TestWrite::SEC1REC2_INDELSEQ1 = "AC"; const std::string TestWrite::SEC1REC2_INDELSEQ2 = "TCA"; const std::string TestWrite::SEC2_REFNAME = "This is my 2nd RefName"; const std::string TestWrite::HEADER_TEXT2 = "This is my 2nd test header."; const std::string TestWrite::HEADER_TEXT3 = "This is my 3rd test header."; void TestWrite::testWrite() { GlfFile glfOut; std::string testFile = "results/MyTestOut1.glf"; assert(glfOut.openForWrite(testFile.c_str(), false)); // Create a glf header. GlfHeader glfHeader; GlfRefSection glfSection; GlfRecord record; // Test writing refsection with no header - exception bool caughtException = false; try { assert(glfOut.writeRefSection(glfSection) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); // Test writing record with no header - exception. caughtException = false; try { assert(glfOut.writeRecord(record) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); // Write the header. writeHeader(glfOut, 1); // Test writing record with no refsection - exception. caughtException = false; try { assert(glfOut.writeRecord(record) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); ////////////////////////////////////////////// writeRefSection1(glfOut); // Test writing header after refSection - exception caughtException = false; try { assert(glfOut.writeHeader(glfHeader) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); writeSec1Record1(glfOut); // Test writing header after record - exception caughtException = false; try { assert(glfOut.writeHeader(glfHeader) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); writeSec1Record2(glfOut); writeEndMarker(glfOut); writeRefSection2(glfOut); writeSec2Record1(glfOut); writeEndMarker(glfOut); //////////////////// // Close the file. glfOut.close(); ////////////////////////////////////////////// // Validate the just written file. GlfFile glfIn; assert(glfIn.openForRead(testFile.c_str())); readHeader(glfIn, 1); readRefSection1(glfIn); readSec1Record1(glfIn); readSec1Record2(glfIn); readEndMarker(glfIn); readRefSection2(glfIn); readSec2Record1(glfIn); readEndMarker(glfIn); checkEOF(glfIn); //////////////////////////////// // NEW FILE testFile = "results/MyTestOut2.glf"; assert(glfOut.openForWrite(testFile.c_str())); writeHeader(glfOut, 2); writeRefSection1(glfOut); writeSec1Record1(glfOut); writeSec1Record2(glfOut); // Test writing new section without end of section marker - auto-added. writeRefSection2(glfOut); writeSec2Record1(glfOut); // Test closing file with no end of section marker - auto-added. glfOut.close(); ////////////////////////////////////////////// // Validate the just written file. assert(glfIn.openForRead(testFile.c_str())); readHeader(glfIn, 2); readRefSection1(glfIn); readSec1Record1(glfIn); readSec1Record2(glfIn); readEndMarker(glfIn); readRefSection2(glfIn); readSec2Record1(glfIn); readEndMarker(glfIn); checkEOF(glfIn); //////////////////////////////// // NEW FILE testFile = "results/MyTestOut3.glf"; { GlfFile glfOutScoped; assert(glfOutScoped.openForWrite(testFile.c_str())); writeHeader(glfOutScoped, 3); writeRefSection1(glfOutScoped); writeSec1Record1(glfOutScoped); writeSec1Record2(glfOutScoped); // Test writing new section without end of section marker - auto-added. writeRefSection2(glfOutScoped); writeSec2Record1(glfOutScoped); // Test just letting the file go out of scope with no end // of section marker - auto added. } ////////////////////////////////////////////// // Validate the just written file. assert(glfIn.openForRead(testFile.c_str())); // Test reading refsection with no header - exception. caughtException = false; try { assert(glfIn.getNextRefSection(glfSection) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); // Test reading record with no header - exception. caughtException = false; try { assert(glfIn.getNextRecord(record) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); readHeader(glfIn, 3); // Test reading record with no reference section - exception. caughtException = false; try { assert(glfIn.getNextRecord(record) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); // Test reading header after already read - exception caughtException = false; try { assert(glfIn.readHeader(glfHeader) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); readRefSection1(glfIn); readSec1Record1(glfIn); readSec1Record2(glfIn); readEndMarker(glfIn); readRefSection2(glfIn); readSec2Record1(glfIn); readEndMarker(glfIn); checkEOF(glfIn); // Read again, but text reading next refsection before //end of current section - consumes the rest of the records. assert(glfIn.openForRead(testFile.c_str())); readHeader(glfIn, 3); readRefSection1(glfIn); readRefSection2(glfIn); readSec2Record1(glfIn); readEndMarker(glfIn); checkEOF(glfIn); } void TestWrite::writeHeader(GlfFile& glfOut, int headerNum) { GlfHeader glfHeader; std::string headerString = "t"; std::string expectedHeader = ""; if(headerNum == 1) { expectedHeader = HEADER_TEXT1; } else if(headerNum == 2) { expectedHeader = HEADER_TEXT2; } else if(headerNum == 3) { expectedHeader = HEADER_TEXT3; } assert(glfHeader.getHeaderTextString(headerString)); assert(headerString == ""); assert(glfHeader.setHeaderTextString(expectedHeader)); assert(glfHeader.getHeaderTextString(headerString)); assert(headerString == expectedHeader); assert(glfOut.writeHeader(glfHeader)); } void TestWrite::writeRefSection1(GlfFile& glfOut) { GlfRefSection glfSection; //////////////////////////////// // Write the reference section. std::string refNameString = ""; // Check the default settings (no data has been set yet). assert(glfSection.getName(refNameString)); assert(refNameString == ""); assert(glfSection.getRefLen() == 0); // Set the reference name. assert(glfSection.setName(SEC1_REFNAME)); // Check properly set. assert(glfSection.getName(refNameString)); assert(refNameString == SEC1_REFNAME); assert(glfSection.getRefLen() == 0); // Set the reference sequence length. assert(glfSection.setRefLen(SEC1_REFLEN)); // Check properly set. assert(glfSection.getRefLen() == SEC1_REFLEN); assert(glfSection.getName(refNameString)); assert(refNameString == SEC1_REFNAME); // Write the reference section assert(glfOut.writeRefSection(glfSection)); } void TestWrite::writeSec1Record1(GlfFile& glfOut) { GlfRecord record; assert(record.setRecordType(SEC1REC1_RECTYPE)); assert(record.setRefBaseInt(SEC1REC1_REFBASE)); assert(record.setOffset(SEC1REC1_OFFSET)); assert(record.setMinLk(SEC1REC1_MINLK)); assert(record.setReadDepth(SEC1REC1_READDEPTH)); assert(record.setRmsMapQ(SEC1REC1_RMSMAPQ)); assert(glfOut.writeRecord(record)); // Verify the settings of record 1. assert(record.getRecordType() == SEC1REC1_RECTYPE); assert(record.getRefBase() == SEC1REC1_REFBASE); assert(record.getOffset() == SEC1REC1_OFFSET); assert(record.getMinLk() == SEC1REC1_MINLK); assert(record.getReadDepth() == SEC1REC1_READDEPTH); assert(record.getRmsMapQ() == SEC1REC1_RMSMAPQ); } void TestWrite::writeSec1Record2(GlfFile& glfOut) { ////////////////////////////////////////////// // Write a record of type 2. GlfRecord record; assert(record.setRecordType(SEC1REC2_RECTYPE)); assert(record.setRefBaseInt(SEC1REC2_REFBASE)); assert(record.setOffset(SEC1REC2_OFFSET)); assert(record.setMinLk(SEC1REC2_MINLK)); assert(record.setReadDepth(SEC1REC2_READDEPTH)); assert(record.setRmsMapQ(SEC1REC2_RMSMAPQ)); assert(record.setLkHom1(SEC1REC2_LKHOM1)); assert(record.setLkHom2(SEC1REC2_LKHOM2)); assert(record.setLkHet(SEC1REC2_LKHET)); assert(record.setInsertionIndel1(SEC1REC2_INDELSEQ1)); assert(record.setDeletionIndel2(SEC1REC2_INDELSEQ2)); assert(glfOut.writeRecord(record)); // Verify the settings of record 2. std::string indelSeq = ""; assert(record.getRecordType() == SEC1REC2_RECTYPE); assert(record.getRefBase() == SEC1REC2_REFBASE); assert(record.getOffset() == SEC1REC2_OFFSET); assert(record.getMinLk() == SEC1REC2_MINLK); assert(record.getReadDepth() == SEC1REC2_READDEPTH); assert(record.getRmsMapQ() == SEC1REC2_RMSMAPQ); assert(record.getLkHom1() == SEC1REC2_LKHOM1); assert(record.getLkHom2() == SEC1REC2_LKHOM2); assert(record.getLkHet() == SEC1REC2_LKHET); assert(record.getIndel1(indelSeq) == SEC1REC2_INDELLEN1); assert(indelSeq == SEC1REC2_INDELSEQ1); assert(record.getIndel2(indelSeq) == SEC1REC2_INDELLEN2); assert(indelSeq == SEC1REC2_INDELSEQ2); } void TestWrite::writeEndMarker(GlfFile& glfOut) { ////////////////////////////////////////////// // Write a record of type 0. GlfRecord record; assert(glfOut.writeRecord(record)); // Verify the settings of the types. assert(record.getRecordType() == 0); assert(record.getRefBase() == 0); } void TestWrite::writeRefSection2(GlfFile& glfOut) { GlfRefSection glfSection; //////////////////////////////// // Write the reference section. std::string refNameString = ""; // Check the default settings (no data has been set yet). assert(glfSection.getName(refNameString)); assert(refNameString == ""); assert(glfSection.getRefLen() == 0); // Set the reference name. assert(glfSection.setName(SEC2_REFNAME)); // Check properly set. assert(glfSection.getName(refNameString)); assert(refNameString == SEC2_REFNAME); assert(glfSection.getRefLen() == 0); // Set the reference sequence length. assert(glfSection.setRefLen(SEC2_REFLEN)); // Check properly set. assert(glfSection.getRefLen() == SEC2_REFLEN); assert(glfSection.getName(refNameString)); assert(refNameString == SEC2_REFNAME); // Write the reference section assert(glfOut.writeRefSection(glfSection)); } void TestWrite::writeSec2Record1(GlfFile& glfOut) { GlfRecord record; assert(record.setRecordType(SEC2REC1_RECTYPE)); assert(record.setRefBaseInt(SEC2REC1_REFBASE)); assert(record.setOffset(SEC2REC1_OFFSET)); assert(record.setMinLk(SEC2REC1_MINLK)); assert(record.setReadDepth(SEC2REC1_READDEPTH)); assert(record.setRmsMapQ(SEC2REC1_RMSMAPQ)); assert(glfOut.writeRecord(record)); // Verify the settings of record 1. assert(record.getRecordType() == SEC2REC1_RECTYPE); assert(record.getRefBase() == SEC2REC1_REFBASE); assert(record.getOffset() == SEC2REC1_OFFSET); assert(record.getMinLk() == SEC2REC1_MINLK); assert(record.getReadDepth() == SEC2REC1_READDEPTH); assert(record.getRmsMapQ() == SEC2REC1_RMSMAPQ); } void TestWrite::readHeader(GlfFile& glfIn, int headerNum) { GlfHeader glfHeader; std::string expectedHeader = ""; std::string headerString; if(headerNum == 1) { expectedHeader = HEADER_TEXT1; } else if(headerNum == 2) { expectedHeader = HEADER_TEXT2; } else if(headerNum == 3) { expectedHeader = HEADER_TEXT3; } // Check the header string. assert(glfIn.readHeader(glfHeader)); assert(glfHeader.getHeaderTextString(headerString)); assert(headerString == expectedHeader); } void TestWrite::readRefSection1(GlfFile& glfIn) { GlfRefSection glfSection; std::string refNameString; // Check the reference section. assert(glfIn.getNextRefSection(glfSection)); assert(glfSection.getName(refNameString)); assert(refNameString == SEC1_REFNAME); assert(glfSection.getRefLen() == SEC1_REFLEN); } void TestWrite::readSec1Record1(GlfFile& glfIn) { GlfRecord record; // Check the record of type 1. assert(glfIn.getNextRecord(record)); assert(record.getRecordType() == SEC1REC1_RECTYPE); assert(record.getRefBase() == SEC1REC1_REFBASE); assert(record.getOffset() == SEC1REC1_OFFSET); assert(record.getMinLk() == SEC1REC1_MINLK); assert(record.getReadDepth() == SEC1REC1_READDEPTH); assert(record.getRmsMapQ() == SEC1REC1_RMSMAPQ); } void TestWrite::readSec1Record2(GlfFile& glfIn) { GlfRecord record; std::string indelSeq; //Check the record of type 2. assert(glfIn.getNextRecord(record)); assert(record.getRecordType() == SEC1REC2_RECTYPE); assert(record.getRefBase() == SEC1REC2_REFBASE); assert(record.getOffset() == SEC1REC2_OFFSET); assert(record.getMinLk() == SEC1REC2_MINLK); assert(record.getReadDepth() == SEC1REC2_READDEPTH); assert(record.getRmsMapQ() == SEC1REC2_RMSMAPQ); assert(record.getLkHom1() == SEC1REC2_LKHOM1); assert(record.getLkHom2() == SEC1REC2_LKHOM2); assert(record.getLkHet() == SEC1REC2_LKHET); assert(record.getIndel1(indelSeq) == SEC1REC2_INDELLEN1); assert(indelSeq == SEC1REC2_INDELSEQ1); assert(record.getIndel2(indelSeq) == SEC1REC2_INDELLEN2); assert(indelSeq == SEC1REC2_INDELSEQ2); } void TestWrite::readEndMarker(GlfFile& glfIn) { GlfRecord record; // Check the record of type 0. // False, since there are no more records in this section. assert(glfIn.getNextRecord(record) == false); assert(record.getRecordType() == 0); assert(record.getRefBase() == 0); } void TestWrite::readRefSection2(GlfFile& glfIn) { GlfRefSection glfSection; std::string refNameString; // Check the reference section. assert(glfIn.getNextRefSection(glfSection)); assert(glfSection.getName(refNameString)); assert(refNameString == SEC2_REFNAME); assert(glfSection.getRefLen() == SEC2_REFLEN); } void TestWrite::readSec2Record1(GlfFile& glfIn) { GlfRecord record; // Check the record of type 1. assert(glfIn.getNextRecord(record)); assert(record.getRecordType() == SEC2REC1_RECTYPE); assert(record.getRefBase() == SEC2REC1_REFBASE); assert(record.getOffset() == SEC2REC1_OFFSET); assert(record.getMinLk() == SEC2REC1_MINLK); assert(record.getReadDepth() == SEC2REC1_READDEPTH); assert(record.getRmsMapQ() == SEC2REC1_RMSMAPQ); } void TestWrite::checkEOF(GlfFile& glfIn) { GlfHeader glfHeader; GlfRefSection glfSection; GlfRecord record; // Check end of file - no more refsections assert(glfIn.getNextRefSection(glfSection) == false); assert(glfIn.isEOF()); } libStatGen-1.0.14/glf/test/WriteFiles.h000066400000000000000000000064421254730101300176100ustar00rootroot00000000000000/* * Copyright (C) 2010 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __WRITE_FILES_H__ #define __WRITE_FILES_H__ #include #include "GlfFile.h" void testWrite(); void testHeaderWrite(); void testWriteCopiedHeader(); class TestWrite { public: void testWrite(); private: void writeHeader(GlfFile& glfOut, int headerNum); void writeRefSection1(GlfFile& glfOut); void writeSec1Record1(GlfFile& glfOut); void writeSec1Record2(GlfFile& glfOut); void writeEndMarker(GlfFile& glfOut); void writeRefSection2(GlfFile& glfOut); void writeSec2Record1(GlfFile& glfOut); void readHeader(GlfFile& glfIn, int headerNum); void readRefSection1(GlfFile& glfIn); void readSec1Record1(GlfFile& glfIn); void readSec1Record2(GlfFile& glfIn); void readEndMarker(GlfFile& glfIn); void readRefSection2(GlfFile& glfIn); void readSec2Record1(GlfFile& glfIn); void checkEOF(GlfFile& glfIn); // 1st file header values: static const std::string HEADER_TEXT1; // SEC1 values: static const std::string SEC1_REFNAME; static const uint32_t SEC1_REFLEN = 200; // SEC1REC1 values: static const uint8_t SEC1REC1_RECTYPE = 1; static const uint8_t SEC1REC1_REFBASE = 4; static const uint32_t SEC1REC1_OFFSET = 99; static const uint32_t SEC1REC1_MINLK = 55; static const uint32_t SEC1REC1_READDEPTH = 31; static const uint8_t SEC1REC1_RMSMAPQ = 25; // SEC1REC2 values: static const uint8_t SEC1REC2_RECTYPE = 2; static const uint8_t SEC1REC2_REFBASE = 1; static const uint32_t SEC1REC2_OFFSET = 6; static const uint32_t SEC1REC2_MINLK = 44; static const uint32_t SEC1REC2_READDEPTH = 66; static const uint8_t SEC1REC2_RMSMAPQ = 32; static const uint8_t SEC1REC2_LKHOM1 = 98; static const uint8_t SEC1REC2_LKHOM2 = 86; static const uint8_t SEC1REC2_LKHET = 73; static const int16_t SEC1REC2_INDELLEN1 = 2; static const int16_t SEC1REC2_INDELLEN2 = -3; static const std::string SEC1REC2_INDELSEQ1; static const std::string SEC1REC2_INDELSEQ2; // SEC2 values static const std::string SEC2_REFNAME; static const uint32_t SEC2_REFLEN = 102; // SEC2REC1 values: static const uint8_t SEC2REC1_RECTYPE = 1; static const uint8_t SEC2REC1_REFBASE = 2; static const uint32_t SEC2REC1_OFFSET = 50; static const uint32_t SEC2REC1_MINLK = 55; static const uint32_t SEC2REC1_READDEPTH = 31; static const uint8_t SEC2REC1_RMSMAPQ = 25; // 2nd file header. static const std::string HEADER_TEXT2; // 3rd file header. static const std::string HEADER_TEXT3; }; #endif libStatGen-1.0.14/glf/test/results/000077500000000000000000000000001254730101300170555ustar00rootroot00000000000000libStatGen-1.0.14/glf/test/results/.gitignore000066400000000000000000000000121254730101300210360ustar00rootroot00000000000000MyTestOut*libStatGen-1.0.14/glf/test/testFiles/000077500000000000000000000000001254730101300173165ustar00rootroot00000000000000libStatGen-1.0.14/glf/test/testFiles/testGlf.glf000066400000000000000000000001141254730101300214140ustar00rootroot00000000000000sqcf& 6dHcd`2ϐBXDc"ƇE ,bnb zlibStatGen-1.0.14/include/000077500000000000000000000000001254730101300152505ustar00rootroot00000000000000libStatGen-1.0.14/include/.gitignore000066400000000000000000000000031254730101300172310ustar00rootroot00000000000000*.hlibStatGen-1.0.14/samtools/000077500000000000000000000000001254730101300154665ustar00rootroot00000000000000libStatGen-1.0.14/samtools/COPYING000066400000000000000000000020751254730101300165250ustar00rootroot00000000000000The MIT License Copyright (c) 2008-2009 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.libStatGen-1.0.14/samtools/Makefile000066400000000000000000000001231254730101300171220ustar00rootroot00000000000000TOOLBASE = bgzf knetfile HDRONLY = khash.h bam.h include ../Makefiles/Makefile.liblibStatGen-1.0.14/samtools/Makefile.depends000066400000000000000000000000201254730101300205370ustar00rootroot00000000000000# DO NOT DELETE libStatGen-1.0.14/samtools/README.txt000066400000000000000000000001031254730101300171560ustar00rootroot00000000000000These files are based on samtools version 981. (retrieved 7/26/11) libStatGen-1.0.14/samtools/bam.h000066400000000000000000000043001254730101300163730ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008-2010 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Contact: Heng Li */ #ifndef BAM_BAM_H #define BAM_BAM_H /*! @header BAM library provides I/O and various operations on manipulating files in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) format. It now supports importing from or exporting to SAM, sorting, merging, generating pileup, and quickly retrieval of reads overlapped with a specified region. @copyright Genome Research Ltd. */ #include /* * Only small section is pulled out that we use. * 4/29/2011 - Mary Kate Trost */ /*! @abstract Calculate the minimum bin that contains a region [beg,end). @param beg start of the region, 0-based @param end end of the region, 0-based @return bin */ static inline int bam_reg2bin(uint32_t beg, uint32_t end) { --end; if (beg>>14 == end>>14) return 4681 + (beg>>14); if (beg>>17 == end>>17) return 585 + (beg>>17); if (beg>>20 == end>>20) return 73 + (beg>>20); if (beg>>23 == end>>23) return 9 + (beg>>23); if (beg>>26 == end>>26) return 1 + (beg>>26); return 0; } #endif libStatGen-1.0.14/samtools/bgzf.c000066400000000000000000000435341254730101300165730ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifdef __ZLIB_AVAILABLE__ #include #include #include #include #include #include #include "bgzf.h" #ifdef _USE_KNETFILE #include "knetfile.h" typedef knetFile *_bgzf_file_t; #define _bgzf_open(fn, mode) knet_open(fn, mode) #define _bgzf_dopen(fp, mode) knet_dopen(fp, mode) #define _bgzf_close(fp) knet_close(fp) #define _bgzf_fileno(fp) ((fp)->fd) #define _bgzf_tell(fp) knet_tell(fp) #define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence) #define _bgzf_read(fp, buf, len) knet_read(fp, buf, len) #define _bgzf_write(fp, buf, len) knet_write(fp, buf, len) #else // ~defined(_USE_KNETFILE) #if defined(_WIN32) || defined(_MSC_VER) #define ftello(fp) ftell(fp) #define fseeko(fp, offset, whence) fseek(fp, offset, whence) #else // ~defined(_WIN32) extern off_t ftello(FILE *stream); extern int fseeko(FILE *stream, off_t offset, int whence); #endif // ~defined(_WIN32) typedef FILE *_bgzf_file_t; #define _bgzf_open(fn, mode) fopen(fn, mode) #define _bgzf_dopen(fp, mode) fdopen(fp, mode) #define _bgzf_close(fp) fclose(fp) #define _bgzf_fileno(fp) fileno(fp) #define _bgzf_tell(fp) ftello(fp) #define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence) #define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp) #define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp) #endif // ~define(_USE_KNETFILE) #define BLOCK_HEADER_LENGTH 18 #define BLOCK_FOOTER_LENGTH 8 /* BGZF/GZIP header (speciallized from RFC 1952; little endian): +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ */ static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; #ifdef BGZF_CACHE typedef struct { int size; uint8_t *block; int64_t end_offset; } cache_t; #include "khash.h" KHASH_MAP_INIT_INT64(cache, cache_t) #endif static inline void packInt16(uint8_t *buffer, uint16_t value) { buffer[0] = value; buffer[1] = value >> 8; } static inline int unpackInt16(const uint8_t *buffer) { return buffer[0] | buffer[1] << 8; } static inline void packInt32(uint8_t *buffer, uint32_t value) { buffer[0] = value; buffer[1] = value >> 8; buffer[2] = value >> 16; buffer[3] = value >> 24; } static BGZF *bgzf_read_init() { BGZF *fp; fp = calloc(1, sizeof(BGZF)); fp->open_mode = 'r'; fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE); fp->compressed_block = malloc(BGZF_BLOCK_SIZE); #ifdef BGZF_CACHE fp->cache = kh_init(cache); #endif return fp; } static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level { BGZF *fp; fp = calloc(1, sizeof(BGZF)); fp->open_mode = 'w'; fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE); fp->compressed_block = malloc(BGZF_BLOCK_SIZE); fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; return fp; } // get the compress level from the mode string static int mode2level(const char *__restrict mode) { int i, compress_level = -1; for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; if (mode[i]) compress_level = (int)mode[i] - '0'; if (strchr(mode, 'u')) compress_level = 0; return compress_level; } BGZF *bgzf_open(const char *path, const char *mode) { BGZF *fp = 0; if (strchr(mode, 'r') || strchr(mode, 'R')) { _bgzf_file_t fpr; if ((fpr = _bgzf_open(path, "r")) == 0) return 0; fp = bgzf_read_init(); fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'W')) { FILE *fpw; if ((fpw = fopen(path, "w")) == 0) return 0; fp = bgzf_write_init(mode2level(mode)); fp->fp = fpw; } else if (strchr(mode, 'a') || strchr(mode, 'A')) { FILE *fpw; if ((fpw = fopen(path, "r+")) == 0) return 0; fp = bgzf_write_init(mode2level(mode)); fp->fp = fpw; // Check for trailing EOF block. if(bgzf_check_EOF(fp)) { // Overwrite the trailing EOF. _bgzf_seek(fp->fp, -28, SEEK_END); } else { // No trailing EOF block, so go to the end _bgzf_seek(fp->fp, 0, SEEK_END); } } return fp; } BGZF *bgzf_dopen(int fd, const char *mode) { BGZF *fp = 0; if (strchr(mode, 'r') || strchr(mode, 'R')) { _bgzf_file_t fpr; if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0; fp = bgzf_read_init(); fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'W')) { FILE *fpw; if ((fpw = fdopen(fd, "w")) == 0) return 0; fp = bgzf_write_init(mode2level(mode)); fp->fp = fpw; } return fp; } // Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length. static int deflate_block(BGZF *fp, int block_length) { uint8_t *buffer = fp->compressed_block; int buffer_size = BGZF_BLOCK_SIZE; int input_length = block_length; int compressed_length = 0; int remaining; uint32_t crc; assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block while (1) { // loop to retry for blocks that do not compress enough int status; z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = fp->uncompressed_block; zs.avail_in = input_length; zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer if (status != Z_OK) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } status = deflate(&zs, Z_FINISH); if (status != Z_STREAM_END) { // not compressed enough deflateEnd(&zs); // reset the stream if (status == Z_OK) { // reduce the size and recompress input_length -= 1024; assert(input_length > 0); // logically, this should not happen continue; } fp->errcode |= BGZF_ERR_ZLIB; return -1; } if (deflateEnd(&zs) != Z_OK) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } compressed_length = zs.total_out; compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; assert(compressed_length <= BGZF_BLOCK_SIZE); break; } assert(compressed_length > 0); packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes crc = crc32(0L, NULL, 0L); crc = crc32(crc, fp->uncompressed_block, input_length); packInt32((uint8_t*)&buffer[compressed_length-8], crc); packInt32((uint8_t*)&buffer[compressed_length-4], input_length); remaining = block_length - input_length; if (remaining > 0) { assert(remaining <= input_length); memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining); } fp->block_offset = remaining; return compressed_length; } // Inflate the block in fp->compressed_block into fp->uncompressed_block static int inflate_block(BGZF* fp, int block_length) { z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = fp->compressed_block + 18; zs.avail_in = block_length - 16; zs.next_out = fp->uncompressed_block; zs.avail_out = BGZF_BLOCK_SIZE; if (inflateInit2(&zs, -15) != Z_OK) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } if (inflate(&zs, Z_FINISH) != Z_STREAM_END) { inflateEnd(&zs); fp->errcode |= BGZF_ERR_ZLIB; return -1; } if (inflateEnd(&zs) != Z_OK) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } return zs.total_out; } static int check_header(const uint8_t *header) { return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0 && unpackInt16((uint8_t*)&header[10]) == 6 && header[12] == 'B' && header[13] == 'C' && unpackInt16((uint8_t*)&header[14]) == 2); } #ifdef BGZF_CACHE static void free_cache(BGZF *fp) { khint_t k; khash_t(cache) *h = (khash_t(cache)*)fp->cache; if (fp->open_mode != 'r') return; for (k = kh_begin(h); k < kh_end(h); ++k) if (kh_exist(h, k)) free(kh_val(h, k).block); kh_destroy(cache, h); } static int load_block_from_cache(BGZF *fp, int64_t block_address) { khint_t k; cache_t *p; khash_t(cache) *h = (khash_t(cache)*)fp->cache; k = kh_get(cache, h, block_address); if (k == kh_end(h)) return 0; p = &kh_val(h, k); if (fp->block_length != 0) fp->block_offset = 0; fp->block_address = block_address; fp->block_length = p->size; memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE); _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET); return p->size; } static void cache_block(BGZF *fp, int size) { int ret; khint_t k; cache_t *p; khash_t(cache) *h = (khash_t(cache)*)fp->cache; if (BGZF_BLOCK_SIZE >= fp->cache_size) return; if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) { /* A better way would be to remove the oldest block in the * cache, but here we remove a random one for simplicity. This * should not have a big impact on performance. */ for (k = kh_begin(h); k < kh_end(h); ++k) if (kh_exist(h, k)) break; if (k < kh_end(h)) { free(kh_val(h, k).block); kh_del(cache, h, k); } } k = kh_put(cache, h, fp->block_address, &ret); if (ret == 0) return; // if this happens, a bug! p = &kh_val(h, k); p->size = fp->block_length; p->end_offset = fp->block_address + size; p->block = malloc(BGZF_BLOCK_SIZE); memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE); } #else static void free_cache(BGZF *fp) {} static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} static void cache_block(BGZF *fp, int size) {} #endif int bgzf_read_block(BGZF *fp) { uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; int count, size = 0, block_length, remaining; int64_t block_address; block_address = _bgzf_tell((_bgzf_file_t)fp->fp); if (load_block_from_cache(fp, block_address)) return 0; count = _bgzf_read(fp->fp, header, sizeof(header)); if (count == 0) { // no data read fp->block_length = 0; return 0; } if (count != sizeof(header) || !check_header(header)) { fp->errcode |= BGZF_ERR_HEADER; return -1; } size = count; block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" compressed_block = (uint8_t*)fp->compressed_block; memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); remaining = block_length - BLOCK_HEADER_LENGTH; count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining); if (count != remaining) { fp->errcode |= BGZF_ERR_IO; return -1; } size += count; if ((count = inflate_block(fp, block_length)) < 0) return -1; if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek. fp->block_address = block_address; fp->block_length = count; cache_block(fp, size); return 0; } ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length) { ssize_t bytes_read = 0; uint8_t *output = data; if (length <= 0) return 0; assert(fp->open_mode == 'r'); while (bytes_read < length) { int copy_length, available = fp->block_length - fp->block_offset; uint8_t *buffer; if (available <= 0) { if (bgzf_read_block(fp) != 0) return -1; available = fp->block_length - fp->block_offset; if (available <= 0) break; } copy_length = length - bytes_read < available? length - bytes_read : available; buffer = fp->uncompressed_block; memcpy(output, buffer + fp->block_offset, copy_length); fp->block_offset += copy_length; output += copy_length; bytes_read += copy_length; } if (fp->block_offset == fp->block_length) { fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); fp->block_offset = fp->block_length = 0; } return bytes_read; } int bgzf_flush(BGZF *fp) { assert(fp->open_mode == 'w'); while (fp->block_offset > 0) { int block_length; block_length = deflate_block(fp, fp->block_offset); if (block_length < 0) return -1; if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) { fp->errcode |= BGZF_ERR_IO; // possibly truncated file return -1; } fp->block_address += block_length; } return 0; } int bgzf_flush_try(BGZF *fp, ssize_t size) { if (fp->block_offset + size > BGZF_BLOCK_SIZE) return bgzf_flush(fp); return -1; } ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length) { const uint8_t *input = data; int block_length = BGZF_BLOCK_SIZE, bytes_written; assert(fp->open_mode == 'w'); input = data; bytes_written = 0; while (bytes_written < length) { uint8_t* buffer = fp->uncompressed_block; int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written; memcpy(buffer + fp->block_offset, input, copy_length); fp->block_offset += copy_length; input += copy_length; bytes_written += copy_length; if (fp->block_offset == block_length && bgzf_flush(fp)) break; } return bytes_written; } int bgzf_close(BGZF* fp) { int ret, count, block_length; if (fp == 0) return -1; if (fp->open_mode == 'w') { if (bgzf_flush(fp) != 0) return -1; block_length = deflate_block(fp, 0); // write an empty block count = fwrite(fp->compressed_block, 1, block_length, fp->fp); if(count != 0) { // Something was written } if (fflush(fp->fp) != 0) { fp->errcode |= BGZF_ERR_IO; return -1; } } ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp); if (ret != 0) return -1; free(fp->uncompressed_block); free(fp->compressed_block); free_cache(fp); free(fp); return 0; } void bgzf_set_cache_size(BGZF *fp, int cache_size) { if (fp) fp->cache_size = cache_size; } int bgzf_check_EOF(BGZF *fp) { static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; // Last 28 bytes of an uncompressed bgzf file which are different // from the last 28 bytes of compressed bgzf files. static uint8_t magic2[28] = "\4\0\0\0\0\0\377\6\0\102\103\2\0\036\0\1\0\0\377\377\0\0\0\0\0\0\0\0"; uint8_t buf[28]; off_t offset; offset = _bgzf_tell((_bgzf_file_t)fp->fp); if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0; int count = _bgzf_read(fp->fp, buf, 28); if(count != 28) { fp->errcode |= BGZF_ERR_IO; // possibly truncated file return(0); } _bgzf_seek(fp->fp, offset, SEEK_SET); if((memcmp(magic, buf, 28) == 0) || (memcmp(magic2, buf, 28) == 0)) { return(1); } return(0); } int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) { int block_offset; int64_t block_address; if (fp->open_mode != 'r' || where != SEEK_SET) { fp->errcode |= BGZF_ERR_MISUSE; return -1; } block_offset = pos & 0xFFFF; block_address = pos >> 16; if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) { fp->errcode |= BGZF_ERR_IO; return -1; } fp->block_length = 0; // indicates current block has not been loaded fp->block_address = block_address; fp->block_offset = block_offset; return 0; } int bgzf_is_bgzf(const char *fn) { uint8_t buf[16]; int n; _bgzf_file_t fp; if ((fp = _bgzf_open(fn, "r")) == 0) return 0; n = _bgzf_read(fp, buf, 16); _bgzf_close(fp); if (n != 16) return 0; return memcmp(g_magic, buf, 16) == 0? 1 : 0; } int bgzf_getc(BGZF *fp) { int c; if (fp->block_offset >= fp->block_length) { if (bgzf_read_block(fp) != 0) return -2; /* error */ if (fp->block_length == 0) return -1; /* end-of-file */ } c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; if (fp->block_offset == fp->block_length) { fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); fp->block_offset = 0; fp->block_length = 0; } return c; } #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif int bgzf_getline(BGZF *fp, int delim, kstring_t *str) { int l, state = 0; unsigned char *buf = (unsigned char*)fp->uncompressed_block; str->l = 0; do { if (fp->block_offset >= fp->block_length) { if (bgzf_read_block(fp) != 0) { state = -2; break; } if (fp->block_length == 0) { state = -1; break; } } for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); if (l < fp->block_length) state = 1; l -= fp->block_offset; if (str->l + l + 1 >= str->m) { str->m = str->l + l + 2; kroundup32(str->m); str->s = (char*)realloc(str->s, str->m); } memcpy(str->s + str->l, buf + fp->block_offset, l); str->l += l; fp->block_offset += l + 1; if (fp->block_offset >= fp->block_length) { fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); fp->block_offset = 0; fp->block_length = 0; } } while (state == 0); if (str->l == 0 && state < 0) return state; str->s[str->l] = 0; return str->l; } #endif libStatGen-1.0.14/samtools/bgzf.h000066400000000000000000000132521254730101300165720ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* The BGZF library was originally written by Bob Handsaker from the Broad * Institute. It was later improved by the SAMtools developers. */ #ifndef __BGZF_H #define __BGZF_H #include #include #ifdef __ZLIB_AVAILABLE__ #include #endif #define BGZF_BLOCK_SIZE 0x10000 // 64k #define BGZF_ERR_ZLIB 1 #define BGZF_ERR_HEADER 2 #define BGZF_ERR_IO 4 #define BGZF_ERR_MISUSE 8 typedef struct { int open_mode:8, compress_level:8, errcode:16; int cache_size; int block_length, block_offset; int64_t block_address; void *uncompressed_block, *compressed_block; void *cache; // a pointer to a hash table void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading } BGZF; #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif #ifdef __cplusplus extern "C" { #endif BGZF* dummy(); /****************** * Basic routines * ******************/ /** * Open an existing file descriptor for reading or writing. * * @param fd file descriptor * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored. * @return BGZF file handler; 0 on error */ BGZF* bgzf_dopen(int fd, const char *mode); /** * Open the specified file for reading or writing. */ BGZF* bgzf_open(const char* path, const char *mode); /** * Close the BGZF and free all associated resources. * * @param fp BGZF file handler * @return 0 on success and -1 on error */ int bgzf_close(BGZF *fp); /** * Read up to _length_ bytes from the file storing into _data_. * * @param fp BGZF file handler * @param data data array to read into * @param length size of data to read * @return number of bytes actually read; 0 on end-of-file and -1 on error */ ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length); /** * Write _length_ bytes from _data_ to the file. * * @param fp BGZF file handler * @param data data array to write * @param length size of data to write * @return number of bytes actually written; -1 on error */ ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length); /** * Write the data in the buffer to the file. */ int bgzf_flush(BGZF *fp); /** * Return a virtual file pointer to the current location in the file. * No interpetation of the value should be made, other than a subsequent * call to bgzf_seek can be used to position the file at the same point. * Return value is non-negative on success. */ #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) /** * Set the file to read from the location specified by _pos_. * * @param fp BGZF file handler * @param pos virtual file offset returned by bgzf_tell() * @param whence must be SEEK_SET * @return 0 on success and -1 on error */ int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); /** * Check if the BGZF end-of-file (EOF) marker is present * * @param fp BGZF file handler opened for reading * @return 1 if EOF is present; 0 if not or on I/O error */ int bgzf_check_EOF(BGZF *fp); /** * Check if a file is in the BGZF format * * @param fn file name * @return 1 if _fn_ is BGZF; 0 if not or on I/O error */ int bgzf_is_bgzf(const char *fn); /********************* * Advanced routines * *********************/ /** * Set the cache size. Only effective when compiled with -DBGZF_CACHE. * * @param fp BGZF file handler * @param size size of cache in bytes; 0 to disable caching (default) */ void bgzf_set_cache_size(BGZF *fp, int size); /** * Flush the file if the remaining buffer size is smaller than _size_ */ int bgzf_flush_try(BGZF *fp, ssize_t size); /** * Read one byte from a BGZF file. It is faster than bgzf_read() * @param fp BGZF file handler * @return byte read; -1 on end-of-file or error */ int bgzf_getc(BGZF *fp); /** * Read one line from a BGZF file. It is faster than bgzf_getc() * * @param fp BGZF file handler * @param delim delimitor * @param str string to write to; must be initialized * @return length of the string; 0 on end-of-file; negative on error */ int bgzf_getline(BGZF *fp, int delim, kstring_t *str); /** * Read the next BGZF block. */ int bgzf_read_block(BGZF *fp); #ifdef __cplusplus } #endif #endif libStatGen-1.0.14/samtools/khash.h000066400000000000000000000427041254730101300167440ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { int ret, is_missing; khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); if (!ret) kh_del(32, h, k); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); k = kh_get(32, h, 5); kh_del(32, h, k); for (k = kh_begin(h); k != kh_end(h); ++k) if (kh_exist(h, k)) kh_value(h, k) = 1; kh_destroy(32, h); return 0; } */ /* 2011-02-14 (0.2.5): * Allow to declare global functions. 2009-09-26 (0.2.4): * Improve portability 2008-09-19 (0.2.3): * Corrected the example * Improved interfaces 2008-09-11 (0.2.2): * Improved speed a little in kh_put() 2008-09-10 (0.2.1): * Added kh_clear() * Fixed a compiling error 2008-09-02 (0.2.0): * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): * Added destructor */ #ifndef __AC_KHASH_H #define __AC_KHASH_H /*! @header Generic hash table library. @copyright Heng Li */ #define AC_VERSION_KHASH_H "0.2.5" #include #include #include /* compiler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX typedef unsigned long khint64_t; #else typedef unsigned long long khint64_t; #endif #ifdef _MSC_VER #define inline __inline #endif #ifdef _WIN32 #define inline __inline #endif typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_HASH_PRIME_SIZE 32 static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = { 0ul, 3ul, 11ul, 23ul, 53ul, 97ul, 193ul, 389ul, 769ul, 1543ul, 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul }; #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) static const double __ac_HASH_UPPER = 0.77; #define KHASH_DECLARE(name, khkey_t, khval_t) \ typedef struct { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; \ extern kh_##name##_t *kh_init_##name(); \ extern void kh_destroy_##name(kh_##name##_t *h); \ extern void kh_clear_##name(kh_##name##_t *h); \ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ extern void kh_del_##name(kh_##name##_t *h, khint_t x); #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ typedef struct { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; \ SCOPE kh_##name##_t *kh_init_##name() { \ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ free(h->keys); free(h->flags); \ free(h->vals); \ free(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t inc, k, i, last; \ k = __hash_func(key); i = k % h->n_buckets; \ inc = 1 + k % (h->n_buckets - 1); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ else i += inc; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ khint_t t = __ac_HASH_PRIME_SIZE - 1; \ while (__ac_prime_list[t] > new_n_buckets) --t; \ new_n_buckets = __ac_prime_list[t+1]; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ else { \ new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) \ h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ } \ } \ if (j) { \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ while (1) { \ khint_t inc, k, i; \ k = __hash_func(key); \ i = k % new_n_buckets; \ inc = 1 + k % (new_n_buckets - 1); \ while (!__ac_isempty(new_flags, i)) { \ if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ else i += inc; \ } \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ __ac_set_isdel_true(h->flags, i); \ } else { \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ } \ } \ } \ } \ if (h->n_buckets > new_n_buckets) { \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) \ h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ free(h->flags); \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ } \ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { \ if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ else kh_resize_##name(h, h->n_buckets + 1); \ } \ { \ khint_t inc, k, i, site, last; \ x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ if (__ac_isempty(h->flags, i)) x = i; \ else { \ inc = 1 + k % (h->n_buckets - 1); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ else i += inc; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ else x = i; \ } \ } \ } \ if (__ac_isempty(h->flags, x)) { \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ } else if (__ac_isdel(h->flags, x)) { \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ } else *ret = 0; \ return x; \ } \ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ --h->size; \ } \ } #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function @param key The integer [khint32_t] @return The hash value [khint_t] */ #define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function @param key The integer [khint64_t] @return The hash value [khint_t] */ #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @return The hash value */ static inline khint_t __ac_X31_hash_string(const char *s) { khint_t h = *s; if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; return h; } /*! @function @abstract Another interface to const char* hash function @param key Pointer to a null terminated string [const char*] @return The hash value [khint_t] */ #define kh_str_hash_func(key) __ac_X31_hash_string(key) /*! @function @abstract Const char* comparison function */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) /* --- END OF HASH FUNCTIONS --- */ /* Other necessary macros... */ /*! @abstract Type of the hash table. @param name Name of the hash table [symbol] */ #define khash_t(name) kh_##name##_t /*! @function @abstract Initiate a hash table. @param name Name of the hash table [symbol] @return Pointer to the hash table [khash_t(name)*] */ #define kh_init(name) kh_init_##name() /*! @function @abstract Destroy a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_destroy(name, h) kh_destroy_##name(h) /*! @function @abstract Reset a hash table without deallocating memory. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_clear(name, h) kh_clear_##name(h) /*! @function @abstract Resize a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param s New size [khint_t] */ #define kh_resize(name, h, s) kh_resize_##name(h, s) /*! @function @abstract Insert a key to the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @param r Extra return code: 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) /*! @function @abstract Retrieve a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) /*! @function @abstract Remove a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Iterator to the element to be deleted [khint_t] */ #define kh_del(name, h, k) kh_del_##name(h, k) /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return 1 if containing data; 0 otherwise [int] */ #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) /*! @function @abstract Get key given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Key [type of keys] */ #define kh_key(h, x) ((h)->keys[x]) /*! @function @abstract Get value given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Value [type of values] @discussion For hash sets, calling this results in segfault. */ #define kh_val(h, x) ((h)->vals[x]) /*! @function @abstract Alias of kh_val() */ #define kh_value(h, x) ((h)->vals[x]) /*! @function @abstract Get the start iterator @param h Pointer to the hash table [khash_t(name)*] @return The start iterator [khint_t] */ #define kh_begin(h) (khint_t)(0) /*! @function @abstract Get the end iterator @param h Pointer to the hash table [khash_t(name)*] @return The end iterator [khint_t] */ #define kh_end(h) ((h)->n_buckets) /*! @function @abstract Get the number of elements in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of elements in the hash table [khint_t] */ #define kh_size(h) ((h)->size) /*! @function @abstract Get the number of buckets in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of buckets in the hash table [khint_t] */ #define kh_n_buckets(h) ((h)->n_buckets) /* More conenient interfaces */ /*! @function @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_STR(name) \ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #endif /* __AC_KHASH_H */ libStatGen-1.0.14/samtools/knetfile.c000077500000000000000000000452541254730101300174500ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 by Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Probably I will not do socket programming in the next few years and therefore I decide to heavily annotate this file, for Linux and Windows as well. -ac */ /* * Updated 10/22/2013 by Mary Kate Wing * Upgraded to latest version from htslib: develop branch * 1) Fix compile warnings * 2) Add flag to silently fail socket logic */ #include #include #include #include #include #include #include #ifndef _WIN32 #include #include #include #include #else #include #endif #include "knetfile.h" int knetsilent = 0; void knet_silent(int silent) { knetsilent = silent; } /* In winsock.h, the type of a socket is SOCKET, which is: "typedef * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed * integer -1. In knetfile.c, I use "int" for socket type * throughout. This should be improved to avoid confusion. * * In Linux/Mac, recv() and read() do almost the same thing. You can see * in the header file that netread() is simply an alias of read(). In * Windows, however, they are different and using recv() is mandatory. */ /* This function tests if the file handler is ready for reading (or * writing if is_read==0). */ static int socket_wait(int fd, int is_read) { fd_set fds, *fdr = 0, *fdw = 0; struct timeval tv; int ret; tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out FD_ZERO(&fds); FD_SET(fd, &fds); if (is_read) fdr = &fds; else fdw = &fds; ret = select(fd+1, fdr, fdw, 0, &tv); #ifndef _WIN32 if (ret == -1) perror("select"); #else if (ret == 0) { if(!knetsilent) { fprintf(stderr, "select time-out\n"); } } else if (ret == SOCKET_ERROR) { if(!knetsilent) { fprintf(stderr, "select: %d\n", WSAGetLastError()); } } #endif return ret; } #ifndef _WIN32 /* This function does not work with Windows due to the lack of * getaddrinfo() in winsock. It is addapted from an example in "Beej's * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ static int socket_connect(const char *host, const char *port) { #define __err_connect(func) do { if(!knetsilent){perror(func);} freeaddrinfo(res); return -1; } while (0) int on = 1, fd; struct linger lng = { 0, 0 }; struct addrinfo hints, *res = 0; memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; /* In Unix/Mac, getaddrinfo() is the most convenient way to get * server information. */ if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); /* The following two setsockopt() are used by ftplib * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they * necessary. */ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); freeaddrinfo(res); return fd; } #else /* MinGW's printf has problem with "%lld" */ char *int64tostr(char *buf, int64_t x) { int cnt; int i = 0; do { buf[i++] = '0' + x % 10; x /= 10; } while (x); buf[i] = 0; for (cnt = i, i = 0; i < cnt/2; ++i) { int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; } return buf; } int64_t strtoint64(const char *buf) { int64_t x; for (x = 0; *buf != '\0'; ++buf) x = x * 10 + ((int64_t) *buf - 48); return x; } /* In windows, the first thing is to establish the TCP connection. */ int knet_win32_init() { WSADATA wsaData; return WSAStartup(MAKEWORD(2, 2), &wsaData); } void knet_win32_destroy() { WSACleanup(); } /* A slightly modfied version of the following function also works on * Mac (and presummably Linux). However, this function is not stable on * my Mac. It sometimes works fine but sometimes does not. Therefore for * non-Windows OS, I do not use this one. */ static SOCKET socket_connect(const char *host, const char *port) { #define __err_connect(func) \ do { \ if(!knetsilent) {fprintf(stderr, "%s: %d\n", func, WSAGetLastError());} \ return -1; \ } while (0) int on = 1; SOCKET fd; struct linger lng = { 0, 0 }; struct sockaddr_in server; struct hostent *hp = 0; // open socket if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); // get host info if (isalpha(host[0])) hp = gethostbyname(host); else { struct in_addr addr; addr.s_addr = inet_addr(host); hp = gethostbyaddr((char*)&addr, 4, AF_INET); } if (hp == 0) __err_connect("gethost"); // connect server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); server.sin_family= AF_INET; server.sin_port = htons(atoi(port)); if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) return fd; } #endif static off_t my_netread(int fd, void *buf, off_t len) { off_t rest = len, curr, l = 0; /* recv() and read() may not read the required length of data with * one call. They have to be called repeatedly. */ while (rest) { if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading curr = netread(fd, (void*)((char*)buf + l), rest); /* According to the glibc manual, section 13.2, a zero returned * value indicates end-of-file (EOF), which should mean that * read() will not return zero if EOF has not been met but data * are not immediately available. */ if (curr == 0) break; l += curr; rest -= curr; } return l; } /************************* * FTP specific routines * *************************/ static int kftp_get_response(knetFile *ftp) { #ifndef _WIN32 unsigned char c; #else char c; #endif int n = 0; char *p; if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O //fputc(c, stderr); if (n >= ftp->max_response) { ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; ftp->response = (char*)realloc(ftp->response, ftp->max_response); } ftp->response[n++] = c; if (c == '\n') { if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) && ftp->response[3] != '-') break; n = 0; continue; } } if (n < 2) return -1; ftp->response[n-2] = 0; return strtol(ftp->response, &p, 0); } static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) { if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing if(netwrite(ftp->ctrl_fd, cmd, strlen(cmd)) != strlen(cmd)) { } return is_get? kftp_get_response(ftp) : 0; } static int kftp_pasv_prep(knetFile *ftp) { char *p; int v[6]; kftp_send_cmd(ftp, "PASV\r\n", 1); for (p = ftp->response; *p && *p != '('; ++p); if (*p != '(') return -1; ++p; sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; return 0; } static int kftp_pasv_connect(knetFile *ftp) { char host[80], port[10]; if (ftp->pasv_port == 0) { if(!knetsilent) { fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); } return -1; } sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); sprintf(port, "%d", ftp->pasv_port); ftp->fd = socket_connect(host, port); if (ftp->fd == -1) return -1; return 0; } int kftp_connect(knetFile *ftp) { ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); if (ftp->ctrl_fd == -1) return -1; kftp_get_response(ftp); kftp_send_cmd(ftp, "USER anonymous\r\n", 1); kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); kftp_send_cmd(ftp, "TYPE I\r\n", 1); return 0; } int kftp_reconnect(knetFile *ftp) { if (ftp->ctrl_fd != -1) { netclose(ftp->ctrl_fd); ftp->ctrl_fd = -1; } netclose(ftp->fd); ftp->fd = -1; return kftp_connect(ftp); } // initialize ->type, ->host, ->retr and ->size knetFile *kftp_parse_url(const char *fn, const char *mode) { knetFile *fp; char *p; int l; if (strstr(fn, "ftp://") != fn) return 0; for (p = (char*)fn + 6; *p && *p != '/'; ++p); if (*p != '/') return 0; l = p - fn - 6; fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_FTP; fp->fd = -1; /* the Linux/Mac version of socket_connect() also recognizes a port * like "ftp", but the Windows version does not. */ fp->port = strdup("21"); fp->host = (char*)calloc(l + 1, 1); if (strchr(mode, 'c')) fp->no_reconnect = 1; strncpy(fp->host, fn + 6, l); fp->retr = (char*)calloc(strlen(p) + 8, 1); sprintf(fp->retr, "RETR %s\r\n", p); fp->size_cmd = (char*)calloc(strlen(p) + 8, 1); sprintf(fp->size_cmd, "SIZE %s\r\n", p); fp->seek_offset = 0; return fp; } // place ->fd at offset off int kftp_connect_file(knetFile *fp) { int ret; long long file_size; if (fp->fd != -1) { netclose(fp->fd); if (fp->no_reconnect) kftp_get_response(fp); } kftp_pasv_prep(fp); kftp_send_cmd(fp, fp->size_cmd, 1); if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) { if(!knetsilent) { fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); } return -1; } fp->file_size = file_size; if (fp->offset>=0) { char tmp[32]; #ifndef _WIN32 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); #else strcpy(tmp, "REST "); int64tostr(tmp + 5, fp->offset); strcat(tmp, "\r\n"); #endif kftp_send_cmd(fp, tmp, 1); } kftp_send_cmd(fp, fp->retr, 0); kftp_pasv_connect(fp); ret = kftp_get_response(fp); if (ret != 150) { if(!knetsilent) { fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); } netclose(fp->fd); fp->fd = -1; return -1; } fp->is_ready = 1; return 0; } /************************** * HTTP specific routines * **************************/ knetFile *khttp_parse_url(const char *fn, const char *mode) { knetFile *fp; char *p, *proxy, *q; int l; if (strstr(fn, "http://") != fn) return 0; // set ->http_host for (p = (char*)fn + 7; *p && *p != '/'; ++p); l = p - fn - 7; fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->http_host = (char*)calloc(l + 1, 1); strncpy(fp->http_host, fn + 7, l); fp->http_host[l] = 0; for (q = fp->http_host; *q && *q != ':'; ++q); if (*q == ':') *q++ = 0; // get http_proxy proxy = getenv("http_proxy"); // set ->host, ->port and ->path if (proxy == 0) { fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. fp->port = strdup(*q? q : "80"); fp->path = strdup(*p? p : "/"); } else { fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); for (q = fp->host; *q && *q != ':'; ++q); if (*q == ':') *q++ = 0; fp->port = strdup(*q? q : "80"); fp->path = strdup(fn); } fp->type = KNF_TYPE_HTTP; fp->ctrl_fd = fp->fd = -1; fp->seek_offset = 0; return fp; } int khttp_connect_file(knetFile *fp) { int ret, l = 0; char *buf, *p; if (fp->fd != -1) netclose(fp->fd); fp->fd = socket_connect(fp->host, fp->port); buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); l += sprintf(buf + l, "\r\n"); if(netwrite(fp->fd, buf, l) != l) { } l = 0; while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency if (buf[l] == '\n' && l >= 3) if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; ++l; } buf[l] = 0; if (l < 14) { // prematured header netclose(fp->fd); fp->fd = -1; return -1; } ret = strtol(buf + 8, &p, 0); // HTTP return code if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file off_t rest = fp->offset; while (rest) { off_t l = rest < 0x10000? rest : 0x10000; rest -= my_netread(fp->fd, buf, l); } } else if (ret != 206 && ret != 200) { free(buf); if(!knetsilent) { fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); } netclose(fp->fd); fp->fd = -1; return -1; } free(buf); fp->is_ready = 1; return 0; } /******************** * Generic routines * ********************/ knetFile *knet_open(const char *fn, const char *mode) { knetFile *fp = 0; if (mode[0] != 'r') { if(!knetsilent) { fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); } return 0; } if (strstr(fn, "ftp://") == fn) { fp = kftp_parse_url(fn, mode); if (fp == 0) return 0; if (kftp_connect(fp) == -1) { knet_close(fp); return 0; } kftp_connect_file(fp); } else if (strstr(fn, "http://") == fn) { fp = khttp_parse_url(fn, mode); if (fp == 0) return 0; khttp_connect_file(fp); } else { // local file #ifdef _WIN32 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may * be undefined on some systems, although it is defined on my * Mac and the Linux I have tested on. */ int fd = open(fn, O_RDONLY | O_BINARY); #else int fd = open(fn, O_RDONLY); #endif if (fd == -1) { perror("open"); return 0; } fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_LOCAL; fp->fd = fd; fp->ctrl_fd = -1; } if (fp && fp->fd == -1) { knet_close(fp); return 0; } return fp; } knetFile *knet_dopen(int fd, const char *mode) { knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_LOCAL; fp->fd = fd; return fp; } ssize_t knet_read(knetFile *fp, void *buf, size_t len) { off_t l = 0; if (fp->fd == -1) return 0; if (fp->type == KNF_TYPE_FTP) { if (fp->is_ready == 0) { if (!fp->no_reconnect) kftp_reconnect(fp); kftp_connect_file(fp); } } else if (fp->type == KNF_TYPE_HTTP) { if (fp->is_ready == 0) khttp_connect_file(fp); } if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX size_t rest = len; ssize_t curr; while (rest) { do { curr = read(fp->fd, (void*)((char*)buf + l), rest); } while (curr < 0 && EINTR == errno); if (curr < 0) return -1; if (curr == 0) break; l += curr; rest -= curr; } } else l = my_netread(fp->fd, buf, len); fp->offset += l; return l; } off_t knet_seek(knetFile *fp, off_t off, int whence) { if (whence == SEEK_SET && off == fp->offset) return 0; if (fp->type == KNF_TYPE_LOCAL) { /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */ off_t offset = lseek(fp->fd, off, whence); if (offset == -1) return -1; fp->offset = offset; return 0; } else if (fp->type == KNF_TYPE_FTP) { if (whence == SEEK_CUR) fp->offset += off; else if (whence == SEEK_SET) fp->offset = off; else if (whence == SEEK_END) fp->offset = fp->file_size + off; else return -1; fp->is_ready = 0; return 0; } else if (fp->type == KNF_TYPE_HTTP) { if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? if(!knetsilent) { fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); } errno = ESPIPE; return -1; } if (whence == SEEK_CUR) fp->offset += off; else if (whence == SEEK_SET) fp->offset = off; else return -1; fp->is_ready = 0; return 0; } errno = EINVAL; if(!knetsilent) { fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); } return -1; } int knet_close(knetFile *fp) { if (fp == 0) return 0; if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific if (fp->fd != -1) { /* On Linux/Mac, netclose() is an alias of close(), but on * Windows, it is an alias of closesocket(). */ if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); else netclose(fp->fd); } free(fp->host); free(fp->port); free(fp->response); free(fp->retr); // FTP specific free(fp->path); free(fp->http_host); // HTTP specific free(fp); return 0; } #ifdef KNETFILE_MAIN int main(void) { char *buf; knetFile *fp; int type = 4, l; #ifdef _WIN32 knet_win32_init(); #endif buf = calloc(0x100000, 1); if (type == 0) { fp = knet_open("knetfile.c", "r"); knet_seek(fp, 1000, SEEK_SET); } else if (type == 1) { // NCBI FTP, large file fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); knet_seek(fp, 2500000000ll, SEEK_SET); l = knet_read(fp, buf, 255); } else if (type == 2) { fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); knet_seek(fp, 1000, SEEK_SET); } else if (type == 3) { fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); knet_seek(fp, 1000, SEEK_SET); } else if (type == 4) { fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); knet_read(fp, buf, 10000); knet_seek(fp, 20000, SEEK_SET); knet_seek(fp, 10000, SEEK_SET); l = knet_read(fp, buf+10000, 10000000) + 10000; } if (type != 4 && type != 1) { knet_read(fp, buf, 255); buf[255] = 0; printf("%s\n", buf); } else write(fileno(stdout), buf, l); knet_close(fp); free(buf); return 0; } #endif libStatGen-1.0.14/samtools/knetfile.h000066400000000000000000000033611254730101300174430ustar00rootroot00000000000000#ifndef KNETFILE_H #define KNETFILE_H #include #include #ifndef _WIN32 #define netread(fd, ptr, len) read(fd, ptr, len) #define netwrite(fd, ptr, len) write(fd, ptr, len) #define netclose(fd) close(fd) #else #include #define netread(fd, ptr, len) recv(fd, ptr, len, 0) #define netwrite(fd, ptr, len) send(fd, ptr, len, 0) #define netclose(fd) closesocket(fd) #endif // FIXME: currently I/O is unbuffered #define KNF_TYPE_LOCAL 1 #define KNF_TYPE_FTP 2 #define KNF_TYPE_HTTP 3 typedef struct knetFile_s { int type, fd; int64_t offset; char *host, *port; // the following are for FTP only int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; char *response, *retr, *size_cmd; int64_t seek_offset; // for lazy seek int64_t file_size; // the following are for HTTP only char *path, *http_host; } knetFile; #define knet_tell(fp) ((fp)->offset) #define knet_fileno(fp) ((fp)->fd) #ifdef __cplusplus extern "C" { #endif #ifdef _WIN32 int knet_win32_init(); void knet_win32_destroy(); #endif // Pass in non-zero to make knetfile silent (no messages), pass in // 0 to keep any messages (default is 0). void knet_silent(int silent); knetFile *knet_open(const char *fn, const char *mode); /* This only works with local files. */ knetFile *knet_dopen(int fd, const char *mode); /* If ->is_ready==0, this routine updates ->fd; otherwise, it simply reads from ->fd. */ ssize_t knet_read(knetFile *fp, void *buf, size_t len); /* This routine only sets ->offset and ->is_ready=0. It does not communicate with the FTP server. */ off_t knet_seek(knetFile *fp, off_t off, int whence); int knet_close(knetFile *fp); #ifdef __cplusplus } #endif #endif libStatGen-1.0.14/vcf/000077500000000000000000000000001254730101300144035ustar00rootroot00000000000000libStatGen-1.0.14/vcf/Makefile000066400000000000000000000004261254730101300160450ustar00rootroot00000000000000TOOLBASE = VcfFile VcfFileReader VcfFileWriter VcfGenotypeField VcfGenotypeFormat VcfGenotypeSample VcfHeader VcfHelper VcfRecord VcfRecordField VcfRecordFilter VcfRecordGenotype VcfRecordInfo VcfSubsetSamples VcfRecordDiscardRules HDRONLY = include ../Makefiles/Makefile.lib libStatGen-1.0.14/vcf/VcfFile.cpp000066400000000000000000000037551254730101300164370ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfFile.h" VcfFile::VcfFile() { myFilePtr = NULL; mySiteOnly = false; myNumRecords = 0; } VcfFile::~VcfFile() { // Close the file. if (myFilePtr != NULL) { // If we already have an open file, close it. ifclose(myFilePtr); myFilePtr = NULL; } } bool VcfFile::open(const char* filename, const char* mode, InputFile::ifileCompression compressionMode) { // Reset for any previously operated on files. reset(); myFilePtr = ifopen(filename, mode, compressionMode); if(myFilePtr == NULL) { std::string errorMessage = "Failed to Open "; errorMessage += filename; errorMessage += " for "; errorMessage += mode; myStatus.setStatus(StatGenStatus::FAIL_IO, errorMessage.c_str()); return(false); } return(true); } void VcfFile::close() { reset(); } void VcfFile::reset() { // Reset the child class. resetFile(); // Close the file. if (myFilePtr != NULL) { // If we already have an open file, close it. ifclose(myFilePtr); myFilePtr = NULL; } myNumRecords = 0; } libStatGen-1.0.14/vcf/VcfFile.h000066400000000000000000000057751254730101300161100ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_FILE_H__ #define __VCF_FILE_H__ #include "InputFile.h" #include "VcfHeader.h" /// This header file provides interface to read/write VCF files. class VcfFile { public: /// Default Constructor, initializes the variables, but does not open /// any files. VcfFile(); /// Destructor virtual ~VcfFile(); /// Open the vcf file with the specified filename, /// overwritten by child classes for read/write. /// \param filename the vcf file to open. /// \param header to be read/written from/to the file /// \return true = success; false = failure. virtual bool open(const char* filename, VcfHeader& header) = 0; /// Close the file if it is open. void close(); /// When set to true, read only the first 8 columns, skipping the format /// and genotype fields, so when reading do not store them, and when /// writing do not write them. Defaults to read/write all columns. /// This setting is maintained even when the file is reset/closed. /// \param siteOnly process only the first 8 columns void setSiteOnly(bool siteOnly) {mySiteOnly = siteOnly;} /// Get the number of VCF records that have been processed (read/written) /// so far including any filtered records. int getNumRecords() {return(myNumRecords);} // Get the Status of the last call that sets status. // inline StatGenStatus::Status getStatus() // { // return(myStatus.getStatus()); // } protected: // Open the vcf file with the specified filename // with the specified mode. // \param filename the vcf file to open. // \param mode how to open (r/w). // \return true = success; false = failure. bool open(const char* filename, const char* mode, InputFile::ifileCompression compressionMode = InputFile::DEFAULT); void reset(); virtual void resetFile() = 0; IFILE myFilePtr; StatGenStatus myStatus; bool mySiteOnly; // Number of records read/written so far. Child classes need to set this. int myNumRecords; private: VcfFile(const VcfFile& vcfFile); VcfFile& operator=(const VcfFile& vcfFile); }; #endif libStatGen-1.0.14/vcf/VcfFileReader.cpp000066400000000000000000000400751254730101300175560ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfFileReader.h" VcfFileReader::VcfFileReader() : VcfFile(), myVcfIndex(NULL), myNewSection(false), mySectionChrom(""), mySection1BasedStartPos(-1), mySection1BasedEndPos(-1), mySectionOverlap(false), myRecordDiscardRules(), mySampleSubset(), myUseSubset(false), myMinAltAlleleCount(UNSET_MIN_ALT_ALLELE_COUNT), myAltAlleleCountSubset(NULL), myMinMinorAlleleCount(UNSET_MIN_MINOR_ALLELE_COUNT), myMinorAlleleCountSubset(NULL), myDiscardRules(0), myNumKeptRecords(0), myTotalRead(0) { myFilePtr = NULL; } VcfFileReader::~VcfFileReader() { resetFile(); } bool VcfFileReader::open(const char* filename, VcfHeader& header) { // Close an already open file. close(); myStatus = StatGenStatus::SUCCESS; if(VcfFile::open(filename, "r")) { // Successfully opened, so read the header. if(!header.read(myFilePtr)) { // Failed, so copy the status. myStatus = header.getStatus(); return(false); } } else { // Failed, status set by VcfFile::open. return(false); } // Successfully opened and read the header. return(true); } bool VcfFileReader::open(const char* filename, VcfHeader& header, const char* includeFileName, const char* excludeSample, const char* excludeFileName, const char* delims) { if(!open(filename, header)) { // Failed to open & read header, so return. return(false); } // Successfully opened and read the header, so setup the sample subset // object based on the specified sample files and the header. if(!mySampleSubset.init(header, includeFileName, excludeSample, excludeFileName, delims)) { // Failed to setup the subsetting. std::cerr << "VcfFileReader - failed to setup sample subsetting\n"; } myUseSubset = true; // Successfully opened and read the header. return(true); } // Read VCF Index file. bool VcfFileReader::readVcfIndex(const char* vcfIndexFilename) { // Cleanup a previously setup index. if(myVcfIndex != NULL) { delete myVcfIndex; myVcfIndex = NULL; } // Create a new vcf index. myVcfIndex = new Tabix(); StatGenStatus::Status indexStat = myVcfIndex->readIndex(vcfIndexFilename); if(indexStat != StatGenStatus::SUCCESS) { std::string errorMessage = "Failed to read the vcf Index file: "; errorMessage += vcfIndexFilename; myStatus.setStatus(indexStat, errorMessage.c_str()); delete myVcfIndex; myVcfIndex = NULL; return(false); } if(myVcfIndex->getFormat() != Tabix::FORMAT_VCF) { std::string errorMessage = "ERROR: Tabix file not in VCF format: "; errorMessage += vcfIndexFilename; myStatus.setStatus(StatGenStatus::FAIL_PARSE, errorMessage.c_str()); delete myVcfIndex; myVcfIndex = NULL; return(false); } myStatus = StatGenStatus::SUCCESS; return(true); } // Read VCF Index file. bool VcfFileReader::readVcfIndex() { if(myFilePtr == NULL) { // Can't read the vcf index file because the VCF file has not yet been // opened, so we don't know the base filename for the index file. std::string errorMessage = "Failed to read the vcf Index file -" " the VCF file needs to be read first in order to determine" " the index filename."; myStatus.setStatus(StatGenStatus::FAIL_ORDER, errorMessage.c_str()); return(false); } const char* vcfBaseName = myFilePtr->getFileName(); std::string indexName = vcfBaseName; indexName += ".tbi"; bool foundFile = true; std::string failMessage = ""; try { if(readVcfIndex(indexName.c_str()) == false) { foundFile = false; } } catch (std::exception& e) { foundFile = false; failMessage = e.what(); } // Check to see if the index file was found. if(!foundFile) { // Not found - try without the vcf extension. // Locate the start of the vcf extension size_t startExt = indexName.find(".vcf"); if(startExt == std::string::npos) { // Could not find the .vcf extension, so just return false since the // call to readVcfIndex set the status. return(false); } // Remove ".vcf" and try reading the index again. indexName.erase(startExt, 4); try { return(readVcfIndex(indexName.c_str())); } catch (std::exception& e) { failMessage += "\n"; failMessage += e.what(); throw(std::runtime_error(failMessage)); return(false); } } return(true); } // return a pointer to the VCF Index file. const Tabix* VcfFileReader::getVcfIndex() { return(myVcfIndex); } bool VcfFileReader::readRecord(VcfRecord& record, VcfSubsetSamples* subset) { myStatus = StatGenStatus::SUCCESS; // Subset the read if there are subsets specified. VcfSubsetSamples* subsetPtr = subset; if((subsetPtr == NULL) && myUseSubset) { subsetPtr = &mySampleSubset; } // Check to see if a new region has been set. If so, setup for that region. bool searchChrom = false; if(myNewSection) { if(myVcfIndex != NULL) { // Have an index file so use if(!processNewSection()) { // processNewSection sets the status appropriately on failure. return(false); } } else if(myTotalRead == 0) { // ReadSection without an index only works if no records // have been read. searchChrom = true; myNewSection = false; } else { myNewSection = false; myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Cannot set read section with no index after reading records"); return(false); } } // Keep looping until a desired record is found. bool recordFound = false; while(!recordFound) { if(!record.read(myFilePtr, mySiteOnly, myRecordDiscardRules, subsetPtr)) { myStatus = record.getStatus(); myTotalRead += myRecordDiscardRules.getNumDiscarded(); myNumRecords += myRecordDiscardRules.getNumDiscarded(); myRecordDiscardRules.clearNumDiscarded(); return(false); } ++myTotalRead; myTotalRead += myRecordDiscardRules.getNumDiscarded(); // Check to see if the record is in the section. // First check the chromosome. if(!mySectionChrom.empty() && (mySectionChrom != record.getChromStr())) { if(searchChrom) { // Still searching for the chromosome, so continue // to the next record. continue; } // Record is not within the correct chromosome, so return failure. myStatus = StatGenStatus::NO_MORE_RECS; return(false); } searchChrom = false; // Check if the record is after the section end if applicable. if((mySection1BasedEndPos != -1) && (record.get1BasedPosition() >= mySection1BasedEndPos)) { myStatus = StatGenStatus::NO_MORE_RECS; return(false); } // Check if the record is prior to the section start if applicable. // Determinine the VCF record end position. // If we are not requiring overlap, then we only need to check // the start position, but if overlap is required, then it needs // to incrment the start by the length-1. int numIncBases = 0; if(mySectionOverlap) { // The VCF record end position is the start position + length of the // reference string - 1. numIncBases = record.getNumRefBases() - 1; } if((mySection1BasedStartPos != -1) && ((record.get1BasedPosition() + numIncBases) < mySection1BasedStartPos)) { // This record is prior to the section, so keep reading. continue; } ++myNumRecords; myNumRecords += myRecordDiscardRules.getNumDiscarded(); myRecordDiscardRules.clearNumDiscarded(); // Record successfully read, so check to see if it is discarded. if((myDiscardRules & DISCARD_NON_PHASED) && !record.allPhased()) { // Not all samples are phased, so discard this record. continue; } if((myDiscardRules & DISCARD_MISSING_GT) && !record.hasAllGenotypeAlleles()) { // discard missing GTs and this record had missing alleles, // so keep reading. continue; } if((myDiscardRules & DISCARD_FILTERED) && !(record.getFilter().passedAllFilters())) { // Record was filtered, so discard it. continue; } if((myDiscardRules & DISCARD_MULTIPLE_ALTS) && (record.getNumAlts() > 1)) { // Record had multiple alternates, so discard. continue; } // Check allele counts for discarding. if(myMinAltAlleleCount != UNSET_MIN_ALT_ALLELE_COUNT) { // Count the number of alternates. int32_t altCount = 0; for(int sampleNum = 0; sampleNum < record.getNumSamples(); sampleNum++) { if((myAltAlleleCountSubset != NULL) && !(myAltAlleleCountSubset->keep(sampleNum))) { // Skip this sample. continue; } for(int gtNum = 0; gtNum < record.getNumGTs(sampleNum); gtNum++) { if(record.getGT(sampleNum, gtNum) > 0) { // Alternate, so increment the count. ++altCount; } } } if(altCount < myMinAltAlleleCount) { // Not enough alternates so continue to the next sample. continue; } } // Check to see if the minimum alternate allele count is met. if(myMinMinorAlleleCount != UNSET_MIN_MINOR_ALLELE_COUNT) { // Get the number of possible alternates. unsigned int numAlts = record.getNumAlts(); // Verify that each allele has the min count. bool failMinorAlleleCount = false; for(unsigned int i = 0; i <= numAlts; i++) { if(record.getAlleleCount(i, myMinorAlleleCountSubset) < myMinMinorAlleleCount) { // Not enough of one gt, so not ok. failMinorAlleleCount = true; break; } } if(failMinorAlleleCount) { // not enough alleles, so continue to the next record. continue; } } // Record was not discarded. recordFound = true; } // Increment the number of kept records. ++myNumKeptRecords; return(true); } bool VcfFileReader::setReadSection(const char* chromName) { return(set1BasedReadSection(chromName, -1, -1)); } bool VcfFileReader::set1BasedReadSection(const char* chromName, int32_t start, int32_t end, bool overlap) { myNewSection = true; mySectionChrom = chromName; mySection1BasedStartPos = start; mySection1BasedEndPos = end; mySectionOverlap = overlap; return(true); } // Returns whether or not the end of the file has been reached. // return: int - true = EOF; false = not eof. bool VcfFileReader::isEOF() { if (myFilePtr != NULL) { // File Pointer is set, so return if eof. return(ifeof(myFilePtr)); } // File pointer is not set, so return true, eof. return true; } bool VcfFileReader::setExcludeIDs(const char* filename) { return(myRecordDiscardRules.setExcludeIDs(filename)); } bool VcfFileReader::setIncludeIDs(const char* filename) { return(myRecordDiscardRules.setIncludeIDs(filename)); } void VcfFileReader::addDiscardMinAltAlleleCount(int32_t minAltAlleleCount, VcfSubsetSamples* subset) { myMinAltAlleleCount = minAltAlleleCount; myAltAlleleCountSubset = subset; } void VcfFileReader::rmDiscardMinAltAlleleCount() { myMinAltAlleleCount = UNSET_MIN_ALT_ALLELE_COUNT; myAltAlleleCountSubset = NULL; } void VcfFileReader::addDiscardMinMinorAlleleCount(int32_t minMinorAlleleCount, VcfSubsetSamples* subset) { myMinMinorAlleleCount = minMinorAlleleCount; myMinorAlleleCountSubset = subset; } void VcfFileReader::rmDiscardMinMinorAlleleCount() { myMinMinorAlleleCount = UNSET_MIN_ALT_ALLELE_COUNT; myMinorAlleleCountSubset = NULL; } void VcfFileReader::resetFile() { myRecordDiscardRules.reset(), mySampleSubset.reset(); myUseSubset = false; myNumKeptRecords = 0; myTotalRead = 0; myNewSection = false; mySectionChrom = ""; mySection1BasedStartPos = -1; mySection1BasedEndPos = -1; mySectionOverlap = false; if(myVcfIndex != NULL) { delete myVcfIndex; myVcfIndex = NULL; } } bool VcfFileReader::processNewSection() { myNewSection = false; // Check to see if the index file has been read. if(myVcfIndex == NULL) { myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Cannot read section since there is no index file open"); throw(std::runtime_error("SOFTWARE BUG: trying to read a VCF record by section prior to opening the VCF Index file.")); return(false); } if(myFilePtr == NULL) { myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Cannot read section without first opening the VCF file."); throw(std::runtime_error("SOFTWARE BUG: trying to read a VCF record by section prior to opening the VCF file.")); return(false); } // Using random access, so can't buffer myFilePtr->disableBuffering(); uint64_t startPos = 0; // Find where this section starts in the file. if(!myVcfIndex->getStartPos(mySectionChrom.c_str(), mySection1BasedStartPos, startPos)) { // Didn't find the position. myStatus = StatGenStatus::NO_MORE_RECS; return(false); } if(startPos != (uint64_t)iftell(myFilePtr)) { // Seek to the start position. if(ifseek(myFilePtr, startPos, SEEK_SET) != true) { // seek failed, return failure. myStatus.setStatus(StatGenStatus::FAIL_IO, "Failed to seek to the specified section"); return(false); } } return(true); } libStatGen-1.0.14/vcf/VcfFileReader.h000066400000000000000000000253001254730101300172150ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_FILE_READER_H__ #define __VCF_FILE_READER_H__ #include "VcfFile.h" #include "VcfRecord.h" #include "VcfRecordDiscardRules.h" #include "VcfSubsetSamples.h" #include "Tabix.h" #ifdef __GXX_EXPERIMENTAL_CXX0X__ #include #else #include #endif /// This header file provides interface to read/write VCF files. class VcfFileReader : public VcfFile { public: static const uint64_t DISCARD_NON_PHASED = 0x1; static const uint64_t DISCARD_MISSING_GT = 0x2; static const uint64_t DISCARD_FILTERED = 0x4; static const uint64_t DISCARD_MULTIPLE_ALTS = 0x8; /// Default Constructor, initializes the variables, but does not open /// any files. VcfFileReader(); /// Destructor virtual ~VcfFileReader(); /// Open the vcf file with the specified filename for reading. /// This method does no sample subsetting. /// \param filename the vcf file to open for reading. /// \param header to be read from the file /// \return true = success; false = failure. virtual bool open(const char* filename, VcfHeader& header); /// Open the vcf file with the specified filename for reading /// subsetting the samples in the file to just the samples specified /// in the sample file. /// \param filename the vcf file to open for reading. /// \param header to be read from the file /// \param includeFileName file containing samples to keep /// or NULL if all samples except those specified as /// excluded should be included. /// \param excludeSample sample to be excluded or NULL /// \param excludeFileName file containing samples to remove or NULL /// or NULL if there is no file containing samples /// to exclude. /// \param delims deliminators separating the samples in the files ('\n' /// is always considered a delimiter even if it isn't specified). When /// any of the specified delimiter characters is found in the file it /// indicates the end of a sample name. /// \return true = success; false = failure. virtual bool open(const char* filename, VcfHeader& header, const char* includeFileName, const char* excludeSample, const char* excludeFileName, const char* delims = "\n"); /// Read the specified vcf index file. It must be read prior to setting a /// read section, for seeking and reading portions of a vcf file. /// \param filename the name of the vcf index file to be read. /// \return true = success; false = failure. bool readVcfIndex(const char * filename); /// Read the bam index file using the VCF filename as a base. /// It must be read prior to setting a read section, for seeking /// and reading portions of a vcf file. /// Must be read after opening the VCF file since it uses the /// VCF filename as a base name for the index file. /// First it tries filename.vcf.tbi. If that fails, it tries /// it without the .vcf extension, filename.tbi. /// \return true = success; false = failure. bool readVcfIndex(); /// Get the VCF index (it must have already been read). /// \return a const pointer to the tabix object, or NULL if the /// index has not been read. const Tabix* getVcfIndex(); /// Read the next Vcf record from the file until a line passes all /// discard rules (if any) or until the end of the file is found.. /// \param record record to populate with the next record. /// \param subset ptr to subset of samples to keep. This overrides /// mySampleSubset that may have been set at open. /// \return true if successful, false if not. bool readRecord(VcfRecord& record, VcfSubsetSamples* subset = NULL); /// Only read the specified chromosome when readRecord is called. /// If an index is not used, the read section can only be set prior to /// reading any records. bool setReadSection(const char* chromName); /// Only read the specified chromosome/positions when readRecord is called. /// If an index is not used, the read section can only be set prior to /// reading any records. /// \param chromName chromosome name to read. /// \param start inclusive 1-based start positions of records that should be /// read for this chromosome. /// \param end exclusive 1-based end positions of records that should be /// read for this chromosome (this position is not read). /// \param overlap bool indicating whether or not records overlapping /// the specified region should be included even if they do not start /// in the region. False (DEFAULT) means only read records that start /// in the region. True means to read record's whose deletions extend /// into the region. bool set1BasedReadSection(const char* chromName, int32_t start, int32_t end, bool overlap = false); /// Returns whether or not the end of the file has been reached. /// \return true = EOF; false = not eof. /// If the file is not open, true is returned. bool isEOF(); /// Get the number of VCF records that have been processed (read/written) /// so far excluding any discarded records. int getNumKeptRecords() {return(myNumKeptRecords);} /// Get the number of VCF records that were read, even those outside /// a specified region and those discarded. int getTotalReadRecords() {return(myTotalRead);} ///////////////////////////// /// @name Discard Rules Methods /// Methods for setting up the automatic discard rules when reading the file //@{ /// When reading records, skip all variants with the ids specified /// in the passed in filename. /// Returns false, if the file could not be read. bool setExcludeIDs(const char* filename); /// When reading records, keep only variants with the ids specified /// in the passed in filename. /// Returns false, if the file could not be read. bool setIncludeIDs(const char* filename); /// Set which rules should be applied for discarding records. /// OR in all discard rules to be applied. /// For example:: reader.setDiscards(DISCARD_NON_PHASED | /// DISCARD_MISSING_GT); /// NOTE: Discard rules are NOT reset when a file is reset, closed, or a new /// one opened, but are reset when this is called. void setDiscardRules(uint64_t discards) { myDiscardRules = discards; } /// Add additional discard rule, OR in all additional discard rules to /// be applied. /// For example:: reader.setDiscards(DISCARD_NON_PHASED | /// DISCARD_MISSING_GT); /// NOTE: Discard rules are NOT reset when a file is reset, closed, or a new /// one opened, and are NOT reset when this is called. void addDiscardRules(uint64_t discards) { myDiscardRules |= discards; } /// Add a discard rule based on the minimum allele count of alternate /// alleles in the specified samples. If the sum of all alternate allele /// counts in the specified samples (or in all samples if NULL is passed) ///is greater than or equal to the amount specified, the record is kept. /// If not, then the record is discarded. /// \param minAltAlleleCount minimum count of all alternate alleles for /// a record that should be kept, any with fewer will be discarded. /// \param subset only count alternate alleles in samples within the /// specified subset or NULL if all samples should be counted. The /// pointer is stored in this object, but is not cleaned up by this object. void addDiscardMinAltAlleleCount(int32_t minAltAlleleCount, VcfSubsetSamples* subset); /// Remove the discard rule for minimum alternate allele count. void rmDiscardMinAltAlleleCount(); /// Add a discard rule based on the minimum allele count of alternate /// alleles in the specified samples. If the sum of all alternate allele /// counts in the specified samples (or in all samples if NULL is passed) ///is greater than or equal to the amount specified, the record is kept. /// If not, then the record is discarded. /// \param minAltAlleleCount minimum count of all alternate alleles for /// a record that should be kept, any with fewer will be discarded. /// \param subset only count alternate alleles in samples within the /// specified subset or NULL if all samples should be counted. The /// pointer is stored in this object, but is not cleaned up by this object. void addDiscardMinMinorAlleleCount(int32_t minMinorAlleleCount, VcfSubsetSamples* subset); /// Remove the discard rule for minimum alternate allele count. void rmDiscardMinMinorAlleleCount(); //@} protected: virtual void resetFile(); private: VcfFileReader(const VcfFileReader& vcfFileReader); VcfFileReader& operator=(const VcfFileReader& vcfFileReader); static const int32_t UNSET_MIN_MINOR_ALLELE_COUNT = -1; static const int32_t UNSET_MIN_ALT_ALLELE_COUNT = -1; // Set1BasedReadSection was called so process the section prior to reading. bool processNewSection(); // New section information. Tabix* myVcfIndex; bool myNewSection; std::string mySectionChrom; int32_t mySection1BasedStartPos; int32_t mySection1BasedEndPos; bool mySectionOverlap; VcfRecordDiscardRules myRecordDiscardRules; VcfSubsetSamples mySampleSubset; bool myUseSubset; int32_t myMinAltAlleleCount; VcfSubsetSamples* myAltAlleleCountSubset; int32_t myMinMinorAlleleCount; VcfSubsetSamples* myMinorAlleleCountSubset; uint64_t myDiscardRules; // Number of records read/written so far that were not discarded. int myNumKeptRecords; int myTotalRead; }; #endif libStatGen-1.0.14/vcf/VcfFileWriter.cpp000066400000000000000000000036511254730101300176270ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfFileWriter.h" VcfFileWriter::VcfFileWriter() : VcfFile() { } VcfFileWriter::~VcfFileWriter() { } bool VcfFileWriter::open(const char* filename, VcfHeader& header, InputFile::ifileCompression compressionMode) { myStatus = StatGenStatus::SUCCESS; if(VcfFile::open(filename, "w", compressionMode)) { // Successfully opened, so write the header. if(!header.write(myFilePtr)) { // Failed, so copy the status. myStatus = header.getStatus(); return(false); } } else { // Failed, status set by VcfFile::open. return(false); } // Successfully opened and read the header. return(true); } bool VcfFileWriter::open(const char* filename, VcfHeader& header) { return(open(filename, header, InputFile::BGZF)); } bool VcfFileWriter::writeRecord(VcfRecord& record) { if(!record.write(myFilePtr, mySiteOnly)) { myStatus = record.getStatus(); return(false); } ++myNumRecords; return(true); } libStatGen-1.0.14/vcf/VcfFileWriter.h000066400000000000000000000045111254730101300172700ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_FILE_WRITER_H__ #define __VCF_FILE_WRITER_H__ #include "VcfFile.h" #include "VcfRecord.h" /// This header file provides interface to read/write VCF files. class VcfFileWriter : public VcfFile { public: /// Default Constructor, initializes the variables, but does not open /// any files. VcfFileWriter(); /// Destructor virtual ~VcfFileWriter(); /// Open the vcf file with the specified filename for writing. /// \param filename the vcf file to open for writing. /// \param header to be written the file /// \param compressionMode type of compression to use for writing /// \return true = success; false = failure. bool open(const char* filename, VcfHeader& header, InputFile::ifileCompression compressionMode); /// Open the vcf file with the specified filename for writing using the /// default compression (BGZF). /// \param filename the vcf file to open for writing. /// \param header to be written the file /// \return true = success; false = failure. virtual bool open(const char* filename, VcfHeader& header); /// Write the VCF data line to the file. /// \param record record to write to the file. /// \return true if successfully wrote, false if not. bool writeRecord(VcfRecord& record); protected: virtual void resetFile() {} private: VcfFileWriter(const VcfFileWriter& vcfFileWriter); VcfFileWriter& operator=(const VcfFileWriter& vcfFileWriter); }; #endif libStatGen-1.0.14/vcf/VcfGenotypeField.cpp000066400000000000000000000052561254730101300203140ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfGenotypeField.h" bool VcfGenotypeField::write(IFILE filePtr) { int numWritten = 0; int numExpected = 0; std::string* subFieldPtr = NULL; // Write the tab before this field. numWritten += ifprintf(filePtr, "\t"); ++numExpected; // Loop through and write each subfield. for(int i = 0; i < myGenotypeSubFields.size(); i++) { subFieldPtr = &(myGenotypeSubFields.get(i)); if(i == 0) { // First entry, so no ':' numWritten += ifprintf(filePtr, "%s", subFieldPtr->c_str()); numExpected += subFieldPtr->size(); } else { // Not first entry, so use a ':' numWritten += ifprintf(filePtr, ":%s", subFieldPtr->c_str()); numExpected += 1 + subFieldPtr->size(); } } // End loop through entries. return(numWritten == numExpected); } VcfGenotypeField::SUBFIELD_READ_STATUS VcfGenotypeField::readGenotypeSubField(IFILE filePtr, std::string* stringDest) { if(ifeof(filePtr)) { // End of file, so just return END_OF_RECORD. return(END_OF_RECORD); } static const std::string fieldStopChars = "\n\t:"; // Read/parse the field. int pos = 0; if(stringDest == NULL) { pos = filePtr->readTilChar(fieldStopChars); } else { pos = filePtr->readTilChar(fieldStopChars, *stringDest); } if(pos == 2) { // ':' return(MORE_SUBFIELDS); } else if(pos == 1) { // '\t' return(END_OF_FIELD); } // '\n' or EOF return(END_OF_RECORD); } VcfGenotypeField::SUBFIELD_READ_STATUS VcfGenotypeField::getReadStatus(int stopChar) { if(stopChar >= 2) { // ':' return(MORE_SUBFIELDS); } else if(stopChar == 1) { // '\t' return(END_OF_FIELD); } // '\n' or EOF return(END_OF_RECORD); } libStatGen-1.0.14/vcf/VcfGenotypeField.h000066400000000000000000000041541254730101300177550ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_GENOTYPE_FIELD_H__ #define __VCF_GENOTYPE_FIELD_H__ #include "VcfRecordField.h" #include "ReusableVector.h" class VcfGenotypeField { public: static const int GENOTYPE_INDEX_NA = -1; /// Default Constructor, initializes the variables. VcfGenotypeField() : myGenotypeSubFields() {} /// Destructor virtual ~VcfGenotypeField() {} bool write(IFILE filePtr); inline void reset() { myGenotypeSubFields.reset(); internal_reset(); } void clear() {reset(); } /// Get the number of genotype format fields there are. /// \return the number of genotype format fields. inline int getNumFields() { return(myGenotypeSubFields.size()); } protected: enum SUBFIELD_READ_STATUS { MORE_SUBFIELDS = 0, END_OF_FIELD = 1, END_OF_RECORD = 2, }; // Specify null if the field is not to be stored. static SUBFIELD_READ_STATUS readGenotypeSubField(IFILE filePtr, std::string* stringDest); // The stopCharacters must be in the same order as, "\n\t:". static SUBFIELD_READ_STATUS getReadStatus(int stopChar); virtual void internal_reset() = 0; ReusableVector myGenotypeSubFields; private: VcfGenotypeField(const VcfGenotypeField& vcfGenotypeField); VcfGenotypeField& operator=(const VcfGenotypeField& vcfGenotypeField); }; #endif libStatGen-1.0.14/vcf/VcfGenotypeFormat.cpp000066400000000000000000000053341254730101300205160ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfGenotypeFormat.h" #include "VcfRecordGenotype.h" VcfGenotypeFormat::VcfGenotypeFormat() : VcfGenotypeField(), myGTIndex(GENOTYPE_INDEX_NA) { } VcfGenotypeFormat::~VcfGenotypeFormat() { } bool VcfGenotypeFormat::read(IFILE filePtr) { // Clear out any previously set values. reset(); SUBFIELD_READ_STATUS readStatus = MORE_SUBFIELDS; std::string* nextType = &(myGenotypeSubFields.getNextEmpty()); int subFieldIndex = 0; while(readStatus == MORE_SUBFIELDS) { // more subfields to read. readStatus = readGenotypeSubField(filePtr, nextType); // Check if this field should be read/stored. if(!VcfRecordGenotype::storeField(*nextType)) { // Do not read/store this field. myStoreIndices.push_back(false); nextType->clear(); } else { // Check if this is GT. if(*nextType == "GT") { myGTIndex = subFieldIndex; } myStoreIndices.push_back(true); nextType = &(myGenotypeSubFields.getNextEmpty()); } ++subFieldIndex; } // Since we didn't use the last type that was retrieved, remove it. myGenotypeSubFields.rmLast(); // Return true if there is a tab - it is just END_OF_FIELD. return(readStatus == END_OF_FIELD); } int VcfGenotypeFormat::getIndex(const std::string& key) { // Search for this field of the genotypeFormat. for(int i = 0; i < myGenotypeSubFields.size(); i++) { if(myGenotypeSubFields.get(i) == key) { // Found the type. return(i); } } // field was not found, so return null. return(GENOTYPE_INDEX_NA); } bool VcfGenotypeFormat::storeIndex(unsigned int index) { if(index > myStoreIndices.size()) { return(false); } return(myStoreIndices[index]); } void VcfGenotypeFormat::internal_reset() { myGTIndex = GENOTYPE_INDEX_NA; myStoreIndices.clear(); } libStatGen-1.0.14/vcf/VcfGenotypeFormat.h000066400000000000000000000046241254730101300201640ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_GENOTYPE_FORMAT_H__ #define __VCF_GENOTYPE_FORMAT_H__ #include "VcfGenotypeField.h" /// This header file provides interface to read/write VCF files. class VcfGenotypeFormat : public VcfGenotypeField { public: static const int GENOTYPE_INDEX_NA = -1; /// Default Constructor, initializes the variables. VcfGenotypeFormat(); /// Destructor virtual ~VcfGenotypeFormat(); /// Read the genotype format from the file up until the next \t,\n, or EOF. /// \param filePtr IFILE to read from. /// \return true if a tab ended the field, false if it was \n or EOF. bool read(IFILE filePtr); /// Get the index of the specified key. /// \param key to find the index for. /// \return index of the specified key or GENOTYPE_INDEX_NA if the key /// is not found. int getIndex(const std::string& key); /// Get the GT index, returns GENOTYPE_INDEX_NA if it is not found.. inline int getGTIndex() { return(myGTIndex); } /// Return true if the specified subField should be read/stored. bool storeIndex(unsigned int index); /// Get Original number of fields - this is different than the number /// of stored fields. int getOrigNumFields() { return(myStoreIndices.size()); } protected: /// reset the sample for a new entry. virtual void internal_reset(); private: VcfGenotypeFormat(const VcfGenotypeFormat& vcfGenotypeFormat); VcfGenotypeFormat& operator=(const VcfGenotypeFormat& vcfGenotypeFormat); int myGTIndex; // Set when reading the format by checking the readFields in // VcfRecordGenotype and queried when reading the samples fields. std::vector myStoreIndices; }; #endif libStatGen-1.0.14/vcf/VcfGenotypeSample.cpp000066400000000000000000000224461254730101300205120ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfGenotypeSample.h" #include #include const int VcfGenotypeSample::INVALID_GT = -1; const int VcfGenotypeSample::MISSING_GT = -2; const std::string VcfGenotypeSample::MISSING_FIELD = "."; VcfGenotypeSample::VcfGenotypeSample() : VcfGenotypeField(), myFormatPtr(NULL), myPhased(false), myUnphased(false), myHasAllGenotypeAlleles(false), myNewGT(false), myGTs() { } VcfGenotypeSample::~VcfGenotypeSample() { myFormatPtr = NULL; } bool VcfGenotypeSample::read(IFILE filePtr, VcfGenotypeFormat& format) { static const char* GT_DELIM = "\n\t:|/."; static const int END_GT = 2; // Ends at index 2 or less static const int PHASED_CHAR_POS = 3; static const int UNPHASED_CHAR_POS = 4; static const int MISSING_GT_POS = 5; // Clear out any previously set values. reset(); myFormatPtr = &format; int gtIndex = format.getGTIndex(); // Read the subfields. SUBFIELD_READ_STATUS readStatus = MORE_SUBFIELDS; std::string* nextType = NULL; int subFieldIndex = 0; while(readStatus == MORE_SUBFIELDS) { // Get the field to write into. if(format.storeIndex(subFieldIndex)) { nextType = &(myGenotypeSubFields.getNextEmpty()); // Check if this is the GT field. if(subFieldIndex == gtIndex) { // There is a GT field, so set that all GT fields are there. // if any are missing it will be turned back to false. myHasAllGenotypeAlleles = true; // This is the GT field, so parse manually looking to see if it // is phased and store the genotypes. int stopChar = END_GT + 1; // Read until a new subfield is found. while(stopChar > END_GT) { // TODO have an option to autoparse the genotypes? // todo - store the previous nextType len in order to // do string conversion to ints... stopChar = filePtr->readTilChar(GT_DELIM, *nextType); if(stopChar == PHASED_CHAR_POS) { nextType->push_back('|'); myPhased = true; } else if(stopChar == UNPHASED_CHAR_POS) { nextType->push_back('/'); myUnphased = true; } else if(stopChar == MISSING_GT_POS) { nextType->push_back('.'); myHasAllGenotypeAlleles = false; } } // Check if this is the END_GT signal. readStatus = getReadStatus(stopChar); } else { // more subfields to read. readStatus = readGenotypeSubField(filePtr, nextType); } } else { readStatus = readGenotypeSubField(filePtr, NULL); } ++subFieldIndex; } // subFieldIndex contains the number of fields in this sample. if(subFieldIndex > format.getOrigNumFields()) { throw(std::runtime_error("VCF Number of Fields in a Sample does not match the Format.")); } else if(subFieldIndex < format.getOrigNumFields()) { // If there are no fields for this sample, enter the missing value. if(myGenotypeSubFields.size() == 0) { myGenotypeSubFields.getNextEmpty() = MISSING_FIELD; } } // Return true if there is a tab - it is just END_OF_FIELD. return(readStatus == END_OF_FIELD); } bool VcfGenotypeSample::write(IFILE filePtr) { if(myNewGT) { updateGTString(); } return(VcfGenotypeField::write(filePtr)); } const std::string* VcfGenotypeSample::getString(const std::string& key) { if(myFormatPtr == NULL) { return(NULL); } int index = myFormatPtr->getIndex(key); if(index != VcfGenotypeFormat::GENOTYPE_INDEX_NA) { // Check if it is out of range for this sample - means it // is missing for this sample. if(index >= myGenotypeSubFields.size()) { // missing for this sample. return(&MISSING_FIELD); } if((key == "GT") && myNewGT) { updateGTString(); } return(&(myGenotypeSubFields.get(index))); } // key was not found, so return NULL. return(NULL); } bool VcfGenotypeSample::setString(const std::string& key, const std::string& value) { if(myFormatPtr == NULL) { return(false); } int index = myFormatPtr->getIndex(key); if(index != VcfGenotypeFormat::GENOTYPE_INDEX_NA) { // Found the type, so set it. myGenotypeSubFields.get(index) = value; if(key == "GT") { myGTs.clear(); myNewGT = false; if(value.find('|') != std::string::npos) { myPhased = true; } if(value.find('/') != std::string::npos) { myUnphased = true; } if(value.find('.') != std::string::npos) { myHasAllGenotypeAlleles = false; } else { myHasAllGenotypeAlleles = true; } } return(true); } // field was not found, so return false. return(false); } int VcfGenotypeSample::getGT(unsigned int index) { if(myGTs.empty()) { if(!parseGT()) { // Failed to parse GT, so return INVALID_GT. return(INVALID_GT); } } if(index < myGTs.size()) { return(myGTs[index]); } // Out of range index. return(INVALID_GT); } void VcfGenotypeSample::setGT(unsigned int index, int newGt) { if(myGTs.empty()) { if(!parseGT()) { // Failed to parse GT, so return INVALID_GT. throw(std::runtime_error("VCF failed to parse GT.")); } } if(index < myGTs.size()) { if(myGTs[index] != newGt) { myNewGT = true; myGTs[index] = newGt; } } else { // Out of range index. throw(std::runtime_error("VCF setGT called with out of range GT index.")); } } int VcfGenotypeSample::getNumGTs() { if(myGTs.empty()) { if(!parseGT()) { return(0); } } return(myGTs.size()); } void VcfGenotypeSample::internal_reset() { myFormatPtr = NULL; myPhased = false; myUnphased = false; myHasAllGenotypeAlleles = false; myGTs.clear(); myNewGT = false; } bool VcfGenotypeSample::parseGT() { // Parse the GT. const std::string* gtStr = getString("GT"); myGTs.clear(); myNewGT = false; if(gtStr == NULL) { // GT field not found. return(false); } // Parse til the end of the GT string char* startPos = NULL; char* endPos = (char*)gtStr->c_str(); while((endPos != NULL) && (*endPos != '\0')) { startPos = endPos; if(*startPos == '.') { endPos = startPos + 1; // unknown, so set this index to be MISSING_GT. myGTs.push_back(MISSING_GT); continue; } if(*startPos == '|') { endPos = startPos + 1; continue; } if(*startPos == '/') { endPos = startPos + 1; continue; } // Should be an int, so parse it. unsigned long gtLong = strtoul(startPos, &endPos, 10); myGTs.push_back((int)gtLong); } return(true); } void VcfGenotypeSample::updateGTString() { if(myNewGT) { int index = myFormatPtr->getIndex("GT"); if(index != VcfGenotypeFormat::GENOTYPE_INDEX_NA) { // Check if it is out of range for this sample - means it // is missing for this sample. if(index < myGenotypeSubFields.size()) { std::stringstream gtSS; char phaseChar = '/'; if(myPhased) { phaseChar = '|'; } gtSS << myGTs[0]; for(unsigned int i = 1; i < myGTs.size(); i++) { gtSS << phaseChar << myGTs[i]; } myGenotypeSubFields.get(index) = gtSS.str(); myNewGT = false; } } } } libStatGen-1.0.14/vcf/VcfGenotypeSample.h000066400000000000000000000064431254730101300201560ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_GENOTYPE_SAMPLE_H__ #define __VCF_GENOTYPE_SAMPLE_H__ #include "VcfGenotypeFormat.h" /// This header file provides interface to read/write VCF files. class VcfGenotypeSample : public VcfGenotypeField { public: static const int INVALID_GT; static const int MISSING_GT; static const std::string MISSING_FIELD; /// Default Constructor, initializes the variables. VcfGenotypeSample(); /// Destructor virtual ~VcfGenotypeSample(); /// Read this sample from the file up until the next \t,\n, or EOF. /// \param filePtr IFILE to read from. /// \param format the VCF Genotype Format field description. /// \return true if a tab ended the field, false if it was \n or EOF. bool read(IFILE filePtr, VcfGenotypeFormat& format); virtual bool write(IFILE filePtr); /// Get a pointer to the string containing the value associated with the /// specified key(the pointer will be invalid if the field is /// changed/reset). /// \param key to find the falue for. /// \return const pointer to the string value for this key, NULL if /// the sample or the key was not found. const std::string* getString(const std::string& key); /// Set the string associated with the specified key, returns true if set, /// false if not. bool setString(const std::string& key, const std::string& value); inline bool isPhased() { return(myPhased); } inline bool isUnphased() { return(myUnphased); } inline bool hasAllGenotypeAlleles() { return(myHasAllGenotypeAlleles); } /// Return the integer allele at the specified index of the GT field. /// For example, a GT of 0/3, getGT(1) returns 3 and getGT(0) returns 0. /// Returns INVALID_GT if there is no GT at the specified index or GT was /// not specified and returns MISSING_GT if it is '.'. int getGT(unsigned int index); /// Set the integer allele at the specified index of the GT field. /// Requires the GT index to already exist. void setGT(unsigned int index, int newGt); /// Return the number of GT fields for this sample. int getNumGTs(); protected: /// reset the sample for a new entry. virtual void internal_reset(); bool parseGT(); private: VcfGenotypeSample(const VcfGenotypeSample& vcfGenotypeSample); VcfGenotypeSample& operator=(const VcfGenotypeSample& vcfGenotypeSample); void updateGTString(); VcfGenotypeFormat* myFormatPtr; bool myPhased; bool myUnphased; bool myHasAllGenotypeAlleles; bool myNewGT; std::vector myGTs; }; #endif libStatGen-1.0.14/vcf/VcfHeader.cpp000066400000000000000000000205721254730101300167440ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfHeader.h" VcfHeader::VcfHeader() : myHeaderLines() { reset(); } VcfHeader::~VcfHeader() { } bool VcfHeader::read(IFILE filePtr) { // Reading, so clean out this header. reset(); if(filePtr == NULL) { // No file was passed in. myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Need to pass in an open file ptr to VcfHeader::read."); return(false); } // Read until the header line has been read (after the meta lines). while(!myHasHeaderLine) { // Increase the size of headerlines by 1 to fit the new line. myHeaderLines.resize(myHeaderLines.size() + 1); // Read the next line from the file into the header structure. String& newStr = myHeaderLines.back(); if(newStr.ReadLine(filePtr) < 0) { // Error, unable to read an entire header from the file. myStatus.setStatus(StatGenStatus::INVALID, "Error reading VCF Meta/Header, EOF found before the header line."); return(false); } if(newStr.Length() <= 2) { // A header/meta line must have at least 2 characters // ## or # and 8 fields, so if less than 2 characters, // error. myStatus.setStatus(StatGenStatus::INVALID, "Error reading VCF Meta/Header, line without at least 2 characters found before the header line."); return(false); } // Check if it is a header (first char is # and 2nd one is not). if((newStr[0] == '#') && (newStr[1] != '#')) { myHasHeaderLine = true; // Parse the header line to get the sample information. myParsedHeaderLine.ReplaceColumns(newStr, '\t'); } else if((newStr[0] != '#') || (newStr[1] != '#')) { // A meta line must start with "##", we expect meta lines until // the header line is found. myStatus.setStatus(StatGenStatus::INVALID, "Error reading VCF Meta/Header, line not starting with '##' found before the header line."); return(false); } } return(true); } bool VcfHeader::write(IFILE filePtr) { if(filePtr == NULL) { // No file was passed in. myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Need to pass in an open file ptr to VcfHeader::write."); return(false); } // Make sure the last header line is synced with the parsed header line. syncHeaderLine(); int numWritten = 0; int numExpected = 0; for(std::vector::iterator iter = myHeaderLines.begin(); iter != myHeaderLines.end(); iter++) { numWritten += ifprintf(filePtr, "%s\n", iter->c_str()); // expected to write string + new line. numExpected += iter->Length(); numExpected += 1; } if(numWritten != numExpected) { myStatus.setStatus(StatGenStatus::FAIL_IO, "Failed writing VCF Meta/Header."); } return(numWritten == numExpected); } void VcfHeader::reset() { myHasHeaderLine = false; myHeaderLines.clear(); } // Return the error after a failed call. const StatGenStatus& VcfHeader::getStatus() { return(myStatus); } int VcfHeader::getNumMetaLines() { int numHeaderLines = myHeaderLines.size(); if((numHeaderLines >= 1) && (myHasHeaderLine)) { // Remove the header line from the count. return(numHeaderLines-1); } return(numHeaderLines); } const char* VcfHeader::getMetaLine(unsigned int index) { if(index >= myHeaderLines.size()) { return(NULL); } else { return(myHeaderLines[index].c_str()); } return(NULL); } const char* VcfHeader::getHeaderLine() { // Make sure the last header line is synced with the parsed header line. syncHeaderLine(); if(myHasHeaderLine) { return(myHeaderLines.back().c_str()); } return(NULL); } int VcfHeader::getNumSamples() const { if(!myHasHeaderLine) { return(0); } int numFields = myParsedHeaderLine.Length(); if(numFields > NUM_NON_SAMPLE_HEADER_COLS) { // There are samples. return(numFields - NUM_NON_SAMPLE_HEADER_COLS); } // No sample fields return(0); } const char* VcfHeader::getSampleName(unsigned int index) const { if(!myHasHeaderLine) { // No header. return(NULL); } int position = index + NUM_NON_SAMPLE_HEADER_COLS; if(position >= myParsedHeaderLine.Length()) { // Out of range. return(NULL); } return(myParsedHeaderLine[position].c_str()); } int VcfHeader::getSampleIndex(const char* sampleName) const { if(!myHasHeaderLine) { // No header. return(-1); } for(int index = NUM_NON_SAMPLE_HEADER_COLS; index < myParsedHeaderLine.Length(); index++) { if(myParsedHeaderLine[index] == sampleName) { // Found. return(index - NUM_NON_SAMPLE_HEADER_COLS); } } // Not found. return(-1); } void VcfHeader::removeSample(unsigned int index) { int position = index + NUM_NON_SAMPLE_HEADER_COLS; if(position >= myParsedHeaderLine.Length()) { // Out of range, so just return, nothing to remove. return; } // Remove it from the parsed header line. myParsedHeaderLine.Delete(position); // Removed a sample, so clear the header line so the next time it is // accessed it will be reset based on the existing samples. String& hdrLine = myHeaderLines.back(); hdrLine.Clear(); } bool VcfHeader::appendMetaLine(const char* metaLine) { // Check that the line starts with "##". if(strncmp(metaLine, "##", 2) != 0) { // Does not start with "##" return(false); } if(!myHasHeaderLine) { // No header line, so just add to the end of the vector. myHeaderLines.push_back(metaLine); return(true); } // There is a header line, so insert this just before that line. // The headerLine is one position before "end". std::vector::iterator headerLine = myHeaderLines.end(); --headerLine; // Insert just before the header line. myHeaderLines.insert(headerLine, metaLine); return(true); } bool VcfHeader::addHeaderLine(const char* headerLine) { // Check that the line starts with "#". if(strncmp(headerLine, "#", 1) != 0) { // Does not start with "#" return(false); } if(myHasHeaderLine) { // There is a header line, so replace the current line. myHeaderLines.back() = headerLine; } else { // There is not a header line, so add it myHeaderLines.push_back(headerLine); } myHasHeaderLine = true; // Parse the header line to get the sample information. myParsedHeaderLine.ReplaceColumns(headerLine, '\t'); return(true); } void VcfHeader::syncHeaderLine() { if(!myHasHeaderLine) { // No header line, so nothing to sync. return; } // Get the last header line and see if it is set. String& hdrLine = myHeaderLines.back(); if(hdrLine.IsEmpty()) { // The header line is not set, so set it. for(int i = 0; i < myParsedHeaderLine.Length(); i++) { if(i != 0) { hdrLine += '\t'; } hdrLine += myParsedHeaderLine[i]; } } } libStatGen-1.0.14/vcf/VcfHeader.h000066400000000000000000000101161254730101300164020ustar00rootroot00000000000000/* * Copyright (C) 2010-2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_HEADER_H__ #define __VCF_HEADER_H__ #include #include "StringArray.h" #include "StatGenStatus.h" /// This header file provides interface for dealing with VCF Meta/Header lines. class VcfHeader { public: /// Default Constructor, initializes the variables. VcfHeader(); /// Destructor virtual ~VcfHeader(); /// Read the header from the specified file replacing any previous header /// contents. /// \param filePtr IFILE to read from. /// \return true if an entire meta/header was successfully read from /// the specified filePtr, false if not. bool read(IFILE filePtr); /// Write the header to the specified file. /// \param filePtr IFILE to write to. /// \return true if an entire meta/header was successfully written to /// the specified filePtr, false if not. bool write(IFILE filePtr); /// Reset this header, preparing for a new one. void reset(); /// Returns the status associated with the last method that sets the status. /// \return StatGenStatus of the last command that sets status. const StatGenStatus& getStatus(); /// Return the number of meta-lines (lines starting with ##) int getNumMetaLines(); /// Return the specified meta-line (index starting at 0) /// or NULL if out of range. /// Will return the headerline if the header line's index is specified. const char* getMetaLine(unsigned int index); /// Return the header line, the line containing #chrom... const char* getHeaderLine(); /// Returns the number of samples in the header line or 0 if the header /// line has not yet been read. int getNumSamples() const; /// Returns the name of the specified sample or NULL if the sample number /// is out of range (first sample is index 0). const char* getSampleName(unsigned int index) const; /// Returns the index of the specified sample or -1 if the sample name /// is not found (first sample is index 0). int getSampleIndex(const char* sampleName) const; /// Remove the sample at the specified index. void removeSample(unsigned int index); ///////////////// /// Add Lines /// Add a Meta Line to the end of the currently specified meta lines. /// Return false if the meta line is invalid (does not start with ##) /// A return of false means the line was not added. bool appendMetaLine(const char* metaLine); /// Replace the header line if one exists or add it if one does not. /// Return false if the header line is invalid (does not start with #). /// A return of false means the line was not added. bool addHeaderLine(const char* headerLine); protected: private: VcfHeader(const VcfHeader& vcfHeader); VcfHeader& operator=(const VcfHeader& vcfHeader); // Make sure the last header line is synced with the parsed header line. // This is used when samples are removed. void syncHeaderLine(); static const int NUM_NON_SAMPLE_HEADER_COLS = 9; // Is set to true once the header line has been set, false until then. bool myHasHeaderLine; std::vector myHeaderLines; StringArray myParsedHeaderLine; // The status of the last failed command. StatGenStatus myStatus; }; #endif libStatGen-1.0.14/vcf/VcfHelper.cpp000066400000000000000000000031501254730101300167640ustar00rootroot00000000000000 /* * Copyright (C) 2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfHelper.h" void VcfHelper::parseString(const std::string& inputString, char delim, ReusableVector& outputVector) { if(inputString.empty()) { // Nothing to parse, so just return. return; } std::string* outputStringPtr = &(outputVector.getNextEmpty()); for(unsigned int i = 0; i < inputString.size(); i++) { if(inputString[i] == delim) { // Get a new string to write into and continue // to the next character. outputStringPtr = &(outputVector.getNextEmpty()); } else { // Append the character. outputStringPtr->push_back(inputString[i]); } } } libStatGen-1.0.14/vcf/VcfHelper.h000066400000000000000000000024741254730101300164410ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_HELPER_H__ #define __VCF_HELPER_H__ #include #include "ReusableVector.h" /// This header file provides helper methods for dealing with VCF Files. class VcfHelper { public: /// Parse the string at the specified delimiters into /// the specified reusable vector. static void parseString(const std::string& inputString, char delim, ReusableVector& outputVector); }; #endif libStatGen-1.0.14/vcf/VcfRecord.cpp000066400000000000000000000263631254730101300167760ustar00rootroot00000000000000/* * Copyright (C) 2010-2012 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfRecord.h" VcfRecord::VcfRecord() { reset(); } VcfRecord::~VcfRecord() { } bool VcfRecord::read(IFILE filePtr, bool siteOnly, VcfRecordDiscardRules& discardRules, VcfSubsetSamples* sampleSubset) { // Clear out any previously set values. reset(); if(filePtr == NULL) { myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Error reading VCF record before opening the file."); return(false); } if(ifeof(filePtr)) { // End of file, just return false. return(false); } // Read the chromosome. if(!readTilTab(filePtr, myChrom)) { if(myChrom.empty()) { // EOF. return(false); } // Not an empty line. myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record CHROM."); return(false); } // Read the 1-based Position if(!readTilTab(filePtr, my1BasedPos)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record POS."); return(false); } else { // Read the position, so convert to an integer. my1BasedPosNum = atoi(my1BasedPos.c_str()); } // Read the ID. if(!readTilTab(filePtr, myID)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record ID."); return(false); } if(discardRules.discardForID(myID)) { // Do not keep this id, so consume the rest of the record and // return the next record. filePtr->discardLine(); return(read(filePtr, siteOnly, discardRules, sampleSubset)); } // Read the Ref. if(!readTilTab(filePtr, myRef)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record REF."); return(false); } // Read the Alt. myAltArray.clear(); if(!readTilTab(filePtr, myAlt)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record ALT."); return(false); } // Read the Qual. if(!readTilTab(filePtr, myQual)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record QUAL."); return(false); } else { if(myQual != ".") { // Read the quality, so convert to an integer. myQualNum = atof(myQual.c_str()); } else { myQualNum = -1; } } // Read the Filter. if(!myFilter.read(filePtr)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record FILTER."); return(false); } // Read the Info (could be the last word in the line or file). if(!myInfo.read(filePtr)) { // Found the end of the line after the info field, so return true, // successfully read the record. return(true); } if(siteOnly) { // Do not store genotypes, so just consume the rest of the line. filePtr->readTilChar("\n"); } else { // Not yet at the end of the line, so read the genotype fields // (format & samples) try { myGenotype.read(filePtr, sampleSubset); } catch(std::exception& e) { myDummyString = "Failed parsing the Genotype Fields of " + myChrom + ":" + my1BasedPos + " (chr:pos) - " + e.what(); myStatus.setStatus(StatGenStatus::FAIL_PARSE, myDummyString.c_str()); return(false); } } // Found the end of the line, return true since all required fields // were read. return(true); } bool VcfRecord::write(IFILE filePtr, bool siteOnly) { if(filePtr == NULL) { myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Error writing VCF record before opening the file."); return(false); } int numWritten = 0; int numExpected = 0; if(myChrom.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myChrom.c_str()); numExpected += myChrom.length() + 1; } if(my1BasedPos.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", my1BasedPos.c_str()); numExpected += my1BasedPos.length() + 1; } if(myID.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myID.c_str()); numExpected += myID.length() + 1; } if(myRef.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myRef.c_str()); numExpected += myRef.length() + 1; } if(myAlt.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myAlt.c_str()); numExpected += myAlt.length() + 1; } if(myQual.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myQual.c_str()); numExpected += myQual.length() + 1; } const std::string& filterString = myFilter.getString(); if(filterString.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", filterString.c_str()); numExpected += filterString.length() + 1; } // Write the info. bool writeSuccess = myInfo.write(filePtr); // Only write the format & genotype if we are not just writing siteOnly // data and there is at least one sample if((!siteOnly) && (myGenotype.getNumSamples() != 0)) { writeSuccess &= myGenotype.write(filePtr); } // Write the new line. numWritten += ifprintf(filePtr, "\n"); numExpected += 1; return((numWritten == numExpected) && writeSuccess); } void VcfRecord::reset() { myChrom.clear(); my1BasedPosNum = 0; my1BasedPos.clear(); myID.clear(); myRef.clear(); myAlt.clear(); myAltArray.clear(); myQualNum = 0;; myQual.clear(); myFilter.clear(); myInfo.clear(); myGenotype.clear(); myAlleleCount.clear(); myStatus = StatGenStatus::SUCCESS; } // Return the error after a failed call. const StatGenStatus& VcfRecord::getStatus() { return(myStatus); } const char* VcfRecord::getAlleles(unsigned int index) { if(index == 0) { return(myRef.c_str()); } if(index > getNumAlts()) { // Index out of range. // Throw an exception. throw(std::runtime_error("VcfRecord::getAlleles called with an index that is greater than the number of alternates.")); return(NULL); } // Alternate allele, so return the alternate. return(myAltArray.get(index-1).c_str()); } int VcfRecord::getIntAllele(unsigned int index) { const char* alleles = getAlleles(index); switch(alleles[0]) { case 'A': return(1); break; case 'C': return(2); break; case 'G': return(3); break; case 'T': return(4); break; default: std::cerr << "VcfRecord::getIntAllele, unknown allele, " << alleles[0] << std::endl; } return(0); } unsigned int VcfRecord::getNumAlts() { int numAlts = myAltArray.size(); if(numAlts != 0) { // Already parsed so just return the number of alternates. return(numAlts); } // Check if it is just '.'. if((myAlt.length() == 1) && (myAlt == ".")) { // No alternates. return(0); } // Parse the alternates by looping looking for commas. std::string* altStr = &(myAltArray.getNextEmpty()); for(std::string::iterator iter = myAlt.begin(); iter != myAlt.end(); iter++) { if(*iter == ',') { altStr = &(myAltArray.getNextEmpty()); } else { altStr->push_back(*iter); } } return(myAltArray.size()); } int VcfRecord::getAlleleCount(unsigned int index, VcfSubsetSamples* sampleSubset) { unsigned int numAlts = getNumAlts(); if(index > numAlts) { // Index out of range. // Throw an exception. throw(std::runtime_error("VcfRecord::getAlleles called with an index that is greater than the number of alternates.")); return(-1); } if(myAlleleCount.size() == 0) { unsigned int gt = 0; myAlleleCount.resize(numAlts+1, 0); // Loop through the samples, counting the number of each allele. for(int sampleNum = 0; sampleNum < getNumSamples(); sampleNum++) { if((sampleSubset != NULL) && !(sampleSubset->keep(sampleNum))) { // Skip this sample. continue; } for(int gtNum = 0; gtNum < getNumGTs(sampleNum); gtNum++) { gt = getGT(sampleNum, gtNum); if((gt < 0) || (gt > numAlts)) { // Out of range GT, so continue to the next gt continue; } // Increment the minor allele count ++myAlleleCount[gt]; } } } // Alternate allele, so return the alternate. return(myAlleleCount[index]); } bool VcfRecord::readTilTab(IFILE filePtr, std::string& stringRef) { int charRead = 0; while(1) { charRead = ifgetc(filePtr); if((charRead == '\n') || (charRead == EOF)) { // Didn't find a tab, found a '\n' or eof // It still populated the string with values up // until the tab. return(false); } if(charRead == '\t') { // hit the tab character, so exit the loop. break; } stringRef += charRead; } return(true); } libStatGen-1.0.14/vcf/VcfRecord.h000066400000000000000000000203641254730101300164360ustar00rootroot00000000000000/* * Copyright (C) 2011-2012 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_RECORD_H__ #define __VCF_RECORD_H__ #include #include #include "VcfRecordFilter.h" #include "VcfRecordInfo.h" #include "VcfRecordGenotype.h" #include "StatGenStatus.h" #include "VcfRecordDiscardRules.h" /// This header file provides interface to read/write VCF files. class VcfRecord { public: /// Default Constructor, initializes the variables, but does not open /// any files. VcfRecord(); /// Destructor virtual ~VcfRecord(); /// Read the next Vcf data line from the file. /// \param filePtr IFILE to read from. /// \param siteOnly only store the first 8 columns /// \param sampleSubset pointer to sample subset information, /// but NULL if sample subsetting is not to be done. /// \param discardRules pointer to record discard information, /// but NULL if record discarding is not to be done. /// \return true if a line was successfully read from the specified filePtr, /// false if not. bool read(IFILE filePtr, bool siteOnly, VcfRecordDiscardRules& discardRules, VcfSubsetSamples* sampleSubset = NULL); /// Write this data line to the file (including the newline). /// \param filePtr IFILE to write to. /// \param siteOnly only write the first 8 columns /// \return true if a line was successfully written to the specified filePtr, /// false if not. bool write(IFILE filePtr, bool siteOnly); /// Reset this header, preparing for a new one. void reset(); /// Returns the status associated with the last method that sets the status. /// \return StatGenStatus of the last command that sets status. const StatGenStatus& getStatus(); // bool isValid(); /////////////////////// /// @name Get Vcf Fields /// Get methods for record fields (do not set status). //@{ const char* getChromStr() {return(myChrom.c_str());} int get1BasedPosition() {return(my1BasedPosNum);} const char* getIDStr() {return(myID.c_str());} const char* getRefStr() {return(myRef.c_str());} int getNumRefBases() {return(myRef.size());} const char* getAltStr() {return(myAlt.c_str());} /// Return a pointer to the alleles at the specified index with index 0 /// being the reference string for this position and index 1 starting /// the alternate alleles, throwing an exception if the index is out of /// range. /// \param index allele index (0 for reference, 1 for first alt, /// 2 for second, etc) /// \return string of the alleles at the specified index const char* getAlleles(unsigned int index); /// Return the int value of the first allele in the string at the /// specified index with index 0 being the reference string for /// this position and index 1 starting the alternate alleles, /// throwing an exception if the index is out of range. /// \param index allele index (0 for reference, 1 for first alt, /// 2 for second, etc) /// \return int allele at the specified index, 1=A, 2=C, 3=G, 4=T int getIntAllele(unsigned int index); /// Return the number of alternates listed in the Alts string. unsigned int getNumAlts(); float getQual() {return(myQualNum);} const char* getQualStr() {return(myQual.c_str());} /// Return a reference to the filter information. VcfRecordFilter& getFilter(){return(myFilter);} /// Return whether or not all filters were passed. int passedAllFilters() { return(myFilter.passedAllFilters()); } /// Get a reference to the information field. VcfRecordInfo& getInfo() {return myInfo;} /// Get a reference to the genotype fields. VcfRecordGenotype& getGenotypeInfo() {return myGenotype;} inline int getNumSamples() { return(myGenotype.getNumSamples()); } inline int getNumGTs(int index) { return(myGenotype.getNumGTs(index)); } inline int getGT(int sampleNum, unsigned int gtIndex) { return(myGenotype.getGT(sampleNum, gtIndex)); } inline void setGT(int sampleNum, unsigned int gtIndex, int newGt) { myGenotype.setGT(sampleNum, gtIndex, newGt); } /// Return true if all of the samples are phased and none are unphased, /// false if any are unphased or not phased. inline bool allPhased() { return(myGenotype.allPhased()); } /// Return true if all of the samples are unphased and none are phased, /// false if any are phased or not unphased. inline bool allUnphased() { return(myGenotype.allUnphased()); } /// Return true if all samples of all records have all the genotype alleles /// specified, false if not or if any GT field is missing. bool hasAllGenotypeAlleles() { return(myGenotype.hasAllGenotypeAlleles()); } /// Return the number of occurances of the specified allele index in the /// genotypes for this record. Index 0 for the reference. The alternate /// alleles start with index 1. An exception is thrown if the index is /// out of range. Optionally, the specified subset of samples can be /// skipped when determining allele counts. (If the record is read with /// just a subset of samples, those are automatically excluded here /// regardless of the passed in sampleSubset. /// \param index allele index (0 for reference, 1 for first alt, /// 2 for second, etc) /// \param sampleSubset pointer to sample subset information, /// but NULL if additional sample subsetting is not to be done. /// \return int allele count for the specified ref/alt. int getAlleleCount(unsigned int index, VcfSubsetSamples* sampleSubset = NULL); //@} /////////////////////// /// @name Set Vcf Fields /// Set methods for record fields (do not set status). //@{ void setChrom(const char* chrom) {myChrom = chrom;} void set1BasedPosition(int pos) {my1BasedPosNum = pos;} void setID(const char* id) {myID = id;} void setRef(const char* ref) {myRef = ref;} void setAlt(const char* alt) {myAlt = alt; myAltArray.clear();} // void setQual(float qual) {myQualNum = qual; } void setQual(const char* qual) { myQual = qual; if(myQual != ".") { myQualNum = atof(qual); } else { myQualNum = -1; } } protected: /// Read the specified file until a tab, '\n', or EOF is found /// and appending the read characters to stringRef (except for the /// stopping character). /// \param filePtr open file to be read. /// \param stringRef reference to a string that should be appended to /// with the characters read from the file until a '\t', '\n', or EOF /// is found. /// \return true if a '\t' stopped the reading, false if '\n' or /// EOF stopped the reading. bool readTilTab(IFILE filePtr, std::string& stringRef); private: VcfRecord(const VcfRecord& vcfRecord); VcfRecord& operator=(const VcfRecord& vcfRecord); static const char ALT_DELIM = ','; std::string myChrom; int my1BasedPosNum; std::string my1BasedPos; vcfIDtype myID; std::string myRef; std::string myAlt; float myQualNum; std::string myQual; VcfRecordFilter myFilter; VcfRecordInfo myInfo; VcfRecordGenotype myGenotype; ReusableVector myAltArray; std::vector myAlleleCount; // The status of the last failed command. StatGenStatus myStatus; std::string myDummyString; // Set Pointers to each piece. }; #endif libStatGen-1.0.14/vcf/VcfRecordDiscardRules.cpp000066400000000000000000000044101254730101300212700ustar00rootroot00000000000000/* * Copyright (C) 2013 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfRecordDiscardRules.h" void VcfRecordDiscardRules::reset() { myExcludeIDs.clear(); myIncludeIDs.clear(); myNumDiscarded = 0; } bool VcfRecordDiscardRules::setExcludeIDs(const char* filename) { return(setIDs(myExcludeIDs, filename)); } bool VcfRecordDiscardRules::setIncludeIDs(const char* filename) { return(setIDs(myIncludeIDs, filename)); } bool VcfRecordDiscardRules::discardForID(std::string& myID) { if(!myExcludeIDs.empty()) { if(myExcludeIDs.find(myID) != myExcludeIDs.end()) { // The ID is in the exclude list, // so return true, discard the record. // increment the discard counter. ++myNumDiscarded; return(true); } } else if(!myIncludeIDs.empty()) { if(myIncludeIDs.find(myID) == myIncludeIDs.end()) { // The ID is not in the include list, // so return false, discard the record. // increment the discard counter. ++myNumDiscarded; return(true); } } return(false); } bool VcfRecordDiscardRules::setIDs(IDList& idlist, const char* filename) { // Open the file nad read in all the exclude ids. IFILE idFile = ifopen(filename, "r"); if(idFile == NULL) { return(false); } std::string line; while(idFile->readLine(line) == 0) { idlist.insert(line); line.clear(); } if(!line.empty()) { idlist.insert(line); line.clear(); } return(true); } libStatGen-1.0.14/vcf/VcfRecordDiscardRules.h000066400000000000000000000047751254730101300207530ustar00rootroot00000000000000/* * Copyright (C) 2013 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_RECORD_DISCARD_RULES_H__ #define __VCF_RECORD_DISCARD_RULES_H__ #include #ifdef __GXX_EXPERIMENTAL_CXX0X__ #include #else #include #endif #include #include "VcfHeader.h" typedef std::string vcfIDtype; class VcfRecordDiscardRules { public: VcfRecordDiscardRules() : myExcludeIDs(), myIncludeIDs(), myNumDiscarded(0) {} ~VcfRecordDiscardRules() { } void reset(); int getNumDiscarded() { return(myNumDiscarded); } void clearNumDiscarded() { myNumDiscarded = 0; } /////////////////////// /// @name Set the discard rules. //@{ /// When reading records, skip all variants with the ids specified /// in the passed in filename. /// Returns false, if the file could not be read. bool setExcludeIDs(const char* filename); /// When reading records, keep only variants with the ids specified /// in the passed in filename. /// Returns false, if the file could not be read. bool setIncludeIDs(const char* filename); //@} /////////////////////// /// @name Check if a record should be kept. //@{ /// Return whether or not to discard the record based on the id. /// Returns true if it should be disarded, false if not. bool discardForID(std::string& myID); //@} private: #ifdef __GXX_EXPERIMENTAL_CXX0X__ typedef std::unordered_set IDList; #else typedef std::set IDList; #endif VcfRecordDiscardRules(const VcfRecordDiscardRules& vcfRecordDiscardRules); VcfRecordDiscardRules& operator=(const VcfRecordDiscardRules& vcfRecordDiscardRules); bool setIDs(IDList& idlist, const char* filename); IDList myExcludeIDs; IDList myIncludeIDs; int myNumDiscarded; }; #endif libStatGen-1.0.14/vcf/VcfRecordField.cpp000066400000000000000000000016021254730101300177270ustar00rootroot00000000000000 /* * Copyright (C) 2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfRecordField.h" libStatGen-1.0.14/vcf/VcfRecordField.h000066400000000000000000000035061254730101300174010ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_RECORD_FIELD_H__ #define __VCF_RECORD_FIELD_H__ #include "InputFile.h" /// This header file provides interface to read/write VCF files. class VcfRecordField { public: /// Default Constructor, initializes the variables. VcfRecordField() {} /// Destructor virtual ~VcfRecordField() {} /// Read this field from the file up until the next \t,\n, or EOF. /// Reads the \t, \n, or EOF. /// \param filePtr IFILE to read from. /// \return true if the field was successfully read from the specified /// filePtr, false if not. virtual bool read(IFILE filePtr) = 0; /// Write this field to the file, without printing the // starting/trailing '\t'. /// \return true if the field was successfully written to the specified /// filePtr, false if not. virtual bool write(IFILE filePtr) = 0; protected: private: VcfRecordField(const VcfRecordField& field); VcfRecordField& operator=(const VcfRecordField& field); }; #endif libStatGen-1.0.14/vcf/VcfRecordFilter.cpp000077500000000000000000000104151254730101300201360ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "VcfRecordFilter.h" #include "VcfHelper.h" bool VcfRecordFilter::ourParseFilter = false; bool VcfRecordFilter::read(IFILE filePtr) { static const std::string fieldStopCharsNoParse = "\n\t"; static const int tabPos = 1; static std::string fieldStopChars = fieldStopCharsNoParse; fieldStopChars += FILTER_DELIM; // The start of the first character in stopChars that means there is more // filter info in the format field, so continue reading the format field. static const int contPos = fieldStopCharsNoParse.length(); // Clear out any previously set values. reset(); if(ifeof(filePtr)) { // End of file, just return false. return(false); } // Check how much the filter should be parsed. int stopPos = 0; if(!ourParseFilter) { // Do not need to parse the filter, so just read until the tab. stopPos = filePtr->readTilChar(fieldStopCharsNoParse, myFilterString); } else { // Parse the filter as we go. stopPos = contPos; std::string* nextFilter; while(stopPos >= contPos) { nextFilter = &(myFilterVector.getNextEmpty()); stopPos = filePtr->readTilChar(fieldStopChars, *nextFilter); } } return(stopPos == tabPos); } void VcfRecordFilter::reset() { myFilterString.clear(); myFilterVector.reset(); } bool VcfRecordFilter::passedAllFilters() { static std::string pass("PASS"); return(pass.compare(getString()) == 0); } const std::string& VcfRecordFilter::getString() { // Check if the filter string needs to be set. if(myFilterString.size() == 0) { // Filter string is not yet set, so set it. for(int i = 0; i < myFilterVector.size(); i++) { if(i != 0) { myFilterString += ';'; } myFilterString += myFilterVector.get(i); } } return(myFilterString); } int VcfRecordFilter::getNumFilters() { // Check if the filter has been parsed. if(myFilterVector.size() == 0) { // Filter is not parsed, so parse the filter. VcfHelper::parseString(myFilterString, FILTER_DELIM, myFilterVector); } return(myFilterVector.size()); } const std::string& VcfRecordFilter::getString(int index) { // Check if the filter has been parsed yet. if(myFilterVector.size() == 0) { // Filter has not yet been parsed, so parse it. VcfHelper::parseString(myFilterString, FILTER_DELIM, myFilterVector); } return(myFilterVector.get(index)); } void VcfRecordFilter::setFilter(const char* filter) { reset(); myFilterString = filter; } void VcfRecordFilter::addFilter(const char* filter) { // If both are empty, add to both. if((myFilterString.size() == 0) && (myFilterVector.size() == 0)) { myFilterString += filter; myFilterVector.getNextEmpty() = filter; } else { // Either filter (or both) have contents, so append // as appropriate. if(myFilterString.size() != 0) { // String is set, so add the filter to the string. myFilterString += ';'; myFilterString += filter; } if(myFilterVector.size() != 0) { // Vector is set, so add the filter to the vector. myFilterVector.getNextEmpty() = filter; } } } libStatGen-1.0.14/vcf/VcfRecordFilter.h000066400000000000000000000050111254730101300175740ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_RECORD_FILTER_H__ #define __VCF_RECORD_FILTER_H__ #include "InputFile.h" #include "ReusableVector.h" /// This header file provides interface to read/write VCF files. class VcfRecordFilter { public: /// Default Constructor, initializes the variables. VcfRecordFilter() {} /// Destructor virtual ~VcfRecordFilter() {} /// Whether or not to initially parse the filter as it is read /// from the file. static void parseRecordFilter(bool parse) {ourParseFilter = parse;} /// Read this field from the file up until the next \t,\n, or EOF. /// Reads the \t, \n, or EOF. /// \param filePtr IFILE to read from. /// \return true if the field was successfully read from the specified /// filePtr, false if not. bool read(IFILE filePtr); void reset(); /// Return true if all filters were passed (contents is PASS). bool passedAllFilters(); /// Get the entire filter string returned as a const reference. const std::string& getString(); /// Get the number of filters in this string. int getNumFilters(); /// Get the filter at the specified index (starting at 0). /// If the index is out of range, an empty string is returned. const std::string& getString(int index); void clear() {reset();} void setFilter(const char* filter); void addFilter(const char* filter); protected: private: VcfRecordFilter(const VcfRecordFilter& vcfRecordFilter); VcfRecordFilter& operator=(const VcfRecordFilter& vcfRecordFilter); static bool ourParseFilter; static const char FILTER_DELIM = ';'; std::string myFilterString; ReusableVector myFilterVector; }; #endif libStatGen-1.0.14/vcf/VcfRecordGenotype.cpp000066400000000000000000000162101254730101300204770ustar00rootroot00000000000000/* * Copyright (C) 2011-2012 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfRecordGenotype.h" #include std::set VcfRecordGenotype::ourStoreFields; void VcfRecordGenotype::storeAllFields() { ourStoreFields.clear(); } void VcfRecordGenotype::addStoreField(const char* field) { ourStoreFields.insert(field); } bool VcfRecordGenotype::storeField(std::string& field) { if(ourStoreFields.size() == 0) { // No fields were set so read all fields. return(true); } return(ourStoreFields.find(field) != ourStoreFields.end()); } VcfRecordGenotype::VcfRecordGenotype() { reset(); } VcfRecordGenotype::~VcfRecordGenotype() { } bool VcfRecordGenotype::read(IFILE filePtr) { return(read(filePtr, NULL)); } bool VcfRecordGenotype::read(IFILE filePtr, VcfSubsetSamples* subsetInfo) { // Needed for skipping samples. static const std::string fieldEndChars = "\n\t"; static const int tabPos = 1; // Clear out any previously set values. reset(); if(ifeof(filePtr)) { // End of file, just return false. return(false); } // Read the format. if(!myFormat.read(filePtr)) { // No more fields return(false); } // Read all the samples until the end of the line. VcfGenotypeSample* nextSample = NULL; bool moreSamples = true; int sampleIndex = 0; while(moreSamples) { // Done reading the format field, so read the samples. // Check if this sample should be kept. if(subsetInfo != NULL) { // Check if this sample should be kept. if(!subsetInfo->keep(sampleIndex)) { // this sample should not be kept. if(filePtr->readTilChar(fieldEndChars) != tabPos) { // Stopped on new line or end of file instead of // a tab, so no more samples to read. moreSamples = false; } ++sampleIndex; continue; } } // Read this sample. nextSample = &(mySamples.getNextEmpty()); if(nextSample == NULL) { throw(std::runtime_error("VCF failed to get another sample.")); } if(!nextSample->read(filePtr, myFormat)) { // No more fields. moreSamples = false; } ++sampleIndex; } // Return whether or not a tab was found at the end of the field. return(false); } bool VcfRecordGenotype::write(IFILE filePtr) { bool status = true; // Check if there are any fields to write. if(myFormat.getNumFields() == 0) { // Nothing to write. return(true); } // Write the format. status &= myFormat.write(filePtr); // Loop through and write each sample. for(int i = 0; i < mySamples.size(); i++) { status &= mySamples.get(i).write(filePtr); } return(status); } void VcfRecordGenotype::reset() { myFormat.reset(); mySamples.reset(); } const std::string* VcfRecordGenotype::getString(const std::string& key, int sampleNum) { if(sampleNum >= mySamples.size()) { // Out of range sample index. return(NULL); } // Get the field from the sample. return(mySamples.get(sampleNum).getString(key)); } bool VcfRecordGenotype::setString(const std::string& key, int sampleNum, const std::string& value) { if(sampleNum >= mySamples.size()) { // Out of range sample index. return(NULL); } // Set the field in the sample. return(mySamples.get(sampleNum).setString(key, value)); } int VcfRecordGenotype::getGT(int sampleNum, unsigned int gtIndex) { if(sampleNum >= mySamples.size()) { // Out of range sample index. return(VcfGenotypeSample::INVALID_GT); } // Get the field from the sample. return(mySamples.get(sampleNum).getGT(gtIndex)); } void VcfRecordGenotype::setGT(int sampleNum, unsigned int gtIndex, int newGt) { if(sampleNum >= mySamples.size()) { // Out of range sample index. throw(std::runtime_error("setGT called with out of range sample.")); } // Set the field for the sample. mySamples.get(sampleNum).setGT(gtIndex, newGt); } int VcfRecordGenotype::getNumGTs(int sampleNum) { if(sampleNum >= mySamples.size()) { // Out of range sample index, no GTs. return(0); } // Get the field from the sample. return(mySamples.get(sampleNum).getNumGTs()); } bool VcfRecordGenotype::allPhased() { for(int i = 0; i < mySamples.size(); i++) { if(!mySamples.get(i).isPhased() || mySamples.get(i).isUnphased()) { // found a sample that is not phased or is unphased, so // return false. return(false); } } // All phased. return(true); } bool VcfRecordGenotype::allUnphased() { for(int i = 0; i < mySamples.size(); i++) { if(!mySamples.get(i).isUnphased() || mySamples.get(i).isPhased()) { // found a sample that is not unphased or is phased, so // return false. return(false); } } // All unphased. return(true); } bool VcfRecordGenotype::hasAllGenotypeAlleles() { for(int i = 0; i < mySamples.size(); i++) { if(!mySamples.get(i).hasAllGenotypeAlleles()) { // found a sample that does not have all genotype alleles, so // return false. return(false); } } // All have all genotype alleles. return(true); } bool VcfRecordGenotype::isPhased(int sampleNum) { if(sampleNum >= mySamples.size()) { // Out of range sample index. return(false); } return(mySamples.get(sampleNum).isPhased()); } bool VcfRecordGenotype::isUnphased(int sampleNum) { if(sampleNum >= mySamples.size()) { // Out of range sample index. return(false); } return(mySamples.get(sampleNum).isUnphased()); } bool VcfRecordGenotype::hasAllGenotypeAlleles(int sampleNum) { if(sampleNum >= mySamples.size()) { // Out of range sample index. return(false); } return(mySamples.get(sampleNum).hasAllGenotypeAlleles()); } libStatGen-1.0.14/vcf/VcfRecordGenotype.h000066400000000000000000000126631254730101300201540ustar00rootroot00000000000000/* * Copyright (C) 2011-2012 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_RECORD_GENOTYPE_H__ #define __VCF_RECORD_GENOTYPE_H__ #include #include "VcfRecordField.h" #include "ReusableVector.h" #include "VcfSubsetSamples.h" #include "VcfGenotypeFormat.h" #include "VcfGenotypeSample.h" /// This header file provides interface to read/write VCF files. class VcfRecordGenotype : public VcfRecordField { public: /// When reading, store all fields. static void storeAllFields(); /// When reading, store the specified field in addition to any others /// that have previously been specified. By default, all fields are stored. /// Use this method if you only want to store certain fields. static void addStoreField(const char* field); /// Return true if the specified field has been set to be stored. static bool storeField(std::string& field); /// Default Constructor, initializes the variables. VcfRecordGenotype(); /// Destructor virtual ~VcfRecordGenotype(); /// Read this genotype field from the file up until the next \t,\n, or EOF. /// \param filePtr IFILE to read from. /// \return true if a tab ended the field, false if it was \n or EOF (always /// returns false since this is the last field on the line). bool read(IFILE filePtr); /// Read this genotype field from the file up until the next \t,\n, or EOF. /// \param filePtr IFILE to read from. /// \param subsetInfo pointer to optional subsetting information. /// \return true if a tab ended the field, false if it was \n or EOF (always /// returns false since this is the last field on the line). bool read(IFILE filePtr, VcfSubsetSamples* subsetInfo); /// Write the genotype field to the file, without printing the // starting/trailing '\t'. /// \param filePtr IFILE to write to. /// \return true if the field was successfully written to the specified /// filePtr, false if not. bool write(IFILE filePtr); /// reset the field for a new entry. void reset(); /// reset the field for a new entry. void clear() {reset();} /// Get a pointer to the string containing the value associated with the /// specified key for the specified sample /// (the pointer will be invalid if the field is changed/reset). /// \param key to find the value for. /// \param sampleNum which sample to get the value for (starts at 0). /// \return const pointer to the string value for this key, NULL if /// the sample or the key was not found. const std::string* getString(const std::string& key, int sampleNum); /// Set the string associated with the specified key for the specified /// sample, returns true if set, false if not. bool setString(const std::string& key, int sampleNum, const std::string& value); int getGT(int sampleNum, unsigned int gtIndex); void setGT(int sampleNum, unsigned int gtIndex, int newGt); int getNumGTs(int sampleNum); /// Determine if all samples are phased. Returns true if all are phased /// and false if any are not phased or if any are unphased. bool allPhased(); /// Determine if all samples are unphased. Returns true if all are unphased /// and false if any are not unphased or if any are phased. bool allUnphased(); /// Determine if all samples have all the genotype alleles specified. /// Returns true if all genotype alleles are specified and false if /// any are missing ('.') or if GT is not specified. bool hasAllGenotypeAlleles(); /// Determine if the specified sample number is phased, returns true /// if it is phased and false if it is unphased or the sample number is /// invalid. bool isPhased(int sampleNum); /// Determine if the specified sample number is unphased, returns true /// if it is unphased and false if it is phased or the sample number is /// invalid. bool isUnphased(int sampleNum); /// Determine if the specified sample number has all of its genotype /// alleles specified. Returns true if all genotype alleles are specified /// and false if any are missing ('.') or if GT is not specified. bool hasAllGenotypeAlleles(int sampleNum); /// Get the number of samples. inline int getNumSamples() const { return(mySamples.size()); } protected: private: VcfRecordGenotype(const VcfRecordGenotype& gt); VcfRecordGenotype& operator=(const VcfRecordGenotype& gt); // Fields that should be stored when reading for all records. static std::set ourStoreFields; VcfGenotypeFormat myFormat; ReusableVector mySamples; }; #endif libStatGen-1.0.14/vcf/VcfRecordInfo.cpp000077500000000000000000000105701254730101300176060ustar00rootroot00000000000000/* * Copyright (C) 2011-2012 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfRecordInfo.h" #include VcfRecordInfo::VcfRecordInfo() { reset(); } VcfRecordInfo::~VcfRecordInfo() { } bool VcfRecordInfo::read(IFILE filePtr) { // Clear out any previously set values. reset(); if(ifeof(filePtr)) { // End of file, just return false. return(false); } static const std::string keyStopChars = "\n\t;="; static const std::string valueStopChars = "\n\t;"; // The start of the first character in stopChars // that means there is more information for this object, so // continue reading the file. static const int contPos = 2; static const int tabPos = 1; // Keep reading. Loop will be exited // when a \t, \n, or EOF is found. int stopPos = contPos; while(stopPos >= contPos) { // Get the next element to write the key into. InfoElement& nextElement = myInfo.getNextEmpty(); // Read the next key. stopPos = filePtr->readTilChar(keyStopChars, nextElement.key); if(keyStopChars[stopPos] == '=') { // Stoped at the value part, so read the value // associated with the key. stopPos = filePtr->readTilChar(valueStopChars, nextElement.value); } } // Return whether or not a tab was found at the end of the field. return(stopPos == tabPos); } bool VcfRecordInfo::write(IFILE filePtr) { // If there are no entries, write '.'. int infoSize = myInfo.size(); if(infoSize <= 0) { return(ifprintf(filePtr, "%c", EMPTY_INFO) == 1); } int numWritten = 0; int numExpected = 0; for(int i = 0; i < infoSize; i++) { if(i != 0) { numWritten += ifprintf(filePtr, ";"); ++numExpected; } InfoElement& info = myInfo.get(i); if(info.value.empty()) { // No value, just a key. numWritten += ifprintf(filePtr, "%s", info.key.c_str()); numExpected += info.key.size(); } else { // write the key & the value. numWritten += ifprintf(filePtr, "%s=%s", info.key.c_str(), info.value.c_str()); numExpected += info.key.size() + info.value.size() + 1; } } return(numWritten == numExpected); } void VcfRecordInfo::reset() { myInfo.reset(); } void VcfRecordInfo::setString(const char* key, const char* stringVal) { // Check if the field is already there. int infoSize = myInfo.size(); for(int i = 0; i < infoSize; i++) { InfoElement& info = myInfo.get(i); if(info.key.compare(key) == 0) { // Set the value and return. info.value = stringVal; return; } } // Not found, so add a new entry. InfoElement& newElement = myInfo.getNextEmpty(); newElement.key = key; newElement.value = stringVal; } const std::string* VcfRecordInfo::getString(const char* key) { // Check if the field is already there. int infoSize = myInfo.size(); for(int i = 0; i < infoSize; i++) { InfoElement& info = myInfo.get(i); if(info.key.compare(key) == 0) { // Found, so return the value. return(&(info.value)); } } // Not found, so return NULL.. return(NULL); } const std::string* VcfRecordInfo::getString(int index) { if(index >= myInfo.size()) { // Out of range. return(NULL); } return(&(myInfo.get(index).value)); } libStatGen-1.0.14/vcf/VcfRecordInfo.h000066400000000000000000000071151254730101300172510ustar00rootroot00000000000000/* * Copyright (C) 2011-2012 Regents of the University of Michigan, * Hyun Min Kang, Matthew Flickenger, Matthew Snyder, * and Goncalo Abecasis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_RECORD_INFO_H__ #define __VCF_RECORD_INFO_H__ #include #include "VcfRecordField.h" #include "ReusableVector.h" /// This header file provides interface to read/write VCF files. class VcfRecordInfo : public VcfRecordField { public: /// Default Constructor, initializes the variables. VcfRecordInfo(); /// Destructor virtual ~VcfRecordInfo(); /// Read this info field from the file up until the next \t,\n, or EOF. /// \param filePtr IFILE to read from. /// \return true if a tab ended the field, false if it was \n or EOF. bool read(IFILE filePtr); /// Write the info field to the file, without printing the // starting/trailing '\t'. /// \param filePtr IFILE to write to. /// \return true if the field was successfully written to the specified /// filePtr, false if not. bool write(IFILE filePtr); /// reset the field for a new entry. void reset(); /// reset the field for a new entry. void clear() {reset();} int getNumInfoFields() { return(myInfo.size()); } /// Set the string value associated with the specified key. /// \param key to set the value for. /// \param const pointer to the string value for this key, NULL if /// the key was not found, a pointer to an empty string if the key /// was found, but does not have a value. void setString(const char* key, const char* stringVal); /// Get a pointer to the string containing the value associated with the /// specified key (the pointer will be invalid if the field is /// changed/reset). /// \param key to find the value for. /// \return const pointer to the string value for this key, NULL if /// the key was not found, a pointer to an empty string if the key /// was found, but does not have a value. const std::string* getString(const char* key); /// Get a pointer to the string containing the value associated with the /// specified info index (the pointer will be invalid if the field is /// changed/reset). /// \param index to get the value for. /// \return const pointer to the string value for this index, NULL if /// the index is out of range, a pointer to an empty string if the index /// is in range, but does not have a value. const std::string* getString(int index); protected: private: VcfRecordInfo(const VcfRecordInfo& vcfRecordInfo); VcfRecordInfo& operator=(const VcfRecordInfo& vcfRecordInfo); static const char EMPTY_INFO = '.'; class InfoElement { public: InfoElement() {key.clear(); value.clear();} std::string key; std::string value; void clear() {key.clear(); value.clear();} }; ReusableVector myInfo; }; #endif libStatGen-1.0.14/vcf/VcfSubsetSamples.cpp000066400000000000000000000151321254730101300203420ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfSubsetSamples.h" void VcfSubsetSamples::reset() { mySampleSubsetIndicator.clear(); mySampleNames.clear(); } void VcfSubsetSamples::init(const VcfHeader& header, bool include) { // Get the number of samples from the header. unsigned int origNumSamples = header.getNumSamples(); // Resize the sampleSubsetIndicator to nothing to clear it out. mySampleSubsetIndicator.resize(0); // Now resize sampleSubsetIndicator to indicate that all of the original // samples are to be kept or not kept based on the include parameter. // mySampleSubsetIndicator is sized to the original number of samples // so it can be used when reading records to determine which ones should // be removed/kept. mySampleSubsetIndicator.resize(origNumSamples, include); // Copy the vector of original sample names. mySampleNames.clear(); mySampleNames.resize(origNumSamples); for(unsigned int i = 0; i < origNumSamples; i++) { mySampleNames[i] = header.getSampleName(i); } } bool VcfSubsetSamples::addIncludeSample(const char* sampleName) { // Look for the sample name. for(unsigned int i = 0; i < mySampleNames.size(); i++) { if(mySampleNames[i] == sampleName) { // Found the sample index. if(mySampleSubsetIndicator.size() <= i) { // SampleSubsetIndicator not setup properly. return(false); } mySampleSubsetIndicator[i] = true; return(true); } } // Did not find the sample, so can't include it. return(false); } bool VcfSubsetSamples::addExcludeSample(const char* sampleName) { // Look for the sample name. for(unsigned int i = 0; i < mySampleNames.size(); i++) { if(mySampleNames[i] == sampleName) { // Found the sample index. if(mySampleSubsetIndicator.size() <= i) { // SampleSubsetIndicator not setup properly. return(false); } mySampleSubsetIndicator[i] = false; return(true); } } // Did not find the sample, so can't include it. return(false); } bool VcfSubsetSamples::init(VcfHeader& header, const char* includeFileName, const char* excludeSample, const char* excludeFileName, const char* delims) { // Setup the sample lists to include/exclude. std::set includeList; std::set excludeList; if(includeFileName != NULL) { if(!readSamplesFromFile(includeFileName, includeList, delims)) { // Failed, so return. return(false); } } if(excludeFileName != NULL) { if(!readSamplesFromFile(excludeFileName, excludeList, delims)) { // Failed, so return. return(false); } } if(excludeSample != NULL) { excludeList.insert(excludeSample); } int origNumSamples = header.getNumSamples(); // Resize the sampleSubsetIndicator to nothing to clear it out. mySampleSubsetIndicator.resize(0); // Now resize sampleSubsetIndicator to indicate that all of the original // samples are to be kept. The ones that are not to be kept will be // modified to be unkept (false). // mySampleSubsetIndicator is sized to the original number of samples // so it can be used when reading records to determine which ones should // be removed/kept. mySampleSubsetIndicator.resize(origNumSamples, true); // if no samples, return. if(origNumSamples == 0) { return(true); } // Now that the sample lists to include/exclude are setup and the // indicator vector is setup, subset the header removing samples that // should not be kept (not in the include list if set or in the exclude // list). Loop from the back of the samples to the beginning since // removing samples changes the index of all following samples. for(int i = (origNumSamples-1); i >= 0; i--) { // Check if the sample should be kept. const char* sampleName = header.getSampleName(i); // Remove the sample if the includeList was specified and the sample // was not in it or if the excludeList was specified and the sample // was in it. if((!includeList.empty() && (includeList.count(sampleName) == 0)) || (!excludeList.empty() && (excludeList.count(sampleName) != 0))) { // This sample should be removed. header.removeSample(i); mySampleSubsetIndicator[i] = false; } } return(true); } bool VcfSubsetSamples::keep(unsigned int sampleIndex) { if(sampleIndex >= mySampleSubsetIndicator.size()) { // index out of range. return(false); } return(mySampleSubsetIndicator[sampleIndex]); } bool VcfSubsetSamples::readSamplesFromFile(const char* fileName, std::set& sampleList, const char* delims) { // Open the file. IFILE sampleFile = ifopen(fileName, "r"); if(sampleFile == NULL) { // Failed to open. return(false); } // read the file. std::string tempString; std::string delimString = delims; delimString += '\n'; int readResult = 0; while(readResult != -1) { readResult = sampleFile->readTilChar(delimString, tempString); // Check to see if something was read (tempString is not empty). if(!tempString.empty()) { // sample name found, so add it to the container. sampleList.insert(tempString); } // Clear the string being read into. tempString.clear(); } return(true); } libStatGen-1.0.14/vcf/VcfSubsetSamples.h000066400000000000000000000074141254730101300200130ustar00rootroot00000000000000/* * Copyright (C) 2012 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __VCF_SUBSET_SAMPLES_H__ #define __VCF_SUBSET_SAMPLES_H__ #include #include #include #include "VcfHeader.h" class VcfSubsetSamples { public: VcfSubsetSamples() : mySampleSubsetIndicator(), mySampleNames() {} ~VcfSubsetSamples() { mySampleSubsetIndicator.clear(); } void reset(); /// Read the samples from the header initiallizing them all to be /// included/excluded based on the include paramater. The header is /// not stored or updated based on any include/excludes. The mapping /// between sampleNames & indexes is stored to be used for addIncludeSample /// and addExcludeSample. void init(const VcfHeader& header, bool include); /// Include the specified sample. initSample must first be called to /// specify the header mapping between index and sample. /// \return true if the sample could be included, false if the sample /// was not found in the sample list so cannot be included. bool addIncludeSample(const char* sampleName); /// Exclude the specified sample. initSample must first be called to /// specify the header mapping between index and sample. /// \return true if the sample was found in teh sample list and could be /// excluded, false if the sample was not found in the sample list. bool addExcludeSample(const char* sampleName); /// Initialize this object based on the sample names found in sampleFileName /// delimited by any of the characters in delims or '\n' and update the /// header to only include the specified samples. /// This also initializes this class to identify which samples should /// be kept/removed when reading records. bool init(VcfHeader& header, const char* sampleFileName, const char* excludeSample, const char* excludeFileName, const char* delims = "\n"); /// Return if the specified original sample index should be kept. /// This is only applicable after calling init. /// If the index is out of range, it will return false (do not keep). /// \param sampleIndex index into the original sample set to check if /// it should be kept. /// \return true if the sample index should be kept, false if not or if /// the index is out of range. bool keep(unsigned int sampleIndex); private: VcfSubsetSamples(const VcfSubsetSamples& vcfSubsetSamples); VcfSubsetSamples& operator=(const VcfSubsetSamples& vcfSubsetSamples); // Read a list of samples from the specified file delimited by any of the // characters in delims or '\n' and store them in the specified container. bool readSamplesFromFile(const char* fileName, std::set& sampleList, const char* delims="\n"); std::vector mySampleSubsetIndicator; // Used for initSample & addIncludeSample & addExcludeSample for // mapping between original sample names and indexes in // mySampleSubsetIndicator. std::vectormySampleNames; }; #endif libStatGen-1.0.14/vcf/test/000077500000000000000000000000001254730101300153625ustar00rootroot00000000000000libStatGen-1.0.14/vcf/test/.gitignore000066400000000000000000000000101254730101300173410ustar00rootroot00000000000000vcfTest libStatGen-1.0.14/vcf/test/Main.cpp000066400000000000000000000015461254730101300167600ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfFileTest.h" #include "VcfHeaderTest.h" int main(int argc, char ** argv) { testVcfHeader(); testVcfFile(); } libStatGen-1.0.14/vcf/test/Makefile000066400000000000000000000013011254730101300170150ustar00rootroot00000000000000EXE = vcfTest TOOLBASE = VcfFileTest VcfHeaderTest SRCONLY = Main.cpp TEST_COMMAND = ./vcfTest && diff results/vcfHeader.vcf expected/vcfHeader.vcf && diff results/vcfHeaderAddedFirst.vcf expected/vcfHeader.vcf && diff results/vcfHeaderAddedLast.vcf expected/vcfHeader.vcf && diff results/vcfHeaderAddedMiddle.vcf expected/vcfHeader.vcf && diff results/vcfFile.vcf testFiles/vcfFile.vcf && diff results/vcfFileNoInfo.vcf expected/vcfFileNoInfo.vcf && diff results/vcfFileNoInfoBGZF.vcf expected/vcfFileNoInfoBGZF.vcf && diff results/vcfFileNoInfoKeepGT.vcf expected/vcfFileNoInfoKeepGT.vcf && diff results/vcfFileNoInfoKeepGQHQ.vcf expected/vcfFileNoInfoKeepGQHQ.vcf include ../../Makefiles/Makefile.testlibStatGen-1.0.14/vcf/test/VcfFileTest.cpp000066400000000000000000003537031254730101300202570ustar00rootroot00000000000000/* * Copyright (C) 2011-2013 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfFileTest.h" #include "VcfFileReader.h" #include "VcfFileWriter.h" #include "VcfHeaderTest.h" #include const std::string HEADER_LINE_SUBSET1="#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002"; const std::string HEADER_LINE_SUBSET2="#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00002 NA00003"; const int NUM_SAMPLES_SUBSET1 = 2; const int NUM_SAMPLES_SUBSET2 = 2; const std::string HEADER_LINE_EXCLUDE_SUBSET1="#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00003"; const int NUM_SAMPLES_EXCLUDE2 = 2; const std::string HEADER_LINE_EXCLUDE2="#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00003"; void testVcfFile() { testVcfReadFile(); testVcfWriteFile(); testVcfReadSection(); testVcfReadSectionNoIndex(); testVcfReadSectionBadIndex(); } void testVcfReadFile() { // VcfFileHeader header; // Test open for read via the constructor with return. VcfFileReader reader; VcfHeader header; VcfRecord record; // Try reading without opening. bool caughtException = false; try { assert(reader.readRecord(record) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); // Try opening a file that doesn't exist. caughtException = false; try { assert(reader.open("fileDoesNotExist.txt", header) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); // "testFiles/testVcf.vcf"); // assert(vcfInConstructorReadDefault.WriteHeader(header) == false); // assert(vcfInConstructorReadDefault.ReadHeader(header) == true); // // Test open for write via the constructor. // VcfFile vcfInConstructorWrite("results/newWrite.vcf", VcfFile::WRITE, // ErrorHandler::RETURN); // assert(vcfInConstructorWrite.ReadHeader(header) == false); // assert(vcfInConstructorWrite.WriteHeader(header) == true); // // Test open for read via the constructor // VcfFile vcfInConstructorRead("testFiles/testVcf.vcf", VcfFile::READ); // bool caughtException = false; // try // { // assert(vcfInConstructorRead.WriteHeader(header) == false); // } // catch (std::exception& e) // { // caughtException = true; // } // assert(caughtException); // assert(vcfInConstructorRead.ReadHeader(header) == true); // // Test open for write via child class. // VcfFileWriter vcfWriteConstructor("results/newWrite1.vcf"); // caughtException = false; // try // { // assert(vcfWriteConstructor.ReadHeader(header) == false); // } // catch (std::exception& e) // { // caughtException = true; // } // assert(caughtException); // assert(vcfWriteConstructor.WriteHeader(header) == true); // // Test open for read via child class. // VcfFileReader vcfReadConstructor("testFiles/testVcf.vcf"); // caughtException = false; // try // { // assert(vcfReadConstructor.WriteHeader(header) == false); // } // catch (std::exception& e) // { // caughtException = true; // } // assert(caughtException); // assert(vcfReadConstructor.ReadHeader(header) == true); //////////////////////////////// // Test the subset logic. VcfRecordGenotype* sampleInfo = NULL; reader.open("testFiles/vcfFile.vcf", header, "testFiles/subset1.txt", NULL, NULL, ";"); assert(header.getHeaderLine() == HEADER_LINE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == -1); // Read the records to make sure they were subset. assert(reader.readRecord(record)); assert(record.getGT(0,0) == 0); assert(record.getGT(1,1) == 0); assert(record.getGT(1,0) == 1); assert(record.getGT(0,1) == 0); assert(record.getGT(1,2) == VcfGenotypeSample::INVALID_GT); assert(record.getGT(2,0) == VcfGenotypeSample::INVALID_GT); assert(strcmp(record.getAlleles(0), "G") == 0); assert(strcmp(record.getAlleles(1), "A") == 0); assert(record.getIntAllele(0) == 3); assert(record.getIntAllele(1) == 1); assert(record.getNumAlts() == 1); try { caughtException = false; assert(record.getIntAllele(2) == 0); } catch (std::exception& e) { caughtException = true; } try { caughtException = false; assert(record.getAlleles(2) == NULL); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.getGT(0,0) == 0); assert(record.getGT(1,1) == 1); assert(record.getGT(1,0) == 0); assert(record.getGT(0,1) == 0); assert(record.getGT(1,2) == VcfGenotypeSample::INVALID_GT); assert(record.getGT(2,0) == VcfGenotypeSample::INVALID_GT); assert(record.getNumAlts() == 1); assert(strcmp(record.getAlleles(0), "T") == 0); assert(strcmp(record.getAlleles(1), "A") == 0); try { caughtException = false; assert(record.getAlleles(2) == NULL); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(*(sampleInfo->getString("GT", 1)) == "0|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == false); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.getGT(0,0) == 1); assert(record.getGT(1,1) == 1); assert(record.getGT(1,0) == 2); assert(record.getGT(0,1) == 2); assert(record.getGT(1,2) == VcfGenotypeSample::INVALID_GT); assert(record.getGT(2,0) == VcfGenotypeSample::INVALID_GT); assert(strcmp(record.getAlleles(0), "A") == 0); assert(strcmp(record.getAlleles(1), "G") == 0); assert(strcmp(record.getAlleles(2), "T") == 0); assert(record.getIntAllele(2) == 4); assert(record.getNumAlts() == 2); try { caughtException = false; assert(record.getAlleles(3) == NULL); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "1|2"); assert(*(sampleInfo->getString("GT", 1)) == "2|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 2); assert(reader.readRecord(record)); assert(strcmp(record.getAlleles(0), "T") == 0); try { caughtException = false; assert(record.getAlleles(1) == NULL); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(record.getNumAlts() == 0); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 0); assert(reader.readRecord(record)); assert(strcmp(record.getAlleles(0), "GTC") == 0); assert(record.getIntAllele(0) == 3); assert(strcmp(record.getAlleles(1), "G") == 0); assert(record.getIntAllele(1) == 3); assert(record.getNumAlts() == 2); assert(strcmp(record.getAlleles(2), "GTCT") == 0); assert(record.getIntAllele(2) == 3); try { caughtException = false; assert(record.getAlleles(3) == NULL); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0/1"); assert(*(sampleInfo->getString("GT", 1)) == "0/2"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 2); assert(reader.readRecord(record)); assert(strcmp(record.getAlleles(0), "GTC") == 0); assert(record.getNumAlts() == 2); assert(strcmp(record.getAlleles(1), "G") == 0); assert(strcmp(record.getAlleles(2), "GTCT") == 0); try { caughtException = false; assert(record.getAlleles(3) == NULL); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(sampleInfo->getString("GT", 0) == NULL); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 2); assert(reader.readRecord(record)); assert(record.getGT(0,0) == 0); assert(record.getGT(1,1) == VcfGenotypeSample::MISSING_GT); assert(record.getGT(1,0) == 0); assert(record.getGT(0,1) == 1); assert(record.getGT(1,2) == VcfGenotypeSample::INVALID_GT); assert(record.getGT(2,0) == VcfGenotypeSample::INVALID_GT); assert(record.getNumAlts() == 1); assert(strcmp(record.getAlleles(0), "GTC") == 0); assert(strcmp(record.getAlleles(1), "G") == 0); try { caughtException = false; assert(record.getAlleles(2) == NULL); } catch (std::exception& e) { caughtException = true; } assert(caughtException); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0|."); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 7); assert(reader.getNumRecords() == 7); reader.close(); ////////////////////////// // Subset with a different file. reader.open("testFiles/vcfFile.vcf", header, "testFiles/subset2.txt", NULL, NULL); assert(header.getHeaderLine() == HEADER_LINE_SUBSET2); assert(header.getNumSamples() == NUM_SAMPLES_SUBSET2); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[1]); assert(header.getSampleName(1) == SAMPLES[2]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[0].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 1); // Read the records to make sure they were subset. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "1|0"); assert(*(sampleInfo->getString("GT", 1)) == "1/1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0/0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "2|1"); assert(*(sampleInfo->getString("GT", 1)) == "2/2"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0/0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0/2"); assert(*(sampleInfo->getString("GT", 1)) == "1/1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(sampleInfo->getString("GT", 0) == NULL); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == false); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|."); assert(*(sampleInfo->getString("GT", 1)) == "1|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == true); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == false); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 7); assert(reader.getNumRecords() == 7); ////////////////////////// // Subset using an exclude file reader.open("testFiles/vcfFile.vcf", header, NULL, NULL, "testFiles/subset1.txt", ";"); assert(header.getHeaderLine() == HEADER_LINE_EXCLUDE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES - NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[2]); assert(header.getSampleName(1) == NULL); assert(header.getSampleIndex(SAMPLES[1].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 0); // Read the records to make sure they were subset. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "1/1"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "2/2"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "1/1"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(sampleInfo->getString("GT", 0) == NULL); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == false); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "1|1"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == true); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 7); assert(reader.getNumRecords() == 7); ////////////////////////// // Subset with a different exclude. reader.open("testFiles/vcfFile.vcf", header, NULL, NULL, "testFiles/exclude2.txt"); assert(header.getHeaderLine() == HEADER_LINE_EXCLUDE2); assert(header.getNumSamples() == NUM_SAMPLES_EXCLUDE2); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[2]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 1); // Read the records to make sure they were subset. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1/1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(*(sampleInfo->getString("GT", 1)) == "0/0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "1|2"); assert(*(sampleInfo->getString("GT", 1)) == "2/2"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0/0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0/1"); assert(*(sampleInfo->getString("GT", 1)) == "1/1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(sampleInfo->getString("GT", 0) == NULL); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == false); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "1|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == true); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 7); assert(reader.getNumRecords() == 7); ////////////////////////// // Subset with an exclude sample. reader.open("testFiles/vcfFile.vcf", header, NULL, "NA00002", NULL); assert(header.getHeaderLine() == HEADER_LINE_EXCLUDE2); assert(header.getNumSamples() == NUM_SAMPLES_EXCLUDE2); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[2]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 1); // Read the records to make sure they were subset. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1/1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(*(sampleInfo->getString("GT", 1)) == "0/0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "1|2"); assert(*(sampleInfo->getString("GT", 1)) == "2/2"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0/0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0/1"); assert(*(sampleInfo->getString("GT", 1)) == "1/1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(sampleInfo->getString("GT", 0) == NULL); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == false); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "1|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == true); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 7); assert(reader.getNumRecords() == 7); ////////////////////////// // Subset using an exclude file and exclude sample. reader.open("testFiles/vcfFile.vcf", header, NULL, "NA00001", "testFiles/exclude2.txt"); assert(header.getHeaderLine() == HEADER_LINE_EXCLUDE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES - NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[2]); assert(header.getSampleName(1) == NULL); assert(header.getSampleIndex(SAMPLES[1].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 0); // Read the records to make sure they were subset. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "1/1"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "2/2"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "1/1"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(sampleInfo->getString("GT", 0) == NULL); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == false); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "1|1"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == true); assert(sampleInfo->allUnphased() == false); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 7); assert(reader.getNumRecords() == 7); ////////////////////////// // Subset using an exclude file and exclude sample. // Add variant discard reader.open("testFiles/vcfFile.vcf", header, NULL, "NA00001", "testFiles/exclude2.txt"); reader.setExcludeIDs("testFiles/excludeIDs.txt"); assert(header.getHeaderLine() == HEADER_LINE_EXCLUDE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES - NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[2]); assert(header.getSampleName(1) == NULL); assert(header.getSampleIndex(SAMPLES[1].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 0); // Read the records to make sure they were subset. assert(reader.readRecord(record)); assert(strcmp(record.getIDStr(), ".") == 0); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(strcmp(record.getIDStr(), "rs6040355") == 0); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "2/2"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(strcmp(record.getIDStr(), ".") == 0); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 3); assert(reader.getNumRecords() == 7); ////////////////////////// // Subset using an exclude file and exclude sample. // Add variant discard reader.open("testFiles/vcfFile.vcf", header, NULL, "NA00001", "testFiles/exclude2.txt"); reader.setIncludeIDs("testFiles/includeIDs.txt"); assert(header.getHeaderLine() == HEADER_LINE_EXCLUDE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES - NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[2]); assert(header.getSampleName(1) == NULL); assert(header.getSampleIndex(SAMPLES[1].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 0); // Read the records to make sure they were subset. assert(reader.readRecord(record)); assert(strcmp(record.getIDStr(), ".") == 0); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(strcmp(record.getIDStr(), "rs6040355") == 0); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "2/2"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); assert(strcmp(record.getIDStr(), ".") == 0); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 1); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->allPhased() == false); assert(sampleInfo->allUnphased() == true); assert(sampleInfo->hasAllGenotypeAlleles() == true); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 3); assert(reader.getNumRecords() == 7); ////////////////////////// // Add in discarding non-phased. reader.setDiscardRules(VcfFileReader::DISCARD_NON_PHASED); reader.open("testFiles/vcfFile.vcf", header, "testFiles/subset1.txt", NULL, NULL, ";"); assert(header.getHeaderLine() == HEADER_LINE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == -1); // Read the records to make sure they were subset. assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "1|2"); assert(*(sampleInfo->getString("GT", 1)) == "2|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0|."); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 4); assert(reader.getNumRecords() == 7); reader.close(); ////////////////////////// // Discard missing GTs. reader.setDiscardRules(VcfFileReader::DISCARD_MISSING_GT); reader.open("testFiles/vcfFile.vcf", header, "testFiles/subset1.txt", NULL, NULL, ";"); assert(header.getHeaderLine() == HEADER_LINE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == -1); // Read the records to make sure they were subset. assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(*(sampleInfo->getString("GT", 1)) == "0|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "1|2"); assert(*(sampleInfo->getString("GT", 1)) == "2|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0/1"); assert(*(sampleInfo->getString("GT", 1)) == "0/2"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 5); assert(reader.getNumRecords() == 7); ////////////////////////// // Discard missing GTs & non-Phased. reader.setDiscardRules(VcfFileReader::DISCARD_MISSING_GT | VcfFileReader::DISCARD_NON_PHASED); reader.open("testFiles/vcfFile.vcf", header, "testFiles/subset1.txt", NULL, NULL, ";"); assert(header.getHeaderLine() == HEADER_LINE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == -1); // Read the records to make sure they were subset. assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "1|2"); assert(*(sampleInfo->getString("GT", 1)) == "2|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 3); assert(reader.getNumRecords() == 7); reader.close(); //////////////////////////////// // Test Discarding filtered without subsetting. reader.open("testFiles/vcfFile.vcf", header); reader.setDiscardRules(VcfFileReader::DISCARD_FILTERED); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == NUM_SAMPLES); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Read the records. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(*(sampleInfo->getString("GT", 2)) == "1/1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "1|2"); assert(*(sampleInfo->getString("GT", 1)) == "2|1"); assert(*(sampleInfo->getString("GT", 2)) == "2/2"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 2); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(*(sampleInfo->getString("GT", 2)) == "0/0"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 0); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0/1"); assert(*(sampleInfo->getString("GT", 1)) == "0/2"); assert(*(sampleInfo->getString("GT", 2)) == "1/1"); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 2); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(sampleInfo->getString("GT", 0) == NULL); assert(sampleInfo->getString("GT", 1) == NULL); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 2); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0|."); assert(*(sampleInfo->getString("GT", 2)) == "1|1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == true); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 6); assert(reader.getNumRecords() == 7); reader.close(); //////////////////////////////// // Test Discarding multiple Alts without subsetting. reader.open("testFiles/vcfFile.vcf", header); reader.setDiscardRules(VcfFileReader::DISCARD_MULTIPLE_ALTS); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == NUM_SAMPLES); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Read the records. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(*(sampleInfo->getString("GT", 2)) == "1/1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(*(sampleInfo->getString("GT", 1)) == "0|1"); assert(*(sampleInfo->getString("GT", 2)) == "0/0"); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == false); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(*(sampleInfo->getString("GT", 2)) == "0/0"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 0); assert(reader.readRecord(record)); assert(record.getGT(0,0) == 0); assert(record.getGT(1,1) == VcfGenotypeSample::MISSING_GT); assert(record.getGT(1,0) == 0); assert(record.getGT(0,1) == 1); assert(record.getGT(2,0) == 1); assert(record.getGT(2,1) == 1); assert(record.getGT(1,2) == VcfGenotypeSample::INVALID_GT); assert(record.getGT(3,0) == VcfGenotypeSample::INVALID_GT); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0|."); assert(*(sampleInfo->getString("GT", 2)) == "1|1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == true); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 4); assert(reader.getNumRecords() == 7); reader.close(); //////////////////////////////// // Test subsetting and discarding multiple Alts, filter failures, // non-phased, and missing genotypes. reader.open("testFiles/vcfFile.vcf", header); reader.setDiscardRules(VcfFileReader::DISCARD_MULTIPLE_ALTS| VcfFileReader::DISCARD_FILTERED | VcfFileReader::DISCARD_MISSING_GT | VcfFileReader::DISCARD_NON_PHASED); reader.open("testFiles/vcfFile.vcf", header, "testFiles/subset1.txt", NULL, NULL, ";"); assert(header.getHeaderLine() == HEADER_LINE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == -1); // Read the records to make sure they were subset. assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 0); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 2); assert(reader.getNumRecords() == 7); reader.close(); ////////////////////////// // Discard missing GTs & non-Phased and filtering // AND discard without at least 2 alternates with no additional subsetting. reader.setDiscardRules(VcfFileReader::DISCARD_MISSING_GT | VcfFileReader::DISCARD_NON_PHASED); reader.addDiscardMinAltAlleleCount(2, NULL); reader.open("testFiles/vcfFile.vcf", header, "testFiles/subset1.txt", NULL, NULL, ";"); assert(header.getHeaderLine() == HEADER_LINE_SUBSET1); assert(header.getNumSamples() == NUM_SAMPLES_SUBSET1); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == -1); // Read the records to make sure they were subset. assert(reader.readRecord(record)); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 2); assert(*(sampleInfo->getString("GT", 0)) == "1|2"); assert(*(sampleInfo->getString("GT", 1)) == "2|1"); assert(sampleInfo->getString("GT", 2) == NULL); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 1); assert(reader.getNumRecords() == 7); reader.close(); //////////////////////////////// // Test Discarding multiple Alts without subsetting // and discard any without at least 3 alts. reader.open("testFiles/vcfFile.vcf", header); reader.addDiscardMinAltAlleleCount(3, NULL); reader.setDiscardRules(VcfFileReader::DISCARD_MULTIPLE_ALTS); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == NUM_SAMPLES); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Read the records. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(*(sampleInfo->getString("GT", 2)) == "1/1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.getGT(0,0) == 0); assert(record.getGT(1,1) == VcfGenotypeSample::MISSING_GT); assert(record.getGT(1,0) == 0); assert(record.getGT(0,1) == 1); assert(record.getGT(2,0) == 1); assert(record.getGT(2,1) == 1); assert(record.getGT(1,2) == VcfGenotypeSample::INVALID_GT); assert(record.getGT(3,0) == VcfGenotypeSample::INVALID_GT); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0|."); assert(*(sampleInfo->getString("GT", 2)) == "1|1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == true); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 2); assert(reader.getNumRecords() == 7); reader.close(); //////////////////////////////// // Test Discarding multiple Alts without subsetting // and discard any without at least 3 alts and only samples 1 & 2. reader.open("testFiles/vcfFile.vcf", header); VcfSubsetSamples minAltAlleleSubset; minAltAlleleSubset.init(header, true); minAltAlleleSubset.addExcludeSample("NA00002"); reader.addDiscardMinAltAlleleCount(3, &minAltAlleleSubset); reader.setDiscardRules(VcfFileReader::DISCARD_MULTIPLE_ALTS); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == 3); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Read the records. assert(reader.readRecord(record)); assert(record.getGT(0,0) == 0); assert(record.getGT(1,1) == VcfGenotypeSample::MISSING_GT); assert(record.getGT(1,0) == 0); assert(record.getGT(0,1) == 1); assert(record.getGT(2,0) == 1); assert(record.getGT(2,1) == 1); assert(record.getGT(1,2) == VcfGenotypeSample::INVALID_GT); assert(record.getGT(3,0) == VcfGenotypeSample::INVALID_GT); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0|."); assert(*(sampleInfo->getString("GT", 2)) == "1|1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == true); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 1); assert(reader.getNumRecords() == 7); reader.close(); //////////////////////////////// // Test Discarding minor allele count < 1 without subsetting. reader.rmDiscardMinAltAlleleCount(); reader.setDiscardRules(0); reader.addDiscardMinMinorAlleleCount(1, NULL); reader.open("testFiles/vcfFile.vcf", header); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == NUM_SAMPLES); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Read the records. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(*(sampleInfo->getString("GT", 2)) == "1/1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0/0"); assert(*(sampleInfo->getString("GT", 1)) == "0|1"); assert(*(sampleInfo->getString("GT", 2)) == "0/0"); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == false); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(*(sampleInfo->getString("GT", 2)) == "0/0"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 0); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == true); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0/1"); assert(*(sampleInfo->getString("GT", 1)) == "0/2"); assert(*(sampleInfo->getString("GT", 2)) == "1/1"); assert(sampleInfo->isPhased(0) == false); assert(sampleInfo->isPhased(1) == false); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == true); assert(sampleInfo->isUnphased(1) == true); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 2); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0|."); assert(*(sampleInfo->getString("GT", 2)) == "1|1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == true); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 5); assert(reader.getNumRecords() == 7); reader.close(); //////////////////////////////// // Test Discarding minor allele count < 2 without subsetting. reader.addDiscardMinMinorAlleleCount(2, NULL); reader.open("testFiles/vcfFile.vcf", header); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == NUM_SAMPLES); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Read the records. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(*(sampleInfo->getString("GT", 2)) == "1/1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(*(sampleInfo->getString("GT", 2)) == "0/0"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 0); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0|."); assert(*(sampleInfo->getString("GT", 2)) == "1|1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == true); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 3); assert(reader.getNumRecords() == 7); reader.close(); //////////////////////////////// // Test Discarding minor allele count < 1 with subsetting. VcfSubsetSamples minMinorAlleleSubset; minMinorAlleleSubset.init(header, true); minMinorAlleleSubset.addExcludeSample("NA00002"); reader.addDiscardMinMinorAlleleCount(1, &minMinorAlleleSubset); reader.open("testFiles/vcfFile.vcf", header); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == NUM_SAMPLES); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Read the records. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(*(sampleInfo->getString("GT", 2)) == "1/1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(*(sampleInfo->getString("GT", 2)) == "0/0"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 0); assert(reader.readRecord(record)); assert(record.allPhased() == true); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == false); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|1"); assert(*(sampleInfo->getString("GT", 1)) == "0|."); assert(*(sampleInfo->getString("GT", 2)) == "1|1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == true); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == false); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 3); assert(reader.getNumRecords() == 7); reader.close(); //////////////////////////////// // Test Discarding minor allele count < 2 without subsetting. reader.addDiscardMinMinorAlleleCount(2, &minMinorAlleleSubset); reader.open("testFiles/vcfFile.vcf", header); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == NUM_SAMPLES); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Read the records. assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "1|0"); assert(*(sampleInfo->getString("GT", 2)) == "1/1"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 1); assert(reader.readRecord(record)); assert(record.allPhased() == false); assert(record.allUnphased() == false); assert(record.hasAllGenotypeAlleles() == true); sampleInfo = &(record.getGenotypeInfo()); assert(sampleInfo->getNumSamples() == 3); assert(*(sampleInfo->getString("GT", 0)) == "0|0"); assert(*(sampleInfo->getString("GT", 1)) == "0|0"); assert(*(sampleInfo->getString("GT", 2)) == "0/0"); assert(sampleInfo->isPhased(0) == true); assert(sampleInfo->isPhased(1) == true); assert(sampleInfo->isPhased(2) == false); assert(sampleInfo->isUnphased(0) == false); assert(sampleInfo->isUnphased(1) == false); assert(sampleInfo->isUnphased(2) == true); assert(record.passedAllFilters() == true); assert(record.getNumAlts() == 0); assert(reader.readRecord(record) == false); assert(reader.getNumKeptRecords() == 2); assert(reader.getNumRecords() == 7); reader.close(); } void testVcfWriteFile() { VcfFileWriter writer; VcfFileReader reader; VcfHeader header; VcfRecord record; assert(reader.open("testFiles/vcfFile.vcf", header) == true); assert(writer.open("results/vcfFile.vcf", header, InputFile::DEFAULT) == true); while(reader.readRecord(record)) { // Write the record. assert(writer.writeRecord(record)); } assert(reader.open("testFiles/vcfFile.vcf", header) == true); assert(writer.open("results/vcfFileNoInfo.vcf", header, InputFile::DEFAULT) == true); while(reader.readRecord(record)) { // Test Clearing the INFO field. record.getInfo().clear(); // Write the record. assert(writer.writeRecord(record)); } assert(reader.open("testFiles/vcfFile.vcf", header) == true); assert(writer.open("results/vcfFileNoInfoBGZF.vcf", header) == true); while(reader.readRecord(record)) { // Test Clearing the INFO field. record.getInfo().clear(); // Write the record. assert(writer.writeRecord(record)); } assert(reader.open("testFiles/vcfFile.vcf", header) == true); VcfRecordGenotype::addStoreField("GT"); assert(writer.open("results/vcfFileNoInfoKeepGT.vcf", header, InputFile::DEFAULT) == true); while(reader.readRecord(record)) { // Test Clearing the INFO field. record.getInfo().clear(); // Write the record. assert(writer.writeRecord(record)); } assert(reader.open("testFiles/vcfFile.vcf", header) == true); // Undo the storing of GT. VcfRecordGenotype::storeAllFields(); VcfRecordGenotype::addStoreField("GQ"); VcfRecordGenotype::addStoreField("XX"); VcfRecordGenotype::addStoreField("HQ"); assert(writer.open("results/vcfFileNoInfoKeepGQHQ.vcf", header, InputFile::DEFAULT) == true); while(reader.readRecord(record)) { // Test Clearing the INFO field. record.getInfo().clear(); // Write the record. assert(writer.writeRecord(record)); } } void testVcfReadSection() { // Test open for read via the constructor with return. VcfFileReader reader; VcfHeader header; VcfRecord record; const Tabix* tabixPtr = NULL; tabixPtr = reader.getVcfIndex(); assert(tabixPtr == NULL); //////////////////////////////// // Test the read section logic. reader.open("testFiles/testTabix.vcf.bgzf", header); tabixPtr = reader.getVcfIndex(); assert(tabixPtr == NULL); reader.readVcfIndex(); ////////////////// // Test index accessors. tabixPtr = reader.getVcfIndex(); assert(tabixPtr != NULL); assert(tabixPtr->getFormat() == Tabix::FORMAT_VCF); assert(tabixPtr->getNumRefs() == 2); assert(strcmp(tabixPtr->getRefName(0), "1") == 0); assert(strcmp(tabixPtr->getRefName(1), "3") == 0); bool caughtException = false; try { tabixPtr->getRefName(2); } catch(std::exception& e) { caughtException = true; } assert(caughtException); caughtException = false; try { tabixPtr->getRefName(-1); } catch(std::exception& e) { caughtException = true; } assert(caughtException == true); caughtException = false; reader.set1BasedReadSection("10", 16384, 32767); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("1", 16384, 32767); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("1", 16384, 32768); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("1", 16384, 32769); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("1", 32768, 32769); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("1", 32769, 32767); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("1", 32769, 65537); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("1", 32769, 65537); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("1", 32768, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 65537); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("1", 32769, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 65537); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("1", 0, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 65537); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); //////////////////////////////////////// // Test selecting whole chroms assert(reader.setReadSection("10")); assert(reader.readRecord(record) == false); assert(reader.setReadSection("1")); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 65537); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.setReadSection("3")); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); //////////////////////////////////////// // Test selecting sections with deletions reader.set1BasedReadSection("3", 16384, 32767); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 16384, 32768); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 16384, 32769); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32768, 32769); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32769, 32767); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32769, 65537); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32769, 65537); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32770, 65537); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32771, 65537); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32780, 65537); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32781, 65537); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 32768, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 32769, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 32770, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 32771, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 0, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); //////////////////////////////////////// // Test selecting sections with deletions for overlapping reader.set1BasedReadSection("3", 16384, 32767, true); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 16384, 32768, true); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 16384, 32769, true); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32768, 32769, true); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32769, 32767, true); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32769, 65537, true); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32769, 65537, true); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32770, 65537, true); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32771, 65537, true); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32780, 65537, true); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); reader.set1BasedReadSection("3", 32781, 65537, true); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 32768, 65538, true)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 32769, 65538, true)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 32770, 65538, true)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 32771, 65538, true)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); assert(reader.set1BasedReadSection("3", 0, 65538, true)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); reader.close(); } void testVcfReadSectionNoIndex() { // Test open for read via the constructor with return. VcfFileReader reader; VcfHeader header; VcfRecord record; bool caughtException = false; //////////////////////////////// // Test the read section logic. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("10", 16384, 32767); assert(reader.readRecord(record) == false); // Can't call setReadSection after reading a record a second time with no index. try { caughtException = false; assert(reader.set1BasedReadSection("1", 16384, 32769)); assert(reader.readRecord(record) == true); } catch (std::exception& e) { caughtException = true; } assert(caughtException == true); // Reopen to begining to check chrom 1. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("1", 16384, 32768); assert(reader.readRecord(record) == false); // Reopen to begining to check valid range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("1", 16384, 32769); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("1", 32769, 32767); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("1", 32769, 65537); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); assert(reader.set1BasedReadSection("1", 32768, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 65537); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); assert(reader.set1BasedReadSection("1", 32769, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 65537); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); assert(reader.set1BasedReadSection("1", 0, 65538)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 65537); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); //////////////////////////////////////// // Test selecting whole chroms try { caughtException = false; assert(reader.setReadSection("1")); assert(reader.readRecord(record) == true); } catch (std::exception& e) { caughtException = true; } assert(caughtException); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); assert(reader.setReadSection("10")); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); assert(reader.setReadSection("1")); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 65537); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); assert(reader.setReadSection("3")); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); //////////////////////////////////////// // Test selecting sections with deletions // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("3", 16384, 32767); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("3", 16384, 32768); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("3", 16384, 32769); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("3", 32768, 32769); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); //////////////////////////////////////// // Test selecting sections with deletions for overlapping // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("3", 16384, 32767, true); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("3", 16384, 32768, true); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); reader.set1BasedReadSection("3", 16384, 32769, true); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); assert(reader.set1BasedReadSection("3", 32771, 65538, true)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); // Reopen to begining to check another range. reader.open("testFiles/testTabix.vcf.bgzf", header); assert(reader.set1BasedReadSection("3", 0, 65538, true)); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32768); assert(reader.readRecord(record) == true); assert(record.get1BasedPosition() == 32780); assert(reader.readRecord(record) == false); assert(reader.readRecord(record) == false); reader.close(); } void testVcfReadSectionBadIndex() { // Test open for read via the constructor with return. VcfFileReader reader; VcfFileWriter writer; VcfHeader header; VcfRecord record; //////////////////////////////// // Test the read section logic. reader.open("testFiles/testTabixBadIndex.vcf.bgzf", header); bool hitError = false; try { reader.readVcfIndex(); } catch(std::exception& e) { hitError = true; std::string expectedError = "FAIL_PARSE: ERROR: Tabix file not in VCF format: testFiles/testTabixBadIndex.vcf.bgzf.tbi\nFAIL_IO: Failed to read the vcf Index file: testFiles/testTabixBadIndex.bgzf.tbi"; assert(expectedError == e.what()); } assert(hitError); } libStatGen-1.0.14/vcf/test/VcfFileTest.h000066400000000000000000000016151254730101300177140ustar00rootroot00000000000000/* * Copyright (C) 2011-2013 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ void testVcfFile(); void testVcfReadFile(); void testVcfWriteFile(); void testVcfReadSection(); void testVcfReadSectionNoIndex(); void testVcfReadSectionBadIndex(); libStatGen-1.0.14/vcf/test/VcfHeaderTest.cpp000066400000000000000000000275051254730101300205660ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "VcfHeaderTest.h" #include "VcfHeader.h" #include //extern const std::string HEADER_LINE="#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003"; //extern const std::string SAMPLES[NUM_SAMPLES] = {"NA00001","NA00002","NA00003"}; //extern const std::string META_LINES[NUM_META_LINES] void testVcfHeader() { VcfHeader header; // Test accessing the header without having read anything. assert(header.getNumMetaLines() == 0); assert(header.getMetaLine(0) == NULL); assert(header.getMetaLine(2) == NULL); assert(header.getHeaderLine() == NULL); assert(header.getNumSamples() == 0); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == NULL); assert(header.getSampleName(1) == NULL); assert(header.getSampleIndex(SAMPLES[1].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[2].c_str()) == -1); IFILE filePtr = NULL; // Input File IFILE outputFile = NULL; // Output File. // Try reading without opening. bool caughtException = false; try { assert(header.read(filePtr) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); // Open the file, then read. filePtr = ifopen("testFiles/vcfFile.vcf", "r"); assert(header.read(filePtr)); assert(header.getNumMetaLines() == NUM_META_LINES); assert(header.getMetaLine(0) == META_LINES[0]); assert(header.getMetaLine(2) == META_LINES[2]); assert(header.getMetaLine(23) == NULL); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == NUM_SAMPLES); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Reset and verify it is empty. header.reset(); assert(header.getNumMetaLines() == 0); assert(header.getMetaLine(0) == NULL); assert(header.getMetaLine(2) == NULL); assert(header.getHeaderLine() == NULL); assert(header.getNumSamples() == 0); assert(header.getSampleName(2) == NULL); assert(header.getSampleName(0) == NULL); assert(header.getSampleName(1) == NULL); assert(header.getSampleIndex(SAMPLES[1].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == -1); assert(header.getSampleIndex(SAMPLES[2].c_str()) == -1); // Close the file and read again. ifclose(filePtr); filePtr = ifopen("testFiles/vcfFile.vcf", "r"); assert(header.read(filePtr)); assert(header.getNumMetaLines() == NUM_META_LINES); assert(header.getMetaLine(0) == META_LINES[0]); assert(header.getMetaLine(2) == META_LINES[2]); assert(header.getMetaLine(23) == NULL); assert(header.getHeaderLine() == HEADER_LINE); assert(header.getNumSamples() == NUM_SAMPLES); assert(header.getSampleName(2) == SAMPLES[2]); assert(header.getSampleName(0) == SAMPLES[0]); assert(header.getSampleName(1) == SAMPLES[1]); assert(header.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(header.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(header.getSampleIndex(SAMPLES[2].c_str()) == 2); // Try writing without opening. caughtException = false; try { assert(header.write(outputFile) == false); } catch (std::exception& e) { caughtException = true; } assert(caughtException); caughtException = false; // write. outputFile = ifopen("results/vcfHeader.vcf", "w"); assert(header.write(outputFile)); //////////////////////////////// // Test creating a new header starting with the header line. VcfHeader newHeader; // Header starts empty. assert(newHeader.getNumMetaLines() == 0); assert(newHeader.getMetaLine(0) == NULL); assert(newHeader.getHeaderLine() == NULL); assert(newHeader.getNumSamples() == 0); assert(newHeader.getSampleName(0) == NULL); assert(newHeader.getSampleIndex(SAMPLES[0].c_str()) == -1); // Try adding a header line first. newHeader.addHeaderLine(HEADER_LINE.c_str()); assert(newHeader.getNumMetaLines() == 0); assert(newHeader.getMetaLine(0) == HEADER_LINE); assert(newHeader.getMetaLine(1) == NULL); assert(newHeader.getHeaderLine() == HEADER_LINE); assert(newHeader.getNumSamples() == NUM_SAMPLES); assert(newHeader.getSampleName(2) == SAMPLES[2]); assert(newHeader.getSampleName(0) == SAMPLES[0]); assert(newHeader.getSampleName(1) == SAMPLES[1]); assert(newHeader.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(newHeader.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(newHeader.getSampleIndex(SAMPLES[2].c_str()) == 2); // Add an invalid meta line. assert(newHeader.appendMetaLine("# bad line") == false); assert(newHeader.getNumMetaLines() == 0); assert(newHeader.getMetaLine(0) == HEADER_LINE); assert(newHeader.getMetaLine(1) == NULL); // Add the meta lines. for(int i = 1; i <= NUM_META_LINES; i++) { assert(newHeader.appendMetaLine(META_LINES[i-1].c_str())); assert(newHeader.getNumMetaLines() == i); for(int j = 0; j < i; j++) { assert(newHeader.getMetaLine(j) == META_LINES[j]); } assert(newHeader.getMetaLine(i) == HEADER_LINE); for(int k = i+1; k <= NUM_META_LINES; k++) { assert(newHeader.getMetaLine(k) == NULL); } } // write. outputFile = ifopen("results/vcfHeaderAddedFirst.vcf", "w"); assert(newHeader.write(outputFile)); //////////////////////////////// // Test creating a new header ending with the header line. VcfHeader newHeader2; // Header starts empty. assert(newHeader2.getNumMetaLines() == 0); assert(newHeader2.getMetaLine(0) == NULL); assert(newHeader2.getHeaderLine() == NULL); assert(newHeader2.getNumSamples() == 0); assert(newHeader2.getSampleName(0) == NULL); assert(newHeader2.getSampleIndex(SAMPLES[0].c_str()) == -1); // Add an invalid meta line. assert(newHeader2.appendMetaLine("# bad line") == false); assert(newHeader2.getNumMetaLines() == 0); assert(newHeader2.getMetaLine(0) == NULL); assert(newHeader2.getMetaLine(1) == NULL); // Add the meta lines. for(int i = 1; i <= NUM_META_LINES; i++) { assert(newHeader2.appendMetaLine(META_LINES[i-1].c_str())); assert(newHeader2.getNumMetaLines() == i); for(int j = 0; j < i; j++) { assert(newHeader2.getMetaLine(j) == META_LINES[j]); } assert(newHeader2.getMetaLine(i) == NULL); for(int k = i+1; k <= NUM_META_LINES; k++) { assert(newHeader2.getMetaLine(k) == NULL); } } // Try adding a header line last. newHeader2.addHeaderLine(HEADER_LINE.c_str()); assert(newHeader2.getNumMetaLines() == NUM_META_LINES); for(int i = 0; i < NUM_META_LINES; i++) { assert(newHeader2.getMetaLine(i) == META_LINES[i]); } assert(newHeader2.getMetaLine(NUM_META_LINES) == HEADER_LINE); assert(newHeader2.getHeaderLine() == HEADER_LINE); assert(newHeader2.getNumSamples() == NUM_SAMPLES); assert(newHeader2.getSampleName(2) == SAMPLES[2]); assert(newHeader2.getSampleName(0) == SAMPLES[0]); assert(newHeader2.getSampleName(1) == SAMPLES[1]); assert(newHeader2.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(newHeader2.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(newHeader2.getSampleIndex(SAMPLES[2].c_str()) == 2); // write. outputFile = ifopen("results/vcfHeaderAddedLast.vcf", "w"); assert(newHeader2.write(outputFile)); //////////////////////////////// // Test creating a new header adding the header line in the middle. VcfHeader newHeader3; // Header starts empty. assert(newHeader3.getNumMetaLines() == 0); assert(newHeader3.getMetaLine(0) == NULL); assert(newHeader3.getHeaderLine() == NULL); assert(newHeader3.getNumSamples() == 0); assert(newHeader3.getSampleName(0) == NULL); assert(newHeader3.getSampleIndex(SAMPLES[0].c_str()) == -1); // Add an invalid meta line. assert(newHeader3.appendMetaLine("# bad line") == false); assert(newHeader3.getNumMetaLines() == 0); assert(newHeader3.getMetaLine(0) == NULL); assert(newHeader3.getMetaLine(1) == NULL); // Add the meta lines. int subVal = 5; for(int i = 1; i <= NUM_META_LINES-subVal; i++) { assert(newHeader3.appendMetaLine(META_LINES[i-1].c_str())); assert(newHeader3.getNumMetaLines() == i); for(int j = 0; j < i; j++) { assert(newHeader3.getMetaLine(j) == META_LINES[j]); } assert(newHeader3.getMetaLine(i) == NULL); for(int k = i+1; k <= NUM_META_LINES; k++) { assert(newHeader3.getMetaLine(k) == NULL); } } // Try adding a header line. newHeader3.addHeaderLine(HEADER_LINE.c_str()); assert(newHeader3.getNumMetaLines() == NUM_META_LINES - subVal); for(int i = 0; i < NUM_META_LINES-subVal; i++) { assert(newHeader3.getMetaLine(i) == META_LINES[i]); } assert(newHeader3.getMetaLine(NUM_META_LINES-subVal) == HEADER_LINE); assert(newHeader3.getHeaderLine() == HEADER_LINE); assert(newHeader3.getNumSamples() == NUM_SAMPLES); assert(newHeader3.getSampleName(2) == SAMPLES[2]); assert(newHeader3.getSampleName(0) == SAMPLES[0]); assert(newHeader3.getSampleName(1) == SAMPLES[1]); assert(newHeader3.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(newHeader3.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(newHeader3.getSampleIndex(SAMPLES[2].c_str()) == 2); // Add the rest of the meta lines. for(int i = NUM_META_LINES - subVal + 1; i <= NUM_META_LINES; i++) { assert(newHeader3.appendMetaLine(META_LINES[i-1].c_str())); assert(newHeader3.getNumMetaLines() == i); for(int j = 0; j < i; j++) { assert(newHeader3.getMetaLine(j) == META_LINES[j]); } assert(newHeader3.getMetaLine(i) == HEADER_LINE); for(int k = i+1; k <= NUM_META_LINES; k++) { assert(newHeader3.getMetaLine(k) == NULL); } } assert(newHeader3.getNumMetaLines() == NUM_META_LINES); for(int i = 0; i < NUM_META_LINES; i++) { assert(newHeader3.getMetaLine(i) == META_LINES[i]); } assert(newHeader3.getMetaLine(NUM_META_LINES) == HEADER_LINE); assert(newHeader3.getHeaderLine() == HEADER_LINE); assert(newHeader3.getNumSamples() == NUM_SAMPLES); assert(newHeader3.getSampleName(2) == SAMPLES[2]); assert(newHeader3.getSampleName(0) == SAMPLES[0]); assert(newHeader3.getSampleName(1) == SAMPLES[1]); assert(newHeader3.getSampleIndex(SAMPLES[1].c_str()) == 1); assert(newHeader3.getSampleIndex(SAMPLES[0].c_str()) == 0); assert(newHeader3.getSampleIndex(SAMPLES[2].c_str()) == 2); // write. outputFile = ifopen("results/vcfHeaderAddedMiddle.vcf", "w"); assert(newHeader3.write(outputFile)); } libStatGen-1.0.14/vcf/test/VcfHeaderTest.h000066400000000000000000000043641254730101300202310ustar00rootroot00000000000000/* * Copyright (C) 2011 Regents of the University of Michigan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include void testVcfHeader(); const std::string HEADER_LINE="#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003"; const int NUM_SAMPLES = 3; const std::string SAMPLES[NUM_SAMPLES] = {"NA00001","NA00002","NA00003"}; const int NUM_META_LINES = 18; const std::string META_LINES[NUM_META_LINES]={ "##fileformat=VCFv4.1", "##fileDate=20090805", "##source=myImputationProgramV3.1", "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta", "##contig=", "##phasing=partial", "##INFO=", "##INFO=", "##INFO=", "##INFO=", "##INFO=", "##INFO=", "##FILTER=", "##FILTER=", "##FORMAT=", "##FORMAT=", "##FORMAT=", "##FORMAT="}; libStatGen-1.0.14/vcf/test/expected/000077500000000000000000000000001254730101300171635ustar00rootroot00000000000000libStatGen-1.0.14/vcf/test/expected/vcfFileNoInfo.vcf000066400000000000000000000032061254730101300223530ustar00rootroot00000000000000##fileformat=VCFv4.1 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta ##contig= ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 14370 rs6054257 G A 29 PASS . GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 20 17330 . T A 3 q10 . GT:GQ:DP:HQ 0/0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. 20 1110696 rs6040355 A G,T 67 PASS . GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:65,3 20 1230237 . T . 47 PASS . GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:65,3 20 1234567 microsat1 GTC G,GTCT 50 PASS . GT 0/1 0/2 1/1 20 1234568 microsat1 GTC G,GTCT 50 PASS . GQ:DP 35:4 17:2 40:3 20 1234569 microsat1 GTC G 50 PASS . GT:GQ:DP 0|1:35:4 0|. 1|1:40:3 libStatGen-1.0.14/vcf/test/expected/vcfFileNoInfoBGZF.vcf000066400000000000000000000014711254730101300230260ustar00rootroot00000000000000BC[o8="jw^!vڌ Cvڀrjlf?~`9Isi¯o~o6GҨ 4D.ViWG#MZ2JQUžqrz>FMTQJ/Ҭ4fmYt5ٺ0SBN|ʆ܇RkoS@9w,L0]2\m`x$oԆ$ԕڦJw2/=-T&N϶hu:-a%k| |TpVW5)E޵G222Rsl|5n..̼V/򨕏R[qe*S޸VGC:֦wnTZX ]L`Y^)}H+mi0޲䷬eEh b<{+FQf9y=:'gÍQxߔqz1?ƾI-s;]JUO]lKQ,sc_oD, K& <> Ӡ E t3_q%ܸ3]+Ϭ<֥[أkc`[}`NN\v :_uaCH\v7%ۦڴl^vr/W^tBClibStatGen-1.0.14/vcf/test/expected/vcfFileNoInfoKeepGQHQ.vcf000066400000000000000000000027721254730101300236500ustar00rootroot00000000000000##fileformat=VCFv4.1 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta ##contig= ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 14370 rs6054257 G A 29 PASS . GQ:HQ 48:51,51 48:51,51 43:.,. 20 17330 . T A 3 q10 . GQ:HQ 49:58,50 3:65,3 41:.,. 20 1110696 rs6040355 A G,T 67 PASS . GQ:HQ 21:23,27 2:18,2 35:65,3 20 1230237 . T . 47 PASS . GQ:HQ 54:56,60 48:51,51 61:65,3 20 1234567 microsat1 GTC G,GTCT 50 PASS . 20 1234568 microsat1 GTC G,GTCT 50 PASS . GQ 35 17 40 20 1234569 microsat1 GTC G 50 PASS . GQ 35 . 40 libStatGen-1.0.14/vcf/test/expected/vcfFileNoInfoKeepGT.vcf000066400000000000000000000027031254730101300234140ustar00rootroot00000000000000##fileformat=VCFv4.1 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta ##contig= ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 14370 rs6054257 G A 29 PASS . GT 0|0 1|0 1/1 20 17330 . T A 3 q10 . GT 0/0 0|1 0/0 20 1110696 rs6040355 A G,T 67 PASS . GT 1|2 2|1 2/2 20 1230237 . T . 47 PASS . GT 0|0 0|0 0/0 20 1234567 microsat1 GTC G,GTCT 50 PASS . GT 0/1 0/2 1/1 20 1234568 microsat1 GTC G,GTCT 50 PASS . 20 1234569 microsat1 GTC G 50 PASS . GT 0|1 0|. 1|1 libStatGen-1.0.14/vcf/test/expected/vcfHeader.vcf000066400000000000000000000021701254730101300215520ustar00rootroot00000000000000##fileformat=VCFv4.1 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta ##contig= ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 libStatGen-1.0.14/vcf/test/results/000077500000000000000000000000001254730101300170635ustar00rootroot00000000000000libStatGen-1.0.14/vcf/test/results/.gitignore000066400000000000000000000000051254730101300210460ustar00rootroot00000000000000*vcf libStatGen-1.0.14/vcf/test/testFiles/000077500000000000000000000000001254730101300173245ustar00rootroot00000000000000libStatGen-1.0.14/vcf/test/testFiles/exclude2.txt000066400000000000000000000000101254730101300215670ustar00rootroot00000000000000NA00002 libStatGen-1.0.14/vcf/test/testFiles/excludeIDs.txt000066400000000000000000000000231254730101300221110ustar00rootroot00000000000000rs6054257 microsat1libStatGen-1.0.14/vcf/test/testFiles/includeIDs.txt000066400000000000000000000000141254730101300221030ustar00rootroot00000000000000rs6040355 . libStatGen-1.0.14/vcf/test/testFiles/subset1.txt000066400000000000000000000000201254730101300214430ustar00rootroot00000000000000NA00002;NA00001 libStatGen-1.0.14/vcf/test/testFiles/subset2.txt000066400000000000000000000000301254730101300214450ustar00rootroot00000000000000NA00002 NA00003 NA00100 libStatGen-1.0.14/vcf/test/testFiles/testTabix.vcf000066400000000000000000000037241254730101300220010ustar00rootroot00000000000000##fileformat=VCFv4.0 ##filedate=20110211 ##source=glfMultiples ##minDepth=2526 ##maxDepth=2526000 ##minMapQuality=0 ##minPosterior=0.5000 ##contig= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT P1 P2 P3 P4 P5 P6 1 32768 r1 A G 100 PASS . GT:DP:GQ:GL 0/1:0:5:0,0,0 1/0:0:5:0,0,0 0/0:0:5:0,0,0 0/1:1:7:19,3,0 0/0:2:11:0,6,22 0/1:1:5:12,3,0 1 65537 r2 T G 100 PASS . GT:DP:GQ:GL 0/0:0:13:0,0,0 0/0:38:100:0,114,226 0/1:1:16:0,3,20 0/0:39:100:0,117,255 0/0:35:100:0,102,255 0/0:29:100:0,87,255 3 32768 r1 GAA G 100 PASS . GT:DP:GQ:GL 0/1:0:5:0,0,0 1/0:0:5:0,0,0 0/0:0:5:0,0,0 0/1:1:7:19,3,0 0/0:2:11:0,6,22 0/1:1:5:12,3,0 3 32780 r2 T G 100 PASS . GT:DP:GQ:GL 0/0:0:13:0,0,0 0/0:38:100:0,114,226 0/1:1:16:0,3,20 0/0:39:100:0,117,255 0/0:35:100:0,102,255 0/0:29:100:0,87,255 libStatGen-1.0.14/vcf/test/testFiles/testTabix.vcf.bgzf000066400000000000000000000014201254730101300227170ustar00rootroot00000000000000BC͔]o0_ћMvhB$'-Yb|V-.y^ކQ ȶ\?O&-`SLX.v:'XFi nԤXc\I&{HXC!(H'$Yd~MBM'gh1i;h%Dj=Gr 2s3q n~vbu X%=0;H:w-xHuA5kc  `h=,J~^Cjx4+B|@2;TJŚ-&WEK;'Ok/! ?dk=gc uc;(Cq/6B~oǪr [D'-kK՗E lh0@CUDIKKk \ vXe9huKDCpn Hy[2-u #¯VbE,bu,ӍzZDeL6-BmL蠌"# *@H[qY &L ֢QѩiVAe ЗҿuBClibStatGen-1.0.14/vcf/test/testFiles/testTabix.vcf.bgzf.tbi000066400000000000000000000002101254730101300234700ustar00rootroot00000000000000BCk qddb``aF( P B5*l=P *‡C`CajA{2Ák _dF2iBClibStatGen-1.0.14/vcf/test/testFiles/testTabixBadIndex.vcf.bgzf000066400000000000000000000014201254730101300243160ustar00rootroot00000000000000BC͔]o0_ћMvhB$'-Yb|V-.y^ކQ ȶ\?O&-`SLX.v:'XFi nԤXc\I&{HXC!(H'$Yd~MBM'gh1i;h%Dj=Gr 2s3q n~vbu X%=0;H:w-xHuA5kc  `h=,J~^Cjx4+B|@2;TJŚ-&WEK;'Ok/! ?dk=gc uc;(Cq/6B~oǪr [D'-kK՗E lh0@CUDIKKk \ vXe9huKDCpn Hy[2-u #¯VbE,bu,ӍzZDeL6-BmL蠌"# *@H[qY &L ֢QѩiVAe ЗҿuBClibStatGen-1.0.14/vcf/test/testFiles/testTabixBadIndex.vcf.bgzf.tbi000066400000000000000000000001431254730101300250740ustar00rootroot00000000000000BCF qddbF fbV V `9e68@GhBClibStatGen-1.0.14/vcf/test/testFiles/vcfFile.vcf000066400000000000000000000033661254730101300214120ustar00rootroot00000000000000##fileformat=VCFv4.1 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta ##contig= ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0/0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:65,3 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:65,3 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT 0/1 0/2 1/1 20 1234568 microsat1 GTC G,GTCT 50 PASS . GQ:DP 35:4 17:2 40:3 20 1234569 microsat1 GTC G 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0|1:35:4 0|. 1|1:40:3