htsjdk-2.0.1/000077500000000000000000000000001263034757100130115ustar00rootroot00000000000000htsjdk-2.0.1/.classpath000066400000000000000000000007431263034757100150000ustar00rootroot00000000000000 htsjdk-2.0.1/.gitignore000066400000000000000000000003311263034757100147760ustar00rootroot00000000000000htsjdk.iws .command_tmp classes testclasses javadoc dist contracts atlassian-ide-plugin.xml intellij.testclasses intellij.classes /htsjdk.version.properties /bin /test-output target .idea/libraries .idea/workspace.xmlhtsjdk-2.0.1/.idea/000077500000000000000000000000001263034757100137715ustar00rootroot00000000000000htsjdk-2.0.1/.idea/.name000066400000000000000000000000061263034757100147060ustar00rootroot00000000000000htsjdkhtsjdk-2.0.1/.idea/compiler.xml000066400000000000000000000013431263034757100163260ustar00rootroot00000000000000 htsjdk-2.0.1/.idea/copyright/000077500000000000000000000000001263034757100160015ustar00rootroot00000000000000htsjdk-2.0.1/.idea/copyright/profiles_settings.xml000066400000000000000000000001121263034757100222600ustar00rootroot00000000000000 htsjdk-2.0.1/.idea/encodings.xml000066400000000000000000000002441263034757100164640ustar00rootroot00000000000000 htsjdk-2.0.1/.idea/modules.xml000066400000000000000000000006371263034757100161710ustar00rootroot00000000000000 htsjdk-2.0.1/.idea/modules/000077500000000000000000000000001263034757100154415ustar00rootroot00000000000000htsjdk-2.0.1/.idea/modules/htsjdk-build.iml000066400000000000000000000176561263034757100205470ustar00rootroot00000000000000 htsjdk-2.0.1/.idea/modules/htsjdk.iml000066400000000000000000000060251263034757100174360ustar00rootroot00000000000000 htsjdk-2.0.1/.idea/sbt.xml000066400000000000000000000014011263034757100152770ustar00rootroot00000000000000 htsjdk-2.0.1/.idea/scala_compiler.xml000066400000000000000000000002641263034757100174720ustar00rootroot00000000000000 htsjdk-2.0.1/.idea/scopes/000077500000000000000000000000001263034757100152655ustar00rootroot00000000000000htsjdk-2.0.1/.idea/scopes/scope_settings.xml000066400000000000000000000002131263034757100210340ustar00rootroot00000000000000 htsjdk-2.0.1/.idea/vcs.xml000066400000000000000000000003331263034757100153050ustar00rootroot00000000000000 htsjdk-2.0.1/.project000066400000000000000000000005551263034757100144650ustar00rootroot00000000000000 htsjdk org.eclipse.jdt.core.javabuilder org.eclipse.jdt.core.javanature htsjdk-2.0.1/.travis.yml000066400000000000000000000017031263034757100151230ustar00rootroot00000000000000language: java sudo: true jdk: - oraclejdk8 install: ant script: ant all test after_success: - echo "TRAVIS_BRANCH='$TRAVIS_BRANCH'"; echo "JAVA_HOME='$JAVA_HOME'"; if [ "$TRAVIS_BRANCH" == "master" ] && [ "$JAVA_HOME" == "/usr/lib/jvm/java-8-oracle" ]; then sbt \ 'set buildSnapshot := true' \ 'set javacOptions in (Compile, doc) ++= Seq("-quiet")' \ 'set test in publish := {}' \ 'set resolvers += Resolver.url("bintray-sbt-plugin-releases", url("http://dl.bintray.com/content/sbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)' \ 'set publishTo := Option("artifactory-snapshots-publish" at "https://artifactory.broadinstitute.org/artifactory/libs-snapshot-local;build.timestamp=" + new java.util.Date().getTime)' \ "set credentials += Credentials(\"Artifactory Realm\", \"artifactory.broadinstitute.org\", \"${ARTIFACTORY_USERNAME}\", \"${ARTIFACTORY_PASSWORD}\")" \ publish; fi htsjdk-2.0.1/README.md000066400000000000000000000030671263034757100142760ustar00rootroot00000000000000[![Build Status](https://travis-ci.org/samtools/htsjdk.svg?branch=master)](https://travis-ci.org/samtools/htsjdk) ## A Java API for high-throughput sequencing data (HTS) formats. HTSJDK is an implementation of a unified Java library for accessing common file formats, such as [SAM][1] and [VCF][2], used for high-throughput sequencing data. There are also an number of useful utilities for manipulating HTS data. Please see the [HTSJDK Documentation](http://samtools.github.io/htsjdk) for more information. #### Licensing Information Not all sub-packages of htsjdk are subject to the same license, so a license notice is included in each source file or sub-package as appropriate. Please check the relevant license notice whenever you start working with a part of htsjdk that you have not previously worked with to avoid any surprises. #### Java Minimum Version Support Policy > **NOTE: _Effective November 24th 2015, HTSJDK has ended support of Java 7 and previous versions. Java 8 is now required_.** We will support all Java SE versions supported by Oracle until at least six months after Oracle's Public Updates period has ended ([see this link](http://www.oracle.com/technetwork/java/eol-135779.html)). Java SE Major Release | End of Java SE Oracle Public Updates | Proposed End of Support in HTSJDK | Actual End of Support in HTSJDK ---- | ---- | ---- | ---- 6 | Feb 2013 | Aug 2013 | Oct 2015 7 | Apr 2015 | Oct 2015 | Oct 2015 8* | Mar 2017 | Sep 2017 | Sep 2017 * to be finalized [1]: http://samtools.sourceforge.net [2]: http://vcftools.sourceforge.net/specs.html htsjdk-2.0.1/build.sbt000066400000000000000000000100161263034757100146200ustar00rootroot00000000000000import com.typesafe.sbt.SbtGit._ import de.johoop.testngplugin.TestNGPlugin._ import sbt.Package.ManifestAttributes name := "htsjdk" val buildVersion = "2.0.1" organization := "com.github.samtools" libraryDependencies += "gov.nih.nlm.ncbi" % "ngs-java" % "1.2.2" libraryDependencies += "org.apache.commons" % "commons-jexl" % "2.1.1" libraryDependencies += "commons-logging" % "commons-logging" % "1.1.1" libraryDependencies += "org.xerial.snappy" % "snappy-java" % "1.0.3-rc3" libraryDependencies += "org.apache.commons" % "commons-compress" % "1.4.1" libraryDependencies += "org.tukaani" % "xz" % "1.5" libraryDependencies += "org.apache.ant" % "ant" % "1.8.2" libraryDependencies += "org.testng" % "testng" % "6.8.8" unmanagedBase := baseDirectory.value javaSource in Compile := baseDirectory.value / "src/java" javaSource in Test := baseDirectory.value / "src/tests" testNGSettings testNGSuites := Seq("src/tests/resources/testng.xml") autoScalaLibrary := false publishMavenStyle := true publishArtifact in Test := false pomIncludeRepository := { _ => false} val gitVersion = settingKey[String]("The head commit git hash.") gitVersion := git.gitHeadCommit.value.get val gitBranch = settingKey[String]("The git branch.") gitBranch := git.gitCurrentBranch.value val buildSnapshot = settingKey[Boolean]("Is this build a snapshot.") buildSnapshot := false version := { if (buildSnapshot.value) { s"$buildVersion-${gitVersion.value.substring(0, 7)}-SNAPSHOT" } else { s"$buildVersion" } } val implementationVersion = settingKey[String]("Implementation version.") implementationVersion := { if (buildSnapshot.value) s"$buildVersion(${gitVersion.value})(SNAPSHOT)" else s"$buildVersion(${gitVersion.value})" } publishTo := { val nexus = "https://oss.sonatype.org/" if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots") else Some("releases" at nexus + "service/local/staging/deploy/maven2") } artifactName := { (sv: ScalaVersion, module: ModuleID, artifact: Artifact) => val classifierStr = artifact.classifier match { case None => ""; case Some(c) => "-" + c } artifact.name + "-" + module.revision + classifierStr + "." + artifact.extension } crossPaths := false javacOptions in (Compile,doc) ++= Seq("-Xdoclint:none") packageOptions := Seq(ManifestAttributes( ("Implementation-Version", s"${implementationVersion.value}"), ("Implementation-Vendor", "Broad Institute") )) assemblyJarName := s"${name.value}-${version.value}.jar" assemblyMergeStrategy in assembly := { case x if Assembly.isConfigFile(x) => MergeStrategy.concat case PathList(ps@_*) if Assembly.isReadme(ps.last) || Assembly.isLicenseFile(ps.last) => MergeStrategy.rename case PathList("META-INF", path@_*) => path map { _.toLowerCase } match { case ("manifest.mf" :: Nil) | ("index.list" :: Nil) | ("dependencies" :: Nil) => MergeStrategy.discard case ps@(x :: xs) if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") => MergeStrategy.discard case "plexus" :: xs => MergeStrategy.discard case "spring.tooling" :: xs => MergeStrategy.discard case "services" :: xs => MergeStrategy.filterDistinctLines case ("spring.schemas" :: Nil) | ("spring.handlers" :: Nil) => MergeStrategy.filterDistinctLines case _ => MergeStrategy.deduplicate } case "asm-license.txt" | "overview.html" => MergeStrategy.discard case _ => MergeStrategy.deduplicate } pomExtra := http://samtools.github.io/htsjdk/ MIT License http://opensource.org/licenses/MIT repo git@github.com:samtools/htsjdk.git scm:git:git@github.com:samtools/htsjdk.git picard Picard Team http://broadinstitute.github.io/picard/ htsjdk-2.0.1/build.xml000077500000000000000000000272321263034757100146430ustar00rootroot00000000000000 htsjdk-2.0.1/etc/000077500000000000000000000000001263034757100135645ustar00rootroot00000000000000htsjdk-2.0.1/etc/test/000077500000000000000000000000001263034757100145435ustar00rootroot00000000000000htsjdk-2.0.1/etc/test/junit-noframes.xsl000066400000000000000000000537051263034757100202460ustar00rootroot00000000000000 Unit Test Results Last Modified:


Packages

Note: package statistics are not computed recursively, they only sum up all of its testsuites numbers. Failure Error

Package

Back to top

TestCase

N/A
N/A

Back to top

Summary

Failure Error
Tests Failures Errors Success rate Time
Note: failures are anticipated and checked for with assertions while errors are unanticipated.
cur = TestCases[' . '] = new Array(); cur[' '] = ' ';

Unit Test Results

Designed for use with JUnit and Ant .

Name Tests Errors Failures Time(s) Name Tests Errors Failures Time(s) Name Status Type Time(s) System.out System.err Failure Error Error . Failure Error Success N/A



htsjdk-2.0.1/etc/test/testng.css000066400000000000000000000012051263034757100165570ustar00rootroot00000000000000nvocation-failed, .test-failed { background-color: #ffc0cb; } .invocation-percent, .test-percent { background-color: #66cdaa; } .invocation-passed, .test-passed { background-color: #98fb98; } .invocation-skipped, .test-skipped { background-color: #f0e68c; } .main-page { font-size: small; font-family: verdana, 'trebuchet ms', sans-serif; } table { font-size: small; font-family: verdana, 'trebuchet ms', sans-serif; } h1 , h2 , h3 , h4 , h5 , h6 { color: #999; font-family : Georgia, "Times New Roman", Times, serif; font-weight : normal; font-variant : small-caps; padding: 0px; margin-bottom:0.2px; margin-top:1px; } htsjdk-2.0.1/htsjdk.iml000066400000000000000000000041061263034757100150040ustar00rootroot00000000000000 htsjdk-2.0.1/htsjdk.ipr000066400000000000000000000436671263034757100150340ustar00rootroot00000000000000 ribosomal bgzipped codecs demultiplex demultiplexed eamss endian gzipped illumina's indexable inferer inferrer parsability phread seekable tabix tokenizes tribble http://www.w3.org/1999/xhtml htsjdk-2.0.1/project/000077500000000000000000000000001263034757100144575ustar00rootroot00000000000000htsjdk-2.0.1/project/plugins.sbt000066400000000000000000000002531263034757100166520ustar00rootroot00000000000000addSbtPlugin("de.johoop" % "sbt-testng-plugin" % "3.0.2") addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.7.1")htsjdk-2.0.1/src/000077500000000000000000000000001263034757100136005ustar00rootroot00000000000000htsjdk-2.0.1/src/c/000077500000000000000000000000001263034757100140225ustar00rootroot00000000000000htsjdk-2.0.1/src/c/inteldeflater/000077500000000000000000000000001263034757100166445ustar00rootroot00000000000000htsjdk-2.0.1/src/c/inteldeflater/IntelDeflater.c000066400000000000000000000211241263034757100215320ustar00rootroot00000000000000/* * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* * Native method support for net.sf.samtools.util.zip.IntelDeflater. * This is copied from OpenJDK native support for java.util.zip.Deflater, with only package and class name changed. */ #include #include #include "jlong.h" #include "jni.h" #include "jni_util.h" #include "zlib.h" #include "htsjdk_samtools_util_zip_IntelDeflater.h" #define DEF_MEM_LEVEL 8 static jfieldID levelID; static jfieldID strategyID; static jfieldID setParamsID; static jfieldID finishID; static jfieldID finishedID; static jfieldID bufID, offID, lenID; JNIEXPORT void JNICALL Java_htsjdk_samtools_util_zip_IntelDeflater_initIDs(JNIEnv *env, jclass cls) { levelID = (*env)->GetFieldID(env, cls, "level", "I"); strategyID = (*env)->GetFieldID(env, cls, "strategy", "I"); setParamsID = (*env)->GetFieldID(env, cls, "setParams", "Z"); finishID = (*env)->GetFieldID(env, cls, "finish", "Z"); finishedID = (*env)->GetFieldID(env, cls, "finished", "Z"); bufID = (*env)->GetFieldID(env, cls, "buf", "[B"); offID = (*env)->GetFieldID(env, cls, "off", "I"); lenID = (*env)->GetFieldID(env, cls, "len", "I"); } JNIEXPORT jlong JNICALL Java_htsjdk_samtools_util_zip_IntelDeflater_init(JNIEnv *env, jclass cls, jint level, jint strategy, jboolean nowrap) { z_stream *strm = calloc(1, sizeof(z_stream)); if (strm == 0) { JNU_ThrowOutOfMemoryError(env, 0); return jlong_zero; } else { char *msg; switch (deflateInit2(strm, level, Z_DEFLATED, nowrap ? -MAX_WBITS : MAX_WBITS, DEF_MEM_LEVEL, strategy)) { case Z_OK: return ptr_to_jlong(strm); case Z_MEM_ERROR: free(strm); JNU_ThrowOutOfMemoryError(env, 0); return jlong_zero; case Z_STREAM_ERROR: free(strm); JNU_ThrowIllegalArgumentException(env, 0); return jlong_zero; default: msg = strm->msg; free(strm); JNU_ThrowInternalError(env, msg); return jlong_zero; } } } JNIEXPORT void JNICALL Java_htsjdk_samtools_util_zip_IntelDeflater_setDictionary(JNIEnv *env, jclass cls, jlong addr, jarray b, jint off, jint len) { Bytef *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); int res; if (buf == 0) {/* out of memory */ return; } res = deflateSetDictionary((z_stream *)jlong_to_ptr(addr), buf + off, len); (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0); switch (res) { case Z_OK: break; case Z_STREAM_ERROR: JNU_ThrowIllegalArgumentException(env, 0); break; default: JNU_ThrowInternalError(env, ((z_stream *)jlong_to_ptr(addr))->msg); break; } } JNIEXPORT jint JNICALL Java_htsjdk_samtools_util_zip_IntelDeflater_deflateBytes(JNIEnv *env, jobject this, jlong addr, jarray b, jint off, jint len, jint flush) { z_stream *strm = jlong_to_ptr(addr); jarray this_buf = (*env)->GetObjectField(env, this, bufID); jint this_off = (*env)->GetIntField(env, this, offID); jint this_len = (*env)->GetIntField(env, this, lenID); jbyte *in_buf; jbyte *out_buf; int res; if ((*env)->GetBooleanField(env, this, setParamsID)) { int level = (*env)->GetIntField(env, this, levelID); int strategy = (*env)->GetIntField(env, this, strategyID); in_buf = (*env)->GetPrimitiveArrayCritical(env, this_buf, 0); if (in_buf == NULL) { // Throw OOME only when length is not zero if (this_len != 0) JNU_ThrowOutOfMemoryError(env, 0); return 0; } out_buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); if (out_buf == NULL) { (*env)->ReleasePrimitiveArrayCritical(env, this_buf, in_buf, 0); if (len != 0) JNU_ThrowOutOfMemoryError(env, 0); return 0; } strm->next_in = (Bytef *) (in_buf + this_off); strm->next_out = (Bytef *) (out_buf + off); strm->avail_in = this_len; strm->avail_out = len; res = deflateParams(strm, level, strategy); (*env)->ReleasePrimitiveArrayCritical(env, b, out_buf, 0); (*env)->ReleasePrimitiveArrayCritical(env, this_buf, in_buf, 0); switch (res) { case Z_OK: (*env)->SetBooleanField(env, this, setParamsID, JNI_FALSE); this_off += this_len - strm->avail_in; (*env)->SetIntField(env, this, offID, this_off); (*env)->SetIntField(env, this, lenID, strm->avail_in); return len - strm->avail_out; case Z_BUF_ERROR: (*env)->SetBooleanField(env, this, setParamsID, JNI_FALSE); return 0; default: JNU_ThrowInternalError(env, strm->msg); return 0; } } else { jboolean finish = (*env)->GetBooleanField(env, this, finishID); in_buf = (*env)->GetPrimitiveArrayCritical(env, this_buf, 0); if (in_buf == NULL) { if (this_len != 0) JNU_ThrowOutOfMemoryError(env, 0); return 0; } out_buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); if (out_buf == NULL) { (*env)->ReleasePrimitiveArrayCritical(env, this_buf, in_buf, 0); if (len != 0) JNU_ThrowOutOfMemoryError(env, 0); return 0; } strm->next_in = (Bytef *) (in_buf + this_off); strm->next_out = (Bytef *) (out_buf + off); strm->avail_in = this_len; strm->avail_out = len; res = deflate(strm, finish ? Z_FINISH : flush); (*env)->ReleasePrimitiveArrayCritical(env, b, out_buf, 0); (*env)->ReleasePrimitiveArrayCritical(env, this_buf, in_buf, 0); switch (res) { case Z_STREAM_END: (*env)->SetBooleanField(env, this, finishedID, JNI_TRUE); /* fall through */ case Z_OK: this_off += this_len - strm->avail_in; (*env)->SetIntField(env, this, offID, this_off); (*env)->SetIntField(env, this, lenID, strm->avail_in); return len - strm->avail_out; case Z_BUF_ERROR: return 0; default: JNU_ThrowInternalError(env, strm->msg); return 0; } } } JNIEXPORT jint JNICALL Java_htsjdk_samtools_util_zip_IntelDeflater_getAdler(JNIEnv *env, jclass cls, jlong addr) { return ((z_stream *)jlong_to_ptr(addr))->adler; } JNIEXPORT jlong JNICALL Java_htsjdk_samtools_util_zip_IntelDeflater_getBytesRead(JNIEnv *env, jclass cls, jlong addr) { return ((z_stream *)jlong_to_ptr(addr))->total_in; } JNIEXPORT jlong JNICALL Java_htsjdk_samtools_util_zip_IntelDeflater_getBytesWritten(JNIEnv *env, jclass cls, jlong addr) { return ((z_stream *)jlong_to_ptr(addr))->total_out; } JNIEXPORT void JNICALL Java_htsjdk_samtools_util_zip_IntelDeflater_reset(JNIEnv *env, jclass cls, jlong addr) { if (deflateReset((z_stream *)jlong_to_ptr(addr)) != Z_OK) { JNU_ThrowInternalError(env, 0); } } JNIEXPORT void JNICALL Java_htsjdk_samtools_util_zip_IntelDeflater_end(JNIEnv *env, jclass cls, jlong addr) { if (deflateEnd((z_stream *)jlong_to_ptr(addr)) == Z_STREAM_ERROR) { JNU_ThrowInternalError(env, 0); } else { free((z_stream *)jlong_to_ptr(addr)); } } htsjdk-2.0.1/src/java/000077500000000000000000000000001263034757100145215ustar00rootroot00000000000000htsjdk-2.0.1/src/java/htsjdk/000077500000000000000000000000001263034757100160105ustar00rootroot00000000000000htsjdk-2.0.1/src/java/htsjdk/samtools/000077500000000000000000000000001263034757100176515ustar00rootroot00000000000000htsjdk-2.0.1/src/java/htsjdk/samtools/AbstractBAMFileIndex.java000066400000000000000000000660231263034757100243760ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.RuntimeIOException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collections; import java.util.List; /** * Provides basic, generic capabilities to be used reading BAM index files. Users can * subclass this class to create new BAM index functionality for adding querying facilities, * changing caching behavior, etc. * * Of particular note: the AbstractBAMFileIndex is, by design, the only class aware of the * details of the BAM index file format (other than the four classes representing the data, * BAMIndexContent, Bin, Chunk, LinearIndex, and the classes for building the BAM index). * Anyone wanting to implement a reader for a differing * or extended BAM index format should implement BAMIndex directly. */ public abstract class AbstractBAMFileIndex implements BAMIndex { private final IndexFileBuffer mIndexBuffer; private SAMSequenceDictionary mBamDictionary = null; final int [] sequenceIndexes; protected AbstractBAMFileIndex( final SeekableStream stream, final SAMSequenceDictionary dictionary) { mBamDictionary = dictionary; mIndexBuffer = new IndexStreamBuffer(stream); verifyBAMMagicNumber(stream.getSource()); sequenceIndexes = new int[readInteger() + 1]; Arrays.fill(sequenceIndexes, -1); } protected AbstractBAMFileIndex(final File file, final SAMSequenceDictionary dictionary) { this(file, dictionary, true); } protected AbstractBAMFileIndex(final File file, final SAMSequenceDictionary dictionary, final boolean useMemoryMapping) { mBamDictionary = dictionary; mIndexBuffer = (useMemoryMapping ? new MemoryMappedFileBuffer(file) : new RandomAccessFileBuffer(file)); verifyBAMMagicNumber(file.getName()); sequenceIndexes = new int[readInteger() + 1]; Arrays.fill(sequenceIndexes, -1); } /** * Close this index and release any associated resources. */ public void close() { mIndexBuffer.close(); } /** * Get the number of levels employed by this index. * @return Number of levels in this index. */ public static int getNumIndexLevels() { return GenomicIndexUtil.LEVEL_STARTS.length; } /** * Gets the first bin in the given level. * @param levelNumber Level number. 0-based. * @return The first bin in this level. */ public static int getFirstBinInLevel(final int levelNumber) { return GenomicIndexUtil.LEVEL_STARTS[levelNumber]; } /** * Gets the number of bins in the given level. * @param levelNumber Level number. 0-based. * @return The size (number of possible bins) of the given level. */ public int getLevelSize(final int levelNumber) { if(levelNumber == getNumIndexLevels()) return GenomicIndexUtil.MAX_BINS+1-GenomicIndexUtil.LEVEL_STARTS[levelNumber]; else return GenomicIndexUtil.LEVEL_STARTS[levelNumber+1]-GenomicIndexUtil.LEVEL_STARTS[levelNumber]; } /** * Gets the level associated with the given bin number. * @param bin The bin for which to determine the level. * @return the level associated with the given bin number. */ public int getLevelForBin(final Bin bin) { if(bin.getBinNumber() >= GenomicIndexUtil.MAX_BINS) throw new SAMException("Tried to get level for invalid bin."); for(int i = getNumIndexLevels()-1; i >= 0; i--) { if(bin.getBinNumber() >= GenomicIndexUtil.LEVEL_STARTS[i]) return i; } throw new SAMException("Unable to find correct bin for bin "+bin); } /** * Gets the first locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ public int getFirstLocusInBin(final Bin bin) { final int level = getLevelForBin(bin); final int levelStart = GenomicIndexUtil.LEVEL_STARTS[level]; final int levelSize = ((level==getNumIndexLevels()-1) ? GenomicIndexUtil.MAX_BINS-1 : GenomicIndexUtil.LEVEL_STARTS[level+1]) - levelStart; return (bin.getBinNumber() - levelStart)*(GenomicIndexUtil.BIN_GENOMIC_SPAN /levelSize)+1; } /** * Gets the last locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ public int getLastLocusInBin(final Bin bin) { final int level = getLevelForBin(bin); final int levelStart = GenomicIndexUtil.LEVEL_STARTS[level]; final int levelSize = ((level==getNumIndexLevels()-1) ? GenomicIndexUtil.MAX_BINS-1 : GenomicIndexUtil.LEVEL_STARTS[level+1]) - levelStart; return (bin.getBinNumber()-levelStart+1)*(GenomicIndexUtil.BIN_GENOMIC_SPAN /levelSize); } public int getNumberOfReferences() { seek(4); return readInteger(); } /** * Use to get close to the unmapped reads at the end of a BAM file. * @return The file offset of the first record in the last linear bin, or -1 * if there are no elements in linear bins (i.e. no mapped reads). */ public long getStartOfLastLinearBin() { seek(4); final int sequenceCount = readInteger(); // Because no reads may align to the last sequence in the sequence dictionary, // grab the last element of the linear index for each sequence, and return // the last one from the last sequence that has one. long lastLinearIndexPointer = -1; for (int i = 0; i < sequenceCount; i++) { // System.out.println("# Sequence TID: " + i); final int nBins = readInteger(); // System.out.println("# nBins: " + nBins); for (int j1 = 0; j1 < nBins; j1++) { // Skip bin # skipBytes(4); final int nChunks = readInteger(); // Skip chunks skipBytes(16 * nChunks); } final int nLinearBins = readInteger(); if (nLinearBins > 0) { // Skip to last element of list of linear bins skipBytes(8 * (nLinearBins - 1)); lastLinearIndexPointer = readLong(); } } return lastLinearIndexPointer; } /** * Return meta data for the given reference including information about number of aligned, unaligned, and noCoordinate records * * @param reference the reference of interest * @return meta data for the reference */ public BAMIndexMetaData getMetaData(final int reference) { seek(4); final List metaDataChunks = new ArrayList(); final int sequenceCount = readInteger(); if (reference >= sequenceCount) { return null; } skipToSequence(reference); final int binCount = readInteger(); for (int binNumber = 0; binNumber < binCount; binNumber++) { final int indexBin = readInteger(); final int nChunks = readInteger(); if (indexBin == GenomicIndexUtil.MAX_BINS) { for (int ci = 0; ci < nChunks; ci++) { final long chunkBegin = readLong(); final long chunkEnd = readLong(); metaDataChunks.add(new Chunk(chunkBegin, chunkEnd)); } } else { skipBytes(16 * nChunks); } } return new BAMIndexMetaData(metaDataChunks); } /** * Returns count of records unassociated with any reference. Call before the index file is closed * * @return meta data at the end of the bam index that indicates count of records holding no coordinates * or null if no meta data (old index format) */ public Long getNoCoordinateCount() { seek(4); final int sequenceCount = readInteger(); skipToSequence(sequenceCount); try { // in case of old index file without meta data return readLong(); } catch (final Exception e) { return null; } } protected BAMIndexContent query(final int referenceSequence, final int startPos, final int endPos) { seek(4); final List metaDataChunks = new ArrayList(); final int sequenceCount = readInteger(); if (referenceSequence >= sequenceCount) { return null; } final BitSet regionBins = GenomicIndexUtil.regionToBins(startPos, endPos); if (regionBins == null) { return null; } skipToSequence(referenceSequence); final int binCount = readInteger(); boolean metaDataSeen = false; final Bin[] bins = new Bin[getMaxBinNumberForReference(referenceSequence) +1]; for (int binNumber = 0; binNumber < binCount; binNumber++) { final int indexBin = readInteger(); final int nChunks = readInteger(); List chunks = null; // System.out.println("# bin[" + i + "] = " + indexBin + ", nChunks = " + nChunks); Chunk lastChunk = null; if (regionBins.get(indexBin)) { chunks = new ArrayList(nChunks); for (int ci = 0; ci < nChunks; ci++) { final long chunkBegin = readLong(); final long chunkEnd = readLong(); lastChunk = new Chunk(chunkBegin, chunkEnd); chunks.add(lastChunk); } } else if (indexBin == GenomicIndexUtil.MAX_BINS) { // meta data - build the bin so that the count of bins is correct; // but don't attach meta chunks to the bin, or normal queries will be off for (int ci = 0; ci < nChunks; ci++) { final long chunkBegin = readLong(); final long chunkEnd = readLong(); lastChunk = new Chunk(chunkBegin, chunkEnd); metaDataChunks.add(lastChunk); } metaDataSeen = true; continue; // don't create a Bin } else { skipBytes(16 * nChunks); chunks = Collections.emptyList(); } final Bin bin = new Bin(referenceSequence, indexBin); bin.setChunkList(chunks); bin.setLastChunk(lastChunk); bins[indexBin] = bin; } final int nLinearBins = readInteger(); final int regionLinearBinStart = LinearIndex.convertToLinearIndexOffset(startPos); final int regionLinearBinStop = endPos > 0 ? LinearIndex.convertToLinearIndexOffset(endPos) : nLinearBins-1; final int actualStop = Math.min(regionLinearBinStop, nLinearBins -1); long[] linearIndexEntries = new long[0]; if (regionLinearBinStart < nLinearBins) { linearIndexEntries = new long[actualStop-regionLinearBinStart+1]; skipBytes(8 * regionLinearBinStart); for(int linearBin = regionLinearBinStart; linearBin <= actualStop; linearBin++) linearIndexEntries[linearBin-regionLinearBinStart] = readLong(); } final LinearIndex linearIndex = new LinearIndex(referenceSequence,regionLinearBinStart,linearIndexEntries); return new BAMIndexContent(referenceSequence, bins, binCount - (metaDataSeen? 1 : 0), new BAMIndexMetaData(metaDataChunks), linearIndex); } /** * The maximum possible bin number for this reference sequence. * This is based on the maximum coordinate position of the reference * which is based on the size of the reference */ private int getMaxBinNumberForReference(final int reference) { try { final int sequenceLength = mBamDictionary.getSequence(reference).getSequenceLength(); return getMaxBinNumberForSequenceLength(sequenceLength); } catch (final Exception e) { return GenomicIndexUtil.MAX_BINS; } } /** * The maxiumum bin number for a reference sequence of a given length */ static int getMaxBinNumberForSequenceLength(final int sequenceLength) { return getFirstBinInLevel(getNumIndexLevels() - 1) + (sequenceLength >> 14); // return 4680 + (sequenceLength >> 14); // note 4680 = getFirstBinInLevel(getNumIndexLevels() - 1) } abstract protected BAMIndexContent getQueryResults(int reference); /** * Gets the possible number of bins for a given reference sequence. * @return How many bins could possibly be used according to this indexing scheme to index a single contig. */ protected int getMaxAddressibleGenomicLocation() { return GenomicIndexUtil.BIN_GENOMIC_SPAN; } /** * Get candidate bins for the specified region * @param startPos 1-based start of target region, inclusive. * @param endPos 1-based end of target region, inclusive. * @return bit set for each bin that may contain SAMRecords in the target region. */ protected BitSet regionToBins(final int startPos, final int endPos) { final int maxPos = 0x1FFFFFFF; final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos; final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos; if (start > end) { return null; } int k; final BitSet bitSet = new BitSet(GenomicIndexUtil.MAX_BINS); bitSet.set(0); for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k); for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k); for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k); for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k); for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k); return bitSet; } /** * @deprecated Invoke htsjdk.samtools.Chunk#optimizeChunkList(java.util.List, long) directly. */ protected List optimizeChunkList(final List chunks, final long minimumOffset) { return Chunk.optimizeChunkList(chunks, minimumOffset); } private void verifyBAMMagicNumber(final String sourceName) { // Verify the magic number. seek(0); final byte[] buffer = new byte[4]; readBytes(buffer); if (!Arrays.equals(buffer, BAMFileConstants.BAM_INDEX_MAGIC)) { throw new RuntimeIOException("Invalid file header in BAM index " + sourceName + ": " + new String(buffer)); } } private void skipToSequence(final int sequenceIndex) { //Use sequence position cache if available if(sequenceIndexes[sequenceIndex] != -1){ seek(sequenceIndexes[sequenceIndex]); return; } for (int i = 0; i < sequenceIndex; i++) { // System.out.println("# Sequence TID: " + i); final int nBins = readInteger(); // System.out.println("# nBins: " + nBins); for (int j = 0; j < nBins; j++) { readInteger(); // bin final int nChunks = readInteger(); // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); skipBytes(16 * nChunks); } final int nLinearBins = readInteger(); // System.out.println("# nLinearBins: " + nLinearBins); skipBytes(8 * nLinearBins); } //Update sequence position cache sequenceIndexes[sequenceIndex] = position(); } private void readBytes(final byte[] bytes) { mIndexBuffer.readBytes(bytes); } private int readInteger() { return mIndexBuffer.readInteger(); } private long readLong() { return mIndexBuffer.readLong(); } private void skipBytes(final int count) { mIndexBuffer.skipBytes(count); } private void seek(final int position) { mIndexBuffer.seek(position); } private int position(){ return mIndexBuffer.position(); } private abstract static class IndexFileBuffer { abstract void readBytes(final byte[] bytes); abstract int readInteger(); abstract long readLong(); abstract void skipBytes(final int count); abstract void seek(final int position); abstract int position(); abstract void close(); } /** * Traditional implementation of BAM index file access using memory mapped files. */ private static class MemoryMappedFileBuffer extends IndexFileBuffer { private MappedByteBuffer mFileBuffer; MemoryMappedFileBuffer(final File file) { try { // Open the file stream. final FileInputStream fileStream = new FileInputStream(file); final FileChannel fileChannel = fileStream.getChannel(); mFileBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0L, fileChannel.size()); mFileBuffer.order(ByteOrder.LITTLE_ENDIAN); fileChannel.close(); fileStream.close(); } catch (final IOException exc) { throw new RuntimeIOException(exc.getMessage(), exc); } } @Override void readBytes(final byte[] bytes) { mFileBuffer.get(bytes); } @Override int readInteger() { return mFileBuffer.getInt(); } @Override long readLong() { return mFileBuffer.getLong(); } @Override void skipBytes(final int count) { mFileBuffer.position(mFileBuffer.position() + count); } @Override void seek(final int position) { mFileBuffer.position(position); } @Override int position() { return mFileBuffer.position(); } @Override void close() { mFileBuffer = null; } } /** * Alternative implementation of BAM index file access using regular I/O instead of memory mapping. * * This implementation can be more scalable for certain applications that need to access large numbers of BAM files. * Java provides no way to explicitly release a memory mapping. Instead, you need to wait for the garbage collector * to finalize the MappedByteBuffer. Because of this, when accessing many BAM files or when querying many BAM files * sequentially, you cannot easily control the physical memory footprint of the java process. * This can limit scalability and can have bad interactions with load management software like LSF, forcing you * to reserve enough physical memory for a worst case scenario. * The use of regular I/O allows you to trade somewhat slower performance for a small, fixed memory footprint * if that is more suitable for your application. */ private static class RandomAccessFileBuffer extends IndexFileBuffer { private static final int PAGE_SIZE = 4 * 1024; private static final int PAGE_OFFSET_MASK = PAGE_SIZE-1; private static final int PAGE_MASK = ~PAGE_OFFSET_MASK; private static final int INVALID_PAGE = 1; private final File mFile; private RandomAccessFile mRandomAccessFile; private final int mFileLength; private int mFilePointer = 0; private int mCurrentPage = INVALID_PAGE; private final byte[] mBuffer = new byte[PAGE_SIZE]; RandomAccessFileBuffer(final File file) { mFile = file; try { mRandomAccessFile = new RandomAccessFile(file, "r"); final long fileLength = mRandomAccessFile.length(); if (fileLength > Integer.MAX_VALUE) { throw new RuntimeIOException("BAM index file " + mFile + " is too large: " + fileLength); } mFileLength = (int) fileLength; } catch (final IOException exc) { throw new RuntimeIOException(exc.getMessage(), exc); } } @Override void readBytes(final byte[] bytes) { int resultOffset = 0; int resultLength = bytes.length; if (mFilePointer + resultLength > mFileLength) { throw new RuntimeIOException("Attempt to read past end of BAM index file (file is truncated?): " + mFile); } while (resultLength > 0) { loadPage(mFilePointer); final int pageOffset = mFilePointer & PAGE_OFFSET_MASK; final int copyLength = Math.min(resultLength, PAGE_SIZE - pageOffset); System.arraycopy(mBuffer, pageOffset, bytes, resultOffset, copyLength); mFilePointer += copyLength; resultOffset += copyLength; resultLength -= copyLength; } } @Override int readInteger() { // This takes advantage of the fact that integers in BAM index files are always 4-byte aligned. loadPage(mFilePointer); final int pageOffset = mFilePointer & PAGE_OFFSET_MASK; mFilePointer += 4; return((mBuffer[pageOffset + 0] & 0xFF) | ((mBuffer[pageOffset + 1] & 0xFF) << 8) | ((mBuffer[pageOffset + 2] & 0xFF) << 16) | ((mBuffer[pageOffset + 3] & 0xFF) << 24)); } @Override long readLong() { // BAM index files are always 4-byte aligned, but not necessrily 8-byte aligned. // So, rather than fooling with complex page logic we simply read the long in two 4-byte chunks. final long lower = readInteger(); final long upper = readInteger(); return ((upper << 32) | (lower & 0xFFFFFFFFL)); } @Override void skipBytes(final int count) { mFilePointer += count; } @Override void seek(final int position) { mFilePointer = position; } @Override int position() { return mFilePointer; } @Override void close() { mFilePointer = 0; mCurrentPage = INVALID_PAGE; if (mRandomAccessFile != null) { try { mRandomAccessFile.close(); } catch (final IOException exc) { throw new RuntimeIOException(exc.getMessage(), exc); } mRandomAccessFile = null; } } private void loadPage(final int filePosition) { final int page = filePosition & PAGE_MASK; if (page == mCurrentPage) { return; } try { mRandomAccessFile.seek(page); final int readLength = Math.min(mFileLength - page, PAGE_SIZE); mRandomAccessFile.readFully(mBuffer, 0, readLength); mCurrentPage = page; } catch (final IOException exc) { throw new RuntimeIOException("Exception reading BAM index file " + mFile + ": " + exc.getMessage(), exc); } } } static class IndexStreamBuffer extends IndexFileBuffer { private final SeekableStream in; private final ByteBuffer tmpBuf; /** Continually reads from the provided {@link SeekableStream} into the buffer until the specified number of bytes are read, or * until the stream is exhausted, throwing a {@link RuntimeIOException}. */ private static void readFully(final SeekableStream in, final byte[] buffer, final int offset, final int length) { int read = 0; while (read < length) { final int readThisLoop; try { readThisLoop = in.read(buffer, read, length - read); } catch (final IOException e) { throw new RuntimeIOException(e); } if (readThisLoop == -1) break; read += readThisLoop; } if (read != length) throw new RuntimeIOException("Expected to read " + length + " bytes, but expired stream after " + read + "."); } public IndexStreamBuffer(final SeekableStream s) { in = s; tmpBuf = ByteBuffer.allocate(8); // Enough to fit a long. tmpBuf.order(ByteOrder.LITTLE_ENDIAN); } @Override public void close() { try { in.close(); } catch (final IOException e) { throw new RuntimeIOException(e); } } @Override public void readBytes(final byte[] bytes) { readFully(in, bytes, 0, bytes.length); } @Override public void seek(final int position) { try { in.seek(position); } catch (final IOException e) { throw new RuntimeIOException(e); } } @Override public int readInteger() { readFully(in, tmpBuf.array(), 0, 4); return tmpBuf.getInt(0); } @Override public long readLong() { readFully(in, tmpBuf.array(), 0, 8); return tmpBuf.getLong(0); } @Override public void skipBytes(final int count) { try { for (int s = count; s > 0;) { final int skipped = (int)in.skip(s); if (skipped <= 0) throw new RuntimeIOException("Failed to skip " + s); s -= skipped; } } catch (final IOException e) { throw new RuntimeIOException(e); } } @Override public int position() { try { return (int) in.position(); } catch (final IOException e) { throw new RuntimeIOException(e); } } } } htsjdk-2.0.1/src/java/htsjdk/samtools/AbstractSAMHeaderRecord.java000066400000000000000000000103641263034757100250740ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.io.Serializable; import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import javax.xml.bind.annotation.XmlTransient; /** * Base class for the various concrete records in a SAM header, providing uniform * access to the attributes. */ @XmlTransient /* don't consider this class for XML-serialization */ public abstract class AbstractSAMHeaderRecord implements Serializable { public static final long serialVersionUID = 1L; private final Map mAttributes = new LinkedHashMap(); public String getAttribute(final String key) { return mAttributes.get(key); } /** * Set the given value for the attribute named 'key'. Replaces an existing value, if any. * If value is null, the attribute is removed. * Otherwise, the value will be converted to a String with toString. * @param key attribute name * @param value attribute value * @deprecated Use the version that takes a String value instead */ public void setAttribute(final String key, final Object value) { setAttribute(key, value == null? null: value.toString()); } /** * Set the given value for the attribute named 'key'. Replaces an existing value, if any. * If value is null, the attribute is removed. * Supported types are Character, Integer, Float and String. Byte and Short may also be * passed in but they will be converted to Integer. * @param key attribute name * @param value attribute value */ public void setAttribute(final String key, final String value) { if (value == null) { mAttributes.remove(key); } else { mAttributes.put(key, value); } } /** * Returns the Set of attributes. */ public Set> getAttributes() { return mAttributes.entrySet(); } /** * Returns the ID tag (or equivalent) for this header record. The * default implementation throws a SAMException to indicate "not implemented". */ public String getId() { throw new UnsupportedOperationException("Method not implemented for: " + this.getClass()); } /** * For use in the equals() method of the concrete class. */ protected boolean attributesEqual(final AbstractSAMHeaderRecord that) { return mAttributes.equals(that.mAttributes); } /** * For use in the hashCode() method of the concrete class. */ protected int attributesHashCode() { return (mAttributes != null ? mAttributes.hashCode() : 0); } /** * Standard tags are the tags defined in SAM spec. These do not have type information in the test * representation, because the type information is predefined for each tag. * @return list of predefined tags for the concrete SAMHeader record type. */ abstract Set getStandardTags(); /** Simple to String that outputs the concrete class name and the set of attributes stored. */ @Override public String toString() { return getClass().getSimpleName() + this.mAttributes.toString(); } } htsjdk-2.0.1/src/java/htsjdk/samtools/AlignmentBlock.java000066400000000000000000000045161263034757100234130ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.io.Serializable; /** * Represents the contiguous alignment of a subset of read bases to a reference * sequence. Simply put an alignment block tells you that read bases from * readStart are aligned to the reference (matching or mismatching) from * referenceStart for length bases. * * @author Tim Fennell */ public class AlignmentBlock implements Serializable { public static final long serialVersionUID = 1L; private int readStart; private int referenceStart; private int length; /** Constructs a new alignment block with the supplied read and ref starts and length. */ AlignmentBlock(int readStart, int referenceStart, int length) { this.readStart = readStart; this.referenceStart = referenceStart; this.length = length; } /** The first, 1-based, base in the read that is aligned to the reference reference. */ public int getReadStart() { return readStart; } /** The first, 1-based, position in the reference to which the read is aligned. */ public int getReferenceStart() { return referenceStart; } /** The number of contiguous bases aligned to the reference. */ public int getLength() { return length; } } htsjdk-2.0.1/src/java/htsjdk/samtools/AsyncSAMFileWriter.java000066400000000000000000000040461263034757100241330ustar00rootroot00000000000000package htsjdk.samtools; import htsjdk.samtools.util.AbstractAsyncWriter; import htsjdk.samtools.util.ProgressLoggerInterface; /** * SAMFileWriter that can be wrapped around an underlying SAMFileWriter to provide asynchronous output. Records * added are placed into a queue, the queue is then drained into the underlying SAMFileWriter by a thread owned * by the instance. * * Exceptions experienced by the writer thread will be emitted back to the caller in subsequent calls to either * addAlignment() or close(). * * @author Tim Fennell */ class AsyncSAMFileWriter extends AbstractAsyncWriter implements SAMFileWriter { private final SAMFileWriter underlyingWriter; /** * Creates a new AsyncSAMFileWriter wrapping the provided SAMFileWriter. */ public AsyncSAMFileWriter(final SAMFileWriter out) { this(out, DEFAULT_QUEUE_SIZE); } /** * Creates an AsyncSAMFileWriter wrapping the provided SAMFileWriter and using the specified * queue size for buffer SAMRecords. */ public AsyncSAMFileWriter(final SAMFileWriter out, final int queueSize) { super(queueSize); this.underlyingWriter = out; } @Override protected void synchronouslyWrite(final SAMRecord item) { this.underlyingWriter.addAlignment(item); } @Override protected void synchronouslyClose() { this.underlyingWriter.close(); } @Override protected final String getThreadNamePrefix() { return "SAMFileWriterThread-"; } @Override public void setProgressLogger(final ProgressLoggerInterface progress) { this.underlyingWriter.setProgressLogger(progress); } /** * Adds an alignment to the queue to be written. Will re-throw any exception that was received when * writing prior record(s) to the underlying SAMFileWriter. */ public void addAlignment(final SAMRecord alignment) { write(alignment); } /** Returns the SAMFileHeader from the underlying SAMFileWriter. */ public SAMFileHeader getFileHeader() { return this.underlyingWriter.getFileHeader(); } } htsjdk-2.0.1/src/java/htsjdk/samtools/BAMFileConstants.java000066400000000000000000000032511263034757100236110ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Constants used in reading & writing BAM files */ class BAMFileConstants { /** * The beginning of a BAMRecord is a fixed-size block of 8 int32s */ static final int FIXED_BLOCK_SIZE = 8 * 4; /** * BAM file magic number. This is what is present in the gunzipped version of the file, * which never exists on disk. */ static final byte[] BAM_MAGIC = "BAM\1".getBytes(); /** * BAM index file magic number. */ static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); } htsjdk-2.0.1/src/java/htsjdk/samtools/BAMFileReader.java000066400000000000000000001226061263034757100230450ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.BinaryCodec; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.samtools.util.StringLineReader; import java.io.DataInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.NoSuchElementException; /** * Class for reading and querying BAM files. */ class BAMFileReader extends SamReader.ReaderImplementation { // True if reading from a File rather than an InputStream private boolean mIsSeekable = false; // For converting bytes into other primitive types private BinaryCodec mStream = null; // Underlying compressed data stream. private final BlockCompressedInputStream mCompressedInputStream; private SAMFileHeader mFileHeader = null; // One of these is populated if the file is seekable and an index exists private File mIndexFile = null; private SeekableStream mIndexStream = null; private BAMIndex mIndex = null; private long mFirstRecordPointer = 0; // If non-null, there is an unclosed iterator extant. private CloseableIterator mCurrentIterator = null; // If true, all SAMRecords are fully decoded as they are read. private boolean eagerDecode; // For error-checking. private ValidationStringency mValidationStringency; // For creating BAMRecords private SAMRecordFactory samRecordFactory; /** * Use the caching index reader implementation rather than the disk-hit-per-file model. */ private boolean mEnableIndexCaching = false; /** * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O. */ private boolean mEnableIndexMemoryMapping = true; /** * Add information about the origin (reader and position) to SAM records. */ private SamReader mReader = null; /** * Prepare to read BAM from a stream (not seekable) * @param stream source of bytes. * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. * @param validationStringency Controls how to handle invalidate reads or header lines. */ BAMFileReader(final InputStream stream, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { mIndexFile = indexFile; mIsSeekable = false; mCompressedInputStream = new BlockCompressedInputStream(stream); mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); this.eagerDecode = eagerDecode; this.mValidationStringency = validationStringency; this.samRecordFactory = factory; this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, null); } /** * Prepare to read BAM from a file (seekable) * @param file source of bytes. * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. * @param validationStringency Controls how to handle invalidate reads or header lines. */ BAMFileReader(final File file, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : SamFiles.findIndex(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory); if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) { System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() + " is older than BAM " + file.getAbsolutePath()); } // Provide better error message when there is an error reading. mStream.setInputFileName(file.getAbsolutePath()); } BAMFileReader(final SeekableStream strm, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { this(new BlockCompressedInputStream(strm), indexFile, eagerDecode, strm.getSource(), validationStringency, factory); } BAMFileReader(final SeekableStream strm, final SeekableStream indexStream, final boolean eagerDecode, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { this(new BlockCompressedInputStream(strm), indexStream, eagerDecode, strm.getSource(), validationStringency, factory); } private BAMFileReader(final BlockCompressedInputStream compressedInputStream, final File indexFile, final boolean eagerDecode, final String source, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { mIndexFile = indexFile; mIsSeekable = true; mCompressedInputStream = compressedInputStream; mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); this.eagerDecode = eagerDecode; this.mValidationStringency = validationStringency; this.samRecordFactory = factory; this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, source); mFirstRecordPointer = mCompressedInputStream.getFilePointer(); } private BAMFileReader(final BlockCompressedInputStream compressedInputStream, final SeekableStream indexStream, final boolean eagerDecode, final String source, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { mIndexStream = indexStream; mIsSeekable = true; mCompressedInputStream = compressedInputStream; mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); this.eagerDecode = eagerDecode; this.mValidationStringency = validationStringency; this.samRecordFactory = factory; this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, source); mFirstRecordPointer = mCompressedInputStream.getFilePointer(); } /** Reads through the header and sequence records to find the virtual file offset of the first record in the BAM file. */ static long findVirtualOffsetOfFirstRecord(final File bam) throws IOException { final BAMFileReader reader = new BAMFileReader(bam, null, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory()); final long offset = reader.mFirstRecordPointer; reader.close(); return offset; } /** * If true, writes the source of every read into the source SAMRecords. * @param enabled true to write source information into each SAMRecord. */ void enableFileSource(final SamReader reader, final boolean enabled) { this.mReader = enabled ? reader : null; } /** * If true, uses the caching version of the index reader. * @param enabled true to write source information into each SAMRecord. */ protected void enableIndexCaching(final boolean enabled) { if(mIndex != null) throw new SAMException("Unable to turn on index caching; index file has already been loaded."); this.mEnableIndexCaching = enabled; } /** * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping). * This is slower but more scalable when accessing large numbers of BAM files sequentially. * @param enabled True to use memory mapping, false to use regular I/O. */ protected void enableIndexMemoryMapping(final boolean enabled) { if (mIndex != null) { throw new SAMException("Unable to change index memory mapping; index file has already been loaded."); } this.mEnableIndexMemoryMapping = enabled; } @Override void enableCrcChecking(final boolean enabled) { this.mCompressedInputStream.setCheckCrcs(enabled); } @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; } @Override public SamReader.Type type() { return SamReader.Type.BAM_TYPE; } /** * @return true if ths is a BAM file, and has an index */ public boolean hasIndex() { return mIsSeekable && ((mIndexFile != null) || (mIndexStream != null)); } /** * Retrieves the index for the given file type. Ensure that the index is of the specified type. * @return An index of the given type. */ public BAMIndex getIndex() { if(!hasIndex()) throw new SAMException("No index is available for this BAM file."); if(mIndex == null) { if (mIndexFile != null) mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); else mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexStream, getFileHeader().getSequenceDictionary()) : new DiskBasedBAMFileIndex(mIndexStream, getFileHeader().getSequenceDictionary()); } return mIndex; } public void setEagerDecode(final boolean desired) { this.eagerDecode = desired; } public void close() { if (mStream != null) { mStream.close(); } if (mIndex != null) { mIndex.close(); } mStream = null; mFileHeader = null; mIndex = null; } public SAMFileHeader getFileHeader() { return mFileHeader; } /** * Set error-checking level for subsequent SAMRecord reads. */ void setValidationStringency(final ValidationStringency validationStringency) { this.mValidationStringency = validationStringency; } public ValidationStringency getValidationStringency() { return this.mValidationStringency; } /** * Prepare to iterate through the SAMRecords in file order. * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once, * that iterator must be closed before getIterator() can be called again. * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to * getIterator() begins its iteration where the last one left off. That is the best that can be * done in that situation. */ public CloseableIterator getIterator() { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (mIsSeekable) { try { mCompressedInputStream.seek(mFirstRecordPointer); } catch (final IOException exc) { throw new RuntimeIOException(exc.getMessage(), exc); } } mCurrentIterator = new BAMFileIterator(); return mCurrentIterator; } @Override public CloseableIterator getIterator(final SAMFileSpan chunks) { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!(chunks instanceof BAMFileSpan)) { throw new IllegalStateException("BAMFileReader cannot handle this type of file span."); } // Create an iterator over the given chunk boundaries. mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray()); return mCurrentIterator; } /** * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However, * the rightmost bound is guaranteed to be after the last read in the file. * @return An unbounded pointer to the first record in the BAM file. */ @Override public SAMFileSpan getFilePointerSpanningReads() { return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE)); } /** * Prepare to iterate through the SAMRecords that match the given interval. * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed * before calling any of the methods that return an iterator. * * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate * matches the specified interval. * * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. * * @param sequence Reference sequence sought. * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end. * A value of zero implies the start of the reference sequence. * @param end A value of zero implies the end of the reference sequence. * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval * specified by start and end. If false, the SAMRecords need only overlap the interval. * @return Iterator for the matching SAMRecords */ CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!mIsSeekable) { throw new UnsupportedOperationException("Cannot query stream-based BAM file"); } final int referenceIndex = mFileHeader.getSequenceIndex(sequence); if (referenceIndex == -1) { mCurrentIterator = new EmptyBamIterator(); } else { final QueryInterval[] queryIntervals = {new QueryInterval(referenceIndex, start, end)}; mCurrentIterator = createIndexIterator(queryIntervals, contained); } return mCurrentIterator; } /** * Prepare to iterate through the SAMRecords that match any of the given intervals. * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed * before calling any of the methods that return an iterator. * * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate * matches the specified interval. * * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. * * @param intervals list of intervals to be queried. Must be optimized. * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval * specified by start and end. If false, the SAMRecords need only overlap the interval. * @return Iterator for the matching SAMRecords * @see QueryInterval#optimizeIntervals(QueryInterval[]) */ public CloseableIterator query(final QueryInterval[] intervals, final boolean contained) { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!mIsSeekable) { throw new UnsupportedOperationException("Cannot query stream-based BAM file"); } mCurrentIterator = createIndexIterator(intervals, contained); return mCurrentIterator; } /** * Prepare to iterate through the SAMRecords with the given alignment start. * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed * before calling any of the methods that return an iterator. * * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate * matches the specified interval. * * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. * * @param sequence Reference sequence sought. * @param start Alignment start sought. * @return Iterator for the matching SAMRecords. */ public CloseableIterator queryAlignmentStart(final String sequence, final int start) { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!mIsSeekable) { throw new UnsupportedOperationException("Cannot query stream-based BAM file"); } final int referenceIndex = mFileHeader.getSequenceIndex(sequence); if (referenceIndex == -1) { mCurrentIterator = new EmptyBamIterator(); } else { mCurrentIterator = createStartingAtIndexIterator(referenceIndex, start); } return mCurrentIterator; } /** * Prepare to iterate through the SAMRecords that are unmapped and do not have a reference name or alignment start. * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed * before calling any of the methods that return an iterator. * * @return Iterator for the matching SAMRecords. */ public CloseableIterator queryUnmapped() { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!mIsSeekable) { throw new UnsupportedOperationException("Cannot query stream-based BAM file"); } try { final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); if (startOfLastLinearBin != -1) { mCompressedInputStream.seek(startOfLastLinearBin); } else { // No mapped reads in file, just start at the first read in file. mCompressedInputStream.seek(mFirstRecordPointer); } mCurrentIterator = new BAMFileIndexUnmappedIterator(); return mCurrentIterator; } catch (final IOException e) { throw new RuntimeIOException("IOException seeking to unmapped reads", e); } } /** * Reads the header of a BAM file from a stream * @param stream A BinaryCodec to read the header from * @param validationStringency Determines how stringent to be when validating the sam * @param source Note that this is used only for reporting errors. */ protected static SAMFileHeader readHeader(final BinaryCodec stream, final ValidationStringency validationStringency, final String source) throws IOException { final byte[] buffer = new byte[4]; stream.readBytes(buffer); if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { throw new IOException("Invalid BAM file header"); } final int headerTextLength = stream.readInt(); final String textHeader = stream.readString(headerTextLength); final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); headerCodec.setValidationStringency(validationStringency); final SAMFileHeader samFileHeader = headerCodec.decode(new StringLineReader(textHeader), source); final int sequenceCount = stream.readInt(); if (samFileHeader.getSequenceDictionary().size() > 0) { // It is allowed to have binary sequences but no text sequences, so only validate if both are present if (sequenceCount != samFileHeader.getSequenceDictionary().size()) { throw new SAMFormatException("Number of sequences in text header (" + samFileHeader.getSequenceDictionary().size() + ") != number of sequences in binary header (" + sequenceCount + ") for file " + source); } for (int i = 0; i < sequenceCount; i++) { final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(stream, source); final SAMSequenceRecord sequenceRecord = samFileHeader.getSequence(i); if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + source); } if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + source); } } } else { // If only binary sequences are present, copy them into samFileHeader final List sequences = new ArrayList(sequenceCount); for (int i = 0; i < sequenceCount; i++) { sequences.add(readSequenceRecord(stream, source)); } samFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); } return samFileHeader; } /** * Reads a single binary sequence record from the file or stream * @param source Note that this is used only for reporting errors. */ private static SAMSequenceRecord readSequenceRecord(final BinaryCodec stream, final String source) { final int nameLength = stream.readInt(); if (nameLength <= 1) { throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source); } final String sequenceName = stream.readString(nameLength - 1); // Skip the null terminator stream.readByte(); final int sequenceLength = stream.readInt(); return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength); } /** * Encapsulates the restriction that only one iterator may be open at a time. */ private abstract class AbstractBamIterator implements CloseableIterator { private boolean isClosed = false; public void close() { if (!isClosed) { if (mCurrentIterator != null && this != mCurrentIterator) { throw new IllegalStateException("Attempt to close non-current iterator"); } mCurrentIterator = null; isClosed = true; } } protected void assertOpen() { if (isClosed) throw new AssertionError("Iterator has been closed"); } public void remove() { throw new UnsupportedOperationException("Not supported: remove"); } } private class EmptyBamIterator extends AbstractBamIterator { @Override public boolean hasNext() { return false; } @Override public SAMRecord next() { throw new NoSuchElementException("next called on empty iterator"); } } /** /** * Iterator for non-indexed sequential iteration through all SAMRecords in file. * Starting point of iteration is wherever current file position is when the iterator is constructed. */ private class BAMFileIterator extends AbstractBamIterator { private SAMRecord mNextRecord = null; private final BAMRecordCodec bamRecordCodec; private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file BAMFileIterator() { this(true); } /** * @param advance Trick to enable subclass to do more setup before advancing */ BAMFileIterator(final boolean advance) { this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory); this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream(), BAMFileReader.this.mStream.getInputFileName()); if (advance) { advance(); } } public boolean hasNext() { assertOpen(); return (mNextRecord != null); } public SAMRecord next() { assertOpen(); final SAMRecord result = mNextRecord; advance(); return result; } void advance() { try { mNextRecord = getNextRecord(); if (mNextRecord != null) { ++this.samRecordIndex; // Because some decoding is done lazily, the record needs to remember the validation stringency. mNextRecord.setValidationStringency(mValidationStringency); if (mValidationStringency != ValidationStringency.SILENT) { final List validationErrors = mNextRecord.isValid(mValidationStringency == ValidationStringency.STRICT); SAMUtils.processValidationErrors(validationErrors, this.samRecordIndex, BAMFileReader.this.getValidationStringency()); } } if (eagerDecode && mNextRecord != null) { mNextRecord.eagerDecode(); } } catch (final IOException exc) { throw new RuntimeIOException(exc.getMessage(), exc); } } /** * Read the next record from the input stream. */ SAMRecord getNextRecord() throws IOException { final long startCoordinate = mCompressedInputStream.getFilePointer(); final SAMRecord next = bamRecordCodec.decode(); final long stopCoordinate = mCompressedInputStream.getFilePointer(); if(mReader != null && next != null) next.setFileSource(new SAMFileSource(mReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate)))); return next; } /** * @return The record that will be return by the next call to next() */ protected SAMRecord peek() { return mNextRecord; } } /** * Prepare to iterate through SAMRecords in the given reference that start exactly at the given start coordinate. * @param referenceIndex Desired reference sequence. * @param start 1-based alignment start. */ private CloseableIterator createStartingAtIndexIterator(final int referenceIndex, final int start) { // Hit the index to determine the chunk boundaries for the required data. final BAMIndex fileIndex = getIndex(); final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, 0); final long[] filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; // Create an iterator over the above chunk boundaries. final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); // Add some preprocessing filters for edge-case reads that don't fit into this // query type. return new BAMQueryFilteringIterator(iterator,new BAMStartingAtIteratorFilter(referenceIndex,start)); } /** * @throws java.lang.IllegalArgumentException if the intervals are not optimized * @see QueryInterval#optimizeIntervals(QueryInterval[]) */ private void assertIntervalsOptimized(final QueryInterval[] intervals) { if (intervals.length == 0) return; for (int i = 1; i < intervals.length; ++i) { final QueryInterval prev = intervals[i-1]; final QueryInterval thisInterval = intervals[i]; if (prev.compareTo(thisInterval) >= 0) { throw new IllegalArgumentException(String.format("List of intervals is not sorted: %s >= %s", prev, thisInterval)); } if (prev.overlaps(thisInterval)) { throw new IllegalArgumentException(String.format("List of intervals is not optimized: %s intersects %s", prev, thisInterval)); } if (prev.abuts(thisInterval)) { throw new IllegalArgumentException(String.format("List of intervals is not optimized: %s abuts %s", prev, thisInterval)); } } } private CloseableIterator createIndexIterator(final QueryInterval[] intervals, final boolean contained) { assertIntervalsOptimized(intervals); // Hit the index to determine the chunk boundaries for the required data. final BAMFileSpan[] inputSpans = new BAMFileSpan[intervals.length]; final BAMIndex fileIndex = getIndex(); for (int i = 0; i < intervals.length; ++i) { final QueryInterval interval = intervals[i]; final BAMFileSpan span = fileIndex.getSpanOverlapping(interval.referenceIndex, interval.start, interval.end); inputSpans[i] = span; } final long[] filePointers; if (inputSpans.length > 0) { filePointers = BAMFileSpan.merge(inputSpans).toCoordinateArray(); } else { filePointers = null; } // Create an iterator over the above chunk boundaries. final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); // Add some preprocessing filters for edge-case reads that don't fit into this // query type. return new BAMQueryFilteringIterator(iterator, new BAMQueryMultipleIntervalsIteratorFilter(intervals, contained)); } /** * Iterate over the SAMRecords defined by the sections of the file described in the ctor argument. */ private class BAMFileIndexIterator extends BAMFileIterator { private long[] mFilePointers = null; private int mFilePointerIndex = 0; private long mFilePointerLimit = -1; /** * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset. * @param filePointers the block / offset combination, stored in chunk format. */ BAMFileIndexIterator(final long[] filePointers) { super(false); // delay advance() until after construction mFilePointers = filePointers; advance(); } SAMRecord getNextRecord() throws IOException { // Advance to next file block if necessary while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) { if (mFilePointers == null || mFilePointerIndex >= mFilePointers.length) { return null; } final long startOffset = mFilePointers[mFilePointerIndex++]; final long endOffset = mFilePointers[mFilePointerIndex++]; mCompressedInputStream.seek(startOffset); mFilePointerLimit = endOffset; } // Pull next record from stream return super.getNextRecord(); } } /** * Pull SAMRecords from a coordinate-sorted iterator, and filter out any that do not match the filter. */ public class BAMQueryFilteringIterator extends AbstractBamIterator { /** * The wrapped iterator. */ protected final CloseableIterator wrappedIterator; /** * The next record to be returned. Will be null if no such record exists. */ protected SAMRecord mNextRecord; private final BAMIteratorFilter iteratorFilter; public BAMQueryFilteringIterator(final CloseableIterator iterator, final BAMIteratorFilter iteratorFilter) { this.wrappedIterator = iterator; this.iteratorFilter = iteratorFilter; mNextRecord = advance(); } /** * Returns true if a next element exists; false otherwise. */ public boolean hasNext() { assertOpen(); return mNextRecord != null; } /** * Gets the next record from the given iterator. * @return The next SAM record in the iterator. */ public SAMRecord next() { if(!hasNext()) throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available"); final SAMRecord currentRead = mNextRecord; mNextRecord = advance(); return currentRead; } SAMRecord advance() { while (true) { // Pull next record from stream if(!wrappedIterator.hasNext()) return null; final SAMRecord record = wrappedIterator.next(); switch (iteratorFilter.compareToFilter(record)) { case MATCHES_FILTER: return record; case STOP_ITERATION: return null; case CONTINUE_ITERATION: break; // keep looping default: throw new SAMException("Unexpected return from compareToFilter"); } } } } interface BAMIteratorFilter { /** * Determine if given record passes the filter, and if it does not, whether iteration should continue * or if this record is beyond the region(s) of interest. */ FilteringIteratorState compareToFilter(final SAMRecord record); } /** * A decorating iterator that filters out records that do not match the given reference and start position. */ private class BAMStartingAtIteratorFilter implements BAMIteratorFilter { private final int mReferenceIndex; private final int mRegionStart; public BAMStartingAtIteratorFilter(final int referenceIndex, final int start) { mReferenceIndex = referenceIndex; mRegionStart = start; } /** * * @return MATCHES_FILTER if this record matches the filter; * CONTINUE_ITERATION if does not match filter but iteration should continue; * STOP_ITERATION if does not match filter and iteration should end. */ @Override public FilteringIteratorState compareToFilter(final SAMRecord record) { // If beyond the end of this reference sequence, end iteration final int referenceIndex = record.getReferenceIndex(); if (referenceIndex < 0 || referenceIndex > mReferenceIndex) { return FilteringIteratorState.STOP_ITERATION; } else if (referenceIndex < mReferenceIndex) { // If before this reference sequence, continue return FilteringIteratorState.CONTINUE_ITERATION; } final int alignmentStart = record.getAlignmentStart(); if (alignmentStart > mRegionStart) { // If scanned beyond target region, end iteration return FilteringIteratorState.STOP_ITERATION; } else if (alignmentStart == mRegionStart) { return FilteringIteratorState.MATCHES_FILTER; } else { return FilteringIteratorState.CONTINUE_ITERATION; } } } private class BAMFileIndexUnmappedIterator extends BAMFileIterator { private BAMFileIndexUnmappedIterator() { while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { advance(); } } } /** * Filters out records that do not match any of the given intervals and query type. */ private class BAMQueryMultipleIntervalsIteratorFilter implements BAMIteratorFilter { final QueryInterval[] intervals; final boolean contained; int intervalIndex = 0; public BAMQueryMultipleIntervalsIteratorFilter(final QueryInterval[] intervals, final boolean contained) { this.contained = contained; this.intervals = intervals; } @Override public FilteringIteratorState compareToFilter(final SAMRecord record) { while (intervalIndex < intervals.length) { final IntervalComparison comparison = compareIntervalToRecord(intervals[intervalIndex], record); switch (comparison) { // Interval is before SAMRecord. Try next interval; case BEFORE: ++intervalIndex; break; // Interval is after SAMRecord. Keep scanning forward in SAMRecords case AFTER: return FilteringIteratorState.CONTINUE_ITERATION; // Found a good record case CONTAINED: return FilteringIteratorState.MATCHES_FILTER; // Either found a good record, or else keep scanning SAMRecords case OVERLAPPING: return (contained ? FilteringIteratorState.CONTINUE_ITERATION : FilteringIteratorState.MATCHES_FILTER); } } // Went past the last interval return FilteringIteratorState.STOP_ITERATION; } private IntervalComparison compareIntervalToRecord(final QueryInterval interval, final SAMRecord record) { // interval.end <= 0 implies the end of the reference sequence. final int intervalEnd = (interval.end <= 0? Integer.MAX_VALUE: interval.end); final int alignmentEnd; if (record.getReadUnmappedFlag() && record.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { // Unmapped read with coordinate of mate. alignmentEnd = record.getAlignmentStart(); } else { alignmentEnd = record.getAlignmentEnd(); } if (interval.referenceIndex < record.getReferenceIndex()) return IntervalComparison.BEFORE; else if (interval.referenceIndex > record.getReferenceIndex()) return IntervalComparison.AFTER; else if (intervalEnd < record.getAlignmentStart()) return IntervalComparison.BEFORE; else if (alignmentEnd < interval.start) return IntervalComparison.AFTER; else if (CoordMath.encloses(interval.start, intervalEnd, record.getAlignmentStart(), alignmentEnd)) { return IntervalComparison.CONTAINED; } else return IntervalComparison.OVERLAPPING; } } private enum IntervalComparison { BEFORE, AFTER, OVERLAPPING, CONTAINED } /** * Type returned by BAMIteratorFilter that tell BAMQueryFilteringIterator how to handle each SAMRecord. */ private enum FilteringIteratorState { MATCHES_FILTER, STOP_ITERATION, CONTINUE_ITERATION } } htsjdk-2.0.1/src/java/htsjdk/samtools/BAMFileSpan.java000066400000000000000000000216161263034757100225430ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.StringUtil; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * An ordered list of chunks, capable of representing a set of discontiguous * regions in the BAM file. FileSpans are mutable within the package, but perceived * as immutable outside the package. * * Some operations on FileSpans assume that the spans are sorted. In these cases, * sort order will be validated. * * @author mhanna * @version 0.1 */ public class BAMFileSpan implements SAMFileSpan, Serializable { private static final long serialVersionUID = 1L; /** * The constituent chunks of this list. */ private final List chunks; /** * Create a new empty list of chunks. */ public BAMFileSpan() { this.chunks = new ArrayList(); } /** * Convenience constructor to construct a BAM file span from * a single chunk. * @param chunk Chunk to use as the sole region in this span. */ public BAMFileSpan(final Chunk chunk) { this.chunks = new ArrayList(); chunks.add(chunk); } /** * Create a new chunk list from the given list of chunks. * @param chunks Constituent chunks. */ public BAMFileSpan(final List chunks) { this.chunks = new ArrayList(chunks); } /** * Does this chunk list map to any position within the BAM file? * @return True iff the ChunkList points to any data within the BAM. */ public boolean isEmpty() { return chunks.isEmpty(); } /** * Deep clone the given chunk list. * @return A copy of the chunk list. */ public BAMFileSpan clone() { final BAMFileSpan clone = new BAMFileSpan(); for(final Chunk chunk: chunks) clone.chunks.add(chunk.clone()); return clone; } /** * Creates a new file span by removing all chunks before the given file span starts. * If a chunk in the chunk list starts before and ends after the given * chunk, the first portion of the chunk will be deleted. * @param fileSpan The filespan before which to eliminate. * @return A new BAMFileSpan which contains the portion of the chunk list after the given chunk. */ public SAMFileSpan removeContentsBefore(final SAMFileSpan fileSpan) { if(fileSpan == null) return clone(); if(!(fileSpan instanceof BAMFileSpan)) throw new SAMException("Unable to compare "); final BAMFileSpan bamFileSpan = (BAMFileSpan)fileSpan; if(bamFileSpan.isEmpty()) return clone(); validateSorted(); final BAMFileSpan trimmedChunkList = new BAMFileSpan(); for(final Chunk chunkToTrim: chunks) { if(chunkToTrim.getChunkEnd() > chunkToTrim.getChunkStart()) { if(chunkToTrim.getChunkStart() >= bamFileSpan.chunks.get(0).getChunkStart()) { // This chunk from the list is completely beyond the start of the filtering chunk. trimmedChunkList.add(chunkToTrim.clone()); } else { // This chunk from the list partially overlaps the filtering chunk and must be trimmed. trimmedChunkList.add(new Chunk(bamFileSpan.chunks.get(0).getChunkStart(),chunkToTrim.getChunkEnd())); } } } return trimmedChunkList; } /** * Gets a file span over the data immediately following this span. * @return The a pointer to data immediately following this span. */ public SAMFileSpan getContentsFollowing() { if(chunks.isEmpty()) throw new SAMException("Unable to get the file pointer following this one: no data present."); validateSorted(); return new BAMFileSpan(new Chunk(chunks.get(chunks.size()-1).getChunkEnd(),Long.MAX_VALUE)); } /** * Merge one span into another * * @param span - span with chunks to add to this one */ protected void add(final BAMFileSpan span) { for (final Chunk c : span.chunks) { chunks.add(c); } } /** * Adds a new chunk to this list. Visible only within the BAm. * @param chunk Chunk to add. */ protected void add(final Chunk chunk) { chunks.add(chunk); } /** * Convert the chunk list to an array of offsets, paired in [start,end) format. * @return Array of offsets. */ public long[] toCoordinateArray() { final int count = chunks.size() * 2; if (count == 0) { return null; } int index = 0; final long[] result = new long[count]; for (final Chunk chunk : chunks) { result[index++] = chunk.getChunkStart(); result[index++] = chunk.getChunkEnd(); } return result; } /** * Find the first offset in the chunk list * @return The first offset in the span */ public long getFirstOffset() { final long result = 0; if (chunks == null){ return result; } for (final Chunk chunk : chunks) { return chunk.getChunkStart(); } return result; } /** * Gets the constituent chunks stored in this span. * @return An unmodifiable list of chunks. */ public List getChunks() { return Collections.unmodifiableList(chunks); } /** * Checks that there is only a single chunk for this span and returns it. * @return The single chunk stored in this span */ protected Chunk getSingleChunk() { if (chunks.size() != 1){ throw new SAMException("Expecting a single chunk for span. Found " + chunks.size()); } return chunks.get(0); } /** * The list of chunks is often represented as an array of * longs where every even-numbered index is a start coordinate * and every odd-numbered index is a stop coordinate. Convert * from that format back to a list of chunks. * @param coordinateArray List of chunks to convert. * @return A list of chunks. */ protected static SAMFileSpan toChunkList(final long[] coordinateArray) { if(coordinateArray.length % 2 != 0) throw new SAMException("Data supplied does not appear to be in coordinate array format."); final BAMFileSpan chunkList = new BAMFileSpan(); for(int i = 0; i < coordinateArray.length; i += 2) chunkList.add(new Chunk(coordinateArray[i],coordinateArray[i+1])); chunkList.validateSorted(); return chunkList; } /** * Validates the list of chunks to ensure that they appear in sorted order. */ private void validateSorted() { for(int i = 1; i < chunks.size(); i++) { if(chunks.get(i).getChunkStart() < chunks.get(i-1).getChunkEnd()) throw new SAMException(String.format("Chunk list is unsorted; chunk %s is before chunk %s",chunks.get(i-1),chunks.get(i))); } } /** * Creates a string representation of this chunk list. */ @Override public String toString() { return StringUtil.join(";", chunks); } /** * * @return A single BAMFileSpan that is an intelligent merge of the input spans, i.e. contiguous, overlapping * and contained chunks are intelligently merged, and the chunks are sorted. */ public static BAMFileSpan merge(final BAMFileSpan[] spans) { final ArrayList inputChunks = new ArrayList(); for (final BAMFileSpan span : spans) { if(span != null){ inputChunks.addAll(span.chunks); } } return new BAMFileSpan(Chunk.optimizeChunkList(inputChunks, 0)); } } htsjdk-2.0.1/src/java/htsjdk/samtools/BAMFileWriter.java000066400000000000000000000211351263034757100231120ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.BinaryCodec; import htsjdk.samtools.util.BlockCompressedOutputStream; import htsjdk.samtools.util.RuntimeIOException; import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.StringWriter; import java.io.Writer; /** * Concrete implementation of SAMFileWriter for writing gzipped BAM files. */ class BAMFileWriter extends SAMFileWriterImpl { private final BinaryCodec outputBinaryCodec; private BAMRecordCodec bamRecordCodec = null; private final BlockCompressedOutputStream blockCompressedOutputStream; private BAMIndexer bamIndexer = null; protected BAMFileWriter(final File path) { blockCompressedOutputStream = new BlockCompressedOutputStream(path); outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); outputBinaryCodec.setOutputFileName(path.getAbsolutePath()); } protected BAMFileWriter(final File path, final int compressionLevel) { blockCompressedOutputStream = new BlockCompressedOutputStream(path, compressionLevel); outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); outputBinaryCodec.setOutputFileName(path.getAbsolutePath()); } protected BAMFileWriter(final OutputStream os, final File file) { blockCompressedOutputStream = new BlockCompressedOutputStream(os, file); outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); outputBinaryCodec.setOutputFileName(getPathString(file)); } protected BAMFileWriter(final OutputStream os, final File file, final int compressionLevel) { blockCompressedOutputStream = new BlockCompressedOutputStream(os, file, compressionLevel); outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); outputBinaryCodec.setOutputFileName(getPathString(file)); } private void prepareToWriteAlignments() { if (bamRecordCodec == null) { bamRecordCodec = new BAMRecordCodec(getFileHeader()); bamRecordCodec.setOutputStream(outputBinaryCodec.getOutputStream(), getFilename()); } } /** @return absolute path, or null if arg is null. */ private String getPathString(final File path){ return (path != null) ? path.getAbsolutePath() : null; } // Allow enabling the bam index construction // only enabled by factory method before anything is written void enableBamIndexConstruction () { if (!getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)){ throw new SAMException("Not creating BAM index since not sorted by coordinates: " + getSortOrder()); } if(getFilename() == null){ throw new SAMException("Not creating BAM index since we don't have an output file name"); } bamIndexer = createBamIndex(getFilename()); } private BAMIndexer createBamIndex(final String path) { try { final String indexFileBase = path.endsWith(BamFileIoUtils.BAM_FILE_EXTENSION) ? path.substring(0, path.lastIndexOf(".")) : path; final File indexFile = new File(indexFileBase + BAMIndex.BAMIndexSuffix); if (indexFile.exists()) { if (!indexFile.canWrite()) { throw new SAMException("Not creating BAM index since unable to write index file " + indexFile); } } return new BAMIndexer(indexFile, getFileHeader()); } catch (Exception e) { throw new SAMException("Not creating BAM index", e); } } protected void writeAlignment(final SAMRecord alignment) { prepareToWriteAlignments(); if (bamIndexer != null) { try { final long startOffset = blockCompressedOutputStream.getFilePointer(); bamRecordCodec.encode(alignment); final long stopOffset = blockCompressedOutputStream.getFilePointer(); // set the alignment's SourceInfo and then prepare its index information alignment.setFileSource(new SAMFileSource(null, new BAMFileSpan(new Chunk(startOffset, stopOffset)))); bamIndexer.processAlignment(alignment); } catch (Exception e) { bamIndexer = null; throw new SAMException("Exception when processing alignment for BAM index " + alignment, e); } } else { bamRecordCodec.encode(alignment); } } protected void writeHeader(final String textHeader) { writeHeader(outputBinaryCodec, getFileHeader(), textHeader); } protected void finish() { outputBinaryCodec.close(); try { if (bamIndexer != null) { bamIndexer.finish(); } } catch (Exception e) { throw new SAMException("Exception writing BAM index file", e); } } /** @return absolute path, or null if this writer does not correspond to a file. */ protected String getFilename() { return outputBinaryCodec.getOutputFileName(); } /** * Writes a header to a BAM file. samFileHeader and headerText are redundant - one can be used to regenerate the other but in * some instances we already have both so this allows us to save some cycles */ protected static void writeHeader(final BinaryCodec outputBinaryCodec, final SAMFileHeader samFileHeader, final String headerText) { outputBinaryCodec.writeBytes(BAMFileConstants.BAM_MAGIC); // calculate and write the length of the SAM file header text and the header text outputBinaryCodec.writeString(headerText, true, false); // write the sequences binarily. This is redundant with the text header outputBinaryCodec.writeInt(samFileHeader.getSequenceDictionary().size()); for (final SAMSequenceRecord sequenceRecord: samFileHeader.getSequenceDictionary().getSequences()) { outputBinaryCodec.writeString(sequenceRecord.getSequenceName(), true, true); outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength()); } } /** * Writes a header to a BAM file. Might need to regenerate the String version of the header, if one already has both the * samFileHeader and the String, use the version of this method which takes both. */ protected static void writeHeader(final BinaryCodec outputBinaryCodec, final SAMFileHeader samFileHeader) { // Do not use SAMFileHeader.getTextHeader() as it is not updated when changes to the underlying object are made final String headerString; final Writer stringWriter = new StringWriter(); new SAMTextHeaderCodec().encode(stringWriter, samFileHeader, true); headerString = stringWriter.toString(); writeHeader(outputBinaryCodec, samFileHeader, headerString); } protected static void writeHeader(final OutputStream outputStream, final SAMFileHeader samFileHeader) { final BlockCompressedOutputStream blockCompressedOutputStream = new BlockCompressedOutputStream(outputStream, null); final BinaryCodec outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); writeHeader(outputBinaryCodec, samFileHeader); try { blockCompressedOutputStream.flush(); } catch (final IOException ioe) { throw new RuntimeIOException(ioe); } } } htsjdk-2.0.1/src/java/htsjdk/samtools/BAMIndex.java000066400000000000000000000047511263034757100221120ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.io.Closeable; /** * A basic interface for querying BAM indices. * * @author mhanna * @version 0.1 */ public interface BAMIndex extends Closeable { public static final String BAMIndexSuffix = ".bai"; /** * Gets the compressed chunks which should be searched for the contents of records contained by the span * referenceIndex:startPos-endPos, inclusive. See the BAM spec for more information on how a chunk is * represented. * * @param referenceIndex The contig. * @param startPos Genomic start of query. * @param endPos Genomic end of query. * @return A file span listing the chunks in the BAM file. */ BAMFileSpan getSpanOverlapping(final int referenceIndex, final int startPos, final int endPos); /** * Gets the start of the last linear bin in the index. * @return The chunk indicating the start of the last bin in the linear index. */ long getStartOfLastLinearBin(); /** * Gets meta data for the given reference including information about number of aligned, unaligned, and noCoordinate records * @param reference the reference of interest * @return meta data for the reference */ public BAMIndexMetaData getMetaData(int reference); /** * Close the index and release any associated resources. */ void close(); } htsjdk-2.0.1/src/java/htsjdk/samtools/BAMIndexContent.java000066400000000000000000000060571263034757100234460ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Represents the contents of a bam index file for one reference. * A BAM index (.bai) file contains information for all references in the bam file. * This class describes the data present in the index file for one of these references; * including the bins, chunks, and linear index. */ class BAMIndexContent extends BinningIndexContent { /** * Chunks containing metaData for the reference, e.g. number of aligned and unaligned records */ private final BAMIndexMetaData mMetaData; /** * @param referenceSequence Content corresponds to this reference. * @param binList Array of bins represented by this content, possibly sparse * @param metaData Extra information about the reference in this index * @param linearIndex Additional index used to optimize queries */ BAMIndexContent(final int referenceSequence, final BinList binList, final BAMIndexMetaData metaData, final LinearIndex linearIndex) { super(referenceSequence, binList, linearIndex); this.mMetaData = metaData; } /** * @param referenceSequence Content corresponds to this reference. * @param bins Array of bins represented by this content, possibly sparse * @param numberOfBins Number of non-null bins * @param metaData Extra information about the reference in this index * @param linearIndex Additional index used to optimize queries */ BAMIndexContent(final int referenceSequence, final Bin[] bins, final int numberOfBins, final BAMIndexMetaData metaData, final LinearIndex linearIndex) { this(referenceSequence, new BinList(bins, numberOfBins), metaData, linearIndex); } /** * @return the meta data chunks for this content */ public BAMIndexMetaData getMetaData() { return mMetaData; } } htsjdk-2.0.1/src/java/htsjdk/samtools/BAMIndexMetaData.java000066400000000000000000000230561263034757100235120ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.cram.structure.Slice; import htsjdk.samtools.util.BlockCompressedFilePointerUtil; import java.io.File; import java.io.IOException; import java.util.List; /** * Metadata about the bam index contained within the bam index. * One instance created per index file. */ public class BAMIndexMetaData { // information for the entire index. // stored at the end of the index private long noCoordinateRecords = 0; // information for each reference. // stored in two chunks in bin # MAX_BINS private long firstOffset = -1; private long lastOffset = 0; private int alignedRecords = 0; private int unAlignedRecords = 0; // unmapped, but associated with this reference /** * Constructor used when writing an index * construct one instance for each index generated */ BAMIndexMetaData() { noCoordinateRecords = 0; newReference(); } /** * Constructor used when reading an index * construct one instance for each index generated */ BAMIndexMetaData(List chunkList) { noCoordinateRecords = 0; if (chunkList == null || chunkList.size() == 0) { // System.out.println("No metadata chunks"); } else if (chunkList.size() != 2) { throw new SAMException("Unexpected number of metadata chunks " + (chunkList.size())); } // fill in the first/lastOffset un/alignedRecords from this boolean firstChunk = true; if (chunkList != null) { for (Chunk c : chunkList) { long start = c.getChunkStart(); long end = c.getChunkEnd(); if (firstChunk) { firstOffset = start; lastOffset = end; firstChunk = false; } else { firstChunk = true; alignedRecords = (int) start; unAlignedRecords = (int) end; } } } } /** * @return the count of aligned records associated with this reference */ public int getAlignedRecordCount() { return alignedRecords; } /** * @return the count of unaligned records associated with this reference */ public int getUnalignedRecordCount() { return unAlignedRecords; } /** * Call for each new reference sequence encountered */ void newReference() { firstOffset = -1; lastOffset = 0; alignedRecords = 0; unAlignedRecords = 0; } /** * Extract relevant metaData from the record and its filePointer * Call only once per record in the file being indexed * * @param rec */ void recordMetaData(final SAMRecord rec) { final int alignmentStart = rec.getAlignmentStart(); if (alignmentStart == SAMRecord.NO_ALIGNMENT_START) { incrementNoCoordinateRecordCount(); return; } if (rec.getFileSource() == null) { throw new SAMException("BAM cannot be indexed without setting a fileSource for record " + rec); } final Chunk newChunk = ((BAMFileSpan) rec.getFileSource().getFilePointer()).getSingleChunk(); final long start = newChunk.getChunkStart(); final long end = newChunk.getChunkEnd(); if (rec.getReadUnmappedFlag()) { unAlignedRecords++; } else { alignedRecords++; } if (BlockCompressedFilePointerUtil.compare(start, firstOffset) < 1 || firstOffset == -1) { this.firstOffset = start; } if (BlockCompressedFilePointerUtil.compare(lastOffset, end) < 1) { this.lastOffset = end; } } /** * @param slice */ void recordMetaData(Slice slice) { final int alignmentStart = slice.alignmentStart; if (alignmentStart == SAMRecord.NO_ALIGNMENT_START) { incrementNoCoordinateRecordCount(); return; } final long start = slice.offset; final long end = slice.offset + 0; if (slice.alignmentSpan < 1) { unAlignedRecords += slice.nofRecords; } else { alignedRecords += slice.nofRecords; } if (BlockCompressedFilePointerUtil.compare(start, firstOffset) < 1 || firstOffset == -1) { this.firstOffset = start; } if (BlockCompressedFilePointerUtil.compare(lastOffset, end) < 1) { this.lastOffset = end; } } /** * Call whenever a reference with no coordinate information is encountered in the bam file */ void incrementNoCoordinateRecordCount() { noCoordinateRecords++; } /** * Set local variable. Normally noCoordinateRecord count accessed from AbstractBAMFileIndex when reading */ private void setNoCoordinateRecordCount(long count) { noCoordinateRecords = count; } /** * @return the count of records with no coordinate information in the bam file. * Not public, since only used by BAMIndexer when writing bam index. * Readers of bam index should use AbstractBAMFileIndex.getNoCoordinateRecordCount. */ long getNoCoordinateRecordCount() { return noCoordinateRecords; } /** * @return the first virtual file offset used by this reference */ long getFirstOffset() { return firstOffset; } /** * @return the last virtual file offset used by this reference */ long getLastOffset() { return lastOffset; } /** * Prints meta-data statistics from BAM index (.bai) file * Statistics include count of aligned and unaligned reads for each reference sequence * and a count of all records with no start coordinate */ static public void printIndexStats(final File inputBamFile) { try { final BAMFileReader bam = new BAMFileReader(inputBamFile, null, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory()); if (!bam.hasIndex()) { throw new SAMException("No index for bam file " + inputBamFile); } BAMIndexMetaData[] data = getIndexStats(bam); // read through all the bins of every reference. int nRefs = bam.getFileHeader().getSequenceDictionary().size(); for (int i = 0; i < nRefs; i++) { final SAMSequenceRecord seq = bam.getFileHeader().getSequence(i); if (seq == null) continue; final String sequenceName = seq.getSequenceName(); final int sequenceLength = seq.getSequenceLength(); System.out.print(sequenceName + ' ' + "length=\t" + sequenceLength); if (data[i] == null) { System.out.println(); continue; } System.out.println("\tAligned= " + data[i].getAlignedRecordCount() + "\tUnaligned= " + data[i].getUnalignedRecordCount()); } System.out.println("NoCoordinateCount= " + data[0].getNoCoordinateRecordCount()); } catch (IOException e) { throw new SAMException("Exception in getting index statistics", e); } } /** * Prints meta-data statistics from BAM index (.bai) file * Statistics include count of aligned and unaligned reads for each reference sequence * and a count of all records with no start coordinate */ static public BAMIndexMetaData[] getIndexStats(final BAMFileReader bam) { AbstractBAMFileIndex index = (AbstractBAMFileIndex) bam.getIndex(); // read through all the bins of every reference. int nRefs = index.getNumberOfReferences(); BAMIndexMetaData[] result = new BAMIndexMetaData[nRefs == 0 ? 1 : nRefs]; for (int i = 0; i < nRefs; i++) { result[i] = index.getMetaData(i); } if (result[0] == null) { result[0] = new BAMIndexMetaData(); } final Long noCoordCount = index.getNoCoordinateCount(); if (noCoordCount != null) // null in old index files without metadata result[0].setNoCoordinateRecordCount(noCoordCount); return result; } } htsjdk-2.0.1/src/java/htsjdk/samtools/BAMIndexWriter.java000066400000000000000000000035101263034757100232770ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.io.Closeable; /** * A basic interface for writing BAM index files * * @author mborkan */ interface BAMIndexWriter extends Closeable { // note - only package visibility /** * Write the data for one alignments to one reference sequence * * @param content BAMIndexContent containing the information for one reference */ public void writeReference(final BAMIndexContent content); /** * Writes out the count of records without coordinates * * @param count */ public void writeNoCoordinateRecordCount(final Long count); /** * Any necessary processing at the end of the file */ public void close(); }htsjdk-2.0.1/src/java/htsjdk/samtools/BAMIndexer.java000066400000000000000000000261201263034757100224330ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sub-license, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.Log; import java.io.File; import java.io.OutputStream; /** * Class for both constructing BAM index content and writing it out. * There are two usage patterns: * 1) Building a bam index from an existing bam file * 2) Building a bam index while building the bam file * In both cases, processAlignment is called for each alignment record and * finish() is called at the end. */ public class BAMIndexer { // The number of references (chromosomes) in the BAM file private final int numReferences; // output written as binary, or (for debugging) as text private final BAMIndexWriter outputWriter; private int currentReference = 0; // content is built up from the input bam file using this private final BAMIndexBuilder indexBuilder; /** * @param output binary BAM Index (.bai) file * @param fileHeader header for the corresponding bam file */ public BAMIndexer(final File output, final SAMFileHeader fileHeader) { numReferences = fileHeader.getSequenceDictionary().size(); indexBuilder = new BAMIndexBuilder(fileHeader.getSequenceDictionary()); outputWriter = new BinaryBAMIndexWriter(numReferences, output); } /** * Prepare to index a BAM. * * @param output Index will be written here. output will be closed when finish() method is called. * @param fileHeader header for the corresponding bam file. */ public BAMIndexer(final OutputStream output, final SAMFileHeader fileHeader) { numReferences = fileHeader.getSequenceDictionary().size(); indexBuilder = new BAMIndexBuilder(fileHeader.getSequenceDictionary()); outputWriter = new BinaryBAMIndexWriter(numReferences, output); } /** * Record any index information for a given BAM record. * If this alignment starts a new reference, write out the old reference. * Requires a non-null value for rec.getFileSource(). * * @param rec The BAM record */ public void processAlignment(final SAMRecord rec) { try { final int reference = rec.getReferenceIndex(); if (reference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && reference != currentReference) { // process any completed references advanceToReference(reference); } indexBuilder.processAlignment(rec); } catch (final Exception e) { throw new SAMException("Exception creating BAM index for record " + rec, e); } } /** * After all the alignment records have been processed, finish is called. * Writes any final information and closes the output file. */ public void finish() { // process any remaining references advanceToReference(numReferences); outputWriter.writeNoCoordinateRecordCount(indexBuilder.getNoCoordinateRecordCount()); outputWriter.close(); } /** write out any references between the currentReference and the nextReference */ private void advanceToReference(final int nextReference) { while (currentReference < nextReference) { final BAMIndexContent content = indexBuilder.processReference(currentReference); outputWriter.writeReference(content); currentReference++; if (currentReference < numReferences) { indexBuilder.startNewReference(); } } } /** * Generates a BAM index file, either textual or binary, from an input BAI file. * Only used for testing, but located here for visibility into CachingBAMFileIndex. * * @param output BAM Index (.bai) file (or bai.txt file when text) * @param textOutput Whether to create text output or binary */ static public void createAndWriteIndex(final File input, final File output, final boolean textOutput) { // content is from an existing bai file. final CachingBAMFileIndex existingIndex = new CachingBAMFileIndex(input, null); final int n_ref = existingIndex.getNumberOfReferences(); final BAMIndexWriter outputWriter; if (textOutput) { outputWriter = new TextualBAMIndexWriter(n_ref, output); } else { outputWriter = new BinaryBAMIndexWriter(n_ref, output); } // write the content one reference at a time try { for (int i = 0; i < n_ref; i++) { outputWriter.writeReference(existingIndex.getQueryResults(i)); } outputWriter.writeNoCoordinateRecordCount(existingIndex.getNoCoordinateCount()); outputWriter.close(); } catch (final Exception e) { throw new SAMException("Exception creating BAM index", e); } } /** * Class for constructing BAM index files. * One instance is used to construct an entire index. * processAlignment is called for each alignment until a new reference is encountered, then * processReference is called when all records for the reference have been processed. */ private class BAMIndexBuilder { private final SAMSequenceDictionary sequenceDictionary; private BinningIndexBuilder binningIndexBuilder; private int currentReference = -1; // information in meta data private final BAMIndexMetaData indexStats = new BAMIndexMetaData(); BAMIndexBuilder(final SAMSequenceDictionary sequenceDictionary) { this.sequenceDictionary = sequenceDictionary; if (!sequenceDictionary.isEmpty()) startNewReference(); } /** * Record any index information for a given BAM record * * @param rec The BAM record. Requires rec.getFileSource() is non-null. */ public void processAlignment(final SAMRecord rec) { // metadata indexStats.recordMetaData(rec); if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { return; // do nothing for records without coordinates, but count them } // various checks final int reference = rec.getReferenceIndex(); if (reference != currentReference) { throw new SAMException("Unexpected reference " + reference + " when constructing index for " + currentReference + " for record " + rec); } binningIndexBuilder.processFeature(new BinningIndexBuilder.FeatureToBeIndexed() { @Override public int getStart() { return rec.getAlignmentStart(); } @Override public int getEnd() { return rec.getAlignmentEnd(); } @Override public Integer getIndexingBin() { final Integer binNumber = rec.getIndexingBin(); return (binNumber == null ? rec.computeIndexingBin() : binNumber); } @Override public Chunk getChunk() { final SAMFileSource source = rec.getFileSource(); if (source == null) { throw new SAMException("No source (virtual file offsets); needed for indexing on BAM Record " + rec); } return ((BAMFileSpan) source.getFilePointer()).getSingleChunk(); } }); } /** * Creates the BAMIndexContent for this reference. * Requires all alignments of the reference have already been processed. * * @return Null if there are no features for this reference. */ public BAMIndexContent processReference(final int reference) { if (reference != currentReference) { throw new SAMException("Unexpected reference " + reference + " when constructing index for " + currentReference); } final BinningIndexContent indexContent = binningIndexBuilder.generateIndexContent(); if (indexContent == null) return null; return new BAMIndexContent(indexContent.getReferenceSequence(), indexContent.getBins(), indexStats, indexContent.getLinearIndex()); } /** * @return the count of records with no coordinate positions */ public long getNoCoordinateRecordCount() { return indexStats.getNoCoordinateRecordCount(); } /** * reinitialize all data structures when the reference changes */ void startNewReference() { ++currentReference; // I'm not crazy about recycling this object, but that is the way it was originally written and // it helps keep track of no-coordinate read count (which shouldn't be stored in this class anyway). indexStats.newReference(); binningIndexBuilder = new BinningIndexBuilder(currentReference, sequenceDictionary.getSequence(currentReference).getSequenceLength()); } } /** * Generates a BAM index file from an input BAM file * * @param reader SAMFileReader for input BAM file * @param output File for output index file */ public static void createIndex(SamReader reader, File output) { createIndex(reader, output, null); } /** * Generates a BAM index file from an input BAM file * * @param reader SAMFileReader for input BAM file * @param output File for output index file */ public static void createIndex(SamReader reader, File output, Log log) { BAMIndexer indexer = new BAMIndexer(output, reader.getFileHeader()); long totalRecords = 0; // create and write the content for (SAMRecord rec : reader) { if (++totalRecords % 1000000 == 0) { if (null != log) log.info(totalRecords + " reads processed ..."); } indexer.processAlignment(rec); } indexer.finish(); } }htsjdk-2.0.1/src/java/htsjdk/samtools/BAMRecord.java000066400000000000000000000302421263034757100222530ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.StringUtil; import java.nio.ByteBuffer; import java.nio.ByteOrder; /** * Wrapper class for binary BAM records. * Delays unpacking all data binary until requested. */ public class BAMRecord extends SAMRecord { /** * Offset of the read name in the variable length section of the disk representation of BAMRecord */ private static final int READ_NAME_OFFSET = 0; /** * Variable-length part of BAMRecord. Lazily decoded. */ private byte[] mRestOfBinaryData = null; // Various lengths are stored, because they are in the fixed-length part of the BAMRecord, and it is // more efficient to remember them than decode the element they store the length of. // The length becomes invalid if the element is changed with a set() method. private int mReadLength = 0; private boolean mReadLengthValid = true; private final short mReadNameLength; private boolean mReadNameLengthValid = true; private final int mCigarLength; private boolean mCigarLengthValid = true; // Whether or not the getter needs to decode the corresponding element. // For all the other variable length elements, null == not yet decoded. private boolean mAttributesDecoded = false; private boolean mCigarDecoded = false; /** * If any of the properties set from mRestOfBinaryData have been overridden by calls to setters, * this is set to true, indicating that mRestOfBinaryData cannot be used to write this record to disk. */ private boolean mBinaryDataStale; /** * Create a new BAM Record. If the reference sequence index or mate reference sequence index are any value other * than NO_ALIGNMENT_REFERENCE_INDEX (-1), then the specified index values must exist in the sequence dictionary * in the header argument. */ protected BAMRecord(final SAMFileHeader header, final int referenceID, final int coordinate, final short readNameLength, final short mappingQuality, final int indexingBin, final int cigarLen, final int flags, final int readLen, final int mateReferenceID, final int mateCoordinate, final int insertSize, final byte[] restOfData) { super(header); setReferenceIndex(referenceID); setAlignmentStart(coordinate); mReadNameLength = readNameLength; setMappingQuality(mappingQuality); mCigarLength = cigarLen; setFlags(flags); mReadLength = readLen; setMateReferenceIndex(mateReferenceID); setMateAlignmentStart(mateCoordinate); setInferredInsertSize(insertSize); mRestOfBinaryData = restOfData; // Set these to null in order to mark them as being candidates for lazy initialization. // If this is not done, they will have non-null defaults. super.setReadName(null); super.setCigarString(null); super.setReadBases(null); super.setBaseQualities(null); // Do this after the above because setCigarString will clear it. setIndexingBin(indexingBin); // Mark the binary block as being valid for writing back out to disk mBinaryDataStale = false; } /** * Force all the lazily-initialized attributes to be decoded. */ protected void eagerDecode() { getReadName(); getCigar(); getReadBases(); getBaseQualities(); getBinaryAttributes(); super.eagerDecode(); mRestOfBinaryData = null; } /** * If this record has a valid binary representation of the variable-length portion of a binary record stored, * return that byte array, otherwise return null. This will never be true for SAMRecords. It will be true * for BAMRecords that have not been eagerDecoded(), and for which none of the data in the variable-length * portion has been changed. */ @Override public byte[] getVariableBinaryRepresentation() { if (mBinaryDataStale) { return null; } // This may have been set to null by eagerDecode() return mRestOfBinaryData; } /** * Depending on the concrete implementation, the binary file size of attributes may be known without * computing them all. * * @return binary file size of attribute, if known, else -1. */ @Override public int getAttributesBinarySize() { if (mBinaryDataStale || mRestOfBinaryData == null) { return -1; } final int tagsOffset = readNameSize() + cigarSize() + basesSize() + qualsSize(); return mRestOfBinaryData.length - tagsOffset; } @Override public void setReadName(final String value) { super.setReadName(value); mBinaryDataStale = true; mReadNameLengthValid = false; } @Override public void setCigar(final Cigar cigar) { super.setCigar(cigar); mBinaryDataStale = true; mCigarLengthValid = false; mCigarDecoded = true; } @Override public void setCigarString(final String value) { super.setCigarString(value); mBinaryDataStale = true; mCigarLengthValid = false; mCigarDecoded = true; } @Override public void setReadBases(final byte[] value) { super.setReadBases(value); mBinaryDataStale = true; mReadLengthValid = false; } @Override public void setBaseQualities(final byte[] value) { super.setBaseQualities(value); mBinaryDataStale = true; } @Override protected void setAttribute(final short tag, final Object value, final boolean isUnsignedArray) { // populate all the attributes from the binary block before overwriting one getBinaryAttributes(); super.setAttribute(tag, value, isUnsignedArray); mBinaryDataStale = true; } /** * Removes all attributes. */ @Override public void clearAttributes() { mAttributesDecoded = true; mBinaryDataStale = true; super.clearAttributes(); } /** * Avoids decoding binary block to get read length. */ @Override public int getReadLength() { if (mReadLengthValid) { return mReadLength; } return super.getReadLength(); } @Override public String getReadName() { String result = super.getReadName(); if (mRestOfBinaryData != null && result == null) { result = decodeReadName(); super.setReadName(result); } return result; } /** * Avoids decoding read name to get read name length. Do not include null terminator. */ @Override public int getReadNameLength() { if (mReadNameLengthValid) { return mReadNameLength - 1; } return super.getReadNameLength(); } @Override public Cigar getCigar() { if (mRestOfBinaryData != null && !mCigarDecoded) { final int cigarOffset = readNameSize(); final ByteBuffer byteBuffer = ByteBuffer.wrap(mRestOfBinaryData, cigarOffset, cigarSize()); byteBuffer.order(ByteOrder.LITTLE_ENDIAN); super.initializeCigar(BinaryCigarCodec.decode(byteBuffer)); mCigarDecoded = true; if (null != getHeader() && getValidationStringency() != ValidationStringency.SILENT && !this.getReadUnmappedFlag()) { // Don't know line number, and don't want to force read name to be decoded. SAMUtils.processValidationErrors(validateCigar(-1L), -1, getValidationStringency()); } } return super.getCigar(); } /** * Avoids decoding CIGAR in order to get length. */ @Override public int getCigarLength() { if (mCigarLengthValid) { return mCigarLength; } else { return super.getCigarLength(); } } @Override public byte[] getReadBases() { byte[] result = super.getReadBases(); if (mRestOfBinaryData != null && result == null) { result = decodeReadBases(); super.setReadBases(result); } return result; } @Override public byte[] getBaseQualities() { byte[] ret = super.getBaseQualities(); if (mRestOfBinaryData != null && ret == null) { ret = decodeBaseQualities(); super.setBaseQualities(ret); } return ret; } @Override public Object getAttribute(final short tag) { if (!mAttributesDecoded) { decodeAttributes(); } return super.getAttribute(tag); } @Override protected SAMBinaryTagAndValue getBinaryAttributes() { if (!mAttributesDecoded) { decodeAttributes(); } return super.getBinaryAttributes(); } private void decodeAttributes() { if (mAttributesDecoded) { return; } mAttributesDecoded = true; final int tagsOffset = readNameSize() + cigarSize() + basesSize() + qualsSize(); final int tagsSize = mRestOfBinaryData.length - tagsOffset; final SAMBinaryTagAndValue attributes = BinaryTagCodec.readTags(mRestOfBinaryData, tagsOffset, tagsSize, getValidationStringency()); setAttributes(attributes); } private byte[] decodeBaseQualities() { if (mReadLength == 0) { return SAMRecord.NULL_QUALS; } final int qualsOffset = readNameSize() + cigarSize() + basesSize(); final byte[] ret = new byte[qualsSize()]; System.arraycopy(mRestOfBinaryData, qualsOffset, ret, 0, qualsSize()); if (ret.length > 0 && ret[0] == (byte) 0xFF) { // BAM files store missing qualities as an array of 0xFF bytes. // 0xFF is an illegal quality score value (it cannot be encoded in SAM) // and so the first byte is a suitable marker. // We hide this quirk of the BAM encoding so that the BAM interface looks the same as SAM. return NULL_QUALS; } return ret; } private String decodeReadName() { // Don't include terminating null return StringUtil.bytesToString(mRestOfBinaryData, READ_NAME_OFFSET, mReadNameLength-1); } private byte[] decodeReadBases() { if (mReadLength == 0) { return NULL_SEQUENCE; } final int basesOffset = readNameSize() + cigarSize(); return SAMUtils.compressedBasesToBytes(mReadLength, mRestOfBinaryData, basesOffset); } /* methods for computing disk size of variably-sized elements, in order to locate * elements in mRestOfBinaryData */ private int readNameSize() { return mReadNameLength; } private int cigarSize() { return mCigarLength * 4; } private int basesSize() { return (mReadLength + 1)/2; } private int qualsSize() { return mReadLength; } } htsjdk-2.0.1/src/java/htsjdk/samtools/BAMRecordCodec.java000066400000000000000000000224251263034757100232150ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.BinaryCodec; import htsjdk.samtools.util.RuntimeEOFException; import htsjdk.samtools.util.SortingCollection; import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; /** * Class for translating between in-memory and disk representation of BAMRecord. */ public class BAMRecordCodec implements SortingCollection.Codec { private final SAMFileHeader header; private final BinaryCodec binaryCodec = new BinaryCodec(); private final BinaryTagCodec binaryTagCodec = new BinaryTagCodec(binaryCodec); private final SAMRecordFactory samRecordFactory; public BAMRecordCodec(final SAMFileHeader header) { this(header, new DefaultSAMRecordFactory()); } public BAMRecordCodec(final SAMFileHeader header, final SAMRecordFactory factory) { this.header = header; this.samRecordFactory = factory; } public BAMRecordCodec clone() { // Do not clone the references to codecs, as they must be distinct for each instance. return new BAMRecordCodec(this.header, this.samRecordFactory); } /** Sets the output stream that records will be written to. */ public void setOutputStream(final OutputStream os) { this.binaryCodec.setOutputStream(os); } /** Sets the output stream that records will be written to. */ public void setOutputStream(final OutputStream os, final String filename) { this.binaryCodec.setOutputStream(os); this.binaryCodec.setOutputFileName(filename); } /** Sets the input stream that records will be read from. */ public void setInputStream(final InputStream is) { this.binaryCodec.setInputStream(is); } /** Sets the input stream that records will be read from. */ public void setInputStream(final InputStream is, final String filename) { this.binaryCodec.setInputStream(is); this.binaryCodec.setInputFileName(filename); } /** * Write object to OutputStream. * Reference and mate reference indices must be resolvable, which either means that these have been set into the * SAMRecord directly, or the SAMRecord must have a header assigned into it so that reference names can be * resolved into indices. * * @param alignment Record to be written. */ public void encode(final SAMRecord alignment) { // Compute block size, as it is the first element of the file representation of SAMRecord final int readLength = alignment.getReadLength(); final int cigarLength = alignment.getCigarLength(); int blockSize = BAMFileConstants.FIXED_BLOCK_SIZE + alignment.getReadNameLength() + 1 + // null terminated cigarLength * 4 + (readLength + 1) / 2 + // 2 bases per byte, round up readLength; final int attributesSize = alignment.getAttributesBinarySize(); if (attributesSize != -1) { // binary attribute size already known, don't need to compute. blockSize += attributesSize; } else { SAMBinaryTagAndValue attribute = alignment.getBinaryAttributes(); while (attribute != null) { blockSize += (BinaryTagCodec.getTagSize(attribute.value)); attribute = attribute.getNext(); } } int indexBin = 0; if (alignment.getReferenceIndex() >= 0) { if (alignment.getIndexingBin() != null) { indexBin = alignment.getIndexingBin(); } else { indexBin = alignment.computeIndexingBin(); } } // Blurt out the elements this.binaryCodec.writeInt(blockSize); this.binaryCodec.writeInt(alignment.getReferenceIndex()); // 0-based!! this.binaryCodec.writeInt(alignment.getAlignmentStart() - 1); this.binaryCodec.writeUByte((short)(alignment.getReadNameLength() + 1)); this.binaryCodec.writeUByte((short)alignment.getMappingQuality()); this.binaryCodec.writeUShort(indexBin); this.binaryCodec.writeUShort(cigarLength); this.binaryCodec.writeUShort(alignment.getFlags()); this.binaryCodec.writeInt(alignment.getReadLength()); this.binaryCodec.writeInt(alignment.getMateReferenceIndex()); this.binaryCodec.writeInt(alignment.getMateAlignmentStart() - 1); this.binaryCodec.writeInt(alignment.getInferredInsertSize()); final byte[] variableLengthBinaryBlock = alignment.getVariableBinaryRepresentation(); if (variableLengthBinaryBlock != null) { // Don't need to encode variable-length block, because it is unchanged from // when the record was read from a BAM file. this.binaryCodec.writeBytes(variableLengthBinaryBlock); } else { if (alignment.getReadLength() != alignment.getBaseQualities().length && alignment.getBaseQualities().length != 0) { throw new RuntimeException("Mismatch between read length and quals length writing read " + alignment.getReadName() + "; read length: " + alignment.getReadLength() + "; quals length: " + alignment.getBaseQualities().length); } this.binaryCodec.writeString(alignment.getReadName(), false, true); final int[] binaryCigar = BinaryCigarCodec.encode(alignment.getCigar()); for (final int cigarElement : binaryCigar) { // Assumption that this will fit into an integer, despite the fact // that it is specced as a uint. this.binaryCodec.writeInt(cigarElement); } this.binaryCodec.writeBytes(SAMUtils.bytesToCompressedBases(alignment.getReadBases())); byte[] qualities = alignment.getBaseQualities(); if (qualities.length == 0) { qualities = new byte[alignment.getReadLength()]; Arrays.fill(qualities, (byte) 0xFF); } this.binaryCodec.writeBytes(qualities); SAMBinaryTagAndValue attribute = alignment.getBinaryAttributes(); while (attribute != null) { this.binaryTagCodec.writeTag(attribute.tag, attribute.value, attribute.isUnsignedArray()); attribute = attribute.getNext(); } } } /** * Read the next record from the input stream and convert into a java object. * * @return null if no more records. Should throw exception if EOF is encountered in the middle of * a record. */ public SAMRecord decode() { int recordLength = 0; try { recordLength = this.binaryCodec.readInt(); } catch (RuntimeEOFException e) { return null; } if (recordLength < BAMFileConstants.FIXED_BLOCK_SIZE) { throw new SAMFormatException("Invalid record length: " + recordLength); } final int referenceID = this.binaryCodec.readInt(); final int coordinate = this.binaryCodec.readInt() + 1; final short readNameLength = this.binaryCodec.readUByte(); final short mappingQuality = this.binaryCodec.readUByte(); final int bin = this.binaryCodec.readUShort(); final int cigarLen = this.binaryCodec.readUShort(); final int flags = this.binaryCodec.readUShort(); final int readLen = this.binaryCodec.readInt(); final int mateReferenceID = this.binaryCodec.readInt(); final int mateCoordinate = this.binaryCodec.readInt() + 1; final int insertSize = this.binaryCodec.readInt(); final byte[] restOfRecord = new byte[recordLength - BAMFileConstants.FIXED_BLOCK_SIZE]; this.binaryCodec.readBytes(restOfRecord); final BAMRecord ret = this.samRecordFactory.createBAMRecord( header, referenceID, coordinate, readNameLength, mappingQuality, bin, cigarLen, flags, readLen, mateReferenceID, mateCoordinate, insertSize, restOfRecord); if (null != header) { // don't reset a null header as this will clobber the reference and mate reference indices ret.setHeader(header); } return ret; } } htsjdk-2.0.1/src/java/htsjdk/samtools/BamFileIoUtils.java000066400000000000000000000214661263034757100233350ustar00rootroot00000000000000package htsjdk.samtools; import htsjdk.samtools.util.BlockCompressedFilePointerUtil; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.BlockCompressedOutputStream; import htsjdk.samtools.util.BlockCompressedStreamConstants; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.Md5CalculatingOutputStream; import htsjdk.samtools.util.RuntimeIOException; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.List; public class BamFileIoUtils { private static final Log LOG = Log.getInstance(BamFileIoUtils.class); public static final String BAM_FILE_EXTENSION = ".bam"; public static boolean isBamFile(final File file) { return ((file != null) && file.getName().endsWith(BAM_FILE_EXTENSION)); } public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile) { reheaderBamFile(samFileHeader, inputFile, outputFile, true, true); } /** * Copy a BAM file but replacing the header * * @param samFileHeader The header to use in the new file * @param inputFile The BAM file to copy, sans header * @param outputFile The new BAM file, constructed with the new header and the content from inputFile * @param createMd5 Whether or not to create an MD5 file for the new BAM * @param createIndex Whether or not to create an index file for the new BAM */ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile, final boolean createMd5, final boolean createIndex) { IOUtil.assertFileIsReadable(inputFile); IOUtil.assertFileIsWritable(outputFile); try { BlockCompressedInputStream.assertNonDefectiveFile(inputFile); assertSortOrdersAreEqual(samFileHeader, inputFile); final OutputStream outputStream = buildOutputStream(outputFile, createMd5, createIndex); BAMFileWriter.writeHeader(outputStream, samFileHeader); blockCopyBamFile(inputFile, outputStream, true, false); CloserUtil.close(inputFile); outputStream.close(); } catch (final IOException ioe) { throw new RuntimeIOException(ioe); } } /** * Copy data from a BAM file to an OutputStream by directly copying the gzip blocks * * @param inputFile The file to be copied * @param outputStream The stream to write the copied data to * @param skipHeader If true, the header of the input file will not be copied to the output stream * @param skipTerminator If true, the terminator block of the input file will not be written to the output stream */ public static void blockCopyBamFile(final File inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) { FileInputStream in = null; try { in = new FileInputStream(inputFile); // a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it if skipTerminator is true final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(inputFile); if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE) throw new SAMException(inputFile.getAbsolutePath() + " does not have a valid GZIP block at the end of the file."); if (skipHeader) { final long vOffsetOfFirstRecord = SAMUtils.findVirtualOffsetOfFirstRecordInBam(inputFile); final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(inputFile); blockIn.seek(vOffsetOfFirstRecord); final long remainingInBlock = blockIn.available(); // If we found the end of the header then write the remainder of this block out as a // new gzip block and then break out of the while loop if (remainingInBlock >= 0) { final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(outputStream, null); IOUtil.transferByStream(blockIn, blockOut, remainingInBlock); blockOut.flush(); // Don't close blockOut because closing underlying stream would break everything } long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer()); blockIn.close(); while (pos > 0) { pos -= in.skip(pos); } } // Copy remainder of input stream into output stream final long currentPos = in.getChannel().position(); final long length = inputFile.length(); final long skipLast = ((term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) && skipTerminator) ? BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0; final long bytesToWrite = length - skipLast - currentPos; IOUtil.transferByStream(in, outputStream, bytesToWrite); } catch (final IOException ioe) { throw new RuntimeIOException(ioe); } finally { CloserUtil.close(in); } } /** * Assumes that all inputs and outputs are block compressed VCF files and copies them without decompressing and parsing * most of the gzip blocks. Will decompress and parse blocks up to the one containing the end of the header in each file * (often the first block) and re-compress any data remaining in that block into a new block in the output file. Subsequent * blocks (excluding a terminator block if present) are copied directly from input to output. */ public static void gatherWithBlockCopying(final List bams, final File output, final boolean createIndex, final boolean createMd5) { try { OutputStream out = new FileOutputStream(output); if (createMd5) out = new Md5CalculatingOutputStream(out, new File(output.getAbsolutePath() + ".md5")); File indexFile = null; if (createIndex) { indexFile = new File(output.getParentFile(), IOUtil.basename(output) + BAMIndex.BAMIndexSuffix); out = new StreamInflatingIndexingOutputStream(out, indexFile); } boolean isFirstFile = true; for (final File f : bams) { LOG.info(String.format("Block copying %s ...", f.getAbsolutePath())); blockCopyBamFile(f, out, !isFirstFile, true); isFirstFile = false; } // And lastly add the Terminator block and close up out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); out.close(); // It is possible that the modified time on the index file is ever so slightly older than the original BAM file // and this makes ValidateSamFile unhappy. if (createIndex && (output.lastModified() > indexFile.lastModified())) { final boolean success = indexFile.setLastModified(System.currentTimeMillis()); if (!success) { System.err.print(String.format("Index file is older than BAM file for %s and unable to resolve this", output.getAbsolutePath())); } } } catch (final IOException ioe) { throw new RuntimeIOException(ioe); } } private static OutputStream buildOutputStream(final File outputFile, final boolean createMd5, final boolean createIndex) throws IOException { OutputStream outputStream = new FileOutputStream(outputFile); if (createMd5) { outputStream = new Md5CalculatingOutputStream(outputStream, new File(outputFile.getAbsolutePath() + ".md5")); } if (createIndex) { outputStream = new StreamInflatingIndexingOutputStream(outputStream, new File(outputFile.getParentFile(), IOUtil.basename(outputFile) + BAMIndex.BAMIndexSuffix)); } return outputStream; } private static void assertSortOrdersAreEqual(final SAMFileHeader newHeader, final File inputFile) throws IOException { final SamReader reader = SamReaderFactory.makeDefault().open(inputFile); final SAMFileHeader origHeader = reader.getFileHeader(); final SAMFileHeader.SortOrder newSortOrder = newHeader.getSortOrder(); if (newSortOrder != SAMFileHeader.SortOrder.unsorted && newSortOrder != origHeader.getSortOrder()) { throw new SAMException("Sort order of new header does not match the original file, needs to be " + origHeader.getSortOrder()); } reader.close(); } } htsjdk-2.0.1/src/java/htsjdk/samtools/BamIndexValidator.java000066400000000000000000000103301263034757100240460ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; /** * One crisp, informative sentence or noun phrase that explains * the concept modeled by the class. *

* This class is [not] thread safe [because it is immutable]. */ public class BamIndexValidator { public static int exhaustivelyTestIndex(final SamReader reader) { // throws Exception { // look at all chunk offsets in a linear index to make sure they are valid if (reader.indexing().hasBrowseableIndex()) { // content is from an existing bai file final CachingBAMFileIndex existingIndex = (CachingBAMFileIndex) reader.indexing().getBrowseableIndex(); // new CachingBAMFileIndex(inputBai, null); final int n_ref = existingIndex.getNumberOfReferences(); int chunkCount = 0; int indexCount = 0; for (int i = 0; i < n_ref; i++) { final BAMIndexContent content = existingIndex.getQueryResults(i); for (final Chunk c : content.getAllChunks()) { final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(c)); chunkCount++; BAMRecord b = null; try { b = (BAMRecord) iter.next(); iter.close(); } catch (final Exception e) { throw new SAMException("Exception in BamIndexValidator. Last good record " + b + " in chunk " + c + " chunkCount=" + chunkCount, e); } } // also seek to every position in the linear index // final BAMRecordCodec bamRecordCodec = new BAMRecordCodec(reader.getFileHeader()); // bamRecordCodec.setInputStream(reader.getInputStream()); LinearIndex linearIndex = content.getLinearIndex(); for (long l : linearIndex.getIndexEntries()) { try { if (l != 0) { final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(new Chunk(l, l + 1))); BAMRecord b = (BAMRecord) iter.next(); // read the first record identified by the linear index indexCount++; iter.close(); } } catch (Exception e) { throw new SAMException("Exception in BamIndexValidator. Linear index access failure " + l + " indexCount=" + indexCount, e); } } } return chunkCount; // System.out.println("Found " chunkCount + " chunks in test " + inputBai + // " linearIndex positions = " + indexCount); } // else not a bam file with a browseable index // System.err.println("No browseableIndex for reader"); return 0; } } htsjdk-2.0.1/src/java/htsjdk/samtools/Bin.java000066400000000000000000000117521263034757100212320ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * An individual bin in a BAM file. * * @author mhanna * @version 0.1 */ public class Bin implements Comparable { /** * The reference sequence associated with this bin. */ private final int referenceSequence; /** * The number of this bin within the BAM file. */ private final int binNumber; /** * The chunks associated with this bin. */ private List chunkList; /** * The last chunk in the chunkList. Only maintained during index building, * not when reading existing index */ private Chunk lastChunk; public Bin(final int referenceSequence, final int binNumber) { this.referenceSequence = referenceSequence; this.binNumber = binNumber; } protected int getReferenceSequence() { return referenceSequence; } public int getBinNumber() { return binNumber; } /** * See whether two bins are equal. If the ref seq and the bin number * are equal, assume equality of the chunk list. * @param other The other Bin to which to compare this. * @return True if the two bins are equal. False otherwise. */ @Override public boolean equals(final Object other) { if(other == null) return false; if(!(other instanceof Bin)) return false; final Bin otherBin = (Bin)other; return this.referenceSequence == otherBin.referenceSequence && this.binNumber == otherBin.binNumber; } /** * Compute a unique hash code for the given reference sequence and bin number. * @return A unique hash code. */ @Override public int hashCode() { return ((Integer)referenceSequence).hashCode() ^ ((Integer)binNumber).hashCode(); } /** * Returns whether the bin currently contains chunks. * @return True if the bin has chunks, false otherwise. */ public boolean containsChunks() { return chunkList != null; } /** * Compare two bins to see what ordering they should appear in. * @param other Other bin to which this bin should be compared. * @return -1 if this < other, 0 if this == other, 1 if this > other. */ public int compareTo(final Bin other) { if(other == null) throw new ClassCastException("Cannot compare to a null object"); // Check the reference sequences first. if(this.referenceSequence != other.referenceSequence) return referenceSequence - other.referenceSequence; // Then check the bin ordering. return binNumber - other.binNumber; } /** * Adds the first chunk to the bin */ public void addInitialChunk(final Chunk newChunk){ final List oldChunks = new ArrayList(); setChunkList(oldChunks); setLastChunk(newChunk); oldChunks.add(newChunk); } /** * Sets the chunks associated with this bin */ public void setChunkList(final List list){ chunkList = list; } /** * Gets the list of chunks associated with this bin. * @return the chunks in this bin. If no chunks are associated, an empty list will be returned. */ public List getChunkList(){ if(chunkList == null) return Collections.emptyList(); return chunkList; } /** * Optimization to keep lastChunk instead of iterating over all chunks repeatedly */ public void setLastChunk(final Chunk c){ lastChunk = c; } /** * Warning: Currently only valid during index building, not when reading existing index, * (AbstractBAMFileIndex.optimizeChunkList doesn't maintain this) * @return the last Chunk of the chunkList */ public Chunk getLastChunk(){ return lastChunk; } } htsjdk-2.0.1/src/java/htsjdk/samtools/BinList.java000066400000000000000000000075301263034757100220650ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.BitSet; import java.util.Iterator; import java.util.NoSuchElementException; /** * Provides a list of all bins which could exist in the BAM file. * Allows the user to iterate over all bins, selecting ones of interest * for later inspection. * * @author mhanna * @version 0.1 */ public class BinList implements Iterable { /** * The reference sequence relevant to this bin list. */ private final int referenceSequence; /** * For each sequence, which bins should be included in the BitSet. */ private final BitSet bins; /** * Create a new BinList over sequenceCount sequences, consisting of the given bins. * @param referenceSequence Reference sequence to which these bins are relevant. * @param bins The given bins to include. */ protected BinList(final int referenceSequence, final BitSet bins) { this.referenceSequence = referenceSequence; this.bins = bins; } /** * Gets an iterator over all selected bins. * @return An iterator over all selected bins. */ public Iterator iterator() { return new BinIterator(); } /** * Get the reference sequence to which this bin belongs. * @return Integer representing the reference sequence. */ protected int getReferenceSequence() { return referenceSequence; } /** * Retrieves the bins stored in this list. * @return A bitset where a bin is present in the list if the bit is true. */ protected BitSet getBins() { return bins; } private class BinIterator implements Iterator { /** * Stores the bin currently in use. Will be -1 if no more bins remain in the set. */ private int nextBin; public BinIterator() { // Initialize the bin iterator to just before the first bin. nextBin = bins.nextSetBit(0); } /** * Are there more bins in this set, waiting to be returned? * @return True if more bins are remaining. */ public boolean hasNext() { return nextBin >= 0; } /** * Gets the next bin in the provided BinList. * @return the next available bin in the BinList. */ public Bin next() { if(!hasNext()) throw new NoSuchElementException("This BinIterator is currently empty"); int currentBin = nextBin; nextBin = bins.nextSetBit(nextBin+1); return new Bin(referenceSequence,currentBin); } public void remove() { throw new UnsupportedOperationException("Unable to remove from a bin iterator"); } } } htsjdk-2.0.1/src/java/htsjdk/samtools/BinaryBAMIndexWriter.java000066400000000000000000000151251263034757100244510ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.BinaryCodec; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.util.List; /** * Class for writing binary BAM index files */ class BinaryBAMIndexWriter implements BAMIndexWriter { protected final int nRef; private final BinaryCodec codec; private int count = 0; /** * constructor * * @param nRef Number of reference sequences * @param output BAM Index output file */ public BinaryBAMIndexWriter(final int nRef, final File output) { this.nRef = nRef; try { codec = new BinaryCodec(output, true); writeHeader(); } catch (final Exception e) { throw new SAMException("Exception opening output file " + output, e); } } /** * * @param nRef Number of reference sequences. * @param output BAM index output stream. This stream will be closed when BinaryBAMIndexWriter.close() is called. */ public BinaryBAMIndexWriter(final int nRef, final OutputStream output) { this.nRef = nRef; try { codec = new BinaryCodec(output); writeHeader(); } catch (final Exception e) { throw new SAMException("Exception opening output stream", e); } } /** * Write this content as binary output */ public void writeReference(final BAMIndexContent content) { if (content == null) { writeNullContent(); count++; return; } if (content.getReferenceSequence() != count){ throw new SAMException("Unexpectedly writing reference " + content.getReferenceSequence() + ", expecting reference " + count); } count ++; // write bins final BAMIndexContent.BinList bins = content.getBins(); final int size = bins == null ? 0 : content.getNumberOfNonNullBins(); if (size == 0) { writeNullContent(); return; } //final List chunks = content.getMetaData() == null ? null // : content.getMetaData().getMetaDataChunks(); final BAMIndexMetaData metaData = content.getMetaData(); codec.writeInt(size + ((metaData != null)? 1 : 0 )); // codec.writeInt(size); for (final Bin bin : bins) { // note, bins will always be sorted if (bin.getBinNumber() == GenomicIndexUtil.MAX_BINS) continue; writeBin(bin); } // write metadata "bin" and chunks if (metaData != null) writeChunkMetaData(metaData); // write linear index final LinearIndex linearIndex = content.getLinearIndex(); final long[] entries = linearIndex == null ? null : linearIndex.getIndexEntries(); final int indexStart = linearIndex == null ? 0 : linearIndex.getIndexStart(); final int n_intv = entries == null ? indexStart : entries.length + indexStart; codec.writeInt(n_intv); if (entries == null) { return; } // since indexStart is usually 0, this is usually a no-op for (int i = 0; i < indexStart; i++) { codec.writeLong(0); } for (int k = 0; k < entries.length; k++) { codec.writeLong(entries[k]); } try { codec.getOutputStream().flush(); } catch (final IOException e) { throw new SAMException("IOException in BinaryBAMIndexWriter reference " + content.getReferenceSequence(), e); } } /** * Writes out the count of records without coordinates * * @param count */ public void writeNoCoordinateRecordCount(final Long count) { codec.writeLong(count == null ? 0 : count); } /** * Any necessary processing at the end of the file */ public void close() { codec.close(); } private void writeBin(final Bin bin) { final int binNumber = bin.getBinNumber(); if (binNumber >= GenomicIndexUtil.MAX_BINS){ throw new SAMException("Unexpected bin number when writing bam index " + binNumber); } codec.writeInt(binNumber); if (bin.getChunkList() == null){ codec.writeInt(0); return; } final List chunkList = bin.getChunkList(); final int n_chunk = chunkList.size(); codec.writeInt(n_chunk); for (final Chunk c : chunkList) { codec.writeLong(c.getChunkStart()); codec.writeLong(c.getChunkEnd()); } } /** * Write the meta data represented by the chunkLists associated with bin MAX_BINS 37450 * * @param metaData information describing numAligned records, numUnAligned, etc */ private void writeChunkMetaData(final BAMIndexMetaData metaData) { codec.writeInt(GenomicIndexUtil.MAX_BINS); final int nChunk = 2; codec.writeInt(nChunk); codec.writeLong(metaData.getFirstOffset()); codec.writeLong(metaData.getLastOffset()); codec.writeLong(metaData.getAlignedRecordCount()); codec.writeLong(metaData.getUnalignedRecordCount()); } private void writeHeader() { // magic string final byte[] magic = BAMFileConstants.BAM_INDEX_MAGIC; codec.writeBytes(magic); codec.writeInt(nRef); } private void writeNullContent() { codec.writeLong(0); // 0 bins , 0 intv } } htsjdk-2.0.1/src/java/htsjdk/samtools/BinaryCigarCodec.java000066400000000000000000000067031263034757100236520ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.nio.ByteBuffer; /** * Converter between disk and in-memory (object, not String) CIGAR representation. */ class BinaryCigarCodec { /** * Convert CIGAR from object representation to disk representation. * @return Array of unsigned ints, one for each element of CIGAR. */ static int[] encode(final Cigar cigar) { if (cigar.numCigarElements() == 0) { return new int[0]; } // Binary rep can be no longer than 1/2 of text rep // Although this is documented as uint, I think lengths will never get that long, // and it's a pain in Java. final int[] binaryCigar = new int[cigar.numCigarElements()]; int binaryCigarLength = 0; for (int i = 0; i < cigar.numCigarElements(); ++i) { final CigarElement cigarElement = cigar.getCigarElement(i); final int op = CigarOperator.enumToBinary(cigarElement.getOperator()); binaryCigar[binaryCigarLength++] = cigarElement.getLength() << 4 | op; } return binaryCigar; } /** * Convert CIGAR from disk representation to object. * @param binaryCigar ByteArray that is assumed to have byte order set appropriately for extracting ints. */ static Cigar decode(final ByteBuffer binaryCigar) { final Cigar ret = new Cigar(); while (binaryCigar.hasRemaining()) { final int cigarette = binaryCigar.getInt(); ret.add(binaryCigarToCigarElement(cigarette)); } return ret; } /** * Convert CIGAR from disk representation to object. * @param binaryCigar Array of unsigned ints, one for each CIGAR element. */ static Cigar decode(final int[] binaryCigar) { final Cigar ret = new Cigar(); for (final int cigarette : binaryCigar) { ret.add(binaryCigarToCigarElement(cigarette)); } return ret; } /** * @param cigarette CIGAR element (operator + length) encoded as an unsigned int. * @return Object representation of the CIGAR element. */ private static CigarElement binaryCigarToCigarElement(final int cigarette) { final int binaryOp = cigarette & 0xf; final int length = cigarette >>> 4; return new CigarElement(length, CigarOperator.binaryToEnum(binaryOp)); } } htsjdk-2.0.1/src/java/htsjdk/samtools/BinaryTagCodec.java000066400000000000000000000410031263034757100233300ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.BinaryCodec; import htsjdk.samtools.util.StringUtil; import java.lang.reflect.Array; import java.nio.ByteBuffer; import java.nio.ByteOrder; /** * Converter between disk and in-memory representation of a SAMRecord tag. */ public class BinaryTagCodec { // Size in bytes of the fixed part of the disk representation of a tag, // i.e. the number of bytes occupied by the tag name and tag type fields. private static final int FIXED_TAG_SIZE = 3; // Size in bytes of the fixed part of the value of a binary array, // i.e. the number of bytes occupied by the array type and the array length. private static final int FIXED_BINARY_ARRAY_TAG_SIZE = 5; // Integers are stored in the smallest size that will hold them. private static final long MAX_INT = Integer.MAX_VALUE; private static final long MAX_UINT = MAX_INT * 2 + 1; private static final long MAX_SHORT = Short.MAX_VALUE; private static final long MAX_USHORT = MAX_SHORT * 2 + 1; private static final long MAX_BYTE = Byte.MAX_VALUE; private static final long MAX_UBYTE = MAX_BYTE * 2 + 1; // Source or sink for disk representation. final BinaryCodec binaryCodec; /** * For writing tags. * For reading tags, a BinaryCodec is not used. See readTags() below. * @param binaryCodec where to write the file rep of the tags */ public BinaryTagCodec(final BinaryCodec binaryCodec) { this.binaryCodec = binaryCodec; } /** * @param attributeValue In-memory representation of a tag value. * @return Size in bytes to store the value on disk. */ private static int getBinaryValueSize(final Object attributeValue) { switch (getTagValueType(attributeValue)) { case 'Z': return ((String)attributeValue).length() + 1; case 'A': return 1; case 'I': case 'i': return 4; case 's': case 'S': return 2; case 'c': case 'C': return 1; case 'f': return 4; case 'H': final byte[] byteArray = (byte[])attributeValue; return byteArray.length * 2 + 1; case 'B': final int numElements = Array.getLength(attributeValue); final int elementSize; if(attributeValue instanceof byte[]) { elementSize = 1; } else if(attributeValue instanceof short[]) { elementSize = 2; } else if(attributeValue instanceof int[]) { elementSize = 4; } else if(attributeValue instanceof float[]) { elementSize = 4; } else { throw new IllegalArgumentException("Unsupported array type: " + attributeValue.getClass()); } return numElements * elementSize + FIXED_BINARY_ARRAY_TAG_SIZE; default: throw new IllegalArgumentException("When writing BAM, unrecognized tag type " + attributeValue.getClass().getName()); } } /** * @param value In-memory representation of a tag value. * @return Size in bytes to store the tag name, tag type and tag value on disk. */ static int getTagSize(final Object value) { return FIXED_TAG_SIZE + getBinaryValueSize(value); } /** * @param value In-memory representation of a tag value. * @return One-character disk representation of tag type. */ static char getTagValueType(final Object value) { if (value instanceof String) { return 'Z'; } else if (value instanceof Character) { return 'A'; } else if (value instanceof Float) { return 'f'; } else if (value instanceof Number) { if (!(value instanceof Byte || value instanceof Short || value instanceof Integer || value instanceof Long)) { throw new IllegalArgumentException("Unrecognized tag type " + value.getClass().getName()); } return getIntegerType(((Number)value).longValue()); } /* Note that H tag type is never written anymore, because B style is more compact. else if (value instanceof byte[]) { return 'H'; } */ else if (value instanceof byte[] || value instanceof short[] || value instanceof int[] || value instanceof float[]) { return 'B'; } else { throw new IllegalArgumentException("When writing BAM, unrecognized tag type " + value.getClass().getName()); } } /** * @param val Integer tag value. * @return Tag type corresponding to the smallest integer type that will hold the given value. */ static private char getIntegerType(final long val) { if (val > MAX_UINT) { throw new IllegalArgumentException("Integer attribute value too large to be encoded in BAM"); } if (val > MAX_INT) { return 'I'; } if (val > MAX_USHORT) { return 'i'; } if (val > MAX_SHORT) { return 'S'; } if (val > MAX_UBYTE) { return 's'; } if (val > MAX_BYTE) { return 'C'; } if (val >= Byte.MIN_VALUE) { return 'c'; } if (val >= Short.MIN_VALUE) { return 's'; } if (val >= Integer.MIN_VALUE) { return 'i'; } throw new IllegalArgumentException("Integer attribute value too negative to be encoded in BAM"); } /** * Write the given tag name and value to disk. */ public void writeTag(final short tag, final Object value, final boolean isUnsignedArray) { binaryCodec.writeShort(tag); final char tagValueType = getTagValueType(value); binaryCodec.writeByte(tagValueType); switch (tagValueType) { case 'Z': binaryCodec.writeString((String)value, false, true); break; case 'A': binaryCodec.writeByte(((Character)value)); break; case 'I': binaryCodec.writeUInt((Long)value); break; case 'i': binaryCodec.writeInt(((Number)value).intValue()); break; case 's': binaryCodec.writeShort(((Number)value).shortValue()); break; case 'S': binaryCodec.writeUShort(((Number)value).intValue()); break; case 'c': binaryCodec.writeByte(((Number)value).byteValue()); break; case 'C': binaryCodec.writeUByte(((Integer)value).shortValue()); break; case 'f': binaryCodec.writeFloat((Float)value); break; /* Writing H is no longer supported case 'H': final byte[] byteArray = (byte[])value; binaryCodec.writeString(StringUtil.bytesToHexString(byteArray), false, true); break; */ case 'B': writeArray(value, isUnsignedArray); break; default: throw new IllegalArgumentException("When writing BAM, unrecognized tag type " + value.getClass().getName()); } } private void writeArray(final Object value, final boolean isUnsignedArray) { if (value instanceof byte[]) { binaryCodec.writeByte(isUnsignedArray? 'C': 'c'); final byte[] array = (byte[]) value; binaryCodec.writeInt(array.length); for (final byte element: array) binaryCodec.writeByte(element); } else if (value instanceof short[]) { binaryCodec.writeByte(isUnsignedArray? 'S': 's'); final short[] array = (short[]) value; binaryCodec.writeInt(array.length); for (final short element: array) binaryCodec.writeShort(element); } else if (value instanceof int[]) { binaryCodec.writeByte(isUnsignedArray? 'I': 'i'); final int[] array = (int[]) value; binaryCodec.writeInt(array.length); for (final int element: array) binaryCodec.writeInt(element); } else if (value instanceof float[]) { binaryCodec.writeByte('f'); final float[] array = (float[]) value; binaryCodec.writeInt(array.length); for (final float element: array) binaryCodec.writeFloat(element); } else throw new SAMException("Unrecognized array value type: " + value.getClass()); } /** * Convert tags from little-endian disk representation to in-memory representation. * @param binaryRep Byte buffer containing file representation of tags. * @param offset Where in binaryRep tags start. * @param length How many bytes in binaryRep are tag storage. */ public static SAMBinaryTagAndValue readTags(final byte[] binaryRep, final int offset, final int length, final ValidationStringency validationStringency) { final ByteBuffer byteBuffer = ByteBuffer.wrap(binaryRep, offset, length); byteBuffer.order(ByteOrder.LITTLE_ENDIAN); SAMBinaryTagAndValue head = null; SAMBinaryTagAndValue tail = null; while (byteBuffer.hasRemaining()) { final short tag = byteBuffer.getShort(); final byte tagType = byteBuffer.get(); final SAMBinaryTagAndValue tmp; if (tagType != 'B') { tmp = new SAMBinaryTagAndValue(tag, readSingleValue(tagType, byteBuffer, validationStringency)); } else { final TagValueAndUnsignedArrayFlag valueAndFlag = readArray(byteBuffer, validationStringency); if (valueAndFlag.isUnsignedArray) tmp = new SAMBinaryTagAndUnsignedArrayValue(tag, valueAndFlag.value); else tmp = new SAMBinaryTagAndValue(tag, valueAndFlag.value); } // If samjdk wrote the BAM then the attributes will be in lowest->highest tag order, to inserting at the // head each time will be very inefficient. To fix that we check here to see if the tag should go right on // the tail and if so stick it there, else insert it through the head. if (head == null) { head = tmp; tail = tmp; } else if (tmp.tag > tail.tag) { tail.insert(tmp); tail = tmp; } else { head = head.insert(tmp); } } return head; } /** * Read value of specified non-array type. * @param tagType What type to read. * @param byteBuffer Little-ending byte buffer to read value from. * @return Value in in-memory Object form. */ private static Object readSingleValue(final byte tagType, final ByteBuffer byteBuffer, final ValidationStringency validationStringency) { switch (tagType) { case 'Z': return readNullTerminatedString(byteBuffer); case 'A': return (char)byteBuffer.get(); case 'I': final long val = byteBuffer.getInt() & 0xffffffffL; if ( val <= Integer.MAX_VALUE ) { return (int)val; } // If it won't fit into a signed integer, but is within range for an unsigned 32-bit integer, // return it directly as a long if (! SAMUtils.isValidUnsignedIntegerAttribute(val)) { SAMUtils.processValidationError(new SAMValidationError(SAMValidationError.Type.TAG_VALUE_TOO_LARGE, "Unsigned integer is out of range for a 32-bit unsigned value: " + val, null), validationStringency); } return val; case 'i': return byteBuffer.getInt(); case 's': return (int)byteBuffer.getShort(); case 'S': // Convert to unsigned short stored in an int return byteBuffer.getShort() & 0xffff; case 'c': return (int)byteBuffer.get(); case 'C': // Convert to unsigned byte stored in an int return (int)byteBuffer.get() & 0xff; case 'f': return byteBuffer.getFloat(); case 'H': final String hexRep = readNullTerminatedString(byteBuffer); return StringUtil.hexStringToBytes(hexRep); default: throw new SAMFormatException("Unrecognized tag type: " + (char)tagType); } } /** * Read value of specified type. * @param byteBuffer Little-ending byte buffer to read value from. * @return CVO containing the value in in-memory Object form, and a flag indicating whether it is unsigned or not. */ private static TagValueAndUnsignedArrayFlag readArray(final ByteBuffer byteBuffer, final ValidationStringency validationStringency) { final byte arrayType = byteBuffer.get(); final boolean isUnsigned = Character.isUpperCase(arrayType); final int length = byteBuffer.getInt(); final Object value; switch (Character.toLowerCase(arrayType)) { case 'c': { final byte[] array = new byte[length]; value = array; byteBuffer.get(array); break; } case 's': { final short[] array = new short[length]; value = array; for (int i = 0; i < length; ++i) { array[i] = byteBuffer.getShort(); } break; } case 'i': { final int[] array = new int[length]; value = array; for (int i = 0; i < length; ++i) { array[i] = byteBuffer.getInt(); } break; } case 'f': { final float[] array = new float[length]; value = array; for (int i = 0; i < length; ++i) { array[i] = byteBuffer.getFloat(); } break; } default: throw new SAMFormatException("Unrecognized tag array type: " + (char)arrayType); } return new TagValueAndUnsignedArrayFlag(value, isUnsigned); } private static String readNullTerminatedString(final ByteBuffer byteBuffer) { // Count the number of bytes in the string byteBuffer.mark(); final int startPosition = byteBuffer.position(); while (byteBuffer.get() != 0) {} final int endPosition = byteBuffer.position(); // Don't count null terminator final byte[] buf = new byte[endPosition - startPosition - 1]; // Go back to the start of the string and read out the bytes byteBuffer.reset(); byteBuffer.get(buf); // Skip over the null terminator byteBuffer.get(); return StringUtil.bytesToString(buf); } } htsjdk-2.0.1/src/java/htsjdk/samtools/BinningIndexBuilder.java000066400000000000000000000156701263034757100244100ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.BlockCompressedFilePointerUtil; import java.util.List; import static htsjdk.samtools.GenomicIndexUtil.MAX_BINS; /** * Builder for a BinningIndexContent object. */ public class BinningIndexBuilder { private final int referenceSequence; // the bins for the current reference private final Bin[] bins; // made only as big as needed for each reference private int binsSeen = 0; // linear index for the current reference private final long[] index = new long[LinearIndex.MAX_LINEAR_INDEX_SIZE]; private int largestIndexSeen = -1; /** * * @param referenceSequence * @param sequenceLength 0 implies unknown length. Known length will reduce memory use. */ public BinningIndexBuilder(final int referenceSequence, final int sequenceLength) { this.referenceSequence = referenceSequence; final int numBins; if (sequenceLength <= 0) numBins = MAX_BINS + 1; else numBins = AbstractBAMFileIndex.getMaxBinNumberForSequenceLength(sequenceLength) + 1; bins = new Bin[numBins]; } public BinningIndexBuilder(final int referenceSequence) { this(referenceSequence, 0); } /** * coordinates are 1-based, inclusive */ public interface FeatureToBeIndexed { public int getStart(); public int getEnd(); public Integer getIndexingBin(); public Chunk getChunk(); } public void processFeature(final FeatureToBeIndexed feature) { // process bins final Integer binNumber = feature.getIndexingBin(); final int binNum = binNumber == null ? computeIndexingBin(feature) : binNumber; // is there a bin already represented for this index? if not, add one final Bin bin; if (bins[binNum] != null) { bin = bins[binNum]; } else { bin = new Bin(referenceSequence, binNum); bins[binNum] = bin; binsSeen++; } // process chunks final Chunk newChunk = feature.getChunk(); final long chunkStart = newChunk.getChunkStart(); final long chunkEnd = newChunk.getChunkEnd(); final List oldChunks = bin.getChunkList(); if (!bin.containsChunks()) { bin.addInitialChunk(newChunk); } else { final Chunk lastChunk = bin.getLastChunk(); // Coalesce chunks that are in the same or adjacent file blocks. // Similar to AbstractBAMFileIndex.optimizeChunkList, // but no need to copy the list, no minimumOffset, and maintain bin.lastChunk if (BlockCompressedFilePointerUtil.areInSameOrAdjacentBlocks(lastChunk.getChunkEnd(), chunkStart)) { lastChunk.setChunkEnd(chunkEnd); // coalesced } else { oldChunks.add(newChunk); bin.setLastChunk(newChunk); } } // process linear index // the smallest file offset that appears in the 16k window for this bin final int featureEnd = feature.getEnd(); int startWindow = LinearIndex.convertToLinearIndexOffset(feature.getStart()); // the 16k window final int endWindow; if (featureEnd == GenomicIndexUtil.UNSET_GENOMIC_LOCATION) { // assume feature uses one position // Next line for C (samtools index) compatibility. Differs only when on a window boundary startWindow = LinearIndex.convertToLinearIndexOffset(feature.getStart() - 1); endWindow = startWindow; } else { endWindow = LinearIndex.convertToLinearIndexOffset(featureEnd); } if (endWindow > largestIndexSeen) { largestIndexSeen = endWindow; } // set linear index at every 16K window that this feature overlaps for (int win = startWindow; win <= endWindow; win++) { if (index[win] == 0 || chunkStart < index[win]) { index[win] = chunkStart; } } } /** * Creates the BAMIndexContent for this reference. * Requires all features of the reference have already been processed. */ public BinningIndexContent generateIndexContent() { // process bins if (binsSeen == 0) return null; // no bins for this reference // process chunks // nothing needed // process linear index // linear index will only be as long as the largest index seen final long[] newIndex = new long[largestIndexSeen + 1]; // in java1.6 Arrays.copyOf(index, largestIndexSeen + 1); // C (samtools index) also fills in intermediate 0's with values. This seems unnecessary, but safe long lastNonZeroOffset = 0; for (int i = 0; i <= largestIndexSeen; i++) { if (index[i] == 0) { index[i] = lastNonZeroOffset; // not necessary, but C (samtools index) does this // note, if you remove the above line BAMIndexWriterTest.compareTextual and compareBinary will have to change } else { lastNonZeroOffset = index[i]; } newIndex[i] = index[i]; } final LinearIndex linearIndex = new LinearIndex(referenceSequence, 0, newIndex); return new BinningIndexContent(referenceSequence, new BinningIndexContent.BinList(bins, binsSeen), linearIndex); } private int computeIndexingBin(final FeatureToBeIndexed feature) { // reg2bin has zero-based, half-open API final int start = feature.getStart()-1; int end = feature.getEnd(); if (end <= 0) { // If feature end cannot be determined (e.g. because a read is not really aligned), // then treat this as a one base feature for indexing purposes. end = start + 1; } return GenomicIndexUtil.reg2bin(start, end); } } htsjdk-2.0.1/src/java/htsjdk/samtools/BinningIndexContent.java000066400000000000000000000210711263034757100244240ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; /** * In-memory representation of the binning index for a single reference. BAM and Tabix are both binning indices * with slightly different disk formats but identical in-memory representations. */ public class BinningIndexContent { /** * The reference sequence for the data currently loaded. */ private final int mReferenceSequence; /** * A list of all bins in the above reference sequence. */ private final BinList mBinList; /** * The linear index for the reference sequence above. */ private final LinearIndex mLinearIndex; /** * @param referenceSequence Content corresponds to this reference. * @param binList Array of bins represented by this content, possibly sparse * @param linearIndex Additional index used to optimize queries */ public BinningIndexContent(final int referenceSequence, final BinList binList, final LinearIndex linearIndex) { this.mReferenceSequence = referenceSequence; this.mBinList = binList; this.mLinearIndex = linearIndex; } /** * Reference for this Content */ public int getReferenceSequence() { return mReferenceSequence; } /** * Does this content have anything in this bin? */ public boolean containsBin(final Bin bin) { return mBinList.getBin(bin.getBinNumber()) != null; } /** * @return iterable list of bins represented by this content */ public BinList getBins() { return mBinList; } /** * @return the number of non-null bins represented by this content */ int getNumberOfNonNullBins() { return mBinList.getNumberOfNonNullBins(); } /** * @return all chunks associated with all bins in this content */ public List getAllChunks() { final List allChunks = new ArrayList(); for (final Bin b : mBinList) if (b.getChunkList() != null) { allChunks.addAll(b.getChunkList()); } return Collections.unmodifiableList(allChunks); } /** * @return the linear index represented by this content */ public LinearIndex getLinearIndex() { return mLinearIndex; } /** * * @param startPos 1-based, inclusive * @param endPos 1-based, inclusive * @return List of Chunks overlapping the given region. May return null if there are none. */ public List getChunksOverlapping(final int startPos, final int endPos) { final BitSet overlappingBins = GenomicIndexUtil.regionToBins(startPos,endPos); if (overlappingBins == null) return null; // System.out.println("# Sequence target TID: " + referenceIndex); final List chunkList = new ArrayList(); for (int index = overlappingBins.nextSetBit(0); index >= 0; index = overlappingBins.nextSetBit(index + 1)) { final Bin bin = getBins().getBin(index); if (bin != null) { for (final Chunk chunk : bin.getChunkList()) { chunkList.add(chunk.clone()); } } } if (chunkList.isEmpty()) { return null; } return Chunk.optimizeChunkList(chunkList, getLinearIndex().getMinimumOffset(startPos)); } /** * This class is used to encapsulate the list of Bins store in the BAMIndexContent * While it is currently represented as an array, we may decide to change it to an ArrayList or other structure */ public static class BinList implements Iterable { private final Bin[] mBinArray; public final int numberOfNonNullBins; public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based /** * @param binArray a sparse array representation of the bins. The index into the array is the bin number. * @param numberOfNonNullBins */ public BinList(final Bin[] binArray, final int numberOfNonNullBins) { this.mBinArray = binArray; this.numberOfNonNullBins = numberOfNonNullBins; this.maxBinNumber = mBinArray.length - 1; } Bin getBin(final int binNumber) { if (binNumber > maxBinNumber) return null; return mBinArray[binNumber]; } int getNumberOfNonNullBins() { return numberOfNonNullBins; } /** * @return An iterator over all non-empty bins. */ public Iterator iterator() { return new BinIterator(); } private class BinIterator implements Iterator { /** * Stores the bin # of the Bin currently in use. */ private int nextBin; public BinIterator() { nextBin = 0; } /** * Are there more bins in this set, waiting to be returned? * * @return True if more bins are remaining. */ public boolean hasNext() { while (nextBin <= maxBinNumber) { if (getBin(nextBin) != null) return true; nextBin++; } return false; } /** * Gets the next bin in the provided BinList. * * @return the next available bin in the BinList. */ public Bin next() { if (!hasNext()) throw new NoSuchElementException("This BinIterator is currently empty"); final Bin result = getBin(nextBin); nextBin++; return result; } public void remove() { throw new UnsupportedOperationException("Unable to remove from a bin iterator"); } } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final BinList bins = (BinList) o; if (maxBinNumber != bins.maxBinNumber) return false; if (numberOfNonNullBins != bins.numberOfNonNullBins) return false; if (!Arrays.equals(mBinArray, bins.mBinArray)) return false; return true; } @Override public int hashCode() { int result = Arrays.hashCode(mBinArray); result = 31 * result + numberOfNonNullBins; result = 31 * result + maxBinNumber; return result; } } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final BinningIndexContent that = (BinningIndexContent) o; if (mReferenceSequence != that.mReferenceSequence) return false; if (!mBinList.equals(that.mBinList)) return false; if (!mLinearIndex.equals(that.mLinearIndex)) return false; return true; } @Override public int hashCode() { int result = mReferenceSequence; result = 31 * result + mBinList.hashCode(); result = 31 * result + mLinearIndex.hashCode(); return result; } } htsjdk-2.0.1/src/java/htsjdk/samtools/BrowseableBAMIndex.java000066400000000000000000000035341263034757100241160ustar00rootroot00000000000000package htsjdk.samtools; /** * An index interface with additional functionality for querying and inspecting the structure of a BAM index. * * @author mhanna * @version 0.1 */ public interface BrowseableBAMIndex extends BAMIndex { /** * Gets the size (number of bins in) a given level of a BAM index. * @param levelNumber Level for which to inspect the size. * @return Size of the given level. */ public int getLevelSize(final int levelNumber); /** * Gets the level associated with the given bin number. * @param bin The bin for which to determine the level. * @return the level associated with the given bin number. */ public int getLevelForBin(final Bin bin); /** * Gets the first locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ int getFirstLocusInBin(final Bin bin); /** * Gets the last locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ int getLastLocusInBin(final Bin bin); /** * Get a list of bins in the BAM file that may contain SAMRecords for the given range. * @param referenceIndex sequence of desired SAMRecords * @param startPos 1-based start of the desired interval, inclusive * @param endPos 1-based end of the desired interval, inclusive * @return a list of bins that contain relevant data. */ BinList getBinsOverlapping(final int referenceIndex, final int startPos, final int endPos); /** * Perform an overlapping query of all bins bounding the given location. * @param bin The bin over which to perform an overlapping query. * @return The file pointers */ BAMFileSpan getSpanOverlapping(final Bin bin); } htsjdk-2.0.1/src/java/htsjdk/samtools/CRAMFileReader.java000066400000000000000000000511061263034757100231640ustar00rootroot00000000000000/******************************************************************************* * Copyright 2013 EMBL-EBI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package htsjdk.samtools; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.SamReader.Type; import htsjdk.samtools.cram.CRAIIndex; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.cram.structure.Container; import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.seekablestream.SeekableFileStream; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.RuntimeEOFException; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.Iterator; /** * {@link htsjdk.samtools.BAMFileReader BAMFileReader} analogue for CRAM files. * Supports random access using BAI index file formats. * * @author vadim */ @SuppressWarnings("UnusedDeclaration") public class CRAMFileReader extends SamReader.ReaderImplementation implements SamReader.Indexing { private File cramFile; private final ReferenceSource referenceSource; private InputStream inputStream; private CRAMIterator iterator; private BAMIndex mIndex; private File mIndexFile; private boolean mEnableIndexCaching; private boolean mEnableIndexMemoryMapping; private ValidationStringency validationStringency; /** * Open CRAM data for reading using either the file or the input stream * supplied in the arguments. The * {@link htsjdk.samtools.Defaults#REFERENCE_FASTA default} reference fasta * file will be used. * * @param cramFile CRAM file to open * @param inputStream CRAM stream to read */ public CRAMFileReader(final File cramFile, final InputStream inputStream) { this(cramFile, inputStream, new ReferenceSource(Defaults.REFERENCE_FASTA)); } /** * Open CRAM data for reading using either the file or the input stream * supplied in the arguments. * * @param cramFile CRAM file to read * @param inputStream index file to be used for random access * @param referenceSource a {@link htsjdk.samtools.cram.ref.ReferenceSource source} of * reference sequences */ public CRAMFileReader(final File cramFile, final InputStream inputStream, final ReferenceSource referenceSource) { if (cramFile == null && inputStream == null) throw new IllegalArgumentException( "Either file or input stream is required."); this.cramFile = cramFile; this.inputStream = inputStream; this.referenceSource = referenceSource; getIterator(); } /** * Open CRAM file for reading. If index file is supplied than random access * will be available. * * @param cramFile CRAM file to read * @param indexFile index file to be used for random access * @param referenceSource a {@link htsjdk.samtools.cram.ref.ReferenceSource source} of * reference sequences */ public CRAMFileReader(final File cramFile, final File indexFile, final ReferenceSource referenceSource) { if (cramFile == null) throw new IllegalArgumentException("File is required."); this.cramFile = cramFile; this.mIndexFile = indexFile; this.referenceSource = referenceSource; getIterator(); } public CRAMFileReader(final File cramFile, final ReferenceSource referenceSource) { if (cramFile == null && inputStream == null) throw new IllegalArgumentException( "Either file or input stream is required."); this.cramFile = cramFile; this.referenceSource = referenceSource; getIterator(); } public CRAMFileReader(final InputStream inputStream, final SeekableStream indexInputStream, final ReferenceSource referenceSource, final ValidationStringency validationStringency) throws IOException { this.inputStream = inputStream; this.referenceSource = referenceSource; this.validationStringency = validationStringency; iterator = new CRAMIterator(inputStream, referenceSource, validationStringency); if (indexInputStream != null) { try { mIndex = new CachingBAMFileIndex(indexInputStream, iterator.getSAMFileHeader().getSequenceDictionary()); } catch (Exception e) { // try CRAI instead: indexInputStream.seek(0); final SeekableStream baiStream = CRAIIndex.openCraiFileAsBaiStream(indexInputStream, iterator.getSAMFileHeader().getSequenceDictionary()); mIndex = new CachingBAMFileIndex(baiStream, iterator.getSAMFileHeader().getSequenceDictionary()); } } } public CRAMFileReader(final InputStream stream, final File indexFile, final ReferenceSource referenceSource, final ValidationStringency validationStringency) throws IOException { this(stream, indexFile == null ? null: new SeekableFileStream(indexFile), referenceSource, validationStringency); } public CRAMFileReader(final File cramFile, final File indexFile, final ReferenceSource referenceSource, final ValidationStringency validationStringency) throws IOException { this(new FileInputStream(cramFile), indexFile, referenceSource, validationStringency); this.cramFile = cramFile; } @Override void enableIndexCaching(final boolean enabled) { // relevant to BAI only mEnableIndexCaching = enabled; } @Override void enableIndexMemoryMapping(final boolean enabled) { // relevant to BAI only mEnableIndexMemoryMapping = enabled; } @Override void enableCrcChecking(final boolean enabled) { // inapplicable to CRAM: do nothing } @Override void setSAMRecordFactory(final SAMRecordFactory factory) { } @Override public boolean hasIndex() { return mIndex != null || mIndexFile != null; } @Override public BAMIndex getIndex() { if (!hasIndex()) throw new SAMException("No index is available for this BAM file."); if (mIndex == null) { final SAMSequenceDictionary dictionary = getFileHeader() .getSequenceDictionary(); if (mIndexFile.getName().endsWith(BAMIndex.BAMIndexSuffix)) { mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, dictionary, mEnableIndexMemoryMapping) : new DiskBasedBAMFileIndex(mIndexFile, dictionary, mEnableIndexMemoryMapping); return mIndex; } if (!mIndexFile.getName().endsWith(CRAIIndex.CRAI_INDEX_SUFFIX)) return null; // convert CRAI into BAI: final SeekableStream baiStream; try { baiStream = CRAIIndex.openCraiFileAsBaiStream(mIndexFile, iterator.getSAMFileHeader().getSequenceDictionary()); } catch (IOException e) { throw new RuntimeException(e); } mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(baiStream, getFileHeader().getSequenceDictionary()) : new DiskBasedBAMFileIndex(baiStream, getFileHeader().getSequenceDictionary()); } return mIndex; } @Override public boolean hasBrowseableIndex() { return false; } @Override public BrowseableBAMIndex getBrowseableIndex() { return null; } @Override public SAMRecordIterator iterator(final SAMFileSpan fileSpan) { // get the file coordinates for the span: final long[] coordinateArray = ((BAMFileSpan) fileSpan).toCoordinateArray(); if (coordinateArray == null || coordinateArray.length == 0) return emptyIterator; try { // create an input stream that reads the source cram stream only within the coordinate pairs: final SeekableStream seekableStream = getSeekableStreamOrFailWithRTE(); return new CRAMIterator(seekableStream, referenceSource, coordinateArray, validationStringency); } catch (final IOException e) { throw new RuntimeException(e); } } @Override public SAMFileHeader getFileHeader() { return iterator.getSAMFileHeader(); } @Override public SAMRecordIterator getIterator() { if (iterator != null && cramFile == null) return iterator; try { final CRAMIterator newIterator; if (cramFile != null) { newIterator = new CRAMIterator(new FileInputStream(cramFile), referenceSource, validationStringency); } else newIterator = new CRAMIterator(inputStream, referenceSource, validationStringency); iterator = newIterator; return iterator; } catch (final Exception e) { throw new RuntimeException(e); } } @Override public CloseableIterator getIterator(final SAMFileSpan fileSpan) { return iterator(fileSpan); } @Override public SAMFileSpan getFilePointerSpanningReads() { return new BAMFileSpan(new Chunk(iterator.firstContainerOffset << 16, Long.MAX_VALUE)); } private static final SAMRecordIterator emptyIterator = new SAMRecordIterator() { @Override public boolean hasNext() { return false; } @Override public SAMRecord next() { throw new RuntimeException("No records."); } @Override public void remove() { throw new RuntimeException("Remove not supported."); } @Override public void close() { } @Override public SAMRecordIterator assertSorted(final SortOrder sortOrder) { return this; } }; @Override public CloseableIterator queryAlignmentStart(final String sequence, final int start) { long[] filePointers = null; // Hit the index to determine the chunk boundaries for the required data. final SAMFileHeader fileHeader = getFileHeader(); final int referenceIndex = fileHeader.getSequenceIndex(sequence); if (referenceIndex != -1) { final BAMIndex fileIndex = getIndex(); final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping( referenceIndex, start, -1); filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; } if (filePointers == null || filePointers.length == 0) return emptyIterator; Container container; final SeekableStream seekableStream = getSeekableStreamOrFailWithRTE(); for (int i = 0; i < filePointers.length; i += 2) { final long containerOffset = filePointers[i] >>> 16; try { if (seekableStream.position() != containerOffset || iterator.container == null) { seekableStream.seek(containerOffset); container = ContainerIO.readContainerHeader(iterator.getCramHeader().getVersion().major, seekableStream); if (container.alignmentStart + container.alignmentSpan > start) { seekableStream.seek(containerOffset); iterator.jumpWithinContainerToPos(fileHeader.getSequenceIndex(sequence), start); return new IntervalIterator(iterator, new QueryInterval(referenceIndex, start, -1)); } } else { container = iterator.container; if (container.alignmentStart + container.alignmentSpan > start) { iterator.jumpWithinContainerToPos(fileHeader.getSequenceIndex(sequence), start); return new IntervalIterator(iterator, new QueryInterval(referenceIndex, start, -1)); } } } catch (final IOException e) { throw new RuntimeException(e); } } return iterator; } CloseableIterator query(final int referenceIndex, final int start, final int end, final boolean overlap) throws IOException { long[] filePointers = null; // Hit the index to determine the chunk boundaries for the required data. if (referenceIndex != -1) { final BAMIndex fileIndex = getIndex(); final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping( referenceIndex, start, -1); filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; } if (filePointers == null || filePointers.length == 0) return emptyIterator; final CRAMIterator newIterator = new CRAMIterator(getSeekableStreamOrFailWithRTE(), referenceSource, filePointers, validationStringency); return new IntervalIterator(newIterator, new QueryInterval(referenceIndex, start, end), overlap); } @Override public CloseableIterator queryUnmapped() { final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); final SeekableStream seekableStream = getSeekableStreamOrFailWithRTE(); final CRAMIterator newIterator; try { seekableStream.seek(0); newIterator = new CRAMIterator(seekableStream, referenceSource, validationStringency); seekableStream.seek(startOfLastLinearBin >>> 16); final Container container = ContainerIO.readContainerHeader(newIterator.getCramHeader().getVersion().major, seekableStream); seekableStream.seek(seekableStream.position() + container.containerByteSize); iterator = newIterator; iterator.jumpWithinContainerToPos(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START); } catch (final IOException e) { throw new RuntimeEOFException(e); } return iterator; } private SeekableStream getSeekableStreamOrFailWithRTE() { SeekableStream seekableStream = null; if (cramFile != null) { try { seekableStream = new SeekableFileStream(cramFile); } catch (final FileNotFoundException e) { throw new RuntimeException(e); } } else if (inputStream instanceof SeekableStream) seekableStream = (SeekableStream) inputStream; return seekableStream; } @Override public void close() { CloserUtil.close(iterator); CloserUtil.close(inputStream); CloserUtil.close(mIndex); } @Override void setValidationStringency(final ValidationStringency validationStringency) { this.validationStringency = validationStringency; if (iterator != null) iterator.setValidationStringency(validationStringency); } @Override public ValidationStringency getValidationStringency() { return validationStringency; } @Override public CloseableIterator query(final QueryInterval[] intervals, final boolean contained) { return new MultiIntervalIterator(Arrays.asList(intervals).iterator(), !contained); } @Override public Type type() { return Type.CRAM_TYPE; } @Override void enableFileSource(final SamReader reader, final boolean enabled) { if (iterator != null) iterator.setFileSource(enabled ? reader : null); } private class MultiIntervalIterator implements SAMRecordIterator { private final Iterator queries; private CloseableIterator iterator; private final boolean overlap; public MultiIntervalIterator(final Iterator queries, final boolean overlap) { this.queries = queries; this.overlap = overlap; } @Override public SAMRecordIterator assertSorted(final SortOrder sortOrder) { return null; } @Override public void close() { } @Override public boolean hasNext() { if (iterator == null || !iterator.hasNext()) { if (!queries.hasNext()) return false; do { final QueryInterval query = queries.next(); try { iterator = query(query.referenceIndex, query.start, query.end, overlap); } catch (final IOException e) { throw new RuntimeException(e); } } while (!iterator.hasNext() && queries.hasNext()); } return iterator.hasNext(); } @Override public SAMRecord next() { return iterator.next(); } @Override public void remove() { iterator.remove(); } } public static class IntervalIterator implements SAMRecordIterator { private final CloseableIterator delegate; private final QueryInterval interval; private SAMRecord next; private boolean noMore = false; private final boolean overlap; public IntervalIterator(final CloseableIterator delegate, final QueryInterval interval) { this(delegate, interval, true); } public IntervalIterator(final CloseableIterator delegate, final QueryInterval interval, final boolean overlap) { this.delegate = delegate; this.interval = interval; this.overlap = overlap; } @Override public SAMRecordIterator assertSorted(final SortOrder sortOrder) { return null; } @Override public void close() { delegate.close(); } @Override public boolean hasNext() { if (next != null) return true; if (noMore) return false; while (delegate.hasNext()) { next = delegate.next(); if (isWithinTheInterval(next)) break; if (isBeyondTheInterval(next)) { next = null; noMore = true; return false; } next = null; } return next != null; } boolean isWithinTheInterval(final SAMRecord record) { final boolean refMatch = record.getReferenceIndex() == interval.referenceIndex; if (interval.start == -1) return refMatch; final int start = record.getAlignmentStart(); final int end = record.getAlignmentEnd(); if (overlap) { return CoordMath.overlaps(start, end, interval.start, interval.end < 0 ? Integer.MAX_VALUE : interval.end); } else { // contained: return CoordMath.encloses(interval.start, interval.end < 0 ? Integer.MAX_VALUE : interval.end, start, end); } } boolean isBeyondTheInterval(final SAMRecord record) { if (record.getReadUnmappedFlag()) return false; final boolean refMatch = record.getReferenceIndex() == interval.referenceIndex; return !refMatch || interval.end != -1 && record.getAlignmentStart() > interval.end; } @Override public SAMRecord next() { final SAMRecord result = next; next = null; return result; } @Override public void remove() { throw new RuntimeException("Not available."); } } } htsjdk-2.0.1/src/java/htsjdk/samtools/CRAMFileWriter.java000066400000000000000000000515211263034757100232370ustar00rootroot00000000000000/******************************************************************************* * Copyright 2013 EMBL-EBI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package htsjdk.samtools; import htsjdk.samtools.cram.build.ContainerFactory; import htsjdk.samtools.cram.build.Cram2SamRecordFactory; import htsjdk.samtools.cram.build.CramIO; import htsjdk.samtools.cram.build.CramNormalizer; import htsjdk.samtools.cram.build.Sam2CramRecordFactory; import htsjdk.samtools.cram.common.CramVersions; import htsjdk.samtools.cram.common.Version; import htsjdk.samtools.cram.lossy.PreservationPolicy; import htsjdk.samtools.cram.lossy.QualityScorePreservation; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.cram.ref.ReferenceTracks; import htsjdk.samtools.cram.structure.Container; import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.CramHeader; import htsjdk.samtools.cram.structure.Slice; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.samtools.util.StringLineReader; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; @SuppressWarnings("UnusedDeclaration") public class CRAMFileWriter extends SAMFileWriterImpl { private static final int REF_SEQ_INDEX_NOT_INITIALIZED = -2; static int DEFAULT_RECORDS_PER_SLICE = 10000; private static final int DEFAULT_SLICES_PER_CONTAINER = 1; private static final Version cramVersion = CramVersions.CRAM_v2_1; private final String fileName; private final List samRecords = new ArrayList(); private ContainerFactory containerFactory; protected final int recordsPerSlice = DEFAULT_RECORDS_PER_SLICE; protected final int containerSize = recordsPerSlice * DEFAULT_SLICES_PER_CONTAINER; private final OutputStream outputStream; private ReferenceSource source; private int refSeqIndex = REF_SEQ_INDEX_NOT_INITIALIZED; private static final Log log = Log.getInstance(CRAMFileWriter.class); private final SAMFileHeader samFileHeader; private boolean preserveReadNames = true; private QualityScorePreservation preservation = null; private boolean captureAllTags = true; private Set captureTags = new TreeSet(); private Set ignoreTags = new TreeSet(); private CRAMIndexer indexer; private long offset; /** * Create a CRAMFileWriter on an output stream. Requires input records to be presorted to match the * sort order defined by the input {@code samFileHeader}. * * @param outputStream where to write the output. * @param source reference source * @param samFileHeader {@link SAMFileHeader} to be used. Sort order is determined by the sortOrder property of this arg. * @param fileName used for display in error messages */ public CRAMFileWriter( final OutputStream outputStream, final ReferenceSource source, final SAMFileHeader samFileHeader, final String fileName) { this(outputStream, null, source, samFileHeader, fileName); // defaults to presorted == true } /** * Create a CRAMFileWriter and index on output streams. Requires input records to be presorted to match the * sort order defined by the input {@code samFileHeader}. * * @param outputStream where to write the output. * @param indexOS where to write the output index. Can be null if no index is required. * @param source reference source * @param samFileHeader {@link SAMFileHeader} to be used. Sort order is determined by the sortOrder property of this arg. * @param fileName used for display in error messages */ public CRAMFileWriter( final OutputStream outputStream, final OutputStream indexOS, final ReferenceSource source, final SAMFileHeader samFileHeader, final String fileName) { this(outputStream, indexOS, true, source, samFileHeader, fileName); // defaults to presorted==true } /** * Create a CRAMFileWriter and index on output streams. * * @param outputStream where to write the output. * @param indexOS where to write the output index. Can be null if no index is required. * @param presorted if true records written to this writer must already be sorted in the order specified by the header * @param source reference source * @param samFileHeader {@link SAMFileHeader} to be used. Sort order is determined by the sortOrder property of this arg. * @param fileName used for display in error message display */ public CRAMFileWriter(final OutputStream outputStream, final OutputStream indexOS, final boolean presorted, final ReferenceSource source, final SAMFileHeader samFileHeader, final String fileName) { this.outputStream = outputStream; this.samFileHeader = samFileHeader; this.fileName = fileName; initCRAMWriter(indexOS, source, samFileHeader, presorted); } private void initCRAMWriter(final OutputStream indexOS, final ReferenceSource source, final SAMFileHeader samFileHeader, final boolean preSorted) { this.source = source; setSortOrder(samFileHeader.getSortOrder(), preSorted); setHeader(samFileHeader); if (this.source == null) { this.source = new ReferenceSource(Defaults.REFERENCE_FASTA); } containerFactory = new ContainerFactory(samFileHeader, recordsPerSlice); if (indexOS != null) { indexer = new CRAMIndexer(indexOS, samFileHeader); } } /** * Decide if the current container should be completed and flushed. The decision is based on a) number of records and b) if the * reference sequence id has changed. * * @param nextRecord the record to be added into the current or next container * @return true if the current container should be flushed and the following records should go into a new container; false otherwise. */ protected boolean shouldFlushContainer(final SAMRecord nextRecord) { return samRecords.size() >= containerSize || refSeqIndex != REF_SEQ_INDEX_NOT_INITIALIZED && refSeqIndex != nextRecord.getReferenceIndex(); } private static void updateTracks(final List samRecords, final ReferenceTracks tracks) { for (final SAMRecord samRecord : samRecords) { if (samRecord.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { int refPos = samRecord.getAlignmentStart(); int readPos = 0; for (final CigarElement cigarElement : samRecord.getCigar().getCigarElements()) { if (cigarElement.getOperator().consumesReferenceBases()) { for (int elementIndex = 0; elementIndex < cigarElement.getLength(); elementIndex++) tracks.addCoverage(refPos + elementIndex, 1); } switch (cigarElement.getOperator()) { case M: case X: case EQ: for (int pos = readPos; pos < cigarElement.getLength(); pos++) { final byte readBase = samRecord.getReadBases()[readPos + pos]; final byte refBase = tracks.baseAt(refPos + pos); if (readBase != refBase) tracks.addMismatches(refPos + pos, 1); } break; default: break; } readPos += cigarElement.getOperator().consumesReadBases() ? cigarElement.getLength() : 0; refPos += cigarElement.getOperator().consumesReferenceBases() ? cigarElement.getLength() : 0; } } } } /** * Complete the current container and flush it to the output stream. * * @throws IllegalArgumentException * @throws IllegalAccessException * @throws IOException */ protected void flushContainer() throws IllegalArgumentException, IllegalAccessException, IOException { final byte[] refs; String refSeqName = null; if (refSeqIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) refs = new byte[0]; else { final SAMSequenceRecord sequence = samFileHeader.getSequence(refSeqIndex); refs = source.getReferenceBases(sequence, true); refSeqName = sequence.getSequenceName(); } int start = SAMRecord.NO_ALIGNMENT_START; int stop = SAMRecord.NO_ALIGNMENT_START; for (final SAMRecord r : samRecords) { if (r.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) continue; if (start == SAMRecord.NO_ALIGNMENT_START) start = r.getAlignmentStart(); start = Math.min(r.getAlignmentStart(), start); stop = Math.max(r.getAlignmentEnd(), stop); } ReferenceTracks tracks = null; if (preservation != null && preservation.areReferenceTracksRequired()) { tracks = new ReferenceTracks(refSeqIndex, refSeqName, refs); tracks.ensureRange(start, stop - start + 1); updateTracks(samRecords, tracks); } final List cramRecords = new ArrayList(samRecords.size()); final Sam2CramRecordFactory sam2CramRecordFactory = new Sam2CramRecordFactory(refs, samFileHeader, cramVersion); sam2CramRecordFactory.preserveReadNames = preserveReadNames; sam2CramRecordFactory.captureAllTags = captureAllTags; sam2CramRecordFactory.captureTags.addAll(captureTags); sam2CramRecordFactory.ignoreTags.addAll(ignoreTags); containerFactory.setPreserveReadNames(preserveReadNames); int index = 0; int prevAlStart = start; for (final SAMRecord samRecord : samRecords) { final CramCompressionRecord cramRecord = sam2CramRecordFactory.createCramRecord(samRecord); cramRecord.index = ++index; cramRecord.alignmentDelta = samRecord.getAlignmentStart() - prevAlStart; cramRecord.alignmentStart = samRecord.getAlignmentStart(); prevAlStart = samRecord.getAlignmentStart(); cramRecords.add(cramRecord); if (preservation != null) preservation.addQualityScores(samRecord, cramRecord, tracks); else if (cramRecord.qualityScores != SAMRecord.NULL_QUALS) cramRecord.setForcePreserveQualityScores(true); } if (sam2CramRecordFactory.getBaseCount() < 3 * sam2CramRecordFactory.getFeatureCount()) log.warn("Abnormally high number of mismatches, possibly wrong reference."); { if (samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate) { // mating: final Map primaryMateMap = new TreeMap(); final Map secondaryMateMap = new TreeMap(); for (final CramCompressionRecord r : cramRecords) { if (!r.isMultiFragment()) { r.setDetached(true); r.setHasMateDownStream(false); r.recordsToNextFragment = -1; r.next = null; r.previous = null; } else { final String name = r.readName; final Map mateMap = r.isSecondaryAlignment() ? secondaryMateMap : primaryMateMap; final CramCompressionRecord mate = mateMap.get(name); if (mate == null) { mateMap.put(name, r); } else { CramCompressionRecord prev = mate; while (prev.next != null) prev = prev.next; prev.recordsToNextFragment = r.index - prev.index - 1; prev.next = r; r.previous = prev; r.previous.setHasMateDownStream(true); r.setHasMateDownStream(false); r.setDetached(false); r.previous.setDetached(false); } } } // mark unpredictable reads as detached: for (final CramCompressionRecord cramRecord : cramRecords) { if (cramRecord.next == null || cramRecord.previous != null) continue; CramCompressionRecord last = cramRecord; while (last.next != null) last = last.next; if (cramRecord.isFirstSegment() && last.isLastSegment()) { final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last); if (cramRecord.templateSize == templateLength) { last = cramRecord.next; while (last.next != null) { if (last.templateSize != -templateLength) break; last = last.next; } if (last.templateSize != -templateLength) detach(cramRecord); }else detach(cramRecord); } else detach(cramRecord); } for (final CramCompressionRecord cramRecord : primaryMateMap.values()) { if (cramRecord.next != null) continue; cramRecord.setDetached(true); cramRecord.setHasMateDownStream(false); cramRecord.recordsToNextFragment = -1; cramRecord.next = null; cramRecord.previous = null; } for (final CramCompressionRecord cramRecord : secondaryMateMap.values()) { if (cramRecord.next != null) continue; cramRecord.setDetached(true); cramRecord.setHasMateDownStream(false); cramRecord.recordsToNextFragment = -1; cramRecord.next = null; cramRecord.previous = null; } } else { for (final CramCompressionRecord cramRecord : cramRecords) { cramRecord.setDetached(true); } } } { /** * The following passage is for paranoid mode only. When java is run with asserts on it will throw an {@link AssertionError} if * read bases or quality scores of a restored SAM record mismatch the original. This is effectively a runtime round trip test. */ @SuppressWarnings("UnusedAssignment") boolean assertsEnabled = false; //noinspection AssertWithSideEffects,ConstantConditions assert assertsEnabled = true; //noinspection ConstantConditions if (assertsEnabled) { final Cram2SamRecordFactory f = new Cram2SamRecordFactory(samFileHeader); for (int i = 0; i < samRecords.size(); i++) { final SAMRecord restoredSamRecord = f.create(cramRecords.get(i)); assert (restoredSamRecord.getAlignmentStart() == samRecords.get(i).getAlignmentStart()); assert (restoredSamRecord.getReferenceName().equals(samRecords.get(i).getReferenceName())); assert (restoredSamRecord.getReadString().equals(samRecords.get(i).getReadString())); assert (restoredSamRecord.getBaseQualityString().equals(samRecords.get(i).getBaseQualityString())); } } } final Container container = containerFactory.buildContainer(cramRecords); for (final Slice slice : container.slices) slice.setRefMD5(refs); container.offset = offset; offset += ContainerIO.writeContainer(cramVersion, container, outputStream); if (indexer != null) { for (final Slice slice : container.slices) { indexer.processAlignment(slice); } } samRecords.clear(); } /** * Traverse the graph and mark all segments as detached. * * @param cramRecord the starting point of the graph */ private static void detach(CramCompressionRecord cramRecord) { do { cramRecord.setDetached(true); cramRecord.setHasMateDownStream(false); cramRecord.recordsToNextFragment = -1; } while ((cramRecord = cramRecord.next) != null); } /** * Write an alignment record. * @param alignment must not be null and must have a valid SAMFileHeader. */ @Override protected void writeAlignment(final SAMRecord alignment) { if (shouldFlushContainer(alignment)) { try { flushContainer(); } catch (IOException e) { throw new RuntimeIOException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } } updateReferenceContext(alignment.getReferenceIndex()); samRecords.add(alignment); } /** * Check if the reference has changed and create a new record factory using the new reference. * * @param samRecordReferenceIndex index of the new reference sequence */ private void updateReferenceContext(final int samRecordReferenceIndex) { if (refSeqIndex == REF_SEQ_INDEX_NOT_INITIALIZED) { refSeqIndex = samRecordReferenceIndex; } else if (refSeqIndex != samRecordReferenceIndex) refSeqIndex = samRecordReferenceIndex; } @Override protected void writeHeader(final String textHeader) { // TODO: header must be written exactly once per writer life cycle. final SAMFileHeader header = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader), (fileName != null ? fileName : null)); containerFactory = new ContainerFactory(header, recordsPerSlice); final CramHeader cramHeader = new CramHeader(cramVersion, fileName, header); try { offset = CramIO.writeCramHeader(cramHeader, outputStream); } catch (final IOException e) { throw new RuntimeException(e); } } @Override protected void finish() { try { if (!samRecords.isEmpty()) { flushContainer(); } CramIO.issueEOF(cramVersion, outputStream); outputStream.flush(); if (indexer != null) { indexer.finish(); } outputStream.close(); } catch (final IOException e) { throw new RuntimeIOException(e); } catch (final IllegalAccessException e) { throw new RuntimeException(e); } } @Override protected String getFilename() { return fileName; } public boolean isPreserveReadNames() { return preserveReadNames; } public void setPreserveReadNames(final boolean preserveReadNames) { this.preserveReadNames = preserveReadNames; } public List getPreservationPolicies() { if (preservation == null) { // set up greedy policy by default: preservation = new QualityScorePreservation("*8"); } return preservation.getPreservationPolicies(); } public boolean isCaptureAllTags() { return captureAllTags; } public void setCaptureAllTags(final boolean captureAllTags) { this.captureAllTags = captureAllTags; } public Set getCaptureTags() { return captureTags; } public void setCaptureTags(final Set captureTags) { this.captureTags = captureTags; } public Set getIgnoreTags() { return ignoreTags; } public void setIgnoreTags(final Set ignoreTags) { this.ignoreTags = ignoreTags; } } htsjdk-2.0.1/src/java/htsjdk/samtools/CRAMIndexer.java000077500000000000000000000364211263034757100225660ustar00rootroot00000000000000/******************************************************************************* * Copyright 2013 EMBL-EBI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ /* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sub-license, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.cram.build.CramIO; import htsjdk.samtools.cram.structure.Container; import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.cram.structure.CramHeader; import htsjdk.samtools.cram.structure.Slice; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.BlockCompressedFilePointerUtil; import htsjdk.samtools.util.Log; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.util.Arrays; import java.util.List; /** * Class for both constructing BAM index content and writing it out. * There are two usage patterns: * 1) Building a bam index from an existing cram file * 2) Building a bam index while building the cram file * In both cases, processAlignment is called for each cram slice and * finish() is called at the end. */ public class CRAMIndexer { // The number of references (chromosomes) in the BAM file private final int numReferences; // output written as binary, or (for debugging) as text private final BAMIndexWriter outputWriter; private int currentReference = 0; // content is built up from the input bam file using this private final BAMIndexBuilder indexBuilder; /** * Create a CRAM indexer that writes BAI to a file. * * @param output binary BAM Index (.bai) file * @param fileHeader header for the corresponding bam file */ public CRAMIndexer(final File output, final SAMFileHeader fileHeader) { numReferences = fileHeader.getSequenceDictionary().size(); indexBuilder = new BAMIndexBuilder(fileHeader); outputWriter = new BinaryBAMIndexWriter(numReferences, output); } /** * Create a CRAM indexer that writes BAI to a stream. * * @param output Index will be written here. output will be closed when finish() method is called. * @param fileHeader header for the corresponding bam file. */ public CRAMIndexer(final OutputStream output, final SAMFileHeader fileHeader) { numReferences = fileHeader.getSequenceDictionary().size(); indexBuilder = new BAMIndexBuilder(fileHeader); outputWriter = new BinaryBAMIndexWriter(numReferences, output); } /** * Record any index information for a given CRAM slice. * If this alignment starts a new reference, write out the old reference. * Requires a non-null value for rec.getFileSource(). * * @param slice The CRAM slice */ public void processAlignment(final Slice slice) { try { final int reference = slice.sequenceId; if (reference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && reference != currentReference) { // process any completed references advanceToReference(reference); } indexBuilder.processAlignment(slice); } catch (final Exception e) { throw new SAMException("Exception creating BAM index for slice " + slice, e); } } /** * After all the slices have been processed, finish is called. * Writes any final information and closes the output file. */ public void finish() { // process any remaining references advanceToReference(numReferences); outputWriter.writeNoCoordinateRecordCount(indexBuilder.getNoCoordinateRecordCount()); outputWriter.close(); } /** * write out any references between the currentReference and the nextReference */ private void advanceToReference(final int nextReference) { while (currentReference < nextReference) { final BAMIndexContent content = indexBuilder.processReference(currentReference); outputWriter.writeReference(content); currentReference++; indexBuilder.startNewReference(); } } /** * Class for constructing BAM index files. * One instance is used to construct an entire index. * processAlignment is called for each alignment until a new reference is encountered, then * processReference is called when all records for the reference have been processed. */ private class BAMIndexBuilder { private final SAMFileHeader bamHeader; // the bins for the current reference private Bin[] bins; // made only as big as needed for each reference private int binsSeen = 0; // linear index for the current reference private final long[] index = new long[LinearIndex.MAX_LINEAR_INDEX_SIZE]; private int largestIndexSeen = -1; // information in meta data private final BAMIndexMetaData indexStats = new BAMIndexMetaData(); /** * @param header SAMFileHeader used for reference name (in index stats) and for max bin number */ BAMIndexBuilder(final SAMFileHeader header) { this.bamHeader = header; } private int computeIndexingBin(final Slice slice) { // reg2bin has zero-based, half-open API final int alignmentStart = slice.alignmentStart - 1; int alignmentEnd = slice.alignmentStart + slice.alignmentSpan - 1; if (alignmentEnd <= alignmentStart) { // If alignment end cannot be determined (e.g. because this read is not really aligned), // then treat this as a one base alignment for indexing purposes. alignmentEnd = alignmentStart + 1; } return GenomicIndexUtil.reg2bin(alignmentStart, alignmentEnd); } /** * Record any index information for a given BAM record * * @param slice The BAM record. Requires rec.getFileSource() is non-null. */ public void processAlignment(final Slice slice) { // metadata indexStats.recordMetaData(slice); final int alignmentStart = slice.alignmentStart; if (alignmentStart == SAMRecord.NO_ALIGNMENT_START) { return; // do nothing for records without coordinates, but count them } // various checks final int reference = slice.sequenceId; if (reference != currentReference) { throw new SAMException("Unexpected reference " + reference + " when constructing index for " + currentReference + " for record " + slice); } // process bins final int binNum = computeIndexingBin(slice); // has the bins array been allocated? If not, do so if (bins == null) { final SAMSequenceRecord seq = bamHeader.getSequence(reference); if (seq == null) { bins = new Bin[GenomicIndexUtil.MAX_BINS + 1]; } else { bins = new Bin[AbstractBAMFileIndex.getMaxBinNumberForSequenceLength(seq.getSequenceLength()) + 1]; } } // is there a bin already represented for this index? if not, add one final Bin bin; if (bins[binNum] != null) { bin = bins[binNum]; } else { bin = new Bin(reference, binNum); bins[binNum] = bin; binsSeen++; } // process chunks final long chunkStart = (slice.containerOffset << 16) | slice.index; final long chunkEnd = ((slice.containerOffset << 16) | slice.index) + 1; final Chunk newChunk = new Chunk(chunkStart, chunkEnd); final List oldChunks = bin.getChunkList(); if (!bin.containsChunks()) { bin.addInitialChunk(newChunk); } else { final Chunk lastChunk = bin.getLastChunk(); // Coalesce chunks that are in the same or adjacent file blocks. // Similar to AbstractBAMFileIndex.optimizeChunkList, // but no need to copy the list, no minimumOffset, and maintain bin.lastChunk if (BlockCompressedFilePointerUtil.areInSameOrAdjacentBlocks(lastChunk.getChunkEnd(), chunkStart)) { lastChunk.setChunkEnd(chunkEnd); // coalesced } else { oldChunks.add(newChunk); bin.setLastChunk(newChunk); } } // process linear index // the smallest file offset that appears in the 16k window for this bin final int alignmentEnd = slice.alignmentStart + slice.alignmentSpan; int startWindow = LinearIndex.convertToLinearIndexOffset(alignmentStart); // the 16k window final int endWindow; if (alignmentEnd == SAMRecord.NO_ALIGNMENT_START) { // assume alignment uses one position // Next line for C (samtools index) compatibility. Differs only when on a window boundary startWindow = LinearIndex.convertToLinearIndexOffset(alignmentStart - 1); endWindow = startWindow; } else { endWindow = LinearIndex.convertToLinearIndexOffset(alignmentEnd); } if (endWindow > largestIndexSeen) { largestIndexSeen = endWindow; } // set linear index at every 16K window that this alignment overlaps for (int win = startWindow; win <= endWindow; win++) { if (index[win] == 0 || chunkStart < index[win]) { index[win] = chunkStart; } } } /** * Creates the BAMIndexContent for this reference. * Requires all alignments of the reference have already been processed. */ public BAMIndexContent processReference(final int reference) { if (reference != currentReference) { throw new SAMException("Unexpected reference " + reference + " when constructing index for " + currentReference); } // process bins if (binsSeen == 0) return null; // no bins for this reference // process chunks // nothing needed // process linear index // linear index will only be as long as the largest index seen final long[] newIndex = new long[largestIndexSeen + 1]; // in java1.6 Arrays.copyOf(index, largestIndexSeen + 1); // C (samtools index) also fills in intermediate 0's with values. This seems unnecessary, but safe long lastNonZeroOffset = 0; for (int i = 0; i <= largestIndexSeen; i++) { if (index[i] == 0) { index[i] = lastNonZeroOffset; // not necessary, but C (samtools index) does this // note, if you remove the above line BAMIndexWriterTest.compareTextual and compareBinary will have to change } else { lastNonZeroOffset = index[i]; } newIndex[i] = index[i]; } final LinearIndex linearIndex = new LinearIndex(reference, 0, newIndex); return new BAMIndexContent(reference, bins, binsSeen, indexStats, linearIndex); } /** * @return the count of records with no coordinate positions */ public long getNoCoordinateRecordCount() { return indexStats.getNoCoordinateRecordCount(); } /** * reinitialize all data structures when the reference changes */ void startNewReference() { bins = null; if (binsSeen > 0) { Arrays.fill(index, 0); } binsSeen = 0; largestIndexSeen = -1; indexStats.newReference(); } } /** * Generates a BAI index file from an input CRAM stream * * @param stream CRAM stream to index * @param output File for output index file * @param log optional {@link htsjdk.samtools.util.Log} to output progress */ public static void createIndex(final SeekableStream stream, final File output, final Log log) throws IOException { final CramHeader cramHeader = CramIO.readCramHeader(stream); final CRAMIndexer indexer = new CRAMIndexer(output, cramHeader.getSamFileHeader()); int totalRecords = 0; Container container = null; do { if (++totalRecords % 10 == 0) if (null != log) log.info(totalRecords + " slices processed ..."); try { final long offset = stream.position(); container = ContainerIO.readContainer(cramHeader.getVersion(), stream); if (container == null || container.isEOF()) break; container.offset = offset; int i = 0; for (final Slice slice : container.slices) { slice.containerOffset = offset; slice.index = i++; indexer.processAlignment(slice); } } catch (final IOException e) { throw new RuntimeException("Failed to read cram container", e); } } while (!container.isEOF()); indexer.finish(); } } htsjdk-2.0.1/src/java/htsjdk/samtools/CRAMIterator.java000066400000000000000000000301451263034757100227530ustar00rootroot00000000000000/******************************************************************************* * Copyright 2013 EMBL-EBI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License countingInputStream distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package htsjdk.samtools; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.cram.build.ContainerParser; import htsjdk.samtools.cram.build.Cram2SamRecordFactory; import htsjdk.samtools.cram.build.CramContainerIterator; import htsjdk.samtools.cram.build.CramNormalizer; import htsjdk.samtools.cram.build.CramSpanContainerIterator; import htsjdk.samtools.cram.io.CountingInputStream; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.cram.structure.Container; import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.CramHeader; import htsjdk.samtools.cram.structure.Slice; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeEOFException; import htsjdk.samtools.util.SequenceUtil; import java.io.IOException; import java.io.InputStream; import java.math.BigInteger; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import htsjdk.samtools.cram.CRAMException; public class CRAMIterator implements SAMRecordIterator { private static final Log log = Log.getInstance(CRAMIterator.class); private final CountingInputStream countingInputStream; private CramHeader cramHeader; private ArrayList records; private SAMRecord nextRecord = null; @SuppressWarnings({"CanBeFinal", "FieldCanBeLocal"}) private boolean restoreNMTag = true; @SuppressWarnings({"CanBeFinal", "FieldCanBeLocal"}) private boolean restoreMDTag = false; private CramNormalizer normalizer; private byte[] refs; private int prevSeqId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; public Container container; private SamReader mReader; long firstContainerOffset = 0; private Iterator containerIterator; private ContainerParser parser; private final ReferenceSource referenceSource; private Iterator iterator = Collections.emptyList().iterator(); private ValidationStringency validationStringency = ValidationStringency.DEFAULT_STRINGENCY; public ValidationStringency getValidationStringency() { return validationStringency; } public void setValidationStringency( final ValidationStringency validationStringency) { this.validationStringency = validationStringency; } private long samRecordIndex; private ArrayList cramRecords; public CRAMIterator(final InputStream inputStream, final ReferenceSource referenceSource, final ValidationStringency validationStringency) throws IOException { if (null == referenceSource) { throw new CRAMException("A reference source is required for CRAM files"); } this.countingInputStream = new CountingInputStream(inputStream); this.referenceSource = referenceSource; this.validationStringency = validationStringency; final CramContainerIterator containerIterator = new CramContainerIterator(this.countingInputStream); cramHeader = containerIterator.getCramHeader(); this.containerIterator = containerIterator; firstContainerOffset = this.countingInputStream.getCount(); records = new ArrayList(10000); normalizer = new CramNormalizer(cramHeader.getSamFileHeader(), referenceSource); parser = new ContainerParser(cramHeader.getSamFileHeader()); } public CRAMIterator(final SeekableStream seekableStream, final ReferenceSource referenceSource, final long[] coordinates, final ValidationStringency validationStringency) throws IOException { if (null == referenceSource) { throw new CRAMException("A reference source is required for CRAM files"); } this.countingInputStream = new CountingInputStream(seekableStream); this.referenceSource = referenceSource; this.validationStringency = validationStringency; final CramSpanContainerIterator containerIterator = CramSpanContainerIterator.fromFileSpan(seekableStream, coordinates); cramHeader = containerIterator.getCramHeader(); this.containerIterator = containerIterator; firstContainerOffset = containerIterator.getFirstContainerOffset(); records = new ArrayList(10000); normalizer = new CramNormalizer(cramHeader.getSamFileHeader(), referenceSource); parser = new ContainerParser(cramHeader.getSamFileHeader()); } @Deprecated public CRAMIterator(final SeekableStream seekableStream, final ReferenceSource referenceSource, final long[] coordinates) throws IOException { this(seekableStream, referenceSource, coordinates, ValidationStringency.DEFAULT_STRINGENCY); } public CramHeader getCramHeader() { return cramHeader; } private void nextContainer() throws IOException, IllegalArgumentException, IllegalAccessException, CRAMException { if (containerIterator != null) { if (!containerIterator.hasNext()) { records.clear(); nextRecord = null; return; } container = containerIterator.next(); if (container.isEOF()) { records.clear(); nextRecord = null; return; } } else { container = ContainerIO.readContainer(cramHeader.getVersion(), countingInputStream); if (container.isEOF()) { records.clear(); nextRecord = null; return; } } if (records == null) records = new ArrayList(container.nofRecords); else records.clear(); if (cramRecords == null) cramRecords = new ArrayList(container.nofRecords); else cramRecords.clear(); parser.getRecords(container, cramRecords, validationStringency); if (container.sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { refs = new byte[]{}; } else if (container.sequenceId == -2) { refs = null; prevSeqId = -2; } else if (prevSeqId < 0 || prevSeqId != container.sequenceId) { final SAMSequenceRecord sequence = cramHeader.getSamFileHeader() .getSequence(container.sequenceId); refs = referenceSource.getReferenceBases(sequence, true); if (refs == null) { throw new CRAMException(String.format("Contig %s not found in the reference file.", sequence.getSequenceName())); } prevSeqId = container.sequenceId; } for (int i = 0; i < container.slices.length; i++) { final Slice slice = container.slices[i]; if (slice.sequenceId < 0) continue; if (validationStringency != ValidationStringency.SILENT && !slice.validateRefMD5(refs)) { log.error(String .format("Reference sequence MD5 mismatch for slice: seq id %d, start %d, span %d, expected MD5 %s", slice.sequenceId, slice.alignmentStart, slice.alignmentSpan, String.format("%032x", new BigInteger(1, slice.refMD5)))); } } normalizer.normalize(cramRecords, refs, 0, container.header.substitutionMatrix); final Cram2SamRecordFactory cramToSamRecordFactory = new Cram2SamRecordFactory( cramHeader.getSamFileHeader()); for (final CramCompressionRecord cramRecord : cramRecords) { final SAMRecord samRecord = cramToSamRecordFactory.create(cramRecord); if (!cramRecord.isSegmentUnmapped()) { final SAMSequenceRecord sequence = cramHeader.getSamFileHeader() .getSequence(cramRecord.sequenceId); refs = referenceSource.getReferenceBases(sequence, true); if (samRecord.getReadBases() != SAMRecord.NULL_SEQUENCE) SequenceUtil.calculateMdAndNmTags(samRecord, refs, restoreMDTag, restoreNMTag); } samRecord.setValidationStringency(validationStringency); if (validationStringency != ValidationStringency.SILENT) { final List validationErrors = samRecord.isValid(); SAMUtils.processValidationErrors(validationErrors, samRecordIndex, validationStringency); } if (mReader != null) { final long chunkStart = (container.offset << 16) | cramRecord.sliceIndex; final long chunkEnd = ((container.offset << 16) | cramRecord.sliceIndex) + 1; nextRecord.setFileSource(new SAMFileSource(mReader, new BAMFileSpan(new Chunk(chunkStart, chunkEnd)))); } records.add(samRecord); samRecordIndex++; } cramRecords.clear(); iterator = records.iterator(); } /** * Skip cached records until given alignment start position. * * @param refIndex reference sequence index * @param pos alignment start to skip to */ public void jumpWithinContainerToPos(final int refIndex, final int pos) { if (!hasNext()) return; int i = 0; for (final SAMRecord record : records) { if (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && record.getReferenceIndex() != refIndex) continue; if (pos <= 0) { if (record.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { iterator = records.listIterator(i); return; } } else { if (record.getAlignmentStart() >= pos) { iterator = records.listIterator(i); return; } } i++; } iterator = Collections.emptyList().iterator(); } @Override public boolean hasNext() { if (container != null && container.isEOF()) return false; if (!iterator.hasNext()) { try { nextContainer(); } catch (CRAMException ce) { throw ce; } catch (SAMFormatException se) { throw se; } catch (final Exception e) { throw new RuntimeEOFException(e); } } return !records.isEmpty(); } @Override public SAMRecord next() { return iterator.next(); } @Override public void remove() { throw new RuntimeException("Removal of records not implemented."); } @Override public void close() { records.clear(); //noinspection EmptyCatchBlock try { if (countingInputStream != null) countingInputStream.close(); } catch (final IOException e) { } } @Override public SAMRecordIterator assertSorted(final SortOrder sortOrder) { throw new RuntimeException("Not implemented."); } public SamReader getFileSource() { return mReader; } public void setFileSource(final SamReader mReader) { this.mReader = mReader; } public SAMFileHeader getSAMFileHeader() { return cramHeader.getSamFileHeader(); } } htsjdk-2.0.1/src/java/htsjdk/samtools/CachingBAMFileIndex.java000066400000000000000000000163171263034757100241700ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.seekablestream.SeekableStream; import java.io.File; import java.util.ArrayList; import java.util.BitSet; import java.util.List; import java.util.WeakHashMap; /** * Class for reading BAM file indices, caching each contig as it's loaded and * dropping values when the next contig is loaded. */ class CachingBAMFileIndex extends AbstractBAMFileIndex implements BrowseableBAMIndex { private Integer mLastReferenceRetrieved = null; private final WeakHashMap mQueriesByReference = new WeakHashMap(); public CachingBAMFileIndex(final File file, final SAMSequenceDictionary dictionary) { super(file, dictionary); } public CachingBAMFileIndex(final SeekableStream stream, final SAMSequenceDictionary dictionary) { super(stream, dictionary); } public CachingBAMFileIndex(final File file, final SAMSequenceDictionary dictionary, final boolean useMemoryMapping) { super(file, dictionary, useMemoryMapping); } /** * Get list of regions of BAM file that may contain SAMRecords for the given range * @param referenceIndex sequence of desired SAMRecords * @param startPos 1-based start of the desired interval, inclusive * @param endPos 1-based end of the desired interval, inclusive * @return the virtual file position. Each pair is the first and last virtual file position * in a range that can be scanned to find SAMRecords that overlap the given positions. * May return null if there is no content overlapping the region. */ public BAMFileSpan getSpanOverlapping(final int referenceIndex, final int startPos, final int endPos) { final BAMIndexContent queryResults = getQueryResults(referenceIndex); if(queryResults == null) return null; final List chunkList = queryResults.getChunksOverlapping(startPos, endPos); if (chunkList == null) return null; return new BAMFileSpan(chunkList); } /** * Get a list of bins in the BAM file that may contain SAMRecords for the given range. * @param referenceIndex sequence of desired SAMRecords * @param startPos 1-based start of the desired interval, inclusive * @param endPos 1-based end of the desired interval, inclusive * @return a list of bins that contain relevant data. */ public BinList getBinsOverlapping(final int referenceIndex, final int startPos, final int endPos) { final BitSet regionBins = GenomicIndexUtil.regionToBins(startPos, endPos); if (regionBins == null) { return null; } return new BinList(referenceIndex,regionBins); } /** * Perform an overlapping query of all bins bounding the given location. * @param bin The bin over which to perform an overlapping query. * @return The file pointers */ public BAMFileSpan getSpanOverlapping(final Bin bin) { if(bin == null) return null; final int referenceSequence = bin.getReferenceSequence(); final BAMIndexContent indexQuery = getQueryResults(referenceSequence); if(indexQuery == null) return null; final int binLevel = getLevelForBin(bin); final int firstLocusInBin = getFirstLocusInBin(bin); // Add the specified bin to the tree if it exists. final List binTree = new ArrayList(); if(indexQuery.containsBin(bin)) binTree.add(indexQuery.getBins().getBin(bin.getBinNumber())); int currentBinLevel = binLevel; while(--currentBinLevel >= 0) { final int binStart = getFirstBinInLevel(currentBinLevel); final int binWidth = getMaxAddressibleGenomicLocation()/getLevelSize(currentBinLevel); final int binNumber = firstLocusInBin/binWidth + binStart; final Bin parentBin = indexQuery.getBins().getBin(binNumber); if(parentBin != null && indexQuery.containsBin(parentBin)) binTree.add(parentBin); } List chunkList = new ArrayList(); for(final Bin coveringBin: binTree) { for(final Chunk chunk: coveringBin.getChunkList()) chunkList.add(chunk.clone()); } final int start = getFirstLocusInBin(bin); chunkList = Chunk.optimizeChunkList(chunkList,indexQuery.getLinearIndex().getMinimumOffset(start)); return new BAMFileSpan(chunkList); } /** * Looks up the cached BAM query results if they're still in the cache and not expired. Otherwise, * retrieves the cache results from disk. * @param referenceIndex The reference to load. CachingBAMFileIndex only stores index data for entire references. * @return The index information for this reference. */ protected BAMIndexContent getQueryResults(final int referenceIndex) { // WeakHashMap is a bit weird in that its lookups are done via equals() equality, but expirations must be // handled via == equality. This implementation jumps through a few hoops to make sure that == equality still // holds even in the context of boxing/unboxing. // If this query is for the same reference index as the last query, return it. if(mLastReferenceRetrieved!=null && mLastReferenceRetrieved == referenceIndex) return mQueriesByReference.get(referenceIndex); // If not, check to see whether it's available in the cache. BAMIndexContent queryResults = mQueriesByReference.get(referenceIndex); if(queryResults != null) { mLastReferenceRetrieved = referenceIndex; mQueriesByReference.put(referenceIndex,queryResults); return queryResults; } // If not in the cache, attempt to load it from disk. queryResults = query(referenceIndex,1,-1); if(queryResults != null) { mLastReferenceRetrieved = referenceIndex; mQueriesByReference.put(referenceIndex,queryResults); return queryResults; } // Not even available on disk. return null; } } htsjdk-2.0.1/src/java/htsjdk/samtools/ChainedDownsamplingIterator.java000066400000000000000000000103451263034757100261470ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2015 Tim Fennell * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.Iterator; import java.util.List; import java.util.Set; /** * A DownsamplingIterator that combines the ConstantMemory and HighAccuracy downsampling techniques to provide an * iterator that has accuracy approaching that of HighAccuracy, but with more limited memory usage. Instead of * requiring memory proportional to number of read names in the incoming stream of reads, requires memory * approximately proportional to the number of output reads. * * @author Tim Fennell */ class ChainedDownsamplingIterator extends HighAccuracyDownsamplingIterator { public static final int MIN_ACCURATE_INPUT_READS = 50000; /** * Constructs a chained iterator that will read from the provided iterator and attempt to downsampling to the provided proportion. */ ChainedDownsamplingIterator(final Iterator iterator, final double proportion, final int seed) { super(new ConstantMemoryDownsamplingIterator(iterator, adjustProportion(proportion), seed), proportion, seed); // Deal with the fact that the iterator will advance and discard some reads at construction final long discarded = ((ConstantMemoryDownsamplingIterator) getUnderlyingIterator()).getDiscardedCount(); recordDiscardRecords(discarded); } /** * Calculates the upper bound of 99.9% CI given the proportion, that is used to "buffer" the proportion on * the constant memory downsampler, to make sure it leaves enough reads for us to downsample. * * Uses an assumed number of reads tested as this is often not known until after the fact. */ private static double adjustProportion(final double p) { final double ciAdjustment99_9 = 3.3 * Math.sqrt(p/MIN_ACCURATE_INPUT_READS); return Math.min(1, p + ciAdjustment99_9); } /** * Resets statistics before reading from the underlying iterator. */ @Override protected void readFromUnderlyingIterator(final List recs, final Set names, final int templatesToRead) { // Reset the stats on the underlying iterator ((ConstantMemoryDownsamplingIterator) getUnderlyingIterator()).resetStatistics(); // Read from the underlying iterator super.readFromUnderlyingIterator(recs, names, templatesToRead); } @Override protected int calculateTemplatesToKeep(final int templatesRead, final double overallProportion) { // Calculate an adjusted proportion to keep, knowing what proportion the underlying iterator discarded final ConstantMemoryDownsamplingIterator iter = (ConstantMemoryDownsamplingIterator) getUnderlyingIterator(); final double priorProportion = iter.getAcceptedFraction(); final double p = Math.max(0, Math.min(1, overallProportion / priorProportion)); final int retval = super.calculateTemplatesToKeep(templatesRead, p); // Record all the discarded records to keep the overall statistics accurate, but do it after // the call to super() so it doesn't affect the proportion calculation. recordDiscardRecords(iter.getDiscardedCount()); return retval; } } htsjdk-2.0.1/src/java/htsjdk/samtools/Chunk.java000066400000000000000000000145001263034757100215640ustar00rootroot00000000000000package htsjdk.samtools; import htsjdk.samtools.util.BlockCompressedFilePointerUtil; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * A [start,stop) file pointer pairing into the BAM file, stored * as a BAM file index. A chunk is represented as a single 64-bit * value where the high-order 48 bits point to the location of the * start of a compressed BGZF block within a BGZF file and the * low-order 16 bits point to a position within the decompressed * data in the BGZF block. * * See the SAM/BAM spec for more details. */ public class Chunk implements Cloneable, Serializable,Comparable { private static final long serialVersionUID = 1L; /** * A pointer to the start of a region in a SAM/BAM file. The * start is inclusive: start reading from this point. */ private long mChunkStart; /** * A pointer to the end of a region in a SAM/BAM file. The end * is exclusive: this pointer points to one byte past the end * of the region of interest inside the file. */ private long mChunkEnd; public Chunk(final long start, final long end) { mChunkStart = start; mChunkEnd = end; } public Chunk clone() { return new Chunk(mChunkStart,mChunkEnd); } public long getChunkStart() { return mChunkStart; } protected void setChunkStart(final long value) { mChunkStart = value; } public long getChunkEnd() { return mChunkEnd; } protected void setChunkEnd(final long value) { mChunkEnd = value; } public int compareTo(final Chunk chunk) { int result = Long.signum(mChunkStart - chunk.mChunkStart); if (result == 0) { result = Long.signum(mChunkEnd - chunk.mChunkEnd); } return result; } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final Chunk chunk = (Chunk) o; if (mChunkEnd != chunk.mChunkEnd) return false; if (mChunkStart != chunk.mChunkStart) return false; return true; } /** * Returns whether two chunks overlap. * @param other Chunk to which this should be compared. * @return True if the chunks overlap. Returns false if the two chunks abut or are disjoint. */ public boolean overlaps(final Chunk other) { final int comparison = this.compareTo(other); if(comparison == 0) return true; // "sort" the two chunks using the comparator. final Chunk leftMost = comparison==-1 ? this : other; final Chunk rightMost = comparison==1 ? this : other; final long leftMostBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(leftMost.getChunkEnd()); final long rightMostBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(rightMost.getChunkStart()); // If the left block's address is after the right block's address, compare the two blocks. // If the two blocks are identical, compare the block offsets. // If the right block is after the left block, no overlap is possible. if(leftMostBlockAddress > rightMostBlockAddress) return true; else if(leftMostBlockAddress == rightMostBlockAddress) { final int leftMostOffset = BlockCompressedFilePointerUtil.getBlockOffset(leftMost.getChunkEnd()); final int rightMostOffset = BlockCompressedFilePointerUtil.getBlockOffset(rightMost.getChunkStart()); return leftMostOffset > rightMostOffset; } else return false; } /** * Returns whether two chunks overlap. * @param other Chunk to which this should be compared. * @return True if the two chunks are adjacent. Returns false if the chunks overlap or are discontinuous. */ public boolean isAdjacentTo(final Chunk other) { // Simpler implementation would be to == the chunk end of one to the chunk start of the other. Chose this implementation to ensure that all chunk // comparisons point directly to the return (BlockCompressedFilePointerUtil.getBlockAddress(this.getChunkEnd()) == BlockCompressedFilePointerUtil.getBlockAddress(other.getChunkStart()) && BlockCompressedFilePointerUtil.getBlockOffset(this.getChunkEnd()) == BlockCompressedFilePointerUtil.getBlockOffset(other.getChunkStart())) || (BlockCompressedFilePointerUtil.getBlockAddress(this.getChunkStart()) == BlockCompressedFilePointerUtil.getBlockAddress(other.getChunkEnd()) && BlockCompressedFilePointerUtil.getBlockOffset(this.getChunkStart()) == BlockCompressedFilePointerUtil.getBlockOffset(other.getChunkEnd())); } @Override public int hashCode() { int result = (int) (mChunkStart ^ (mChunkStart >>> 32)); result = 31 * result + (int) (mChunkEnd ^ (mChunkEnd >>> 32)); return result; } @Override public String toString() { return String.format("%d:%d-%d:%d",mChunkStart >> 16,mChunkStart & 0xFFFF,mChunkEnd >> 16,mChunkEnd & 0xFFFF); } /** * @param minimumOffset Discard chunks that end before this file offset. * @return sorted list of chunks in which adjacent chunks are coalesced. */ public static List optimizeChunkList(final List chunks, final long minimumOffset) { Chunk lastChunk = null; Collections.sort(chunks); final List result = new ArrayList(); for (final Chunk chunk : chunks) { if (chunk.getChunkEnd() <= minimumOffset) { continue; // linear index optimization } if (result.isEmpty()) { result.add(chunk); lastChunk = chunk; continue; } // Coalesce chunks that are in adjacent file blocks. // This is a performance optimization. if (!lastChunk.overlaps(chunk) && !lastChunk.isAdjacentTo(chunk)) { result.add(chunk); lastChunk = chunk; } else { if (chunk.getChunkEnd() > lastChunk.getChunkEnd()) { lastChunk.setChunkEnd(chunk.getChunkEnd()); } } } return result; } } htsjdk-2.0.1/src/java/htsjdk/samtools/Cigar.java000066400000000000000000000267031263034757100215510ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * A list of CigarElements, which describes how a read aligns with the reference. * E.g. the Cigar string 10M1D25M means * * match or mismatch for 10 bases * * deletion of 1 base * * match or mismatch for 25 bases * * c.f. http://samtools.sourceforge.net/SAM1.pdf for complete CIGAR specification. */ public class Cigar implements Serializable { public static final long serialVersionUID = 1L; private final List cigarElements = new ArrayList(); public Cigar() { } public Cigar(final List cigarElements) { this.cigarElements.addAll(cigarElements); } public List getCigarElements() { return Collections.unmodifiableList(cigarElements); } public CigarElement getCigarElement(final int i) { return cigarElements.get(i); } public void add(final CigarElement cigarElement) { cigarElements.add(cigarElement); } public int numCigarElements() { return cigarElements.size(); } public boolean isEmpty() { return cigarElements.isEmpty(); } /** * @return The number of reference bases that the read covers, excluding padding. */ public int getReferenceLength() { int length = 0; for (final CigarElement element : cigarElements) { switch (element.getOperator()) { case M: case D: case N: case EQ: case X: length += element.getLength(); } } return length; } /** * @return The number of reference bases that the read covers, including padding. */ public int getPaddedReferenceLength() { int length = 0; for (final CigarElement element : cigarElements) { switch (element.getOperator()) { case M: case D: case N: case EQ: case X: case P: length += element.getLength(); } } return length; } /** * @return The number of read bases that the read covers. */ public int getReadLength() { return getReadLength(cigarElements); } /** * @return The number of read bases that the read covers. */ public static int getReadLength(final List cigarElements) { int length = 0; for (final CigarElement element : cigarElements) { if (element.getOperator().consumesReadBases()){ length += element.getLength(); } } return length; } /** * Exhaustive validation of CIGAR. * Note that this method deliberately returns null rather than Collections.emptyList() if there * are no validation errors, because callers tend to assume that if a non-null list is returned, it is modifiable. * @param readName For error reporting only. May be null if not known. * @param recordNumber For error reporting only. May be -1 if not known. * @return List of validation errors, or null if no errors. */ public List isValid(final String readName, final long recordNumber) { if (this.isEmpty()) { return null; } List ret = null; boolean seenRealOperator = false; for (int i = 0; i < cigarElements.size(); ++i) { final CigarElement element = cigarElements.get(i); if (element.getLength() == 0) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "CIGAR element with zero length", readName, recordNumber)); } // clipping operator can only be at start or end of CIGAR final CigarOperator op = element.getOperator(); if (isClippingOperator(op)) { if (op == CigarOperator.H) { if (i != 0 && i != cigarElements.size() - 1) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "Hard clipping operator not at start or end of CIGAR", readName, recordNumber)); } } else { if (op != CigarOperator.S) throw new IllegalStateException("Should never happen: " + op.name()); if (i == 0 || i == cigarElements.size() - 1) { // Soft clip at either end is fine } else if (i == 1) { if (cigarElements.size() == 3 && cigarElements.get(2).getOperator() == CigarOperator.H) { // Handle funky special case in which S operator is both one from the beginning and one // from the end. } else if (cigarElements.get(0).getOperator() != CigarOperator.H) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "Soft clipping CIGAR operator can only be inside of hard clipping operator", readName, recordNumber)); } } else if (i == cigarElements.size() - 2) { if (cigarElements.get(cigarElements.size() - 1).getOperator() != CigarOperator.H) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "Soft clipping CIGAR operator can only be inside of hard clipping operator", readName, recordNumber)); } } else { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "Soft clipping CIGAR operator can at start or end of read, or be inside of hard clipping operator", readName, recordNumber)); } } } else if (isRealOperator(op)) { // Must be at least one real operator (MIDN) seenRealOperator = true; // There should be an M or P operator between any pair of IDN operators if (isInDelOperator(op)) { for (int j = i+1; j < cigarElements.size(); ++j) { final CigarOperator nextOperator = cigarElements.get(j).getOperator(); // Allow if ((isRealOperator(nextOperator) && !isInDelOperator(nextOperator)) || isPaddingOperator(nextOperator)) { break; } if (isInDelOperator(nextOperator) && op == nextOperator) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.ADJACENT_INDEL_IN_CIGAR, "No M or N operator between pair of " + op.name() + " operators in CIGAR", readName, recordNumber)); } } } } else if (isPaddingOperator(op)) { if (i == 0) { /* * Removed restriction that padding not be the first operator because if a read starts in the middle of a pad * in a padded reference, it is necessary to precede the read with padding so that alignment start refers to a * position on the unpadded reference. */ } else if (i == cigarElements.size() - 1) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "Padding operator not valid at end of CIGAR", readName, recordNumber)); } else if (!isRealOperator(cigarElements.get(i-1).getOperator()) || !isRealOperator(cigarElements.get(i+1).getOperator())) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "Padding operator not between real operators in CIGAR", readName, recordNumber)); } } } if (!seenRealOperator) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "No real operator (M|I|D|N) in CIGAR", readName, recordNumber)); } return ret; } private static boolean isRealOperator(final CigarOperator op) { return op == CigarOperator.M || op == CigarOperator.EQ || op == CigarOperator.X || op == CigarOperator.I || op == CigarOperator.D || op == CigarOperator.N; } private static boolean isInDelOperator(final CigarOperator op) { return op == CigarOperator.I || op == CigarOperator.D; } private static boolean isClippingOperator(final CigarOperator op) { return op == CigarOperator.S || op == CigarOperator.H; } private static boolean isPaddingOperator(final CigarOperator op) { return op == CigarOperator.P; } @Override public boolean equals(final Object o) { if (this == o) return true; if (!(o instanceof Cigar)) return false; final Cigar cigar = (Cigar) o; if (cigarElements != null ? !cigarElements.equals(cigar.cigarElements) : cigar.cigarElements != null) return false; return true; } @Override public int hashCode() { return cigarElements != null ? cigarElements.hashCode() : 0; } public String toString() { return TextCigarCodec.encode(this); } } htsjdk-2.0.1/src/java/htsjdk/samtools/CigarElement.java000066400000000000000000000044111263034757100230530ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.io.Serializable; /** * One component of a cigar string. The component comprises the operator, and the number of bases to which * the operator applies. */ public class CigarElement implements Serializable { public static final long serialVersionUID = 1L; private final int length; private final CigarOperator operator; public CigarElement(final int length, final CigarOperator operator) { this.length = length; this.operator = operator; } public int getLength() { return length; } public CigarOperator getOperator() { return operator; } @Override public boolean equals(final Object o) { if (this == o) return true; if (!(o instanceof CigarElement)) return false; final CigarElement that = (CigarElement) o; if (length != that.length) return false; if (operator != that.operator) return false; return true; } @Override public int hashCode() { int result = length; result = 31 * result + (operator != null ? operator.hashCode() : 0); return result; } } htsjdk-2.0.1/src/java/htsjdk/samtools/CigarOperator.java000066400000000000000000000141401263034757100232550ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * The operators that can appear in a cigar string, and information about their disk representations. */ public enum CigarOperator { /** Match or mismatch */ M(true, true, 'M'), /** Insertion vs. the reference. */ I(true, false, 'I'), /** Deletion vs. the reference. */ D(false, true, 'D'), /** Skipped region from the reference. */ N(false, true, 'N'), /** Soft clip. */ S(true, false, 'S'), /** Hard clip. */ H(false, false, 'H'), /** Padding. */ P(false, false, 'P'), /** Matches the reference. */ EQ(true, true, '='), /** Mismatches the reference. */ X(true, true, 'X') ; // Representation of CigarOperator in BAM file private static final byte OP_M = 0; private static final byte OP_I = 1; private static final byte OP_D = 2; private static final byte OP_N = 3; private static final byte OP_S = 4; private static final byte OP_H = 5; private static final byte OP_P = 6; private static final byte OP_EQ = 7; private static final byte OP_X = 8; private final boolean consumesReadBases; private final boolean consumesReferenceBases; private final byte character; private final String string; // Readable synonyms of the above enums public static final CigarOperator MATCH_OR_MISMATCH = M; public static final CigarOperator INSERTION = I; public static final CigarOperator DELETION = D; public static final CigarOperator SKIPPED_REGION = N; public static final CigarOperator SOFT_CLIP = S; public static final CigarOperator HARD_CLIP = H; public static final CigarOperator PADDING = P; /** Default constructor. */ CigarOperator(boolean consumesReadBases, boolean consumesReferenceBases, char character) { this.consumesReadBases = consumesReadBases; this.consumesReferenceBases = consumesReferenceBases; this.character = (byte) character; this.string = new String(new char[] {character}).intern(); } /** If true, represents that this cigar operator "consumes" bases from the read bases. */ public boolean consumesReadBases() { return consumesReadBases; } /** If true, represents that this cigar operator "consumes" bases from the reference sequence. */ public boolean consumesReferenceBases() { return consumesReferenceBases; } /** * @param b CIGAR operator in character form as appears in a text CIGAR string * @return CigarOperator enum value corresponding to the given character. */ public static CigarOperator characterToEnum(final int b) { switch (b) { case 'M': return M; case 'I': return I; case 'D': return D; case 'N': return N; case 'S': return S; case 'H': return H; case 'P': return P; case '=': return EQ; case 'X': return X; default: throw new IllegalArgumentException("Unrecognized CigarOperator: " + b); } } /** * @param i CIGAR operator in binary form as appears in a BAMRecord. * @return CigarOperator enum value corresponding to the given int value. */ public static CigarOperator binaryToEnum(final int i) { switch(i) { case OP_M: return M; case OP_I: return I; case OP_D: return D; case OP_N: return N; case OP_S: return S; case OP_H: return H; case OP_P: return P; case OP_EQ: return EQ; case OP_X: return X; default: throw new IllegalArgumentException("Unrecognized CigarOperator: " + i); } } /** * * @param e CigarOperator enum value. * @return CIGAR operator corresponding to the enum value in binary form as appears in a BAMRecord. */ public static int enumToBinary(final CigarOperator e) { switch(e) { case M: return OP_M; case I: return OP_I; case D: return OP_D; case N: return OP_N; case S: return OP_S; case H: return OP_H; case P: return OP_P; case EQ: return OP_EQ; case X: return OP_X; default: throw new IllegalArgumentException("Unrecognized CigarOperator: " + e); } } /** Returns the character that should be used within a SAM file. */ public static byte enumToCharacter(final CigarOperator e) { return e.character; } /** Returns the cigar operator as it would be seen in a SAM file. */ @Override public String toString() { return this.string; } } htsjdk-2.0.1/src/java/htsjdk/samtools/ComparableSamRecordIterator.java000066400000000000000000000072461263034757100261040ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.PeekableIterator; import java.util.Comparator; /** * Iterator for SAM records that implements comparable to enable sorting of iterators. * The comparison is performed by comparing the next record in the iterator to the next * record in another iterator and returning the ordering between those SAM records. */ class ComparableSamRecordIterator extends PeekableIterator implements Comparable { private final Comparator comparator; private final SamReader reader; /** * Constructs a wrapping iterator around the given iterator that will be able * to compare itself to other ComparableSamRecordIterators using the given comparator. * * @param iterator the wrapped iterator. * @param comparator the Comparator to use to provide ordering fo SAMRecords */ public ComparableSamRecordIterator(final SamReader sam, final CloseableIterator iterator, final Comparator comparator) { super(iterator); this.reader = sam; this.comparator = comparator; } /** Returns the reader from which this iterator was constructed. */ public SamReader getReader() { return reader; } /** * Compares this iterator to another comparable iterator based on the next record * available in each iterator. If the two comparable iterators have different * comparator types internally an exception is thrown. * * @param that another iterator to compare to * @return a negative, 0 or positive number as described in the Comparator interface */ public int compareTo(final ComparableSamRecordIterator that) { if (this.comparator.getClass() != that.comparator.getClass()) { throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " + "have different orderings internally"); } final SAMRecord record = this.peek(); final SAMRecord record2 = that.peek(); return comparator.compare(record, record2); } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; return compareTo((ComparableSamRecordIterator) o) == 0; } @Override public int hashCode() { throw new UnsupportedOperationException("ComparableSamRecordIterator should not be hashed because it can change value"); } } htsjdk-2.0.1/src/java/htsjdk/samtools/ConstantMemoryDownsamplingIterator.java000066400000000000000000000077141263034757100276040ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2015 Tim Fennell * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.Murmur3; import htsjdk.samtools.util.PeekableIterator; import java.util.Iterator; /** * A DownsamplingIterator that runs in constant (and very small) memory. For each read the read name is hashed * using the Murmur3_32 hash algorithm to obtain an integer value that is, enough for our purposes, uniformly * distributed between the min and max int values even for highly similar inputs. The proportion is used to * calculate a maximum acceptable hash value within the range. Records whose hash value is below the limit * are emitted, records whose hash value is above the limit are discarded. * * Does not make any attempt to be accurate (have actual proportion == requested proportion) beyond what would * be expected for a random process and so may become quite inaccurate when downsampling to small numbers of * reads. * * @author Tim Fennell */ class ConstantMemoryDownsamplingIterator extends DownsamplingIterator { private final PeekableIterator underlyingIterator; private final int maxHashValue; private final Murmur3 hasher; /** Constructs a downsampling iterator upon the supplied iterator, using the Random as the source of randomness. */ ConstantMemoryDownsamplingIterator(final Iterator iterator, final double proportion, final int seed) { super(proportion); this.hasher = new Murmur3(seed); this.underlyingIterator = new PeekableIterator(iterator); final long range = (long) Integer.MAX_VALUE - (long) Integer.MIN_VALUE; this.maxHashValue = Integer.MIN_VALUE + (int) Math.round(range * proportion); advanceToNextAcceptedRead(); } /** Returns true if there is another record available post-downsampling, false otherwise. */ @Override public boolean hasNext() { // The underlying iterator is always left at the next return-able read, so if it has a next read, so do we return this.underlyingIterator.hasNext(); } /** * Advances the underlying, peekable, iterator until the next records is one that is to be emitted. * @return true if there is at least one emittable record ready for emission, false otherwise */ private boolean advanceToNextAcceptedRead() { while (this.underlyingIterator.hasNext() && this.hasher.hashUnencodedChars(this.underlyingIterator.peek().getReadName()) > this.maxHashValue) { this.underlyingIterator.next(); recordDiscardedRecord(); } return this.underlyingIterator.hasNext(); } /** Returns the next record from the iterator, or throws an exception if there is no next record. */ @Override public SAMRecord next() { final SAMRecord rec = this.underlyingIterator.next(); recordAcceptedRecord(); advanceToNextAcceptedRead(); return rec; } } htsjdk-2.0.1/src/java/htsjdk/samtools/CoordinateSortedPairInfoMap.java000066400000000000000000000306311263034757100260550ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.FileAppendStreamLRUCache; import htsjdk.samtools.util.IOUtil; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; /** * Holds info about a mate pair for use when processing a coordinate sorted file. When one read of a pair is encountered, * the caller should add a record to this map. When the other read of a pair is encountered, the record should be removed. * This class assumes that reads will be processed in order of reference sequence index. When the map is queried for * a record for a given reference sequence index, all the records for that sequence are loaded from temp file into RAM, so there * must be sufficient RAM to hold all the records for one reference sequence. If the records are not processed in * reference sequence order, loading and unloading of records will cause performance to be terrible. * @param KEY + reference sequence index are used to identify the record being stored or retrieved. * @param The type of record being retrieved. */ public class CoordinateSortedPairInfoMap implements Iterable> { // -1 is a valid sequence index in this case private final int INVALID_SEQUENCE_INDEX = -2; /** * directory where files will go */ private final File workDir = IOUtil.createTempDir("CSPI.", null); private int sequenceIndexOfMapInRam = INVALID_SEQUENCE_INDEX; private Map mapInRam = null; private final FileAppendStreamLRUCache outputStreams; private final Codec elementCodec; // Key is reference index (which is in the range [-1 .. max sequence index]. // Value is the number of records on disk for this index. private final Map sizeOfMapOnDisk = new HashMap(); // No other methods may be called when iteration is in progress, because iteration depends on and changes // internal state. private boolean iterationInProgress = false; public CoordinateSortedPairInfoMap(final int maxOpenFiles, final Codec elementCodec) { this.elementCodec = elementCodec; workDir.deleteOnExit(); outputStreams = new FileAppendStreamLRUCache(maxOpenFiles); } /** * * @param sequenceIndex * @param key * @return The record corresponding to the given sequenceIndex and key, or null if it is not present. */ public REC remove(final int sequenceIndex, final KEY key) { if (iterationInProgress) throw new IllegalStateException("Cannot be called when iteration is in progress"); ensureSequenceLoaded(sequenceIndex); return mapInRam.remove(key); } private void ensureSequenceLoaded(final int sequenceIndex) { try { if (sequenceIndexOfMapInRam == sequenceIndex) { return; } // Spill map in RAM to disk if (mapInRam != null) { final File spillFile = makeFileForSequence(sequenceIndexOfMapInRam); if (spillFile.exists()) throw new IllegalStateException(spillFile + " should not exist."); if (!mapInRam.isEmpty()) { // Do not create file or entry in sizeOfMapOnDisk if there is nothing to write. final OutputStream os = getOutputStreamForSequence(sequenceIndexOfMapInRam); elementCodec.setOutputStream(os); for (final Map.Entry entry : mapInRam.entrySet()) { elementCodec.encode(entry.getKey(), entry.getValue()); } sizeOfMapOnDisk.put(sequenceIndexOfMapInRam, mapInRam.size()); mapInRam.clear(); } } else { mapInRam = new HashMap(); } sequenceIndexOfMapInRam = sequenceIndex; // Load map from disk if it existed File mapOnDisk = makeFileForSequence(sequenceIndex); if (outputStreams.containsKey(mapOnDisk)) { outputStreams.remove(mapOnDisk).close(); } final Integer numRecords = sizeOfMapOnDisk.remove(sequenceIndex); if (mapOnDisk.exists()) { if (numRecords == null) throw new IllegalStateException("null numRecords for " + mapOnDisk); FileInputStream is = null; try { is = new FileInputStream(mapOnDisk); elementCodec.setInputStream(is); for (int i = 0; i < numRecords; ++i) { final Map.Entry keyAndRecord = elementCodec.decode(); if (mapInRam.containsKey(keyAndRecord.getKey())) throw new SAMException("Value was put into PairInfoMap more than once. " + sequenceIndex + ": " + keyAndRecord.getKey()); mapInRam.put(keyAndRecord.getKey(), keyAndRecord.getValue()); } } finally { CloserUtil.close(is); } htsjdk.samtools.util.IOUtil.deleteFiles(mapOnDisk); } else if (numRecords != null && numRecords > 0) throw new IllegalStateException("Non-zero numRecords but " + mapOnDisk + " does not exist"); } catch (IOException e) { throw new SAMException("Error loading new map from disk.", e); } } /** * Store the record with the given sequence index and key. It is assumed that value did not previously exist * in the map, and an exception is thrown (possibly at a later time) if that is not the case. * @param sequenceIndex * @param key * @param record */ public void put(final int sequenceIndex, final KEY key, final REC record) { if (iterationInProgress) throw new IllegalStateException("Cannot be called when iteration is in progress"); if (sequenceIndex == sequenceIndexOfMapInRam) { // Store in RAM map if (mapInRam.containsKey(key)) throw new IllegalArgumentException("Putting value into PairInfoMap that already existed. " + sequenceIndex + ": " + key); mapInRam.put(key, record); } else { // Append to file final OutputStream os = getOutputStreamForSequence(sequenceIndex); elementCodec.setOutputStream(os); elementCodec.encode(key, record); Integer prevCount = sizeOfMapOnDisk.get(sequenceIndex); if (prevCount == null) prevCount = 0; sizeOfMapOnDisk.put(sequenceIndex, prevCount + 1); } } private File makeFileForSequence(final int index) { final File file = new File(workDir, index + ".tmp"); file.deleteOnExit(); return file; } private OutputStream getOutputStreamForSequence(final int mateSequenceIndex) { return outputStreams.get(makeFileForSequence(mateSequenceIndex)); } public int size() { int total = sizeInRam(); for (final Integer mapSize : sizeOfMapOnDisk.values()) { if (mapSize != null) { total += mapSize; } } return total; } /** * @return number of elements stored in RAM. Always <= size() */ public int sizeInRam() { return mapInRam != null? mapInRam.size(): 0; } /** * Creates an iterator over all elements in map, in arbitrary order. Elements may not be added * or removed from map when iteration is in progress, nor may a second iteration be started. * Iterator must be closed in order to allow normal access to the map. */ public CloseableIterator> iterator() { if (iterationInProgress) throw new IllegalStateException("Cannot be called when iteration is in progress"); iterationInProgress = true; return new MapIterator(); } private class MapIterator implements CloseableIterator> { private boolean closed = false; private Set referenceIndices = new HashSet(sizeOfMapOnDisk.keySet()); private final Iterator referenceIndexIterator; private Iterator> currentReferenceIterator = null; private MapIterator() { if (sequenceIndexOfMapInRam != INVALID_SEQUENCE_INDEX) referenceIndices.add(sequenceIndexOfMapInRam); referenceIndexIterator = referenceIndices.iterator(); advanceToNextNonEmptyReferenceIndex(); } private void advanceToNextNonEmptyReferenceIndex() { while (referenceIndexIterator.hasNext()) { int nextReferenceIndex = referenceIndexIterator.next(); ensureSequenceLoaded(nextReferenceIndex); if (!mapInRam.isEmpty()) { createIteratorForMapInRam(); return; } } // no more. currentReferenceIterator = null; } private void createIteratorForMapInRam() { currentReferenceIterator = mapInRam.entrySet().iterator(); } public void close() { closed = true; iterationInProgress = false; } public boolean hasNext() { if (closed) throw new IllegalStateException("Iterator has been closed"); if (currentReferenceIterator != null && !currentReferenceIterator.hasNext()) throw new IllegalStateException("Should not happen"); return currentReferenceIterator != null; } public Map.Entry next() { if (closed) throw new IllegalStateException("Iterator has been closed"); if (!hasNext()) throw new NoSuchElementException(); final Map.Entry ret = currentReferenceIterator.next(); if (!currentReferenceIterator.hasNext()) advanceToNextNonEmptyReferenceIndex(); return ret; } public void remove() { throw new UnsupportedOperationException(); } } /** * Client must implement this class, which defines the way in which records are written to and * read from file. */ public interface Codec { /** * Where to write encoded output * @param os */ void setOutputStream(OutputStream os); /** * Where to read encoded input from * @param is */ void setInputStream(InputStream is); /** * Write object to output stream. If the key is part of the record, then there is no need to write * it separately. */ void encode(KEY key, REC record); /** * Read the next key and record from the input stream and convert into a java object. * @return null if no more records. Should throw exception if EOF is encountered in the middle of * a record. */ Map.Entry decode(); } } htsjdk-2.0.1/src/java/htsjdk/samtools/CustomReaderFactory.java000066400000000000000000000112561263034757100244460ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2013 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.Log; import java.net.URL; import java.net.URLClassLoader; /** * Factory for creating custom readers for accessing API based resources, * e.g. ga4gh. * The configuration is controlled via custom_reader property (@see Defaults). * This allows injection of such readers from code bases outside HTSJDK. */ public class CustomReaderFactory { private final static Log LOG = Log.getInstance(CustomReaderFactory.class); /** * Interface to be implemented by custom factory classes that register * themselves with this factory and are loaded dynamically. */ public interface ICustomReaderFactory { SamReader open(URL url); } private static final CustomReaderFactory DEFAULT_FACTORY; private static CustomReaderFactory currentFactory; private String urlPrefix = ""; private String factoryClassName = ""; private String jarFile = ""; private ICustomReaderFactory factory; static { DEFAULT_FACTORY = new CustomReaderFactory(); currentFactory = DEFAULT_FACTORY; } public static void setInstance(final CustomReaderFactory factory){ currentFactory = factory; } public static void resetToDefaultInstance() { setInstance(DEFAULT_FACTORY); } public static CustomReaderFactory getInstance(){ return currentFactory; } /** * Initializes factory based on the custom_reader property specification. */ private CustomReaderFactory() { this(Defaults.CUSTOM_READER_FACTORY); } CustomReaderFactory(String cfg) { final String[] cfgComponents = cfg.split(","); if (cfgComponents.length < 2) { return; } urlPrefix = cfgComponents[0].toLowerCase(); factoryClassName = cfgComponents[1]; if (cfgComponents.length > 2) { jarFile = cfgComponents[2]; } } /** * Lazily creates factory based on the configuration. * @return null if creation fails, factory instance otherwise. */ private synchronized ICustomReaderFactory getFactory() { if (factory == null) { try { Class clazz = null; if (!jarFile.isEmpty()) { LOG.info("Attempting to load factory class " + factoryClassName + " from " + jarFile); final URL jarURL = new URL("file:///"+jarFile); clazz = Class.forName(factoryClassName, true, new URLClassLoader (new URL[] { jarURL }, this.getClass().getClassLoader())); } else { LOG.info("Attempting to load factory class " + factoryClassName); clazz = Class.forName(factoryClassName); } factory = (ICustomReaderFactory)clazz.newInstance(); LOG.info("Created custom factory for " + urlPrefix + " from " + factoryClassName + " loaded from " + (jarFile.isEmpty() ? " this jar" : jarFile)); } catch (Exception e) { LOG.error(e); return null; } } return factory; } /** * Check if the url is supposed to be handled by the custom factory and if so * attempt to create reader via an instance of this custom factory. * * @return null if the url is not handled by this factory, SamReader otherwise. */ public SamReader maybeOpen(URL url) { if (urlPrefix.isEmpty() || !url.toString().toLowerCase().startsWith(urlPrefix)) { return null; } LOG.info("Attempting to open " + url + " with custom factory"); final ICustomReaderFactory factory = getFactory(); if (factory == null) { return null; } return factory.open(url); } } htsjdk-2.0.1/src/java/htsjdk/samtools/DefaultSAMRecordFactory.java000066400000000000000000000043301263034757100251300ustar00rootroot00000000000000package htsjdk.samtools; /** * Default factory for creating SAM and BAM records used by the SAMFileReader classes. * * @author Tim Fennell */ public class DefaultSAMRecordFactory implements SAMRecordFactory { private static final DefaultSAMRecordFactory INSTANCE = new DefaultSAMRecordFactory(); public static DefaultSAMRecordFactory getInstance() { return INSTANCE; } /** Create a new SAMRecord to be filled in */ public SAMRecord createSAMRecord(final SAMFileHeader header) { return new SAMRecord(header); } /** * Create a new BAM Record. If the reference sequence index or mate reference sequence index are * any value other than NO_ALIGNMENT_REFERENCE_INDEX, the values must be resolvable against the sequence * dictionary in the header argument. */ public BAMRecord createBAMRecord (final SAMFileHeader header, final int referenceSequenceIndex, final int alignmentStart, final short readNameLength, final short mappingQuality, final int indexingBin, final int cigarLen, final int flags, final int readLen, final int mateReferenceSequenceIndex, final int mateAlignmentStart, final int insertSize, final byte[] variableLengthBlock) { return new BAMRecord(header, referenceSequenceIndex, alignmentStart, readNameLength, mappingQuality, indexingBin, cigarLen, flags, readLen, mateReferenceSequenceIndex, mateAlignmentStart, insertSize, variableLengthBlock); } } htsjdk-2.0.1/src/java/htsjdk/samtools/Defaults.java000066400000000000000000000116601263034757100222670ustar00rootroot00000000000000package htsjdk.samtools; import java.io.File; /** * Embodies defaults for global values that affect how the SAM JDK operates. Defaults are encoded in the class * and are also overridable using system properties. * * @author Tim Fennell */ public class Defaults { /** Should BAM index files be created when writing out coordinate sorted BAM files? Default = false. */ public static final boolean CREATE_INDEX; /** Should MD5 files be created when writing out SAM and BAM files? Default = false. */ public static final boolean CREATE_MD5; /** Should asynchronous I/O be used when writing out SAM and BAM files (one thread per file). Default = false. */ public static final boolean USE_ASYNC_IO; /** Compresion level to be used for writing BAM and other block-compressed outputs. Default = 5. */ public static final int COMPRESSION_LEVEL; /** Buffer size, in bytes, used whenever reading/writing files or streams. Default = 128k. */ public static final int BUFFER_SIZE; /** * Even if BUFFER_SIZE is 0, this is guaranteed to be non-zero. If BUFFER_SIZE is non-zero, * this == BUFFER_SIZE */ public static final int NON_ZERO_BUFFER_SIZE; /** Should BlockCompressedOutputStream attempt to load libIntelDeflater? */ public static final boolean TRY_USE_INTEL_DEFLATER; /** * Path to libIntelDeflater.so. If this is not set, the library is looked for in the directory * where the executable jar lives. */ public static final String INTEL_DEFLATER_SHARED_LIBRARY_PATH; /** * The reference FASTA file. If this is not set, the file is null. This file may be required for reading * writing SAM files (ex. CRAM). */ public static final File REFERENCE_FASTA; /** Custom reader factory able to handle URL based resources like ga4gh. * Expected format: ,[,] * E.g. https://www.googleapis.com/genomics/v1beta/reads/,com.google.genomics.ReaderFactory * OR https://www.googleapis.com/genomics/v1beta/reads/,com.google.genomics.ReaderFactory,/tmp/genomics.jar */ public static final String CUSTOM_READER_FACTORY; /** * Boolean describing whether downloading a reference file is allowed (for CRAM files), * in case the reference file is not specified by the user * Enabling this is not necessarily a good idea, since this process often fails */ public static final boolean USE_CRAM_REF_DOWNLOAD; /** * A mask (pattern) to use when building EBI reference service URL for a * given MD5 checksum. Must contain one and only one string placeholder. */ public static final String EBI_REFERENCE_SEVICE_URL_MASK; static { CREATE_INDEX = getBooleanProperty("create_index", false); CREATE_MD5 = getBooleanProperty("create_md5", false); USE_ASYNC_IO = getBooleanProperty("use_async_io", false); COMPRESSION_LEVEL = getIntProperty("compression_level", 5); BUFFER_SIZE = getIntProperty("buffer_size", 1024 * 128); TRY_USE_INTEL_DEFLATER = getBooleanProperty("try_use_intel_deflater", true); INTEL_DEFLATER_SHARED_LIBRARY_PATH = getStringProperty("intel_deflater_so_path", null); if (BUFFER_SIZE == 0) { NON_ZERO_BUFFER_SIZE = 1024 * 128; } else { NON_ZERO_BUFFER_SIZE = BUFFER_SIZE; } REFERENCE_FASTA = getFileProperty("reference_fasta", null); USE_CRAM_REF_DOWNLOAD = getBooleanProperty("use_cram_ref_download", false); EBI_REFERENCE_SEVICE_URL_MASK = "http://www.ebi.ac.uk/ena/cram/md5/%s"; CUSTOM_READER_FACTORY = getStringProperty("custom_reader", ""); } /** Gets a string system property, prefixed with "samjdk." using the default if the property does not exist. */ private static String getStringProperty(final String name, final String def) { return System.getProperty("samjdk." + name, def); } /** Gets a boolean system property, prefixed with "samjdk." using the default if the property does not exist. */ private static boolean getBooleanProperty(final String name, final boolean def) { final String value = getStringProperty(name, new Boolean(def).toString()); return Boolean.parseBoolean(value); } /** Gets an int system property, prefixed with "samjdk." using the default if the property does not exist. */ private static int getIntProperty(final String name, final int def) { final String value = getStringProperty(name, new Integer(def).toString()); return Integer.parseInt(value); } /** Gets a File system property, prefixed with "samdjk." using the default if the property does not exist. */ private static File getFileProperty(final String name, final String def) { final String value = getStringProperty(name, def); // TODO: assert that it is readable return (null == value) ? null : new File(value); } } htsjdk-2.0.1/src/java/htsjdk/samtools/DiskBasedBAMFileIndex.java000066400000000000000000000067111263034757100244620ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.seekablestream.SeekableStream; import java.io.File; import java.util.ArrayList; import java.util.List; /** * A class for reading BAM file indices, hitting the disk once per query. */ class DiskBasedBAMFileIndex extends AbstractBAMFileIndex { DiskBasedBAMFileIndex(final File file, final SAMSequenceDictionary dictionary) { super(file, dictionary); } DiskBasedBAMFileIndex(final SeekableStream stream, final SAMSequenceDictionary dictionary) { super(stream, dictionary); } DiskBasedBAMFileIndex(final File file, final SAMSequenceDictionary dictionary, final boolean useMemoryMapping) { super(file, dictionary, useMemoryMapping); } /** * Get list of regions of BAM file that may contain SAMRecords for the given range * @param referenceIndex sequence of desired SAMRecords * @param startPos 1-based start of the desired interval, inclusive * @param endPos 1-based end of the desired interval, inclusive * @return array of pairs of virtual file positions. Each pair is the first and last * virtual file position in a range that can be scanned to find SAMRecords that overlap the given * positions. The last position in each pair is a virtual file pointer to the first SAMRecord beyond * the range that may contain the indicated SAMRecords. */ public BAMFileSpan getSpanOverlapping(final int referenceIndex, final int startPos, final int endPos) { final BAMIndexContent queryResults = query(referenceIndex,startPos,endPos); if(queryResults == null) return null; List chunkList = new ArrayList(); for(final Chunk chunk: queryResults.getAllChunks()) chunkList.add(chunk.clone()); chunkList = Chunk.optimizeChunkList(chunkList,queryResults.getLinearIndex().getMinimumOffset(startPos)); return new BAMFileSpan(chunkList); } protected BAMIndexContent getQueryResults(final int reference){ throw new UnsupportedOperationException(); // todo: there ought to be a way to support this using the first startPos for the reference and the last // return query(reference, 1, -1); // If this were implemented, BAMIndexer.createAndWriteIndex could extend DiskBasedBAMFileIndex -or- CachingBAMFileIndex } } htsjdk-2.0.1/src/java/htsjdk/samtools/DownsamplingIterator.java000066400000000000000000000120641263034757100246730ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; /** * Abstract base class for all DownsamplingIterators that provides a uniform interface for recording * and reporting statistics bout how many records have been kept and discarded. * * A DownsamplingIterator is an iterator that takes another iterator of SAMRecords and filters out a * subset of those records in a random way, while ensuring that all records for a template (i.e. record name) * are either retained or discarded. Strictly speaking the proportion parameter applies to templates, * though in most instances it is safe to think about it being applied to records. * * @author Tim Fennell */ public abstract class DownsamplingIterator implements CloseableIterator { private long recordsSeen; private long recordsAccepted; private double targetProportion; /** Constructs a downsampling iterator that aims to retain the targetProportion of reads. */ public DownsamplingIterator(final double targetProportion) { if (targetProportion < 0) throw new IllegalArgumentException("targetProportion must be >= 0"); if (targetProportion > 1) throw new IllegalArgumentException("targetProportion must be <= 1"); this.targetProportion = targetProportion; } /** Does nothing. */ @Override public void close() { /** No Op. */ } /** Returns the number of records seen, including accepted and discarded, since creation of the last call to resetStatistics. */ public long getSeenCount() { return this.recordsSeen; } /** Returns the number of records returned since creation of the last call to resetStatistics. */ public long getAcceptedCount() { return this.recordsAccepted; } /** Returns the number of records discarded since creation of the last call to resetStatistics. */ public long getDiscardedCount() { return this.recordsSeen - this.recordsAccepted; } /** Gets the fraction of records discarded since creation or the last call to resetStatistics(). */ public double getDiscardedFraction() { return getDiscardedCount() / (double) getSeenCount(); } /** Gets the fraction of records accepted since creation or the last call to resetStatistics(). */ public double getAcceptedFraction() { return getAcceptedCount() / (double) getSeenCount(); } /** Resets the statistics for records seen/accepted/discarded. */ public void resetStatistics() { this.recordsSeen = 0; this.recordsAccepted = 0; } /** Gets the target proportion of records that should be retained during downsampling. */ public double getTargetProportion() { return targetProportion; } /** Method for subclasses to record a record as being discarded. */ protected final void recordDiscardedRecord() { this.recordsSeen++; } /** * Method for subclasses to record a specific record as being accepted. Null may be passed if a record * was discarded but access to the object is no longer available. */ protected final void recordAcceptedRecord() { this.recordsSeen++; this.recordsAccepted++; } /** Record one or more records as having been discarded. */ protected final void recordDiscardRecords(final long n) { this.recordsSeen += n; } /** Record one or more records as having been discarded. */ protected final void recordAcceptedRecords(final long n) { this.recordsSeen += n; this.recordsAccepted += n; } /** * Indicates whether or not the strategy implemented by this DownsamplingIterator makes any effort to * increase accuracy beyond random sampling (i.e. to reduce the delta between the requested proportion * of reads and the actually emitted proportion of reads). */ public boolean isHigherAccuracy() { return false; } /** Not supported. */ @Override public void remove() { throw new UnsupportedOperationException("remove() not supported in DownsamplingIterators"); } } htsjdk-2.0.1/src/java/htsjdk/samtools/DownsamplingIteratorFactory.java000066400000000000000000000156141263034757100262270ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2015 Tim Fennell * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.IOUtil; import java.io.File; import java.util.Iterator; /** * A factory for creating DownsamplingIterators that uses a number of different strategies to achieve downsampling while * meeting various criteria. * * @author Tim Fennell */ public class DownsamplingIteratorFactory { public static final String HIGH_ACCURACY_DESCRIPTION = "Attempts (but does not guarantee) to provide accuracy up to a specified limit. Accuracy is defined as emitting " + "a proportion of reads as close to the requested proportion as possible. In order to do so this strategy requires " + "memory that is proportional to the number of template names in the incoming stream of reads, and will thus require " + "large amounts of memory when running on large input files."; public static final String CONSTANT_MEMORY_DESCRPTION = "Downsamples a stream or file of SAMRecords using a hash-projection strategy such that it can run in constant memory. " + "The downsampling is stochastic, and therefore the actual retained proportion will vary around the requested proportion. Due " + "to working in fixed memory this strategy is good for large inputs, and due to the stochastic nature the accuracy of this strategy " + "is highest with a high number of output records, and diminishes at low output volumes."; public static final String CHAINED_DESCRIPTION = "Attempts to provide a compromise strategy that offers some of the advantages of both the ConstantMemory and HighAccuracy strategies. " + "Uses a ConstantMemory strategy to downsample the incoming stream to approximately the desired proportion, and then a HighAccuracy " + "strategy to finish. Works in a single pass, and will provide accuracy close to (but often not as good as) HighAccuracy while requiring " + "memory proportional to the set of reads emitted from the ConstantMemory strategy to the HighAccuracy strategy. Works well when downsampling " + "large inputs to small proportions (e.g. downsampling hundreds of millions of reads and retaining only 2%. Should be accurate 99.9% of the time " + "when the input contains >= 50,000 templates (read names). For smaller inputs, HighAccuracy is recommended instead."; /** Describes the available downsampling strategies. */ public enum Strategy { HighAccuracy(HIGH_ACCURACY_DESCRIPTION), ConstantMemory(CONSTANT_MEMORY_DESCRPTION), Chained(CHAINED_DESCRIPTION); public final String description; Strategy(final String description) { this.description = description; } /** Gets the description of the strategy. */ public String getDescription() { return description; } } /** * Creates a new DownsamplingIterator using the supplied Strategy that attempts to read from the provided iterator and return * approximately proportion of the records read. * * @param iterator The iterator from which to consume SAMRecords * @param strategy The downsampling strategy to use * @param proportion The proportion of records the downsampling strategy should attempt to emit * @param accuracy If supported by the downsampling strategy, the accuracy goal for the downsampler. Higher accuracy will generally * require higher memory usage. An accuracy value of 0.0001 tells the strategy to try and ensure the emitted proportion * is within proportion +/0 0.0001. * @param seed The seed value to use for any random process used in down-sampling. */ public static DownsamplingIterator make(final Iterator iterator, final Strategy strategy, final double proportion, final double accuracy, final int seed) { if (strategy == null) throw new IllegalArgumentException("strategy may not be null"); if (iterator == null) throw new IllegalArgumentException("iterator may not be null"); if (proportion < 0) throw new IllegalArgumentException("proportion must be greater than 0"); if (proportion > 1) throw new IllegalArgumentException("proportion must be less than 1"); switch (strategy) { case HighAccuracy: return new HighAccuracyDownsamplingIterator(iterator, proportion, seed).setTargetAccuracy(accuracy); case ConstantMemory: return new ConstantMemoryDownsamplingIterator(iterator, proportion, seed); case Chained: return new ChainedDownsamplingIterator(iterator, proportion, seed).setTargetAccuracy(accuracy); default: throw new IllegalStateException("Unexpected value for Strategy enum in switch statement. Bug!!"); } } /** * Convenience method that constructs a downsampling iterator for all the reads in a SAM file. * See {@link DownsamplingIteratorFactory#make(Iterator, Strategy, double, double, int)} for detailed parameter information. */ public static DownsamplingIterator make(final File samFile, final Strategy strategy, final double proportion, final double accuracy, final int seed) { IOUtil.assertFileIsReadable(samFile); return make(SamReaderFactory.makeDefault().open(samFile), strategy, proportion, accuracy, seed); } /** * Convenience method that constructs a downsampling iterator for all the reads available from a SamReader. * See {@link DownsamplingIteratorFactory#make(Iterator, Strategy, double, double, int)} for detailed parameter information. */ public static DownsamplingIterator make(final SamReader reader, final Strategy strategy, final double proportion, final double accuracy, final int seed) { return make(reader.iterator(), strategy, proportion, accuracy, seed); } } htsjdk-2.0.1/src/java/htsjdk/samtools/DuplicateScoringStrategy.java000066400000000000000000000127551263034757100255100ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * This class helps us compute and compare duplicate scores, which are used for selecting the non-duplicate * during duplicate marking (see MarkDuplicates). * @author nhomer */ public class DuplicateScoringStrategy { public enum ScoringStrategy { SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH } /** An enum to use for storing temporary attributes on SAMRecords. */ private static enum Attr { DuplicateScore } /** Calculates a score for the read which is the sum of scores over Q15. */ private static short getSumOfBaseQualities(final SAMRecord rec) { short score = 0; for (final byte b : rec.getBaseQualities()) { if (b >= 15) score += b; } return score; } /** * Returns the duplicate score computed from the given fragment. */ public static short computeDuplicateScore(final SAMRecord record, final ScoringStrategy scoringStrategy) { return computeDuplicateScore(record, scoringStrategy, false); } /** * Returns the duplicate score computed from the given fragment. * * If true is given to assumeMateCigar, then any score that can use the mate cigar to compute the mate's score will return the score * computed on both ends. */ public static short computeDuplicateScore(final SAMRecord record, final ScoringStrategy scoringStrategy, final boolean assumeMateCigar) { Short storedScore = (Short) record.getTransientAttribute(Attr.DuplicateScore); if (storedScore == null) { short score = 0; switch (scoringStrategy) { case SUM_OF_BASE_QUALITIES: score += getSumOfBaseQualities(record); break; case TOTAL_MAPPED_REFERENCE_LENGTH: if (!record.getReadUnmappedFlag()) { score += record.getCigar().getReferenceLength(); } if (assumeMateCigar && record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { score += SAMUtils.getMateCigar(record).getReferenceLength(); } break; } storedScore = score; record.setTransientAttribute(Attr.DuplicateScore, storedScore); } return storedScore; } /** * Compare two records based on their duplicate scores. If the scores are equal, we break ties based on mapping quality * (added to the mate's mapping quality if paired and mapped), then library/read name. * * If true is given to assumeMateCigar, then any score that can use the mate cigar to to compute the mate's score will return the score * computed on both ends. * * We allow different scoring strategies. We return <0 if rec1 has a better strategy than rec2. */ public static int compare(final SAMRecord rec1, final SAMRecord rec2, final ScoringStrategy scoringStrategy, final boolean assumeMateCigar) { int cmp; // always prefer paired over non-paired if (rec1.getReadPairedFlag() != rec2.getReadPairedFlag()) return rec1.getReadPairedFlag() ? 1 : -1; cmp = computeDuplicateScore(rec2, scoringStrategy, assumeMateCigar) - computeDuplicateScore(rec1, scoringStrategy, assumeMateCigar); /** * Finally, use library ID and read name * This is important because we cannot control the order in which reads appear for reads that are comparable up to now (i.e. cmp == 0). We want to deterministically * choose them, and so we need this. */ if (0 == cmp) cmp = SAMUtils.getCanonicalRecordName(rec1).compareTo(SAMUtils.getCanonicalRecordName(rec2)); return cmp; } /** * Compare two records based on their duplicate scores. The duplicate scores for each record is assume to be * pre-computed by computeDuplicateScore and stored in the "DS" tag. If the scores are equal, we break * ties based on mapping quality (added to the mate's mapping quality if paired and mapped), then library/read name. * * We allow different scoring strategies. We return <0 if rec1 has a better strategy than rec2. */ public static int compare(final SAMRecord rec1, final SAMRecord rec2, final ScoringStrategy scoringStrategy) { return compare(rec1, rec2, scoringStrategy, false); } } htsjdk-2.0.1/src/java/htsjdk/samtools/DuplicateSet.java000066400000000000000000000142661263034757100231130ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * Stores a set of records that are duplicates of each other. The first records in the list of records is * considered the representative of the duplicate, and typically does not have it's duplicate flag set. * The records' duplicate flag will be set appropriately as records are added. This behavior can be * turned off. * * At this time, this set does not track optical duplicates. * * @author nhomer */ public class DuplicateSet { private final List records; private static final SAMRecordDuplicateComparator defaultComparator = new SAMRecordDuplicateComparator(); private final SAMRecordDuplicateComparator comparator; private SAMRecord representative = null; private boolean needsSorting = false; private boolean setDuplicateFlag = false; /** Sets the duplicate flag by default */ public DuplicateSet() { this(true); } public DuplicateSet(final boolean setDuplicateFlag) { this(setDuplicateFlag, defaultComparator); } public DuplicateSet(final SAMRecordDuplicateComparator comparator) { this(true, comparator); } public DuplicateSet(final boolean setDuplicateFlag, final SAMRecordDuplicateComparator comparator) { records = new ArrayList(10); this.setDuplicateFlag = setDuplicateFlag; this.comparator = comparator; } /** * Adds a record to the set and returns zero if either the set is empty, or it is a duplicate of the records already in the set. Otherwise, * it does not add the record and returns non-zero. * @param record the record to add. * @return zero if the record belongs in this set, -1 in a previous set, or 1 in a subsequent set, according to the comparison order */ public int add(final SAMRecord record) { if (!this.records.isEmpty()) { final int cmp = this.comparator.duplicateSetCompare(this.representative, record); if (0 != cmp) { return cmp; } // update representative if (0 < this.comparator.compare(this.representative, record)) { this.representative = record; } } else { this.representative = record; } this.records.add(record); needsSorting = true; return 0; } private void sort() { if (!records.isEmpty()) { if (1 < records.size()) { Collections.sort(records, this.comparator); } if (setDuplicateFlag) { // reset duplicate flags for (final SAMRecord record : records) { if (!record.getReadUnmappedFlag() && !record.isSecondaryOrSupplementary() && !record.getReadName().equals(representative.getReadName())) { record.setDuplicateReadFlag(true); } } records.get(0).setDuplicateReadFlag(false); } if (!records.get(0).equals(this.representative)) { throw new SAMException("BUG: the representative was not the first record after sorting." + "\nFIRST: " + records.get(0).getSAMString() + "\nSECOND: " + this.representative.getSAMString()); } } needsSorting = false; // this could be in the if above if you think hard about it } /** * Gets the list of records from this set. * * Setting sort to false likely will not yield records in duplicate order within the set. * * @param sort true if we want the records in the duplicate set sorted by duplicate order, false if we do not care about the order. */ public List getRecords(final boolean sort) { if (sort && needsSorting) { sort(); } return this.records; } /** * Gets the list of records from this set. */ public List getRecords() { return getRecords(true); } /** * Gets the representative record according to the duplicate comparator. */ public SAMRecord getRepresentative() { return this.representative; } /** * Returns the number of records in this set. */ public int size() { return this.records.size(); } /** * Returns the number of duplicates in this set, including the representative record. Does not include records that are unmapped, * secondary, or supplementary. */ public int numDuplicates() { int n = 0; for (final SAMRecord record : records) { if (!record.getReadUnmappedFlag() && !record.isSecondaryOrSupplementary()) { n++; } } return n; } public boolean isEmpty() { return this.records.isEmpty(); } /** * Controls if we should update the duplicate flag of the records in this set. */ public void setDuplicateFlag(final boolean setDuplicateFlag) { this.setDuplicateFlag = setDuplicateFlag; } }htsjdk-2.0.1/src/java/htsjdk/samtools/DuplicateSetIterator.java000066400000000000000000000130431263034757100246150ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.SortingCollection; import java.io.File; import java.util.Collections; /** * An iterator of sets of duplicates. Duplicates are defined currently by the ordering in * SAMRecordDuplicateComparator. *

* If the input records are not pre-sorted according to the duplicate ordering, the records * will be sorted on-the-fly. This may require extra memory or disk to buffer records, and * also computational time to perform the sorting. * * @author nhomer */ public class DuplicateSetIterator implements CloseableIterator { private final CloseableIterator wrappedIterator; private DuplicateSet duplicateSet = null; private final SAMRecordDuplicateComparator comparator; public DuplicateSetIterator(final CloseableIterator iterator, final SAMFileHeader header) { this(iterator, header, false); } /** * Allows the user of this iterator to skip the sorting of the input if the input is already sorted. If the records are said to be * sorted but not actually sorted in the correct order, an exception during iteration will be thrown. */ public DuplicateSetIterator(final CloseableIterator iterator, final SAMFileHeader header, final boolean preSorted) { this.comparator = new SAMRecordDuplicateComparator(Collections.singletonList(header)); if (preSorted) { this.wrappedIterator = iterator; } else { // Sort it! final int maxRecordsInRam = SAMFileWriterImpl.getDefaultMaxRecordsInRam(); final File tmpDir = new File(System.getProperty("java.io.tmpdir")); final SortingCollection alignmentSorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), comparator, maxRecordsInRam, tmpDir); while (iterator.hasNext()) { final SAMRecord record = iterator.next(); alignmentSorter.add(record); } iterator.close(); this.wrappedIterator = alignmentSorter.iterator(); } this.duplicateSet = new DuplicateSet(this.comparator); if (hasNext()) { this.duplicateSet.add(this.wrappedIterator.next()); } } public void setScoringStrategy(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy) { this.comparator.setScoringStrategy(scoringStrategy); } public DuplicateSet next() { DuplicateSet duplicateSet = null; int cmp = 0; while (0 == cmp) { if (!wrappedIterator.hasNext()) { // no more! duplicateSet = this.duplicateSet; this.duplicateSet = new DuplicateSet(this.comparator); break; } else { // get another one final SAMRecord record = this.wrappedIterator.next(); // assumes that the duplicate set always has at least one record inside! final SAMRecord representative = this.duplicateSet.getRepresentative(); if (representative.getReadUnmappedFlag() || representative.isSecondaryOrSupplementary()) { duplicateSet = this.duplicateSet; this.duplicateSet = new DuplicateSet(this.comparator); this.duplicateSet.add(record); break; // exits the 0 == cmp loop } else { // compare against the representative for set membership, not ordering cmp = this.duplicateSet.add(record); if (0 < cmp) { throw new SAMException("The input records were not sorted in duplicate order:\n" + representative.getSAMString() + record.getSAMString()); } else if (cmp < 0) { duplicateSet = this.duplicateSet; this.duplicateSet = new DuplicateSet(this.comparator); this.duplicateSet.add(record); } // otherwise it was already added } } } return duplicateSet; } public void close() { wrappedIterator.close(); } public boolean hasNext() { return (!duplicateSet.isEmpty() || wrappedIterator.hasNext()); } // Does nothing! public void remove() { } } htsjdk-2.0.1/src/java/htsjdk/samtools/FileTruncatedException.java000066400000000000000000000032311263034757100251230ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Thrown when it is possible to detect that a SAM or BAM file is truncated. * * @author alecw@broadinstitute.org */ public class FileTruncatedException extends SAMException { public FileTruncatedException() { } public FileTruncatedException(final String s) { super(s); } public FileTruncatedException(final String s, final Throwable throwable) { super(s, throwable); } public FileTruncatedException(final Throwable throwable) { super(throwable); } } htsjdk-2.0.1/src/java/htsjdk/samtools/FixBAMFile.java000077500000000000000000000036361263034757100223750ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN */ package htsjdk.samtools; import htsjdk.samtools.util.CloserUtil; import java.io.File; public class FixBAMFile { public static void main(String[] args) { File inputFile = new File(args[0]); File outputFile = new File(args[1]); SamReader reader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(inputFile); SAMFileHeader header = reader.getFileHeader(); SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, true, outputFile); for (SAMRecord record : reader) { if (record.getIndexingBin() != null) { record.setIndexingBin(record.computeIndexingBin()); } writer.addAlignment(record); } writer.close(); CloserUtil.close(reader); } } htsjdk-2.0.1/src/java/htsjdk/samtools/GenomicIndexUtil.java000066400000000000000000000076211263034757100237310ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.BitSet; /** * Constants and methods used by BAM and Tribble indices */ public class GenomicIndexUtil { /** * Reports the total amount of genomic data that any bin can index. */ public static final int BIN_GENOMIC_SPAN = 512*1024*1024; /** * What is the starting bin for each level? */ public static final int[] LEVEL_STARTS = {0,1,9,73,585,4681}; /** * Reports the maximum number of bins that can appear in a binning index. */ public static final int MAX_BINS = 37450; // =(8^6-1)/7+1 public static final int MAX_LINEAR_INDEX_SIZE = MAX_BINS+1-LEVEL_STARTS[LEVEL_STARTS.length-1]; /** * E.g. for a SAMRecord with no genomic coordinate. */ public static final int UNSET_GENOMIC_LOCATION = 0; /** * calculate the bin given an alignment in [beg,end) * Copied from SAM spec. * @param beg 0-based start of read (inclusive) * @param end 0-based end of read (exclusive) */ static int reg2bin(final int beg, int end) { --end; if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); return 0; } // TODO: It is disturbing that reg2bin is 0-based, but regionToBins is 1-based. // TODO: It is also suspicious that regionToBins decrements endPos. Test it! // TODO: However end is decremented in reg2bin so perhaps there is no conflict. /** * Get candidate bins for the specified region * @param startPos 1-based start of target region, inclusive. * @param endPos 1-based end of target region, inclusive. * @return bit set for each bin that may contain SAMRecords in the target region. */ public static BitSet regionToBins(final int startPos, final int endPos) { final int maxPos = 0x1FFFFFFF; final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos; final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos; if (start > end) { return null; } int k; final BitSet bitSet = new BitSet(GenomicIndexUtil.MAX_BINS); bitSet.set(0); for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k); for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k); for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k); for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k); for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k); return bitSet; } } htsjdk-2.0.1/src/java/htsjdk/samtools/HighAccuracyDownsamplingIterator.java000066400000000000000000000210651263034757100271470ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2015 Tim Fennell * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.Random; import java.util.Set; /** * A DownsamplingIterator that attempts to provide very high accuracy (minimizing the difference between emitted proportion * and requested proportion) at the expense of using memory proportional to the number of reads in the incoming stream. * * @author Tim Fennell */ class HighAccuracyDownsamplingIterator extends DownsamplingIterator { private final Iterator underlyingIterator; private final Random random; private SAMRecord nextRecord; private final Map decisions = new HashMap(); private double targetAccuracy = 0.0001; private long totalTemplates, keptTemplates; private Iterator bufferedRecords = new ArrayList().iterator(); private Set bufferedRecordsToKeep; /** Override method to make it clear that this iterator attempts to provide a higher accuracy of downsampling. */ @Override public boolean isHigherAccuracy() { return true; } /** Constructs a downsampling iterator upon the supplied iterator, using the Random as the source of randomness. */ HighAccuracyDownsamplingIterator(final Iterator iterator, final double proportion, final int seed) { super(proportion); this.underlyingIterator = iterator; this.random = new Random(seed); } /** * Sets the target accuracy of the downsampling iterator. The value should be thought of as * probability +/- accuracy. So a value of 0.001 would instruct the downsampling iterator to * attempt to guarantee at accuracy to within 0.1%. The downsampler will need to buffer reads * for 1/accuracy templates, so setting this to extremely small numbers is not advisable. */ public DownsamplingIterator setTargetAccuracy(final double accuracy) { if (accuracy >= 1 || accuracy <= 1d/Integer.MAX_VALUE) throw new IllegalArgumentException("Illegal value. Must be 1/MAX_INT < accuracy < 1"); this.targetAccuracy = accuracy; return this; } /** Returns true if there is another record available post-downsampling, false otherwise. */ @Override public boolean hasNext() { return this.nextRecord != null || advance(); } /** Returns the next record from the iterator, or throws an exception if there is no next record. */ @Override public SAMRecord next() { if (this.nextRecord == null) { throw new NoSuchElementException("Call to next() when hasNext() == false"); } else { final SAMRecord retval = this.nextRecord; advance(); return retval; } } /** Returns the underlying iterator so that subclasses may manipulate it. */ protected Iterator getUnderlyingIterator() { return this.underlyingIterator; } /** * Clears the current record and attempts to advance through the underlying iterator until a * record is kept during downsampling. If no more records are kept and the end of the input * is reached this.nextRecord will be null. * * @return true if a record is available after advancing, false otherwise */ protected boolean advance() { this.nextRecord = null; while (this.nextRecord == null && (this.bufferedRecords.hasNext() || bufferNextChunkOfRecords(getTargetProportion(), this.targetAccuracy))) { final SAMRecord rec = this.bufferedRecords.next(); final String key = rec.getReadName(); final Boolean previous = decisions.get(key); final boolean keepThisRecord; if (previous == null) { keepThisRecord = this.bufferedRecordsToKeep.contains(rec.getReadName()); decisions.put(key, keepThisRecord); } else { keepThisRecord = previous; } if (keepThisRecord) { this.nextRecord = rec; recordAcceptedRecord(); } else { recordDiscardedRecord(); } } return this.nextRecord != null; } /** * Buffers reads until either the end of the file is reached or enough reads have been buffered such * that downsampling can be performed to the desired target accuracy. Once reads have been buffered, * template names are randomly sampled out for discarding until the desired number of reads have * been discarded. * * @return True if one or more reads have been buffered, false otherwise */ protected boolean bufferNextChunkOfRecords(final double proportion, final double accuracy) { final int templatesToRead = (int) Math.ceil(1 / accuracy); final Set names = new HashSet(); final List recs = new ArrayList(templatesToRead); readFromUnderlyingIterator(recs, names, templatesToRead); // Determine how many templates to keep/discard final int templatesRead = names.size(); final int templatesToKeep = calculateTemplatesToKeep(templatesRead, proportion); // Randomly shuffle a list of all the template names, and then remove some from the set final int templatesToDiscard = templatesRead - templatesToKeep; final List tmp = new ArrayList(names); Collections.shuffle(tmp, this.random); for (int i = 0; i < templatesToDiscard; ++i) names.remove(tmp.get(i)); // Set all the instance state so that advance()/next() get what they need this.bufferedRecordsToKeep = names; this.bufferedRecords = recs.iterator(); this.totalTemplates += templatesRead; this.keptTemplates += names.size(); return recs.size() > 0; } /** * Calculates the number of templates to keep in a specific batch of reads having just read templatesRead reads * and wanting to keep proportion of them. Rounds the final number up or down based on whether, to this point, * the iterator is under or over it's goal proportion. * * Implemented as second method to allow ChainedDownsamplingIterator to tamper with the strategy! */ protected int calculateTemplatesToKeep(final int templatesRead, final double proportion) { final double rawTemplatesToKeep = templatesRead * proportion; return (keptTemplates / (double) totalTemplates < proportion) ? (int) Math.ceil(rawTemplatesToKeep) : (int) Math.floor(rawTemplatesToKeep); } /** * Reads from the underlying iterator until it has observed templatesToRead templates (i.e. read names) that it has not yet * observed, so that templatesToRead new keep/reject decisions can be made. The records that are read are placed into recs * and _novel_ template names are placed into names. */ protected void readFromUnderlyingIterator(final List recs, final Set names, final int templatesToRead) { while (this.underlyingIterator.hasNext() && names.size() < templatesToRead) { final SAMRecord rec = this.underlyingIterator.next(); recs.add(rec); if (this.decisions.containsKey(rec.getReadName())) continue; names.add(rec.getReadName()); } } } htsjdk-2.0.1/src/java/htsjdk/samtools/LinearIndex.java000066400000000000000000000100661263034757100227210ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.Arrays; /** * The linear index associated with a given reference in a BAM index. * * @author mhanna * @version 0.1 */ public class LinearIndex { public static final int MAX_LINEAR_INDEX_SIZE = GenomicIndexUtil.MAX_LINEAR_INDEX_SIZE; public static final int BAM_LIDX_SHIFT = 14; /** * The reference sequence number for this linear index. */ private final int mReferenceSequence; /** * Dictates the first stored element of the index. */ private final int mIndexStart; /** * The linear index entries within this bin. */ private final long[] mIndexEntries; public LinearIndex(final int referenceSequence, final int indexStart, final long[] indexEntries) { this.mReferenceSequence = referenceSequence; this.mIndexStart = indexStart; this.mIndexEntries = indexEntries; } public int getReferenceSequence() { return mReferenceSequence; } public int size() { return mIndexEntries.length; } public long get(final int index) { return mIndexEntries[index-mIndexStart]; } public static int convertToLinearIndexOffset(final int contigPos) { final int indexPos = (contigPos <= 0) ? 0 : contigPos-1; return indexPos >> BAM_LIDX_SHIFT; } /** * Gets the minimum offset of any alignment start appearing in this index, according to the linear index. * @param startPos Starting position for this query. * @return The minimum offset, in chunk format, of any read appearing in this position. */ public long getMinimumOffset(final int startPos) { final int start = (startPos <= 0) ? 0 : startPos-1; final int regionLinearBin = start >> BAM_LIDX_SHIFT; // System.out.println("# regionLinearBin: " + regionLinearBin); long minimumOffset = 0; if (regionLinearBin-mIndexStart < mIndexEntries.length) minimumOffset = mIndexEntries[regionLinearBin-mIndexStart]; return minimumOffset; } /** * Direct access to the array. Be careful! * @return The elements of the linear index. */ public long[] getIndexEntries() { return mIndexEntries; } public int getIndexStart() { return mIndexStart; } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final LinearIndex that = (LinearIndex) o; if (mIndexStart != that.mIndexStart) return false; if (mReferenceSequence != that.mReferenceSequence) return false; if (!Arrays.equals(mIndexEntries, that.mIndexEntries)) return false; return true; } @Override public int hashCode() { int result = mReferenceSequence; result = 31 * result + mIndexStart; result = 31 * result + Arrays.hashCode(mIndexEntries); return result; } } htsjdk-2.0.1/src/java/htsjdk/samtools/MergingSamRecordIterator.java000066400000000000000000000247411263034757100254260ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; import java.util.Collection; import java.util.Map; import java.util.PriorityQueue; /** * Provides an iterator interface for merging multiple underlying iterators into a single * iterable stream. The underlying iterators/files must all have the same sort order unless * the requested output format is unsorted, in which case any combination is valid. */ public class MergingSamRecordIterator implements CloseableIterator { private final PriorityQueue pq; private final SamFileHeaderMerger samHeaderMerger; private final Collection readers; private final SAMFileHeader.SortOrder sortOrder; private final SAMRecordComparator comparator; private boolean initialized = false; /** * Constructs a new merging iterator with the same set of readers and sort order as * provided by the header merger parameter. * * @param headerMerger The merged header and contents of readers. * @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order. * @deprecated replaced by (SamFileHeaderMerger, Collection, boolean) */ public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) { this(headerMerger, headerMerger.getReaders(), forcePresorted); } /** * Constructs a new merging iterator with the same set of readers and sort order as * provided by the header merger parameter. * * @param headerMerger The merged header and contents of readers. * @param assumeSorted false ensures that the iterator checks the headers of the readers for appropriate sort order. */ public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, Collection readers, final boolean assumeSorted) { this.samHeaderMerger = headerMerger; this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); this.comparator = getComparator(); this.readers = readers; this.pq = new PriorityQueue(readers.size()); for (final SamReader reader : readers) { if (!samHeaderMerger.getHeaders().contains(reader.getFileHeader())) throw new SAMException("All iterators to be merged must be accounted for in the SAM header merger"); if (!assumeSorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) { throw new SAMException("Files are not compatible with sort order"); } } } /** * Add a set of SAM file iterators to the merging iterator. Use this to restrict the merged iteration to a given genomic interval, * rather than iterating over every read in the backing file or stream. * * @param headerMerger The merged header and contents of readers. * @param iterators Iterator traversing over reader contents. */ public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final Map> iterators, final boolean assumeSorted) { this(headerMerger, iterators.keySet(), assumeSorted); for (final Map.Entry> mapping : iterators.entrySet()) addIfNotEmpty(new ComparableSamRecordIterator(mapping.getKey(), mapping.getValue(), comparator)); initialized = true; } private void startIterationIfRequired() { if (initialized) return; for (final SamReader reader : readers) addIfNotEmpty(new ComparableSamRecordIterator(reader, reader.iterator(), comparator)); initialized = true; } /** * Close down all open iterators. */ public void close() { // Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue. for (CloseableIterator iterator : pq) iterator.close(); } /** Returns true if any of the underlying iterators has more records, otherwise false. */ public boolean hasNext() { startIterationIfRequired(); return !this.pq.isEmpty(); } /** Returns the next record from the top most iterator during merging. */ public SAMRecord next() { startIterationIfRequired(); final ComparableSamRecordIterator iterator = this.pq.poll(); final SAMRecord record = iterator.next(); addIfNotEmpty(iterator); // this will resolve the reference indices against the new, merged header record.setHeader(this.samHeaderMerger.getMergedHeader()); // Fix the read group if needs be if (this.samHeaderMerger.hasReadGroupCollisions()) { final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID); if (oldGroupId != null) { final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader().getFileHeader(), oldGroupId); record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId); } } // Fix the program group if needs be if (this.samHeaderMerger.hasProgramGroupCollisions()) { final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID); if (oldGroupId != null) { final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader().getFileHeader(), oldGroupId); record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId); } } return record; } /** * Adds iterator to priority queue. If the iterator has more records it is added * otherwise it is closed and not added. */ private void addIfNotEmpty(final ComparableSamRecordIterator iterator) { if (iterator.hasNext()) { pq.offer(iterator); } else { iterator.close(); } } /** Unsupported operation. */ public void remove() { throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()"); } /** * Get the right comparator for a given sort order (coordinate, alphabetic). In the * case of "unsorted" it will return a comparator that gives an arbitrary but reflexive * ordering. */ private SAMRecordComparator getComparator() { // For unsorted build a fake comparator that compares based on object ID if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) { return new SAMRecordComparator() { public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) { return System.identityHashCode(lhs) - System.identityHashCode(rhs); } public int compare(final SAMRecord lhs, final SAMRecord rhs) { return fileOrderCompare(lhs, rhs); } }; } if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) { return new MergedSequenceDictionaryCoordinateOrderComparator(); } // Otherwise try and figure out what kind of comparator to return and build it return this.sortOrder.getComparatorInstance(); } /** Returns the merged header that the merging iterator is working from. */ public SAMFileHeader getMergedHeader() { return this.samHeaderMerger.getMergedHeader(); } /** * Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged * sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids * more copy & paste. */ private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator { public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { final int referenceIndex1 = getReferenceIndex(samRecord1); final int referenceIndex2 = getReferenceIndex(samRecord2); if (referenceIndex1 != referenceIndex2) { if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { return 1; } else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { return -1; } else { return referenceIndex1 - referenceIndex2; } } if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { // Both are unmapped. return 0; } return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart(); } private int getReferenceIndex(final SAMRecord samRecord) { if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex()); } if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex()); } return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; } } } htsjdk-2.0.1/src/java/htsjdk/samtools/NotPrimarySkippingIterator.java000066400000000000000000000042121263034757100260360ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.PeekIterator; /** * Wrapper around SAMRecord iterator that skips over non-primary elements. * This iterator conflates a filtering iterator and a peekable iterator. It would be cleaner to * handle those concerns separately. */ public class NotPrimarySkippingIterator { private final PeekIterator it; public NotPrimarySkippingIterator(final CloseableIterator underlyingIt) { it = new PeekIterator(underlyingIt); skipAnyNotprimary(); } public boolean hasCurrent() { return it.hasNext(); } public SAMRecord getCurrent() { assert(hasCurrent()); return it.peek(); } public boolean advance() { it.next(); skipAnyNotprimary(); return hasCurrent(); } private void skipAnyNotprimary() { while (it.hasNext() && it.peek().getNotPrimaryAlignmentFlag()) { it.next(); } } } htsjdk-2.0.1/src/java/htsjdk/samtools/QueryInterval.java000066400000000000000000000066151263034757100233360ustar00rootroot00000000000000package htsjdk.samtools; import htsjdk.samtools.util.CoordMath; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Interval relative to a reference, for querying a BAM file. */ public class QueryInterval implements Comparable { /** Index of reference sequence, based on the sequence dictionary of the BAM file being queried. */ public final int referenceIndex; /** 1-based, inclusive */ public final int start; /** 1-based, inclusive. If <= 0, implies that the interval goes to the end of the reference sequence */ public final int end; public QueryInterval(final int referenceIndex, final int start, final int end) { if (referenceIndex < 0) { throw new IllegalArgumentException("Invalid reference index " + referenceIndex); } this.referenceIndex = referenceIndex; this.start = start; this.end = end; } public int compareTo(final QueryInterval other) { int comp = this.referenceIndex - other.referenceIndex; if (comp != 0) return comp; comp = this.start - other.start; if (comp != 0) return comp; else if (this.end == other.end) return 0; else if (this.end == 0) return 1; else if (other.end == 0) return -1; else return this.end - other.end; } /** * @return true if both are on same reference, and other starts exactly where this ends. */ public boolean abuts(final QueryInterval other) { return this.referenceIndex == other.referenceIndex && this.end == other.start; } /** * @return true if both are on same reference, and the overlap. */ public boolean overlaps(final QueryInterval other) { if (this.referenceIndex != other.referenceIndex) { return false; } final int thisEnd = (this.end == 0 ? Integer.MAX_VALUE : this.end); final int otherEnd = (other.end == 0 ? Integer.MAX_VALUE : other.end); return CoordMath.overlaps(this.start, thisEnd, other.start, otherEnd); } @Override public String toString() { return String.format("%d:%d-%d", referenceIndex, start, end); } private static final QueryInterval[] EMPTY_QUERY_INTERVAL_ARRAY = new QueryInterval[0]; /** * @param inputIntervals WARNING: This list is modified (sorted) by this method. * @return Ordered list of intervals in which abutting and overlapping intervals are merged. */ public static QueryInterval[] optimizeIntervals(final QueryInterval[] inputIntervals) { if (inputIntervals.length == 0) return EMPTY_QUERY_INTERVAL_ARRAY; Arrays.sort(inputIntervals); final List unique = new ArrayList(); QueryInterval previous = inputIntervals[0]; for (int i = 1; i < inputIntervals.length; ++i) { final QueryInterval next = inputIntervals[i]; if (previous.abuts(next) || previous.overlaps(next)) { final int newEnd = ((previous.end == 0 || next.end == 0) ? 0 : Math.max(previous.end, next.end)); previous = new QueryInterval(previous.referenceIndex, previous.start, newEnd); } else { unique.add(previous); previous = next; } } if (previous != null) unique.add(previous); return unique.toArray(EMPTY_QUERY_INTERVAL_ARRAY); } } htsjdk-2.0.1/src/java/htsjdk/samtools/ReservedTagConstants.java000066400000000000000000000064101263034757100246250ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Constants for tags used in our SAM/BAM files */ public class ReservedTagConstants { public static final String READ_GROUP_ID = SAMTag.RG.name(); // Specified in the SAM spec doc public static final String PROGRAM_GROUP_ID = SAMTag.PG.name(); // Specified in the SAM spec doc /** Present and set to 1 if a read is a noise read. */ public static final String XN = "XN"; /** Number of nucleotide differences (Specified in the SAM spec doc) */ public static final String NM = SAMTag.NM.name(); /** The sum of the mismatched qualities. */ public static final String XQ = "XQ"; /** * The name of an attribute which stores the 1-based index of the start of * sequence within a read (in original orientation) that should be clipped * or trimmed before alignment and downstream use. * The region to be clipped extends from this position to the end of the read. */ public static final String XT = "XT"; /** The original sequence before 454 cafie and homopolymer correction */ public static final String XS = "XS"; /** The Four54 edit string of 454 cafie and homopolymer corrections *

     *   editString ::= {base operator position [- position]}* ;  // Cafie needs 2 positions
     *   base ::= A | T | G | C | N ;   // N only for undercall
     *   operator ::= o | u | c ;       // o = Overcall, u = Undercall, c = Cafie.
     *   position is 0 based position of the correction (assuming forward strand) .  Cafie positions are to-from.
     *   For example: XF :Z:Gc4-6Nu11Co15 means a cafie correction moved a G from position 6 to 4,
     *   an N was inserted for an undercall at position 11, and a C was removed as an overcall at position 15
     */
    public static final String XF = "XF";

    /** The original pred quality scores before modifications such as 454 cafie and homopolymer correction */
    public static final String OQ = SAMTag.OQ.name();

    /** The original cigar before indel cleaning, or 454 cafie and homopolymer correction */
    public static final String OC = "OC";

}
htsjdk-2.0.1/src/java/htsjdk/samtools/SAMBinaryTagAndUnsignedArrayValue.java000066400000000000000000000044221263034757100270530ustar00rootroot00000000000000/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

/**
 * Simple extension to SAMBinaryTagAndValue in order to distinguish unsigned array values, because
 * signedness cannot be determined by introspection of value.
 *
 * @author alecw@broadinstitute.org
 */
public class SAMBinaryTagAndUnsignedArrayValue extends SAMBinaryTagAndValue {
    public SAMBinaryTagAndUnsignedArrayValue(final short tag, final Object value) {
        super(tag, value);
    }

    /** Creates and returns a shallow copy of the list of tag/values. */
    @Override
    public SAMBinaryTagAndValue copy() {
        final SAMBinaryTagAndValue retval = new SAMBinaryTagAndUnsignedArrayValue(this.tag, this.value);
        if (next != null) retval.next = next.copy();
        return retval;
    }

    /** Creates and returns a deep copy of the list of tag/values. */
    @Override
    public SAMBinaryTagAndValue deepCopy() {
        final SAMBinaryTagAndValue retval = new SAMBinaryTagAndUnsignedArrayValue(this.tag, cloneValue());
        if (next != null) {
            retval.next = next.deepCopy();
        }
        return retval;
    }


    @Override
    public boolean isUnsignedArray() {
        return true;
    }
}
htsjdk-2.0.1/src/java/htsjdk/samtools/SAMBinaryTagAndValue.java000066400000000000000000000144141263034757100243610ustar00rootroot00000000000000/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

import java.io.Serializable;

/**
 * Holds a SAMRecord attribute and the tagname (in binary form) for that attribute.
 * SAMRecord stores tag name and value in this form, because much String creation is avoided this way.
 * See SAMTagUtil to convert the tag to String form.
 *
 * Values associated with attribute tags must be of a type that implements {@link Serializable} or else
 * serialization will fail.
 *
 * @author alecw@broadinstitute.org
 */
public class SAMBinaryTagAndValue implements Serializable {
    public static final long serialVersionUID = 1L;

    public final short tag;
    public final Object value;
    protected SAMBinaryTagAndValue next = null;

    /**
     * @param tag tagname (in binary form) for this attribute
     * @param value value for this attribute (must be of a type that implements {@link Serializable} or else serialization will fail)
     *              Cannot be null.
     */
    public SAMBinaryTagAndValue(final short tag, final Object value) {
        if (null == value) {
            throw new IllegalArgumentException("SAMBinaryTagAndValue value may not be null");
        }
        this.tag = tag;
        this.value = value;
    }

    @Override public boolean equals(final Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        return typeSafeEquals((SAMBinaryTagAndValue) o);
    }

    /** Type safe equals method that recurses down the list looking for equality. */
    private boolean typeSafeEquals(final SAMBinaryTagAndValue that) {
        if (this.tag != that.tag) return false;
        if ((this.value == null) ? that.value == null : this.value.equals(that.value)) {
            if (this.next == null) return that.next == null;
            else return this.next.equals(that.next);
        }
        else {
            return false;
        }
    }

    @Override
    public int hashCode() {
        int result = (int) tag;
        result = 31 * result + value.hashCode();
        return result;
    }

    /** Creates and returns a shallow copy of the list of tag/values. */
    public SAMBinaryTagAndValue copy() {
        final SAMBinaryTagAndValue retval = new SAMBinaryTagAndValue(this.tag, this.value);
        if (next != null) {
            retval.next = next.copy();
        }
        return retval;
    }

    /** Creates and returns a deep copy of the list of tag/values. */
    public SAMBinaryTagAndValue deepCopy() {
        final SAMBinaryTagAndValue retval = new SAMBinaryTagAndValue(this.tag, cloneValue());
        if (next != null) {
            retval.next = next.deepCopy();
        }
        return retval;
    }

    /* Create and return a clone of value object */
    protected Object cloneValue() {
        Object valueClone;

        if (value instanceof byte[]) {
            valueClone = ((byte[]) value).clone();
        }
        else if (value instanceof short[]) {
            valueClone = ((short[]) value).clone();
        }
        else if (value instanceof int[]) {
            valueClone = ((int[]) value).clone();
        }
        else if (value instanceof float[]) {
            valueClone = ((float[]) value).clone();
        }
        else {
            // otherwise, the api limits the remaining possible value types to
            // immutable (String or boxed primitive) types
            valueClone = value;
        }
        return valueClone;
    }

    // The methods below are for implementing a light-weight, single-direction linked list

    public SAMBinaryTagAndValue getNext() { return this.next; }

    /** Inserts at item into the ordered list of attributes and returns the head of the list/sub-list */
    public SAMBinaryTagAndValue insert(final SAMBinaryTagAndValue attr) {
        if (attr == null) return this;
        if (attr.next != null) throw new IllegalStateException("Can only insert single tag/value combinations.");

        if (attr.tag < this.tag) {
            // attr joins the list ahead of this element
            attr.next = this;
            return attr;
        }
        else if (this.tag == attr.tag) {
            // attr replaces this in the list
            attr.next = this.next;
            return attr;
        }
        else if (this.next == null) {
            // attr gets stuck on the end
            this.next = attr;
            return this;
        }
        else {
            // attr gets inserted somewhere in the tail
            this.next = this.next.insert(attr);
            return this;
        }
    }

    /** Removes a tag from the list and returns the new head of the list/sub-list. */
    public SAMBinaryTagAndValue remove(final short tag) {
        if (this.tag == tag) return this.next;
        else {
            if (this.next != null) this.next = this.next.remove(tag);
            return this;
        }
    }

    /** Returns the SAMBinaryTagAndValue that contains the required tag, or null if not contained. */
    public SAMBinaryTagAndValue find(final short tag) {
        if (this.tag == tag) return this;
        else if (this.tag > tag || this.next == null) return null;
        else return this.next.find(tag); 
    }

    public boolean isUnsignedArray() {
        return false;
    }
}
htsjdk-2.0.1/src/java/htsjdk/samtools/SAMException.java000066400000000000000000000030331263034757100230120ustar00rootroot00000000000000/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

/**
 * @author alecw@broadinstitute.org
 */
public class SAMException extends RuntimeException {
    public SAMException() {
    }

    public SAMException(final String s) {
        super(s);
    }

    public SAMException(final String s, final Throwable throwable) {
        super(s, throwable);
    }

    public SAMException(final Throwable throwable) {
        super(throwable);
    }
}
htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFileHeader.java000066400000000000000000000343641263034757100230570ustar00rootroot00000000000000/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;


import htsjdk.samtools.util.StringLineReader;

import java.io.StringWriter;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Header information from a SAM or BAM file.
 */
public class SAMFileHeader extends AbstractSAMHeaderRecord
{
    public static final String VERSION_TAG = "VN";
    public static final String SORT_ORDER_TAG = "SO";
    public static final String GROUP_ORDER_TAG = "GO";
    public static final String CURRENT_VERSION = "1.5";
    public static final Set ACCEPTABLE_VERSIONS =
            new HashSet(Arrays.asList("1.0", "1.3", "1.4", "1.5"));

    /**
     * These tags are of known type, so don't need a type field in the text representation.
     */
    public static final Set STANDARD_TAGS =
            new HashSet(Arrays.asList(VERSION_TAG, SORT_ORDER_TAG, GROUP_ORDER_TAG));

    Set getStandardTags() {
        return STANDARD_TAGS;
    }

    /**
     * Ways in which a SAM or BAM may be sorted.
     */
    public enum SortOrder {

        unsorted(null),
        queryname(SAMRecordQueryNameComparator.class),
        coordinate(SAMRecordCoordinateComparator.class),
        duplicate(SAMRecordDuplicateComparator.class); // NB: this is not in the SAM spec!

        private final Class comparator;

        SortOrder(final Class comparatorClass) {
            this.comparator = comparatorClass;
        }

        /**
         * @return Comparator class to sort in the specified order, or null if unsorted.
         */
        public Class getComparator() {
            return comparator;
        }

        /**
         * @return Comparator to sort in the specified order, or null if unsorted.
         */
        public SAMRecordComparator getComparatorInstance() {
            if (comparator != null) {
                try {
                    final Constructor ctor = comparator.getConstructor();
                    return ctor.newInstance();
                }
                catch (Exception e) {
                    throw new IllegalStateException("Could not instantiate a comparator for sort order: " +
                            this.name(), e);
                }
            }
            return null;
        }
    }

    public enum GroupOrder {
        none, query, reference
    }

    private List mReadGroups =
        new ArrayList();
    private List mProgramRecords = new ArrayList();
    private final Map mReadGroupMap =
        new HashMap();
    private final Map mProgramRecordMap = new HashMap();
    private SAMSequenceDictionary mSequenceDictionary = new SAMSequenceDictionary();
    final private List mComments = new ArrayList();
    private String textHeader;
    private final List mValidationErrors = new ArrayList();

    public SAMFileHeader() {
        setAttribute(VERSION_TAG, CURRENT_VERSION);
    }

    public String getVersion() {
        return (String) getAttribute("VN");
    }

    public String getCreator() {
        return (String) getAttribute("CR");
    }

    public SAMSequenceDictionary getSequenceDictionary() {
        return mSequenceDictionary;
    }

    public List getReadGroups() {
        return Collections.unmodifiableList(mReadGroups);
    }

    /**
     * Look up sequence record by name.
     */
    public SAMSequenceRecord getSequence(final String name) {
        return mSequenceDictionary.getSequence(name);
    }

    /**
     * Look up read group record by name.
     */
    public SAMReadGroupRecord getReadGroup(final String name) {
        return mReadGroupMap.get(name);
    }

    /**
     * Replace entire sequence dictionary.  The given sequence dictionary is stored, not copied.
     */
    public void setSequenceDictionary(final SAMSequenceDictionary sequenceDictionary) {
        mSequenceDictionary = sequenceDictionary;
    }

    public void addSequence(final SAMSequenceRecord sequenceRecord) {
        mSequenceDictionary.addSequence(sequenceRecord);
    }

    /**
     * Look up a sequence record by index.  First sequence in the header is the 0th.
     * @return The corresponding sequence record, or null if the index is out of range.
     */
    public SAMSequenceRecord getSequence(final int sequenceIndex) {
        return mSequenceDictionary.getSequence(sequenceIndex);
    }

    /**
     *
     * @return Sequence index for the given sequence name, or -1 if the name is not found.
     */
    public int getSequenceIndex(final String sequenceName) {
        return mSequenceDictionary.getSequenceIndex(sequenceName);
    }

    /**
     * Replace entire list of read groups.  The given list is stored, not copied.
     */
    public void setReadGroups(final List readGroups) {
        mReadGroups = readGroups;
        mReadGroupMap.clear();
        for (final SAMReadGroupRecord readGroupRecord : readGroups) {
            mReadGroupMap.put(readGroupRecord.getReadGroupId(), readGroupRecord);
        }
    }

    public void addReadGroup(final SAMReadGroupRecord readGroup) {
        if (mReadGroupMap.containsKey(readGroup.getReadGroupId())) {
            throw new IllegalArgumentException("Read group with group id " +
                readGroup.getReadGroupId() + " already exists in SAMFileHeader!");
        }
        mReadGroups.add(readGroup);
        mReadGroupMap.put(readGroup.getReadGroupId(), readGroup);
    }

    public List getProgramRecords() {
        return Collections.unmodifiableList(mProgramRecords);
    }

    public void addProgramRecord(final SAMProgramRecord programRecord) {
        if (mProgramRecordMap.containsKey(programRecord.getProgramGroupId())) {
            throw new IllegalArgumentException("Program record with group id " +
                programRecord.getProgramGroupId() + " already exists in SAMFileHeader!");
        }
        this.mProgramRecords.add(programRecord);
        this.mProgramRecordMap.put(programRecord.getProgramGroupId(), programRecord);
    }

    public SAMProgramRecord getProgramRecord(final String pgId) {
        return this.mProgramRecordMap.get(pgId);
    }

    /**
     * Replace entire list of program records
     * @param programRecords This list is used directly, not copied.
     */
    public void setProgramRecords(final List programRecords) {
        this.mProgramRecords = programRecords;
        this.mProgramRecordMap.clear();
        for (final SAMProgramRecord programRecord : this.mProgramRecords) {
            this.mProgramRecordMap.put(programRecord.getProgramGroupId(), programRecord);
        }
    }

    /**
     * @return a new SAMProgramRecord with an ID guaranteed to not exist in this SAMFileHeader
     */
    public SAMProgramRecord createProgramRecord() {
        for (int i = 0; i < Integer.MAX_VALUE; ++i) {
            final String s = Integer.toString(i);
            if (!this.mProgramRecordMap.containsKey(s)) {
                final SAMProgramRecord ret = new SAMProgramRecord(s);
                addProgramRecord(ret);
                return ret;
            }
        }
        throw new IllegalStateException("Surprising number of SAMProgramRecords");
    }

    public SortOrder getSortOrder() {
        final String so = getAttribute("SO");
        if (so == null || so.equals("unknown")) {
            return SortOrder.unsorted;
        }
        return SortOrder.valueOf((String) so);
    }

    public void setSortOrder(final SortOrder so) {
        setAttribute("SO", so.name());
    }

    public GroupOrder getGroupOrder() {
        if (getAttribute("GO") == null) {
            return GroupOrder.none;
        }
        return GroupOrder.valueOf((String)getAttribute("GO"));
    }

    public void setGroupOrder(final GroupOrder go) {
        setAttribute("GO", go.name());
    }

    /**
     * If this SAMHeader was read from a file, this property contains the header
     * as it appeared in the file, otherwise it is null.  Note that this is not a toString()
     * operation.  Changes to the SAMFileHeader object after reading from the file are not reflected in this value.
     *
     * In addition this value is only set if one of the following is true:
     *   - The size of the header is < 1,048,576 characters (1MB ascii, 2MB unicode)
     *   - There are either validation or parsing errors associated with the header
     *
     * Invalid header lines may appear in value but are not stored in the SAMFileHeader object.
     */
    public String getTextHeader() {
        return textHeader;
    }

    public void setTextHeader(final String textHeader) {
        this.textHeader = textHeader;
    }

    public List getComments() {
        return Collections.unmodifiableList(mComments);
    }

    public void addComment(String comment) {
        if (!comment.startsWith(SAMTextHeaderCodec.COMMENT_PREFIX)) {
            comment = SAMTextHeaderCodec.COMMENT_PREFIX + comment;
        }
        mComments.add(comment);
    }


    /**
     * Replace existing comments with the contents of the given collection.
     */
    public void setComments(final Collection comments) {
        mComments.clear();
        for (final String comment : comments) {
            addComment(comment);
        }
    }

    public List getValidationErrors() {
        return Collections.unmodifiableList(mValidationErrors);
    }

    public void addValidationError(final SAMValidationError error) {
        mValidationErrors.add(error);
    }

    /**
     * Replace list of validation errors with the elements of the given list.
     */
    public void setValidationErrors(final Collection errors) {
        mValidationErrors.clear();
        mValidationErrors.addAll(errors);
    }

    @Override
    public boolean equals(final Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;

        final SAMFileHeader that = (SAMFileHeader) o;

        if (!attributesEqual(that)) return false;
        if (mProgramRecords != null ? !mProgramRecords.equals(that.mProgramRecords) : that.mProgramRecords != null)
            return false;
        if (mReadGroups != null ? !mReadGroups.equals(that.mReadGroups) : that.mReadGroups != null) return false;
        if (mSequenceDictionary != null ? !mSequenceDictionary.equals(that.mSequenceDictionary) : that.mSequenceDictionary != null)
            return false;

        return true;
    }

    @Override
    public int hashCode() {
        int result = attributesHashCode();
        result = 31 * result + (mSequenceDictionary != null ? mSequenceDictionary.hashCode() : 0);
        result = 31 * result + (mReadGroups != null ? mReadGroups.hashCode() : 0);
        result = 31 * result + (mProgramRecords != null ? mProgramRecords.hashCode() : 0);
        return result;
    }

    public final SAMFileHeader clone() {
        final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
        codec.setValidationStringency(ValidationStringency.SILENT);
        final StringWriter stringWriter = new StringWriter();
        codec.encode(stringWriter, this);
        return codec.decode(new StringLineReader(stringWriter.toString()), "SAMFileHeader.clone");
    }

    /** Little class to generate program group IDs */
    public static class PgIdGenerator {
        private int recordCounter;

        private final Set idsThatAreAlreadyTaken = new HashSet();

        public PgIdGenerator(final SAMFileHeader header) {
            for (final SAMProgramRecord pgRecord : header.getProgramRecords()) {
                idsThatAreAlreadyTaken.add(pgRecord.getProgramGroupId());
            }
            recordCounter = idsThatAreAlreadyTaken.size();
        }

        public String getNonCollidingId(final String recordId) {
            if (!idsThatAreAlreadyTaken.contains(recordId)) {
                // don't remap 1st record. If there are more records
                // with this id, they will be remapped in the 'else'.
                idsThatAreAlreadyTaken.add(recordId);
                ++recordCounter;
                return recordId;
            } else {
                String newId;
                // Below we tack on one of roughly 1.7 million possible 4 digit base36 at random. We do this because
                // our old process of just counting from 0 upward and adding that to the previous id led to 1000s of
                // calls idsThatAreAlreadyTaken.contains() just to resolve 1 collision when merging 1000s of similarly
                // processed bams.
                while (idsThatAreAlreadyTaken.contains(newId = recordId + "." + SamFileHeaderMerger.positiveFourDigitBase36Str(recordCounter++)))
                    ;

                idsThatAreAlreadyTaken.add(newId);
                return newId;
            }

        }
    }
}
htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFileReader.java000066400000000000000000001050541263034757100230640ustar00rootroot00000000000000/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;


import htsjdk.samtools.seekablestream.SeekableBufferedStream;
import htsjdk.samtools.seekablestream.SeekableHTTPStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedStreamConstants;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.RuntimeIOException;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Arrays;
import java.util.NoSuchElementException;
import java.util.zip.GZIPInputStream;

/**
 * Class for reading and querying SAM/BAM files.  Delegates to appropriate concrete implementation.
 *
 * @see SamReaderFactory
 */
@Deprecated
public class SAMFileReader implements SamReader, SamReader.Indexing {

    private static ValidationStringency defaultValidationStringency = ValidationStringency.DEFAULT_STRINGENCY;

    public static ValidationStringency getDefaultValidationStringency() {
        return defaultValidationStringency;
    }

    /**
     * Set validation stringency for all subsequently-created SAMFileReaders.  This is the only way to
     * change the validation stringency for SAM header.
     * NOTE: Programs that change this should make sure to have a try/finally clause wrapping the work that
     * they do, so that the original stringency can be restored after the program's work is done.  This facilitates
     * calling a program that is usually run stand-alone from another program, without messing up the original
     * validation stringency.
     */
    public static void setDefaultValidationStringency(final ValidationStringency defaultValidationStringency) {
        SAMFileReader.defaultValidationStringency = defaultValidationStringency;
    }

    /**
     * Returns the SAMSequenceDictionary from the provided FASTA.
     */
    public static SAMSequenceDictionary getSequenceDictionary(final File dictionaryFile) {
        final SAMFileReader samFileReader = new SAMFileReader(dictionaryFile);
        final SAMSequenceDictionary dict = samFileReader.getFileHeader().getSequenceDictionary();
        CloserUtil.close(dictionaryFile);
        return dict;
    }

    private boolean mIsBinary = false;
    private BAMIndex mIndex = null;
    private SAMRecordFactory samRecordFactory = new DefaultSAMRecordFactory();
    private ReaderImplementation mReader = null;

    private File samFile = null;

    private static class EmptySamIterator implements CloseableIterator {
        @Override
        public boolean hasNext() {
            return false;
        }

        @Override
        public SAMRecord next() {
            throw new NoSuchElementException("next called on empty iterator");
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException("Not supported: remove");
        }

        @Override
        public void close() {
            //no-op
        }
    }


    /**
     * Prepare to read a SAM or BAM file.  Indexed lookup not allowed because reading from InputStream.
     */
    public SAMFileReader(final InputStream stream) {
        this(stream, false);
    }

    /**
     * Prepare to read a SAM or BAM file.  If the given file is a BAM, and has a companion BAI index file
     * that is named according to the convention, it will be found and opened, and indexed query will be allowed.
     */
    public SAMFileReader(final File file) {
        this(file, null, false);
    }

    /**
     * Prepare to read a SAM or BAM file.  If the given file is a BAM, and an index is present, indexed query
     * will be allowed.
     *
     * @param file      SAM or BAM to read.
     * @param indexFile Index file that is companion to BAM, or null if no index file, or if index file
     *                  should be found automatically.
     */
    public SAMFileReader(final File file, final File indexFile) {
        this(file, indexFile, false);
    }

    /**
     * Read a SAM or BAM file.  Indexed lookup not allowed because reading from InputStream.
     *
     * @param stream      input SAM or BAM.  This is buffered internally so caller need not buffer.
     * @param eagerDecode if true, decode SAM record entirely when reading it.
     */
    public SAMFileReader(final InputStream stream, final boolean eagerDecode) {
        init(stream, null, null, eagerDecode, defaultValidationStringency);
    }

    /**
     * Read a SAM or BAM file, possibly with an index file if present.
     * If the given file is a BAM, and an index is present, indexed query will be allowed.
     *
     * @param file        SAM or BAM.
     * @param eagerDecode if true, decode SAM record entirely when reading it.
     */
    public SAMFileReader(final File file, final boolean eagerDecode) {
        this(file, null, eagerDecode);
    }

    /**
     * Read a SAM or BAM file, possibly with an index file. If the given file is a BAM, and an index is present,
     * indexed query will be allowed.
     *
     * @param file        SAM or BAM.
     * @param indexFile   Location of index file, or null in order to use the default index file (if present).
     * @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it.
     */
    public SAMFileReader(final File file, final File indexFile, final boolean eagerDecode) {
        init(null, file, indexFile, eagerDecode, defaultValidationStringency);
    }

    /**
     * Read a BAM file by http
     * indexed query will be allowed.
     *
     * @param url         BAM.
     * @param indexFile   Location of index file, or null if indexed access not required.
     * @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it.
     */
    public SAMFileReader(final URL url, final File indexFile, final boolean eagerDecode) {
        init(new SeekableBufferedStream(new SeekableHTTPStream(url)),
                indexFile, eagerDecode, defaultValidationStringency);
    }

    /**
     * Read a BAM file via caller-supplied mechanism.  Indexed query will be allowed, but
     * index file must be provided in that case.
     *
     * @param strm        BAM -- If the stream is not buffered, caller should wrap in SeekableBufferedStream for
     *                    better performance.
     * @param indexFile   Location of index file, or null indexed access not required.
     * @param eagerDecode if true, decode SAM record entirely when reading it.
     */
    public SAMFileReader(final SeekableStream strm, final File indexFile, final boolean eagerDecode) {
        init(strm, indexFile, eagerDecode, defaultValidationStringency);
    }

    /**
     * @param strm BAM -- If the stream is not buffered, caller should wrap in SeekableBufferedStream for
     *             better performance.
     */
    public SAMFileReader(final SeekableStream strm, final SeekableStream indexStream, final boolean eagerDecode) {
        init(strm, indexStream, eagerDecode, defaultValidationStringency);
    }

    public void close() {
        if (mReader != null) {
            mReader.close();
        }
        mReader = null;
        mIndex = null;
    }

    /**
     * If true, writes the source of every read into the source SAMRecords.
     *
     * @param enabled true to write source information into each SAMRecord.
     */
    public void enableFileSource(final boolean enabled) {
        mReader.enableFileSource(this, enabled);
    }

    /**
     * If true, uses the caching version of the index reader.
     *
     * @param enabled true to write source information into each SAMRecord.
     */
    public void enableIndexCaching(final boolean enabled) {
        if (mIndex != null)
            throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
        mReader.enableIndexCaching(enabled);
    }

    /**
     * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
     * This is slower but more scalable when accessing large numbers of BAM files sequentially.
     *
     * @param enabled True to use memory mapping, false to use regular I/O.
     */
    public void enableIndexMemoryMapping(final boolean enabled) {
        if (mIndex != null) {
            throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
        }
        mReader.enableIndexMemoryMapping(enabled);
    }

    /**
     * Only meaningful for BAM file readers - enables or disables checking of checksums on uncompressed
     * data during decompression. Enabling this will increase decompression time by 15-30%.
     */
    public void enableCrcChecking(final boolean enabled) {
        this.mReader.enableCrcChecking(enabled);
    }

    /**
     * Override the default SAMRecordFactory class used to instantiate instances of SAMRecord and BAMRecord.
     */
    public void setSAMRecordFactory(final SAMRecordFactory factory) {
        this.samRecordFactory = factory;
        this.mReader.setSAMRecordFactory(factory);
    }

    /**
     * @return True if this is a BAM reader.
     */
    public boolean isBinary() {
        return mIsBinary;
    }

    /**
     * @return true if ths is a BAM file, and has an index
     */
    public boolean hasIndex() {
        return mReader.hasIndex();
    }

    @Override
    public Indexing indexing() {
        return this;
    }

    /**
     * Retrieves the index for the given file type.  Ensure that the index is of the specified type.
     *
     * @return An index of the given type.
     */
    public BAMIndex getIndex() {
        return mReader.getIndex();
    }

    /**
     * Returns true if the supported index is browseable, meaning the bins in it can be traversed
     * and chunk data inspected and retrieved.
     *
     * @return True if the index supports the BrowseableBAMIndex interface.  False otherwise.
     */
    public boolean hasBrowseableIndex() {
        return hasIndex() && getIndex() instanceof BrowseableBAMIndex;
    }

    /**
     * Gets an index tagged with the BrowseableBAMIndex interface.  Throws an exception if no such
     * index is available.
     *
     * @return An index with a browseable interface, if possible.
     * @throws SAMException if no such index is available.
     */
    public BrowseableBAMIndex getBrowseableIndex() {
        final BAMIndex index = getIndex();
        if (!(index instanceof BrowseableBAMIndex))
            throw new SAMException("Cannot return index: index created by BAM is not browseable.");
        return BrowseableBAMIndex.class.cast(index);
    }

    public SAMFileHeader getFileHeader() {
        return mReader.getFileHeader();
    }

    @Override
    public Type type() {
        return mReader.type();
    }

    @Override
    public String getResourceDescription() {
        return this.toString();
    }

    /**
     * Control validation of SAMRecords as they are read from file.
     * In order to control validation stringency for SAM Header, call SAMFileReader.setDefaultValidationStringency
     * before constructing a SAMFileReader.
     */
    public void setValidationStringency(final ValidationStringency validationStringency) {
        mReader.setValidationStringency(validationStringency);
    }

    /**
     * Iterate through file in order.  For a SAMFileReader constructed from an InputStream, and for any SAM file,
     * a 2nd iteration starts where the 1st one left off.  For a BAM constructed from a File, each new iteration
     * starts at the first record.
     * 

* Only a single open iterator on a SAM or BAM file may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. */ public SAMRecordIterator iterator() { return new AssertingIterator(mReader.getIterator()); } /** * Iterate through the given chunks in the file. * * @param chunks List of chunks for which to retrieve data. * @return An iterator over the given chunks. */ public SAMRecordIterator iterator(final SAMFileSpan chunks) { return new AssertingIterator(mReader.getIterator(chunks)); } /** * Gets a pointer spanning all reads in the BAM file. * * @return Unbounded pointer to the first record, in chunk format. */ public SAMFileSpan getFilePointerSpanningReads() { return mReader.getFilePointerSpanningReads(); } /** * Iterate over records that match the given interval. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. You can use a second SAMFileReader to iterate * in parallel over the same underlying file. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param sequence Reference sequence of interest. * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. * @param contained If true, each SAMRecord returned is will have its alignment completely contained in the * interval of interest. If false, the alignment of the returned SAMRecords need only overlap the interval of interest. * @return Iterator over the SAMRecords matching the interval. */ public SAMRecordIterator query(final String sequence, final int start, final int end, final boolean contained) { final int referenceIndex = getFileHeader().getSequenceIndex(sequence); final CloseableIterator currentIterator; if (referenceIndex == -1) { currentIterator = new EmptySamIterator(); } else { final QueryInterval[] queryIntervals = {new QueryInterval(referenceIndex, start, end)}; currentIterator = mReader.query(queryIntervals, contained); } return new AssertingIterator(currentIterator); } /** * Iterate over records that overlap the given interval. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param sequence Reference sequence of interest. * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. * @return Iterator over the SAMRecords overlapping the interval. */ public SAMRecordIterator queryOverlapping(final String sequence, final int start, final int end) { return query(sequence, start, end, false); } /** * Iterate over records that are contained in the given interval. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param sequence Reference sequence of interest. * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. * @return Iterator over the SAMRecords contained in the interval. */ public SAMRecordIterator queryContained(final String sequence, final int start, final int end) { return query(sequence, start, end, true); } /** * Iterate over records that match one of the given intervals. This may be more efficient than querying * each interval separately, because multiple reads of the same SAMRecords is avoided. *

* Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. You can use a second SAMFileReader to iterate * in parallel over the same underlying file. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match an interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param intervals Intervals to be queried. The intervals must be optimized, i.e. in order, with overlapping * and abutting intervals merged. This can be done with {@link htsjdk.samtools.QueryInterval#optimizeIntervals} * @param contained If true, each SAMRecord returned is will have its alignment completely contained in one of the * intervals of interest. If false, the alignment of the returned SAMRecords need only overlap one of * the intervals of interest. * @return Iterator over the SAMRecords matching the interval. */ public SAMRecordIterator query(final QueryInterval[] intervals, final boolean contained) { return new AssertingIterator(mReader.query(intervals, contained)); } /** * Iterate over records that overlap any of the given intervals. This may be more efficient than querying * each interval separately, because multiple reads of the same SAMRecords is avoided. *

* Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param intervals Intervals to be queried. The intervals must be optimized, i.e. in order, with overlapping * and abutting intervals merged. This can be done with {@link htsjdk.samtools.QueryInterval#optimizeIntervals} * @return Iterator over the SAMRecords overlapping any of the intervals. */ public SAMRecordIterator queryOverlapping(final QueryInterval[] intervals) { return query(intervals, false); } /** * Iterate over records that are contained in the given interval. This may be more efficient than querying * each interval separately, because multiple reads of the same SAMRecords is avoided. *

* Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param intervals Intervals to be queried. The intervals must be optimized, i.e. in order, with overlapping * and abutting intervals merged. This can be done with {@link htsjdk.samtools.QueryInterval#optimizeIntervals} * @return Iterator over the SAMRecords contained in any of the intervals. */ public SAMRecordIterator queryContained(final QueryInterval[] intervals) { return query(intervals, true); } public SAMRecordIterator queryUnmapped() { return new AssertingIterator(mReader.queryUnmapped()); } /** * Iterate over records that map to the given sequence and start at the given position. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * matches the arguments. * * @param sequence Reference sequence of interest. * @param start Alignment start of interest. * @return Iterator over the SAMRecords with the given alignment start. */ public SAMRecordIterator queryAlignmentStart(final String sequence, final int start) { return new AssertingIterator(mReader.queryAlignmentStart(sequence, start)); } /** * Fetch the mate for the given read. Only valid to call this if hasIndex() == true. * This will work whether the mate has a coordinate or not, so long as the given read has correct * mate information. This method iterates over the SAM file, so there may not be an unclosed * iterator on the SAM file when this method is called. *

* Note that it is not possible to call queryMate when iterating over the SAMFileReader, because queryMate * requires its own iteration, and there cannot be two simultaneous iterations on the same SAMFileReader. The * work-around is to open a second SAMFileReader on the same input file, and call queryMate on the second * reader. * * @param rec Record for which mate is sought. Must be a paired read. * @return rec's mate, or null if it cannot be found. */ public SAMRecord queryMate(final SAMRecord rec) { if (!rec.getReadPairedFlag()) { throw new IllegalArgumentException("queryMate called for unpaired read."); } if (rec.getFirstOfPairFlag() == rec.getSecondOfPairFlag()) { throw new IllegalArgumentException("SAMRecord must be either first and second of pair, but not both."); } final boolean firstOfPair = rec.getFirstOfPairFlag(); final CloseableIterator it; if (rec.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { it = queryUnmapped(); } else { it = queryAlignmentStart(rec.getMateReferenceName(), rec.getMateAlignmentStart()); } try { SAMRecord mateRec = null; while (it.hasNext()) { final SAMRecord next = it.next(); if (!next.getReadPairedFlag()) { if (rec.getReadName().equals(next.getReadName())) { throw new SAMFormatException("Paired and unpaired reads with same name: " + rec.getReadName()); } continue; } if (firstOfPair) { if (next.getFirstOfPairFlag()) continue; } else { if (next.getSecondOfPairFlag()) continue; } if (rec.getReadName().equals(next.getReadName())) { if (mateRec != null) { throw new SAMFormatException("Multiple SAMRecord with read name " + rec.getReadName() + " for " + (firstOfPair ? "second" : "first") + " end."); } mateRec = next; } } return mateRec; } finally { it.close(); } } private void init(final SeekableStream strm, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) { try { if (streamLooksLikeBam(strm)) { mIsBinary = true; mReader = new BAMFileReader(strm, indexFile, eagerDecode, validationStringency, this.samRecordFactory); } else { throw new SAMFormatException("Unrecognized file format: " + strm); } setValidationStringency(validationStringency); } catch (final IOException e) { throw new RuntimeIOException(e); } } private void init(final SeekableStream strm, final SeekableStream indexStream, final boolean eagerDecode, final ValidationStringency validationStringency) { try { if (streamLooksLikeBam(strm)) { mIsBinary = true; mReader = new BAMFileReader(strm, indexStream, eagerDecode, validationStringency, this.samRecordFactory); } else { throw new SAMFormatException("Unrecognized file format: " + strm); } setValidationStringency(validationStringency); } catch (final IOException e) { throw new RuntimeIOException(e); } } // Its too expensive to examine the remote file to determine type. // Rely on file extension. private boolean streamLooksLikeBam(final SeekableStream strm) { String source = strm.getSource(); if (source == null) return true; source = source.toLowerCase(); //Source will typically be a file path or URL //If it's a URL we require one of the query parameters to be bam file return source.endsWith(".bam") || source.contains(".bam?") || source.contains(".bam&") || source.contains(".bam%26"); } private void init(final InputStream stream, File file, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency) { if (stream != null && file != null) throw new IllegalArgumentException("stream and file are mutually exclusive"); this.samFile = file; try { BufferedInputStream bufferedStream; // Buffering is required because mark() and reset() are called on the input stream. final int bufferSize = Math.max(Defaults.BUFFER_SIZE, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE); if (file != null) bufferedStream = new BufferedInputStream(new FileInputStream(file), bufferSize); else bufferedStream = IOUtil.toBufferedStream(stream); if (isBAMFile(bufferedStream)) { mIsBinary = true; if (file == null || !file.isFile()) { // Handle case in which file is a named pipe, e.g. /dev/stdin or created by mkfifo mReader = new BAMFileReader(bufferedStream, indexFile, eagerDecode, validationStringency, this.samRecordFactory); } else { bufferedStream.close(); mReader = new BAMFileReader(file, indexFile, eagerDecode, validationStringency, this.samRecordFactory); } } else if (BlockCompressedInputStream.isValidFile(bufferedStream)) { mIsBinary = false; mReader = new SAMTextReader(new BlockCompressedInputStream(bufferedStream), validationStringency, this.samRecordFactory); } else if (isGzippedSAMFile(bufferedStream)) { mIsBinary = false; mReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency, this.samRecordFactory); } else if (SamStreams.isCRAMFile(bufferedStream)) { if (file == null || !file.isFile()) { file = null; } else { bufferedStream.close(); bufferedStream = null; } mReader = new CRAMFileReader(file, bufferedStream); } else if (isSAMFile(bufferedStream)) { if (indexFile != null) { bufferedStream.close(); throw new RuntimeException("Cannot use index file with textual SAM file"); } mIsBinary = false; mReader = new SAMTextReader(bufferedStream, file, validationStringency, this.samRecordFactory); } else { bufferedStream.close(); throw new SAMFormatException("Unrecognized file format"); } setValidationStringency(validationStringency); mReader.setSAMRecordFactory(this.samRecordFactory); } catch (final IOException e) { throw new RuntimeIOException(e); } } /** * @param stream stream.markSupported() must be true * @return true if this looks like a BAM file. */ private boolean isBAMFile(final InputStream stream) throws IOException { if (!BlockCompressedInputStream.isValidFile(stream)) { return false; } final int buffSize = BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE; stream.mark(buffSize); final byte[] buffer = new byte[buffSize]; readBytes(stream, buffer, 0, buffSize); stream.reset(); final byte[] magicBuf = new byte[4]; final int magicLength = readBytes(new BlockCompressedInputStream(new ByteArrayInputStream(buffer)), magicBuf, 0, 4); return magicLength == BAMFileConstants.BAM_MAGIC.length && Arrays.equals(BAMFileConstants.BAM_MAGIC, magicBuf); } private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) throws IOException { int bytesRead = 0; while (bytesRead < length) { final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); if (count <= 0) { break; } bytesRead += count; } return bytesRead; } /** * Attempts to check whether the file is a gzipped sam file. Returns true if it * is and false otherwise. */ private boolean isGzippedSAMFile(final BufferedInputStream stream) { if (!stream.markSupported()) { throw new IllegalArgumentException("Cannot test a stream that doesn't support marking."); } stream.mark(8000); try { final GZIPInputStream gunzip = new GZIPInputStream(stream); final int ch = gunzip.read(); return true; } catch (final IOException ioe) { return false; } finally { try { stream.reset(); } catch (final IOException ioe) { throw new IllegalStateException("Could not reset stream."); } } } private boolean isSAMFile(final InputStream stream) { // For now, assume every non-binary file is a SAM text file. return true; } @Override public String toString() { if (this.samFile == null) { return getClass().getSimpleName() + "{initialized with stream}"; } else { return getClass().getSimpleName() + "{" + this.samFile.getAbsolutePath() + "}"; } } /** * Convenience method to create a QueryInterval * * @param sequence sequence of interest, must exist in sequence dictionary * @param start 1-based start position, must be >= 1 * @param end 1-based end position. * @throws java.lang.IllegalArgumentException if sequence not found in sequence dictionary, or start position < 1 */ public QueryInterval makeQueryInterval(final String sequence, int start, int end) { int referenceIndex = getFileHeader().getSequenceIndex(sequence); if (referenceIndex < 0) { throw new IllegalArgumentException(String.format("Sequence '%s' not found in sequence dictionary", sequence)); } if (start < 1) { throw new IllegalArgumentException("Start position must be >= 1"); } return new QueryInterval(referenceIndex, start, end); } /** * Convenience method to create a QueryInterval that goes from start to end of given sequence. * * @param sequence sequence of interest, must exist in sequence dictionary * @param start 1-based start position, must be >= 1 * @throws java.lang.IllegalArgumentException if sequence not found in sequence dictionary, or start position < 1 */ public QueryInterval makeQueryInterval(final String sequence, int start) { return makeQueryInterval(sequence, start, 0); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFileSource.java000066400000000000000000000042361263034757100231220ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Represents the origin of a SAM record. * * @author mhanna * @version 0.1 */ public class SAMFileSource { /** * The reader originating this SAM record. */ private SamReader mReader; /** * The point on disk from which a record originates. */ private SAMFileSpan mFilePointer; /** * Create a new SAMFileSource with the given reader and file pointer. * @param reader reader. * @param filePointer File pointer. */ public SAMFileSource(final SamReader reader, final SAMFileSpan filePointer) { this.mReader = reader; this.mFilePointer = filePointer; } /** * Retrieves the reader from which this read was initially retrieved. * @return The reader. */ public SamReader getReader() { return mReader; } /** * A pointer to the region on disk from which the read originated. * @return A pointer within the file. */ public SAMFileSpan getFilePointer() { return mFilePointer; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFileSpan.java000066400000000000000000000040251263034757100225570ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * A interface representing a collection of (possibly) discontinuous segments in the * BAM file, possibly representing the results of an index query. */ public interface SAMFileSpan extends Cloneable { /** * Gets a pointer over the data immediately following this span. * @return The a pointer to data immediately following this span. */ public SAMFileSpan getContentsFollowing(); /** * Remove all pointers in this file span before the given file span starts. * @param fileSpan The filespan before which to eliminate. * @return The portion of the chunk list after the given chunk. */ public SAMFileSpan removeContentsBefore(final SAMFileSpan fileSpan); /** * Does this file span point to any data, or is it completely empty? * @return True if the file span is empty, false otherwise. */ public boolean isEmpty(); } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFileTruncatedReader.java000066400000000000000000000035001263034757100247270ustar00rootroot00000000000000package htsjdk.samtools; import java.io.File; import java.util.NoSuchElementException; /** * A truncated form of a SAMFileReader that iterates over a limited number of records. * * @author mccowan@broadinstitute.org */ @Deprecated public class SAMFileTruncatedReader extends SAMFileReader { private class TruncatedIterator implements SAMRecordIterator { final SAMRecordIterator i; final long max; long currentRecord = 0; TruncatedIterator(final SAMRecordIterator i, final long max) { this.i = i; this.max = max; } public boolean hasNext() { return i.hasNext() && max != currentRecord; } public SAMRecord next() { if (this.hasNext()) { currentRecord += 1; return i.next(); } else { throw new NoSuchElementException(); } } public void remove() { i.remove(); } public void close() { i.close(); } public SAMRecordIterator assertSorted(final SAMFileHeader.SortOrder sortOrder) { return i.assertSorted(sortOrder); } } private final long maxRecordsToIterate; /** * @param input The SAM file * @param max The maximum number of records to read from the file via iterator() methods */ public SAMFileTruncatedReader(final File input, final long max) { super(input); this.maxRecordsToIterate = max; } @Override public SAMRecordIterator iterator() { return new TruncatedIterator(super.iterator(), maxRecordsToIterate); } @Override public SAMRecordIterator iterator(final SAMFileSpan chunks) { return new TruncatedIterator(super.iterator(chunks), maxRecordsToIterate); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFileWriter.java000066400000000000000000000034221263034757100231320ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.io.Closeable; import htsjdk.samtools.util.ProgressLoggerInterface; /** * Interface for SAMText and BAM file writers. Clients need not care which they write to, * once the object is constructed. */ public interface SAMFileWriter extends Closeable { void addAlignment(SAMRecord alignment); SAMFileHeader getFileHeader(); /** * Sets a ProgressLogger on this writer. This is useful when pulling, for instance, from a * SortingCollection. */ void setProgressLogger(final ProgressLoggerInterface progress); /** * Must be called to flush or file will likely be defective. */ void close(); } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFileWriterFactory.java000066400000000000000000000473251263034757100244740ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.util.BlockCompressedOutputStream; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.Md5CalculatingOutputStream; import htsjdk.samtools.util.RuntimeIOException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; /** * Create a writer for writing SAM, BAM, or CRAM files. */ public class SAMFileWriterFactory { private final static Log log = Log.getInstance(SAMFileWriterFactory.class); private static boolean defaultCreateIndexWhileWriting = Defaults.CREATE_INDEX; private boolean createIndex = defaultCreateIndexWhileWriting; private static boolean defaultCreateMd5File = Defaults.CREATE_MD5; private boolean createMd5File = defaultCreateMd5File; private boolean useAsyncIo = Defaults.USE_ASYNC_IO; private int asyncOutputBufferSize = AsyncSAMFileWriter.DEFAULT_QUEUE_SIZE; private int bufferSize = Defaults.BUFFER_SIZE; private File tmpDir; private Integer maxRecordsInRam; /** * Sets the default for whether to create md5Files for BAM files this factory. */ public static void setDefaultCreateMd5File(final boolean createMd5File) { defaultCreateMd5File = createMd5File; } /** * Sets whether to create md5Files for BAMs from this factory. */ public SAMFileWriterFactory setCreateMd5File(final boolean createMd5File) { this.createMd5File = createMd5File; return this; } /** * Sets the default for subsequent SAMFileWriterFactories * that do not specify whether to create an index. * If a BAM (not SAM) file is created, the setting is true, and the file header specifies coordinate order, * then a BAM index file will be written along with the BAM file. * * @param setting whether to attempt to create a BAM index while creating the BAM file */ public static void setDefaultCreateIndexWhileWriting(final boolean setting) { defaultCreateIndexWhileWriting = setting; } /** * Convenience method allowing newSAMFileWriterFactory().setCreateIndex(true); * Equivalent to SAMFileWriterFactory.setDefaultCreateIndexWhileWriting(true); newSAMFileWriterFactory(); * If a BAM or CRAM (not SAM) file is created, the setting is true, and the file header specifies coordinate order, * then a BAM index file will be written along with the BAM file. * * @param setting whether to attempt to create a BAM index while creating the BAM file. * @return this factory object */ public SAMFileWriterFactory setCreateIndex(final boolean setting) { this.createIndex = setting; return this; } /** * Before creating a writer that is not presorted, this method may be called in order to override * the default number of SAMRecords stored in RAM before spilling to disk * (c.f. SAMFileWriterImpl.MAX_RECORDS_IN_RAM). When writing very large sorted SAM files, you may need * call this method in order to avoid running out of file handles. The RAM available to the JVM may need * to be increased in order to hold the specified number of records in RAM. This value affects the number * of records stored in subsequent calls to one of the make...() methods. * * @param maxRecordsInRam Number of records to store in RAM before spilling to temporary file when * creating a sorted SAM or BAM file. */ public SAMFileWriterFactory setMaxRecordsInRam(final int maxRecordsInRam) { this.maxRecordsInRam = maxRecordsInRam; return this; } /** * Turn on or off the use of asynchronous IO for writing output SAM and BAM files. If true then * each SAMFileWriter creates a dedicated thread which is used for compression and IO activities. */ public SAMFileWriterFactory setUseAsyncIo(final boolean useAsyncIo) { this.useAsyncIo = useAsyncIo; return this; } /** * If and only if using asynchronous IO then sets the maximum number of records that can be buffered per * SAMFileWriter before producers will block when trying to write another SAMRecord. */ public SAMFileWriterFactory setAsyncOutputBufferSize(final int asyncOutputBufferSize) { this.asyncOutputBufferSize = asyncOutputBufferSize; return this; } /** * Controls size of write buffer. * Default value: [[htsjdk.samtools.Defaults#BUFFER_SIZE]] */ public SAMFileWriterFactory setBufferSize(final int bufferSize) { this.bufferSize = bufferSize; return this; } /** * Set the temporary directory to use when sort data. * * @param tmpDir Path to the temporary directory */ public SAMFileWriterFactory setTempDirectory(final File tmpDir) { this.tmpDir = tmpDir; return this; } /** * Create a BAMFileWriter that is ready to receive SAMRecords. Uses default compression level. * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param outputFile where to write the output. */ public SAMFileWriter makeBAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile) { return makeBAMWriter(header, presorted, outputFile, BlockCompressedOutputStream.getDefaultCompressionLevel()); } /** * Create a BAMFileWriter that is ready to receive SAMRecords. * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param outputFile where to write the output. * @param compressionLevel Override default compression level with the given value, between 0 (fastest) and 9 (smallest). */ public SAMFileWriter makeBAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile, final int compressionLevel) { try { final boolean createMd5File = this.createMd5File && IOUtil.isRegularPath(outputFile); if (this.createMd5File && !createMd5File) { log.warn("Cannot create MD5 file for BAM because output file is not a regular file: " + outputFile.getAbsolutePath()); } OutputStream os = IOUtil.maybeBufferOutputStream(new FileOutputStream(outputFile, false), bufferSize); if (createMd5File) os = new Md5CalculatingOutputStream(os, new File(outputFile.getAbsolutePath() + ".md5")); final BAMFileWriter ret = new BAMFileWriter(os, outputFile, compressionLevel); final boolean createIndex = this.createIndex && IOUtil.isRegularPath(outputFile); if (this.createIndex && !createIndex) { log.warn("Cannot create index for BAM because output file is not a regular file: " + outputFile.getAbsolutePath()); } if (this.tmpDir != null) ret.setTempDirectory(this.tmpDir); initializeBAMWriter(ret, header, presorted, createIndex); if (this.useAsyncIo) return new AsyncSAMFileWriter(ret, this.asyncOutputBufferSize); else return ret; } catch (final IOException ioe) { throw new RuntimeIOException("Error opening file: " + outputFile.getAbsolutePath()); } } private void initializeBAMWriter(final BAMFileWriter writer, final SAMFileHeader header, final boolean presorted, final boolean createIndex) { writer.setSortOrder(header.getSortOrder(), presorted); if (maxRecordsInRam != null) { writer.setMaxRecordsInRam(maxRecordsInRam); } writer.setHeader(header); if (createIndex && writer.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) { writer.enableBamIndexConstruction(); } } /** * Create a SAMTextWriter that is ready to receive SAMRecords. * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param outputFile where to write the output. */ public SAMFileWriter makeSAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile) { try { final SAMTextWriter ret = this.createMd5File ? new SAMTextWriter(new Md5CalculatingOutputStream(new FileOutputStream(outputFile, false), new File(outputFile.getAbsolutePath() + ".md5"))) : new SAMTextWriter(outputFile); ret.setSortOrder(header.getSortOrder(), presorted); if (maxRecordsInRam != null) { ret.setMaxRecordsInRam(maxRecordsInRam); } ret.setHeader(header); if (this.useAsyncIo) return new AsyncSAMFileWriter(ret, this.asyncOutputBufferSize); else return ret; } catch (final IOException ioe) { throw new RuntimeIOException("Error opening file: " + outputFile.getAbsolutePath()); } } /** * Create a SAMTextWriter for writing to a stream that is ready to receive SAMRecords. * This method does not support the creation of an MD5 file * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param stream the stream to write records to. Note that this method does not buffer the stream, so the * caller must buffer if desired. Note that PrintStream is buffered. */ public SAMFileWriter makeSAMWriter(final SAMFileHeader header, final boolean presorted, final OutputStream stream) { return initWriter(header, presorted, false, new SAMTextWriter(stream)); } /** * Create a BAMFileWriter for writing to a stream that is ready to receive SAMRecords. * This method does not support the creation of an MD5 file * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param stream the stream to write records to. Note that this method does not buffer the stream, so the * caller must buffer if desired. Note that PrintStream is buffered. */ public SAMFileWriter makeBAMWriter(final SAMFileHeader header, final boolean presorted, final OutputStream stream) { return initWriter(header, presorted, true, new BAMFileWriter(stream, null)); } /** * Initialize SAMTextWriter or a BAMFileWriter and possibly wrap in AsyncSAMFileWriter * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param binary do we want to generate a BAM or a SAM * @param writer SAM or BAM writer to initialize and maybe wrap. */ private SAMFileWriter initWriter(final SAMFileHeader header, final boolean presorted, final boolean binary, final SAMFileWriterImpl writer) { writer.setSortOrder(header.getSortOrder(), presorted); if (maxRecordsInRam != null) { writer.setMaxRecordsInRam(maxRecordsInRam); } writer.setHeader(header); if (this.useAsyncIo) return new AsyncSAMFileWriter(writer, this.asyncOutputBufferSize); else return writer; } /** * Create either a SAM or a BAM writer based on examination of the outputFile extension. * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param outputFile where to write the output. Must end with .sam or .bam. * @return SAM or BAM writer based on file extension of outputFile. */ public SAMFileWriter makeSAMOrBAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile) { final String filename = outputFile.getName(); if (filename.endsWith(BamFileIoUtils.BAM_FILE_EXTENSION)) { return makeBAMWriter(header, presorted, outputFile); } if (filename.endsWith(".sam")) { return makeSAMWriter(header, presorted, outputFile); } return makeBAMWriter(header, presorted, outputFile); } /** * * Create a SAM, BAM or CRAM writer based on examination of the outputFile extension. * * @param header header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param outputFile where to write the output. Must end with .sam, .bam or .cram. * @param referenceFasta reference sequence file * @return SAMFileWriter appropriate for the file type specified in outputFile * */ public SAMFileWriter makeWriter(final SAMFileHeader header, final boolean presorted, final File outputFile, final File referenceFasta) { if (outputFile.getName().endsWith(SamReader.Type.CRAM_TYPE.fileExtension())) { return makeCRAMWriter(header, presorted, outputFile, referenceFasta); } else { return makeSAMOrBAMWriter(header, presorted, outputFile); } } /** * Create a CRAMFileWriter on an output stream. Requires the input to be presorted to match the sort order defined * by the input header. * * Note: does not honor factory settings for CREATE_MD5, CREATE_INDEX, USE_ASYNC_IO. * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param stream where to write the output. * @param referenceFasta reference sequence file * @return CRAMFileWriter */ public CRAMFileWriter makeCRAMWriter(final SAMFileHeader header, final OutputStream stream, final File referenceFasta) { // create the CRAMFileWriter directly without propagating factory settings final CRAMFileWriter writer = new CRAMFileWriter(stream, new ReferenceSource(referenceFasta), header, null); setCRAMWriterDefaults(writer); return writer; } /** * Create a CRAMFileWriter on an output file. Requires input record to be presorted to match the * sort order defined by the input header. * * Note: does not honor factory settings for USE_ASYNC_IO. * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param outputFile where to write the output. Must end with .sam, .bam or .cram. * @param referenceFasta reference sequence file * @return CRAMFileWriter * */ public CRAMFileWriter makeCRAMWriter(final SAMFileHeader header, final File outputFile, final File referenceFasta) { return createCRAMWriterWithSettings(header, true, outputFile, referenceFasta); } /** * Create a CRAMFileWriter on an output file. * * Note: does not honor factory setting for USE_ASYNC_IO. * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param outputFile where to write the output. Must end with .sam, .bam or .cram. * @param referenceFasta reference sequence file * @return CRAMFileWriter * */ public CRAMFileWriter makeCRAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile, final File referenceFasta) { return createCRAMWriterWithSettings(header, presorted, outputFile, referenceFasta); } /** * Create a CRAMFileWriter on an output file based on factory settings. * * Note: does not honor the factory setting for USE_ASYNC_IO. * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder. * @param outputFile where to write the output. Must end with .sam, .bam or .cram. * @param referenceFasta reference sequence file * @return CRAMFileWriter */ private CRAMFileWriter createCRAMWriterWithSettings( final SAMFileHeader header, final boolean presorted, final File outputFile, final File referenceFasta) { OutputStream cramOS = null; OutputStream indexOS = null ; if (createIndex) { if (!IOUtil.isRegularPath(outputFile)) { log.warn("Cannot create index for CRAM because output file is not a regular file: " + outputFile.getAbsolutePath()); } else { try { final File indexFile = new File(outputFile.getAbsolutePath() + BAMIndex.BAMIndexSuffix) ; indexOS = new FileOutputStream(indexFile) ; } catch (final IOException ioe) { throw new RuntimeIOException("Error creating index file for: " + outputFile.getAbsolutePath()+ BAMIndex.BAMIndexSuffix); } } } try { cramOS = IOUtil.maybeBufferOutputStream(new FileOutputStream(outputFile, false), bufferSize); } catch (final IOException ioe) { throw new RuntimeIOException("Error creating CRAM file: " + outputFile.getAbsolutePath()); } CRAMFileWriter writer = new CRAMFileWriter( createMd5File ? new Md5CalculatingOutputStream(cramOS, new File(outputFile.getAbsolutePath() + ".md5")) : cramOS, indexOS, presorted, new ReferenceSource(referenceFasta), header, outputFile.getAbsolutePath()); setCRAMWriterDefaults(writer); return writer; } // Set the default CRAM writer preservation parameters private void setCRAMWriterDefaults(CRAMFileWriter writer) { writer.setPreserveReadNames(true); writer.setCaptureAllTags(true); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFileWriterImpl.java000066400000000000000000000224161263034757100237600ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.ProgressLoggerInterface; import htsjdk.samtools.util.SortingCollection; import java.io.File; import java.io.StringWriter; /** * Base class for implementing SAM writer with any underlying format. * Mostly this manages accumulation & sorting of SAMRecords when appropriate, * and produces the text version of the header, since that seems to be a popular item * in both text and binary file formats. */ public abstract class SAMFileWriterImpl implements SAMFileWriter { private static int DEAFULT_MAX_RECORDS_IN_RAM = 500000; private int maxRecordsInRam = DEAFULT_MAX_RECORDS_IN_RAM; private SAMFileHeader.SortOrder sortOrder; private SAMFileHeader header; private SortingCollection alignmentSorter; private File tmpDir = new File(System.getProperty("java.io.tmpdir")); private ProgressLoggerInterface progressLogger = null; private boolean isClosed = false; // If true, records passed to addAlignment are already in the order specified by sortOrder private boolean presorted; // For validating presorted records. private SAMSortOrderChecker sortOrderChecker; /** * When writing records that are not presorted, specify the number of records stored in RAM * before spilling to disk. This method sets the default value for all SamFileWriterImpl * instances. Must be called before the constructor is called. * @param maxRecordsInRam */ public static void setDefaultMaxRecordsInRam(final int maxRecordsInRam) { DEAFULT_MAX_RECORDS_IN_RAM = maxRecordsInRam; } /** * When writing records that are not presorted, this number determines the * number of records stored in RAM before spilling to disk. * @return DEAFULT_MAX_RECORDS_IN_RAM */ public static int getDefaultMaxRecordsInRam() { return DEAFULT_MAX_RECORDS_IN_RAM; } /** * Sets the progress logger used by this implementation. Setting this lets this writer emit log * messages as SAM records in a SortingCollection are being written to disk. */ public void setProgressLogger(final ProgressLoggerInterface progress) { this.progressLogger = progress; } /** * Must be called before calling setHeader(). SortOrder value in the header passed * to setHeader() is ignored. If setSortOrder is not called, default is SortOrder.unsorted. */ public void setSortOrder(final SAMFileHeader.SortOrder sortOrder, final boolean presorted) { if (header != null) { throw new IllegalStateException("Cannot call SAMFileWriterImpl.setSortOrder after setHeader for " + getFilename()); } this.sortOrder = sortOrder; this.presorted = presorted; } /** * Must be called after calling setHeader(). */ protected SAMFileHeader.SortOrder getSortOrder() { return this.sortOrder; } /** * When writing records that are not presorted, specify the number of records stored in RAM * before spilling to disk. Must be called before setHeader(). * @param maxRecordsInRam */ void setMaxRecordsInRam(final int maxRecordsInRam) { if (this.header != null) { throw new IllegalStateException("setMaxRecordsInRam must be called before setHeader()"); } this.maxRecordsInRam = maxRecordsInRam; } /** * When writing records that are not presorted, specify the path of the temporary directory * for spilling to disk. Must be called before setHeader(). * @param tmpDir path to the temporary directory */ void setTempDirectory(final File tmpDir) { if (tmpDir!=null) { this.tmpDir = tmpDir; } } /** * Must be called before addAlignment. Header cannot be null. */ public void setHeader(final SAMFileHeader header) { if (null == header) { throw new IllegalArgumentException("A non-null SAMFileHeader is required for a writer"); } this.header = header; if (sortOrder == null) { sortOrder = SAMFileHeader.SortOrder.unsorted; } header.setSortOrder(sortOrder); final StringWriter headerTextBuffer = new StringWriter(); new SAMTextHeaderCodec().encode(headerTextBuffer, header); final String headerText = headerTextBuffer.toString(); writeHeader(headerText); if (presorted) { if (sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { presorted = false; } else { sortOrderChecker = new SAMSortOrderChecker(sortOrder); } } else if (!sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { alignmentSorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), makeComparator(), maxRecordsInRam, tmpDir); } } public SAMFileHeader getFileHeader() { return header; } private SAMRecordComparator makeComparator() { switch (sortOrder) { case coordinate: return new SAMRecordCoordinateComparator(); case queryname: return new SAMRecordQueryNameComparator(); case duplicate: return new SAMRecordDuplicateComparator(); case unsorted: return null; } throw new IllegalStateException("sortOrder should not be null"); } /** * Add an alignment record to be emitted by the writer. * * @param alignment Must not be null. If the alignment record's SAMFileHeader is null, the record will be * updated to the header used by this writer, which will in turn cause any unresolved reference and * mate reference indices to be resolved against the new header's sequence dictionary. */ public void addAlignment(final SAMRecord alignment) { if (null == alignment.getHeader()) { alignment.setHeader(header); // re-establish the record header and attempt to resolve reference index values } if (sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { writeAlignment(alignment); } else if (presorted) { assertPresorted(alignment); writeAlignment(alignment); } else { alignmentSorter.add(alignment); } } private void assertPresorted(final SAMRecord alignment) { final SAMRecord prev = sortOrderChecker.getPreviousRecord(); if (!sortOrderChecker.isSorted(alignment)) { throw new IllegalArgumentException("Alignments added out of order in SAMFileWriterImpl.addAlignment for " + getFilename() + ". Sort order is " + this.sortOrder + ". Offending records are at [" + sortOrderChecker.getSortKey(prev) + "] and [" + sortOrderChecker.getSortKey(alignment) + "]"); } } /** * Must be called or else file will likely be defective. */ public final void close() { if (!isClosed) { if (alignmentSorter != null) { for (final SAMRecord alignment : alignmentSorter) { writeAlignment(alignment); if (progressLogger != null) progressLogger.record(alignment); } alignmentSorter.cleanup(); } finish(); } isClosed = true; } /** * Writes the record to disk. Sort order has been taken care of by the time * this method is called. The record must hava a non-null SAMFileHeader. * @param alignment */ abstract protected void writeAlignment(SAMRecord alignment); /** * Write the header to disk. Header object is available via getHeader(). * @param textHeader for convenience if the implementation needs it. */ abstract protected void writeHeader(String textHeader); /** * Do any required flushing here. */ abstract protected void finish(); /** * For producing error messages. * @return Output filename, or null if there isn't one. */ abstract protected String getFilename(); } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFlag.java000066400000000000000000000077451263034757100217430ustar00rootroot00000000000000/* * The MIT License * * Author: Pierre Lindenbaum PhD @yokofakun * Institut du Thorax - Nantes - France * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.HashSet; import java.util.Set; /** * SAM flags as enum, to be used in GUI, menu, etc... */ public enum SAMFlag { READ_PAIRED( 0x1, "Template having multiple segments in sequencing"), PROPER_PAIR( 0x2, "Each segment properly aligned according to the aligner"), READ_UNMAPPED( 0x4, "Segment unmapped"), MATE_UNMAPPED( 0x8, "Next segment in the template unmapped"), READ_REVERSE_STRAND( 0x10, "SEQ being reverse complemented"), MATE_REVERSE_STRAND( 0x20, "SEQ of the next segment in the template being reverse complemented"), FIRST_OF_PAIR( 0x40, "The first segment in the template"), SECOND_OF_PAIR( 0x80, "The last segment in the template"), NOT_PRIMARY_ALIGNMENT( 0x100, "Secondary alignment"), READ_FAILS_VENDOR_QUALITY_CHECK(0x200, "Not passing quality controls"), DUPLICATE_READ( 0x400, "PCR or optical duplicate"), SUPPLEMENTARY_ALIGNMENT( 0x800, "Supplementary alignment") ; /* visible for the package, to be used by SAMRecord */ final int flag; private final String description; SAMFlag(int flag,String description) { this.flag = flag; this.description = description; } /** @return this flag as an int */ public int intValue() { return flag; } /** @return a human label for this SAMFlag */ public String getLabel() { return name().toLowerCase().replace('_', ' '); } /** @return a human description for this SAMFlag */ public String getDescription() { return this.description; } /** @return the SAMFlag for the value 'flag' or null if it was not found */ public static SAMFlag valueOf(int flag) { for (SAMFlag f : values()) { if (flag == f.flag) return f; } return null; } /** @return find SAMFlag the flag by name, or null if it was not found */ public static SAMFlag findByName(String flag) { for (SAMFlag f : values()) { if (f.name().equals(flag)) return f; } return null; } /** @returns true if the bit for is set for flag */ public boolean isSet(int flag) { return (this.flag & flag) != 0; } /** @returns true if the bit for is not set for flag */ public boolean isUnset(int flag) { return !isSet(flag); } /** @returns the java.util.Set of SAMFlag for 'flag' */ public static Set getFlags(int flag) { Set set = new HashSet(); for (SAMFlag f : values()) { if (f.isSet(flag)) set.add(f); } return set; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMFormatException.java000066400000000000000000000031351263034757100241660ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Thrown when a SAM file being read or decoded (text or binary) looks bad. */ public class SAMFormatException extends SAMException { public SAMFormatException() { } public SAMFormatException(final String s) { super(s); } public SAMFormatException(final String s, final Throwable throwable) { super(s, throwable); } public SAMFormatException(final Throwable throwable) { super(throwable); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMHeaderRecordComparator.java000066400000000000000000000045651263034757100254460ustar00rootroot00000000000000package htsjdk.samtools; /** * The MIT License *

* Copyright (c) 2014 The Broad Institute *

* Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: *

* The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. *

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ import java.util.Comparator; /** * Provides ordering based on SAM header records' attribute values. Provide the list of attributes to use * in the comparison to the constructor. Null attribute values (i.e., those attributes not present in the * record) sort behind those that have values. */ public class SAMHeaderRecordComparator implements Comparator { private final String[] attributes; public SAMHeaderRecordComparator(final String... attributes) { this.attributes = attributes; } @Override public int compare(final T left, final T right) { for (final String attribute : attributes) { final String leftValue = left.getAttribute(attribute); final String rightValue = right.getAttribute(attribute); if (leftValue == null) { // Fastest comparison possible; two empty values are // equivalent, so move along to the next attribute if (rightValue == null) continue; // Otherwise left < right, since right has a value else return -1; } // left is not null; if right is, left > right if (rightValue == null) return 1; final int compare = leftValue.compareTo(rightValue); if (compare != 0) return compare; } return 0; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMLineParser.java000066400000000000000000000412171263034757100231260ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.StringUtil; import java.io.File; import java.util.List; import java.util.Map; import java.util.regex.Pattern; /** * this class enables creation of a SAMRecord object from a String in SAM text format. */ public class SAMLineParser { // From SAM specification private static final int QNAME_COL = 0; private static final int FLAG_COL = 1; private static final int RNAME_COL = 2; private static final int POS_COL = 3; private static final int MAPQ_COL = 4; private static final int CIGAR_COL = 5; private static final int MRNM_COL = 6; private static final int MPOS_COL = 7; private static final int ISIZE_COL = 8; private static final int SEQ_COL = 9; private static final int QUAL_COL = 10; private static final int NUM_REQUIRED_FIELDS = 11; // Read string must contain only these characters private static final Pattern VALID_BASES = Pattern .compile("^[acmgrsvtwyhkdbnACMGRSVTWYHKDBN.=]+$"); /** * Allocate this once rather than for every line as a performance * optimization. The size is arbitrary -- merely large enough to handle the * maximum number of fields we might expect from a reasonable SAM file. */ private final String[] mFields = new String[10000]; /** * Add information about the origin (reader and position) to SAM records. */ private final SamReader mParentReader; private final SAMRecordFactory samRecordFactory; private final ValidationStringency validationStringency; private final SAMFileHeader mFileHeader; private final File mFile; private final TextTagCodec tagCodec = new TextTagCodec(); private int currentLineNumber; private String currentLine; // // Constructors // /** * Public constructor. Use the default SAMRecordFactory and stringency. * * @param samFileHeader SAM file header */ public SAMLineParser(final SAMFileHeader samFileHeader) { this(new DefaultSAMRecordFactory(), ValidationStringency.DEFAULT_STRINGENCY, samFileHeader, null, null); } /** * Public constructor. Use the default SAMRecordFactory and stringency. * * @param samFileHeader SAM file header * @param samFileReader SAM file reader For passing to SAMRecord.setFileSource, may be null. * @param samFile SAM file being read (for error message only, may be null) */ public SAMLineParser(final SAMFileHeader samFileHeader, final SamReader samFileReader, final File samFile) { this(new DefaultSAMRecordFactory(), ValidationStringency.DEFAULT_STRINGENCY, samFileHeader, samFileReader, samFile); } /** * Public constructor. * * @param samRecordFactory SamRecord Factory * @param validationStringency validation stringency * @param samFileHeader SAM file header * @param samFileReader SAM file reader For passing to SAMRecord.setFileSource, may be null. * @param samFile SAM file being read (for error message only, may be null) */ public SAMLineParser(final SAMRecordFactory samRecordFactory, final ValidationStringency validationStringency, final SAMFileHeader samFileHeader, final SamReader samFileReader, final File samFile) { if (samRecordFactory == null) throw new NullPointerException("The SamRecordFactory must be set"); if (validationStringency == null) throw new NullPointerException("The validationStringency must be set"); if (samFileHeader == null) throw new NullPointerException("The mFileHeader must be set"); this.samRecordFactory = samRecordFactory; this.validationStringency = validationStringency; this.mFileHeader = samFileHeader; // Can be null this.mParentReader = samFileReader; // Can be null this.mFile = samFile; } /** * Get the File header. * * @return the SAM file header */ public SAMFileHeader getFileHeader() { return this.mFileHeader; } /** * Get validation stringency. * * @return validation stringency */ public ValidationStringency getValidationStringency() { return this.validationStringency; } private int parseInt(final String s, final String fieldName) { final int ret; try { ret = Integer.parseInt(s); } catch (NumberFormatException e) { throw reportFatalErrorParsingLine("Non-numeric value in " + fieldName + " column"); } return ret; } private void validateReferenceName(final String rname, final String fieldName) { if (rname.equals("=")) { if (fieldName.equals("MRNM")) { return; } reportErrorParsingLine("= is not a valid value for " + fieldName + " field."); } if (this.mFileHeader.getSequenceDictionary().size() != 0) { if (this.mFileHeader.getSequence(rname) == null) { reportErrorParsingLine(fieldName + " '" + rname + "' not found in any SQ record"); } } } /** * Parse a SAM line. * * @param line line to parse * @return a new SAMRecord object */ public SAMRecord parseLine(final String line) { return parseLine(line, -1); } /** * Parse a SAM line. * * @param line line to parse * @param lineNumber line number in the file. If the line number is not known * can be <=0. * @return a new SAMRecord object */ public SAMRecord parseLine(final String line, final int lineNumber) { final String mCurrentLine = line; this.currentLineNumber = lineNumber; this.currentLine = line; final int numFields = StringUtil.split(mCurrentLine, mFields, '\t'); if (numFields < NUM_REQUIRED_FIELDS) { throw reportFatalErrorParsingLine("Not enough fields"); } if (numFields == mFields.length) { reportErrorParsingLine("Too many fields in SAM text record."); } for (int i = 0; i < numFields; ++i) { if (mFields[i].length() == 0) { reportErrorParsingLine("Empty field at position " + i + " (zero-based)"); } } final SAMRecord samRecord = samRecordFactory.createSAMRecord(this.mFileHeader); samRecord.setValidationStringency(this.validationStringency); if (mParentReader != null) samRecord.setFileSource(new SAMFileSource(mParentReader, null)); samRecord.setHeader(this.mFileHeader); samRecord.setReadName(mFields[QNAME_COL]); final int flags = parseInt(mFields[FLAG_COL], "FLAG"); samRecord.setFlags(flags); String rname = mFields[RNAME_COL]; if (!rname.equals("*")) { rname = SAMSequenceRecord.truncateSequenceName(rname); validateReferenceName(rname, "RNAME"); samRecord.setReferenceName(rname); } else if (!samRecord.getReadUnmappedFlag()) { reportErrorParsingLine("RNAME is not specified but flags indicate mapped"); } final int pos = parseInt(mFields[POS_COL], "POS"); final int mapq = parseInt(mFields[MAPQ_COL], "MAPQ"); final String cigar = mFields[CIGAR_COL]; if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(samRecord .getReferenceName())) { if (pos == 0) { reportErrorParsingLine("POS must be non-zero if RNAME is specified"); } if (!samRecord.getReadUnmappedFlag() && cigar.equals("*")) { reportErrorParsingLine("CIGAR must not be '*' if RNAME is specified"); } } else { if (pos != 0) { reportErrorParsingLine("POS must be zero if RNAME is not specified"); } if (mapq != 0) { reportErrorParsingLine("MAPQ must be zero if RNAME is not specified"); } if (!cigar.equals("*")) { reportErrorParsingLine("CIGAR must be '*' if RNAME is not specified"); } } samRecord.setAlignmentStart(pos); samRecord.setMappingQuality(mapq); samRecord.setCigarString(cigar); String mateRName = mFields[MRNM_COL]; if (mateRName.equals("*")) { if (samRecord.getReadPairedFlag() && !samRecord.getMateUnmappedFlag()) { reportErrorParsingLine("MRNM not specified but flags indicate mate mapped"); } } else { if (!samRecord.getReadPairedFlag()) { reportErrorParsingLine("MRNM specified but flags indicate unpaired"); } if (!"=".equals(mateRName)) { mateRName = SAMSequenceRecord.truncateSequenceName(mateRName); } validateReferenceName(mateRName, "MRNM"); if (mateRName.equals("=")) { if (samRecord.getReferenceName() == null) { reportErrorParsingLine("MRNM is '=', but RNAME is not set"); } samRecord.setMateReferenceName(samRecord.getReferenceName()); } else { samRecord.setMateReferenceName(mateRName); } } final int matePos = parseInt(mFields[MPOS_COL], "MPOS"); final int isize = parseInt(mFields[ISIZE_COL], "ISIZE"); if (!samRecord.getMateReferenceName().equals( SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { if (matePos == 0) { reportErrorParsingLine("MPOS must be non-zero if MRNM is specified"); } } else { if (matePos != 0) { reportErrorParsingLine("MPOS must be zero if MRNM is not specified"); } if (isize != 0) { reportErrorParsingLine("ISIZE must be zero if MRNM is not specified"); } } samRecord.setMateAlignmentStart(matePos); samRecord.setInferredInsertSize(isize); if (!mFields[SEQ_COL].equals("*")) { validateReadBases(mFields[SEQ_COL]); samRecord.setReadString(mFields[SEQ_COL]); } else { samRecord.setReadBases(SAMRecord.NULL_SEQUENCE); } if (!mFields[QUAL_COL].equals("*")) { if (samRecord.getReadBases() == SAMRecord.NULL_SEQUENCE) { reportErrorParsingLine("QUAL should not be specified if SEQ is not specified"); } if (samRecord.getReadString().length() != mFields[QUAL_COL].length()) { reportErrorParsingLine("length(QUAL) != length(SEQ)"); } samRecord.setBaseQualityString(mFields[QUAL_COL]); } else { samRecord.setBaseQualities(SAMRecord.NULL_QUALS); } for (int i = NUM_REQUIRED_FIELDS; i < numFields; ++i) { parseTag(samRecord, mFields[i]); } // Only call samRecord.isValid() if errors would be reported since the validation // is quite expensive in and of itself. if (this.validationStringency != ValidationStringency.SILENT) { final List validationErrors = samRecord.isValid(); if (validationErrors != null) { for (final SAMValidationError errorMessage : validationErrors) { reportErrorParsingLine(errorMessage.getMessage()); } } } return samRecord; } private void validateReadBases(final String bases) { /* * Using regex is slow, so check for invalid characters via * isValidReadBase(), which hopefully the JIT will optimize. if * (!VALID_BASES.matcher(bases).matches()) { * reportErrorParsingLine("Invalid character in read bases"); } */ for (int i = 0; i < bases.length(); ++i) { if (!isValidReadBase(bases.charAt(i))) { reportErrorParsingLine("Invalid character in read bases"); return; } } } private boolean isValidReadBase(final char base) { switch (base) { case 'a': case 'c': case 'm': case 'g': case 'r': case 's': case 'v': case 't': case 'w': case 'y': case 'h': case 'k': case 'd': case 'b': case 'n': case 'A': case 'C': case 'M': case 'G': case 'R': case 'S': case 'V': case 'T': case 'W': case 'Y': case 'H': case 'K': case 'D': case 'B': case 'N': case '.': case '=': return true; default: return false; } } private void parseTag(final SAMRecord samRecord, final String tag) { Map.Entry entry = null; try { entry = tagCodec.decode(tag); } catch (SAMFormatException e) { reportErrorParsingLine(e); } if (entry != null) { if (entry.getValue() instanceof TagValueAndUnsignedArrayFlag) { final TagValueAndUnsignedArrayFlag valueAndFlag = (TagValueAndUnsignedArrayFlag) entry.getValue(); if (valueAndFlag.isUnsignedArray) { samRecord.setUnsignedArrayAttribute(entry.getKey(), valueAndFlag.value); } else { samRecord.setAttribute(entry.getKey(), valueAndFlag.value); } } else { samRecord.setAttribute(entry.getKey(), entry.getValue()); } } } // // Error methods // private RuntimeException reportFatalErrorParsingLine(final String reason) { return new SAMFormatException(makeErrorString(reason)); } private void reportErrorParsingLine(final String reason) { final String errorMessage = makeErrorString(reason); if (validationStringency == ValidationStringency.STRICT) { throw new SAMFormatException(errorMessage); } else if (validationStringency == ValidationStringency.LENIENT) { System.err .println("Ignoring SAM validation error due to lenient parsing:"); System.err.println(errorMessage); } } private void reportErrorParsingLine(final Exception e) { final String errorMessage = makeErrorString(e.getMessage()); if (validationStringency == ValidationStringency.STRICT) { throw new SAMFormatException(errorMessage); } else if (validationStringency == ValidationStringency.LENIENT) { System.err .println("Ignoring SAM validation error due to lenient parsing:"); System.err.println(errorMessage); } } private String makeErrorString(final String reason) { String fileMessage = ""; if (mFile != null) { fileMessage = "File " + mFile + "; "; } return "Error parsing text SAM file. " + reason + "; " + fileMessage + "Line " + (this.currentLineNumber <= 0 ? "unknown" : this.currentLineNumber) + "\nLine: " + this.currentLine; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMProgramRecord.java000066400000000000000000000105251263034757100236260ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Map; import java.util.Set; /** * In-memory representation of @PG SAM header record. */ public class SAMProgramRecord extends AbstractSAMHeaderRecord { public static final String PROGRAM_GROUP_ID_TAG = "ID"; public static final String PROGRAM_NAME_TAG = "PN"; public static final String PROGRAM_VERSION_TAG = "VN"; public static final String COMMAND_LINE_TAG = "CL"; public static final String PREVIOUS_PROGRAM_GROUP_ID_TAG = "PP"; private String mProgramGroupId; public static final Set STANDARD_TAGS = Collections.unmodifiableSet( new HashSet(Arrays.asList(PROGRAM_GROUP_ID_TAG, PROGRAM_NAME_TAG, PROGRAM_VERSION_TAG, COMMAND_LINE_TAG, PREVIOUS_PROGRAM_GROUP_ID_TAG)) ); public SAMProgramRecord(final String programGroupId) { this.mProgramGroupId = programGroupId; } public SAMProgramRecord(final String id, SAMProgramRecord srcProgramRecord) { mProgramGroupId = id; for (final Map.Entry entry : srcProgramRecord.getAttributes()) { setAttribute(entry.getKey(), entry.getValue()); } } public String getId() { return getProgramGroupId(); } public String getProgramGroupId() { return mProgramGroupId; } public String getProgramName() { return (String)getAttribute(PROGRAM_NAME_TAG); } public void setProgramName(final String name) { setAttribute(PROGRAM_NAME_TAG, name); } public String getProgramVersion() { return (String)getAttribute(PROGRAM_VERSION_TAG); } public void setProgramVersion(final String version) { setAttribute(PROGRAM_VERSION_TAG, version); } public String getCommandLine() { return (String)getAttribute(COMMAND_LINE_TAG); } public void setCommandLine(final String commandLine) { setAttribute(COMMAND_LINE_TAG, commandLine); } public String getPreviousProgramGroupId() { return (String)getAttribute(PREVIOUS_PROGRAM_GROUP_ID_TAG); } public void setPreviousProgramGroupId(final String id) { setAttribute(PREVIOUS_PROGRAM_GROUP_ID_TAG, id); } /** * @return true if this == that except for the program group ID, which is arbitrary */ public boolean equivalent(final SAMProgramRecord that) { return attributesEqual(that); } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final SAMProgramRecord that = (SAMProgramRecord) o; if (!attributesEqual(that)) return false; if (mProgramGroupId != null ? !mProgramGroupId.equals(that.mProgramGroupId) : that.mProgramGroupId != null) return false; return true; } @Override public int hashCode() { int result = mProgramGroupId != null ? mProgramGroupId.hashCode() : 0; result = 31 * result + attributesHashCode(); return result; } Set getStandardTags() { return STANDARD_TAGS; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMReadGroupRecord.java000066400000000000000000000153441263034757100241130ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.Iso8601Date; import java.util.Arrays; import java.util.Date; import java.util.HashSet; import java.util.Map; import java.util.Set; /** * Header information about a read group. */ public class SAMReadGroupRecord extends AbstractSAMHeaderRecord { private String mReadGroupId = null; public static final String READ_GROUP_ID_TAG = "ID"; public static final String SEQUENCING_CENTER_TAG = "CN"; public static final String DESCRIPTION_TAG = "DS"; public static final String DATE_RUN_PRODUCED_TAG = "DT"; public static final String FLOW_ORDER_TAG = "FO"; public static final String KEY_SEQUENCE_TAG = "KS"; public static final String LIBRARY_TAG = "LB"; public static final String PROGRAM_GROUP_TAG = "PG"; public static final String PREDICTED_MEDIAN_INSERT_SIZE_TAG = "PI"; public static final String PLATFORM_TAG = "PL"; public static final String PLATFORM_MODEL_TAG = "PM"; public static final String PLATFORM_UNIT_TAG = "PU"; public static final String READ_GROUP_SAMPLE_TAG = "SM"; /* Platform values for the @RG-PL tag */ public enum PlatformValue { CAPILLARY, LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT, ONT, PACBIO } public static final Set STANDARD_TAGS = new HashSet(Arrays.asList(READ_GROUP_ID_TAG, SEQUENCING_CENTER_TAG, DESCRIPTION_TAG, DATE_RUN_PRODUCED_TAG, FLOW_ORDER_TAG, KEY_SEQUENCE_TAG, LIBRARY_TAG, PROGRAM_GROUP_TAG, PREDICTED_MEDIAN_INSERT_SIZE_TAG, PLATFORM_TAG, PLATFORM_MODEL_TAG, PLATFORM_UNIT_TAG, READ_GROUP_SAMPLE_TAG)); public SAMReadGroupRecord(final String id) { mReadGroupId = id; } public SAMReadGroupRecord(final String id, final SAMReadGroupRecord srcProgramRecord) { mReadGroupId = id; for (final Map.Entry entry : srcProgramRecord.getAttributes()) { setAttribute(entry.getKey(), entry.getValue()); } } public String getId() { return getReadGroupId(); } public String getReadGroupId() { return mReadGroupId; } public String getSample() { return getAttribute(READ_GROUP_SAMPLE_TAG); } public void setSample(final String value) { setAttribute(READ_GROUP_SAMPLE_TAG, value); } public String getLibrary() { return getAttribute(LIBRARY_TAG); } public void setLibrary(final String value) { setAttribute(LIBRARY_TAG, value); } public String getPlatformUnit() { return getAttribute(PLATFORM_UNIT_TAG); } public void setPlatformUnit(final String pu) { setAttribute(PLATFORM_UNIT_TAG, pu); } public String getPlatform() { return getAttribute(PLATFORM_TAG); } public void setPlatform(final String platform) { setAttribute(PLATFORM_TAG, platform); } public Date getRunDate() { final String dt = getAttribute(DATE_RUN_PRODUCED_TAG); if (dt == null) return null; else return new Iso8601Date(dt); } public String getFlowOrder() { return getAttribute(FLOW_ORDER_TAG); } public void setFlowOrder(final String flowOrder) { setAttribute(FLOW_ORDER_TAG, flowOrder); } public String getKeySequence() { return getAttribute(KEY_SEQUENCE_TAG); } public void setKeySequence(final String keySequence) { setAttribute(KEY_SEQUENCE_TAG, keySequence); } /** * Converts to Iso8601Date if not already in that form. */ public void setRunDate(Date runDate) { if (runDate != null && !(runDate instanceof Iso8601Date)) { runDate = new Iso8601Date(runDate); } setAttribute(DATE_RUN_PRODUCED_TAG, runDate != null ? runDate.toString() : null); } public String getSequencingCenter() { return getAttribute(SEQUENCING_CENTER_TAG); } public void setSequencingCenter(final String center) { setAttribute(SEQUENCING_CENTER_TAG, center); } public String getDescription() { return getAttribute(DESCRIPTION_TAG); } public void setDescription(final String description) { setAttribute(DESCRIPTION_TAG, description); } public Integer getPredictedMedianInsertSize() { final String stringRep = getAttribute(PREDICTED_MEDIAN_INSERT_SIZE_TAG); if (stringRep == null) return null; return Integer.parseInt(stringRep); } public void setPredictedMedianInsertSize(final Integer predictedMedianInsertSize) { setAttribute(PREDICTED_MEDIAN_INSERT_SIZE_TAG, (predictedMedianInsertSize == null? null: predictedMedianInsertSize.toString())); } public String getProgramGroup() { return getAttribute(PROGRAM_GROUP_TAG); } public void setProgramGroup(final String programGroup) { setAttribute(PROGRAM_GROUP_TAG, programGroup); } public String getPlatformModel() { return getAttribute(PLATFORM_MODEL_TAG); } public void setPlatformModel(final String platformModel) { setAttribute(PLATFORM_MODEL_TAG, platformModel); } /** * @return true if this == that except for the read group ID, which is arbitrary */ public boolean equivalent(final SAMReadGroupRecord that) { return attributesEqual(that); } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final SAMReadGroupRecord that = (SAMReadGroupRecord) o; if (!attributesEqual(that)) return false; if (mReadGroupId != null ? !mReadGroupId.equals(that.mReadGroupId) : that.mReadGroupId != null) return false; return true; } @Override public int hashCode() { return mReadGroupId.hashCode(); } Set getStandardTags() { return STANDARD_TAGS; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecord.java000066400000000000000000002743661263034757100223150ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.Locatable; import htsjdk.samtools.util.StringUtil; import java.io.Serializable; import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; /** * Java binding for a SAM file record. c.f. http://samtools.sourceforge.net/SAM1.pdf *

* The presence of reference name/reference index and alignment start * do not necessarily mean that a read is aligned. Those values may merely be set to force a SAMRecord * to appear in a certain place in the sort order. The readUnmappedFlag must be checked to determine whether * or not a read is mapped. Only if the readUnmappedFlag is false can the reference name/index and alignment start * be interpreted as indicating an actual alignment position. *

* Likewise, presence of mate reference name/index and mate alignment start do not necessarily mean that the * mate is aligned. These may be set for an unaligned mate if the mate has been forced into a particular place * in the sort order per the above paragraph. Only if the mateUnmappedFlag is false can the mate reference name/index * and mate alignment start be interpreted as indicating the actual alignment position of the mate. *

* Note also that there are a number of getters & setters that are linked, i.e. they present different representations * of the same underlying data. In these cases there is typically a representation that is preferred because it * ought to be faster than some other representation. The following are the preferred representations: *

    *
  • getReadNameLength() is preferred to getReadName().length()
  • *
  • get/setReadBases() is preferred to get/setReadString()
  • *
  • get/setBaseQualities() is preferred to get/setBaseQualityString()
  • *
  • get/setReferenceIndex() is preferred to get/setReferenceName() for records with valid SAMFileHeaders
  • *
  • get/setMateReferenceIndex() is preferred to get/setMateReferenceName() for records with valid SAMFileHeaders
  • *
  • getCigarLength() is preferred to getCigar().getNumElements()
  • *
  • get/setCigar() is preferred to get/setCigarString()
  • *
*

* setHeader() is called by the SAM reading code, so the get/setReferenceIndex() and get/setMateReferenceIndex() * methods will have access to the sequence dictionary to resolve reference and mate reference names to dictionary * indices. *

* setHeader() need not be called explicitly when writing SAMRecords, however the writers require a record * in order to call get/setReferenceIndex() and get/setMateReferenceIndex(). Therefore adding records to a writer * has a side effect: any record that does not have an assigned header at the time it is added to a writer will be * updated and assigned the header associated with the writer. *

* Some of the get() methods return values that are mutable, due to the limitations of Java. A caller should * never change the value returned by a get() method. If you want to change the value of some attribute of a * SAMRecord, create a new value object and call the appropriate set() method. *

* Note that setIndexingBin() need not be called when writing SAMRecords. It will be computed as necessary. It is only * present as an optimization in the event that the value is already known and need not be computed. *

* By default, extensive validation of SAMRecords is done when they are read. Very limited validation is done when * values are set onto SAMRecords. *

*

Notes on Headerless SAMRecords

*

* If the header is null, the following SAMRecord methods may throw exceptions: *

    *
  • getReferenceIndex
  • *
  • setReferenceIndex
  • *
  • getMateReferenceIndex
  • *
  • setMateReferenceIndex
  • *

* Record comparators (i.e. SAMRecordCoordinateComparator and SAMRecordDuplicateComparator) require records with * non-null header values. *

* A record with null a header may be validated by the isValid method, but the reference and mate reference indices, * read group, sequence dictionary, and alignment start will not be fully validated unless a header is present. *

* Also, SAMTextWriter, BAMFileWriter, and CRAMFileWriter all require records to have a valid header in order to be * written. Any record that does not have a header at the time it is added to the writer will be updated to use the * header associated with the writer. *

* @author alecw@broadinstitute.org * @author mishali.naik@intel.com */ public class SAMRecord implements Cloneable, Locatable, Serializable { public static final long serialVersionUID = 1L; /** * Alignment score for a good alignment, but where computing a Phred-score is not feasible. */ public static final int UNKNOWN_MAPPING_QUALITY = 255; /** * Alignment score for an unaligned read. */ public static final int NO_MAPPING_QUALITY = 0; /** * If a read has this reference name, it is unaligned, but not all unaligned reads have * this reference name (see above). */ public static final String NO_ALIGNMENT_REFERENCE_NAME = "*"; /** * If a read has this reference index, it is unaligned, but not all unaligned reads have * this reference index (see above). */ public static final int NO_ALIGNMENT_REFERENCE_INDEX = -1; /** * Cigar string for an unaligned read. */ public static final String NO_ALIGNMENT_CIGAR = "*"; /** * If a read has reference name "*", it will have this value for position. */ public static final int NO_ALIGNMENT_START = GenomicIndexUtil.UNSET_GENOMIC_LOCATION; /** * This should rarely be used, since a read with no sequence doesn't make much sense. */ public static final byte[] NULL_SEQUENCE = new byte[0]; public static final String NULL_SEQUENCE_STRING = "*"; /** * This should rarely be used, since all reads should have quality scores. */ public static final byte[] NULL_QUALS = new byte[0]; public static final String NULL_QUALS_STRING = "*"; /** * abs(insertSize) must be <= this */ public static final int MAX_INSERT_SIZE = 1<<29; private String mReadName = null; private byte[] mReadBases = NULL_SEQUENCE; private byte[] mBaseQualities = NULL_QUALS; private String mReferenceName = NO_ALIGNMENT_REFERENCE_NAME; private int mAlignmentStart = NO_ALIGNMENT_START; private transient int mAlignmentEnd = NO_ALIGNMENT_START; private int mMappingQuality = NO_MAPPING_QUALITY; private String mCigarString = NO_ALIGNMENT_CIGAR; private Cigar mCigar = null; private List mAlignmentBlocks = null; private int mFlags = 0; private String mMateReferenceName = NO_ALIGNMENT_REFERENCE_NAME; private int mMateAlignmentStart = 0; private int mInferredInsertSize = 0; private SAMBinaryTagAndValue mAttributes = null; protected Integer mReferenceIndex = null; protected Integer mMateReferenceIndex = null; private Integer mIndexingBin = null; /** * Some attributes (e.g. CIGAR) are not decoded immediately. Use this to decide how to validate when decoded. */ private ValidationStringency mValidationStringency = ValidationStringency.SILENT; /** * File source of this record. May be null. Note that this field is not serializable (and therefore marked * as transient) due to encapsulated stream objects within it -- so serializing a SAMRecord will cause its * file source to be lost (if it had one). */ private transient SAMFileSource mFileSource; private SAMFileHeader mHeader = null; /** Transient Map of attributes for use by anyone. */ private transient Map transientAttributes; public SAMRecord(final SAMFileHeader header) { mHeader = header; } public String getReadName() { return mReadName; } /** * This method is preferred over getReadName().length(), because for BAMRecord * it may be faster. * @return length not including a null terminator. */ public int getReadNameLength() { return mReadName.length(); } public void setReadName(final String value) { mReadName = value; } /** * @return read sequence as a string of ACGTN=. */ public String getReadString() { final byte[] readBases = getReadBases(); if (readBases.length == 0) { return NULL_SEQUENCE_STRING; } return StringUtil.bytesToString(readBases); } public void setReadString(final String value) { if (NULL_SEQUENCE_STRING.equals(value)) { mReadBases = NULL_SEQUENCE; } else { final byte[] bases = StringUtil.stringToBytes(value); SAMUtils.normalizeBases(bases); setReadBases(bases); } } /** * Do not modify the value returned by this method. If you want to change the bases, create a new * byte[] and call setReadBases() or call setReadString(). * @return read sequence as ASCII bytes ACGTN=. */ public byte[] getReadBases() { return mReadBases; } public void setReadBases(final byte[] value) { mReadBases = value; } /** * This method is preferred over getReadBases().length, because for BAMRecord it may be faster. * @return number of bases in the read. */ public int getReadLength() { return getReadBases().length; } /** * @return Base qualities, encoded as a FASTQ string. */ public String getBaseQualityString() { if (Arrays.equals(NULL_QUALS, getBaseQualities())) { return NULL_QUALS_STRING; } return SAMUtils.phredToFastq(getBaseQualities()); } public void setBaseQualityString(final String value) { if (NULL_QUALS_STRING.equals(value)) { setBaseQualities(NULL_QUALS); } else { setBaseQualities(SAMUtils.fastqToPhred(value)); } } /** * Do not modify the value returned by this method. If you want to change the qualities, create a new * byte[] and call setBaseQualities() or call setBaseQualityString(). * @return Base qualities, as binary phred scores (not ASCII). */ public byte[] getBaseQualities() { return mBaseQualities; } public void setBaseQualities(final byte[] value) { mBaseQualities = value; } /** * If the original base quality scores have been store in the "OQ" tag will return the numeric * score as a byte[] */ public byte[] getOriginalBaseQualities() { final String oqString = (String) getAttribute("OQ"); if (oqString != null && oqString.length() > 0) { return SAMUtils.fastqToPhred(oqString); } else { return null; } } /** * Sets the original base quality scores into the "OQ" tag as a String. Supplied value should be * as phred-scaled numeric qualities. */ public void setOriginalBaseQualities(final byte[] oq) { setAttribute("OQ", SAMUtils.phredToFastq(oq)); } private static boolean hasReferenceName(final Integer referenceIndex, final String referenceName) { return (referenceIndex != null && !referenceIndex.equals(NO_ALIGNMENT_REFERENCE_INDEX)) || (!NO_ALIGNMENT_REFERENCE_NAME.equals(referenceName)); } /** * @return true if this SAMRecord has a reference, either as a String or index (or both). */ private boolean hasReferenceName() { return hasReferenceName(mReferenceIndex, mReferenceName); } /** * @return true if this SAMRecord has a mate reference, either as a String or index (or both). */ private boolean hasMateReferenceName() { return hasReferenceName(mMateReferenceIndex, mMateReferenceName); } /** * @return Reference name, or NO_ALIGNMENT_REFERENCE_NAME (*) if the record has no reference name */ public String getReferenceName() { return mReferenceName; } /** * Sets the reference name for this record. If the record has a valid SAMFileHeader and the reference * name is present in the associated sequence dictionary, the record's reference index will also be * updated with the corresponding sequence index. If referenceName is NO_ALIGNMENT_REFERENCE_NAME, sets * the reference index to NO_ALIGNMENT_REFERENCE_INDEX. * * @param referenceName - must not be null */ public void setReferenceName(final String referenceName) { if (null == referenceName) { throw new IllegalArgumentException( "Reference name must not be null. Use SAMRecord.NO_ALIGNMENT_REFERENCE_NAME to reset the reference name."); } else if (NO_ALIGNMENT_REFERENCE_NAME.equals(referenceName)) { mReferenceName = NO_ALIGNMENT_REFERENCE_NAME; mReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; } else if (null != mHeader) { // String.intern() is surprisingly expensive, so avoid it by looking up in sequence dictionary if possible final int referenceIndex = mHeader.getSequenceIndex(referenceName); if (-1 != referenceIndex) { setReferenceIndex(referenceIndex); // sets reference name and index } else { mReferenceName = referenceName.intern(); mReferenceIndex = null; } } else { mReferenceName = referenceName.intern(); mReferenceIndex = null; } } /** * Returns the reference index for this record. * * If the reference name for this record has previously been resolved against the sequence dictionary, the corresponding * index is returned directly. Otherwise, the record must have a non-null SAMFileHeader that can be used to * resolve the index for the record's current reference name, unless the reference name is NO_ALIGNMENT_REFERENCE_NAME. * If the record has a header, and the name does not appear in the header's sequence dictionary, the value * NO_ALIGNMENT_REFERENCE_INDEX (-1) will be returned. If the record does not have a header, an IllegalStateException * is thrown. * * @return Index in the sequence dictionary of the reference sequence. If the read has no reference sequence, or if * the reference name is not found in the sequence index, NO_ALIGNMENT_REFERENCE_INDEX (-1) is returned. * * @throws IllegalStateException if the reference index cannot be resolved because the SAMFileHeader for the * record is null. */ public Integer getReferenceIndex() { if (null == mReferenceIndex) { // try to resolve the reference index if (NO_ALIGNMENT_REFERENCE_NAME.equals(mReferenceName)) { mReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; } else if (null != mHeader) { mReferenceIndex = mHeader.getSequenceIndex(mReferenceName); } else { throw new IllegalStateException("A non-null SAMFileHeader is required to resolve the reference index"); } } return mReferenceIndex; } /** * Updates the reference index. The record must have a valid SAMFileHeader unless the referenceIndex parameter equals * NO_ALIGNMENT_REFERENCE_INDEX, and the reference index must appear in the header's sequence dictionary. If the * reference index is valid, the reference name will also be resolved and updated to the name for the sequence * dictionary entry corresponding to the index. * * @param referenceIndex Must either equal NO_ALIGNMENT_REFERENCE_INDEX (-1) indicating no reference, or the * record must have a SAMFileHeader and the index must exist in the associated sequence * dictionary. * @throws IllegalStateException if the SAMFileHeader is null for this record or the reference index is not * found in the sequence dictionary for this record. */ public void setReferenceIndex(final int referenceIndex) { if (referenceIndex == NO_ALIGNMENT_REFERENCE_INDEX) { mReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; mReferenceName = NO_ALIGNMENT_REFERENCE_NAME; } else if (null == mHeader) { throw new IllegalStateException("A non-null SAMFileHeader is required to resolve the reference index"); } else { SAMSequenceRecord samSequence = mHeader.getSequence(referenceIndex); if (null != samSequence) { mReferenceIndex = referenceIndex; mReferenceName = samSequence.getSequenceName(); } else { throw new IllegalArgumentException("Reference index " + referenceIndex + " not found in sequence dictionary."); } } } /** * @return Mate reference name, or NO_ALIGNMENT_REFERENCE_NAME (*) if the record has no mate reference name */ public String getMateReferenceName() { return mMateReferenceName; } /** * Sets the mate reference name for this record. If the record has a valid SAMFileHeader and the mate reference * name is present in the associated sequence dictionary, the record's mate reference index will also be * updated with the corresponding sequence index. If mateReferenceName is NO_ALIGNMENT_REFERENCE_NAME, sets the * mate reference index to NO_ALIGNMENT_REFERENCE_INDEX. * * @param mateReferenceName - must not be null */ public void setMateReferenceName(final String mateReferenceName) { if (null == mateReferenceName) { throw new IllegalArgumentException("Mate reference name must not be null"); } else if (NO_ALIGNMENT_REFERENCE_NAME.equals(mateReferenceName)) { mMateReferenceName = NO_ALIGNMENT_REFERENCE_NAME; mMateReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; } else if (null != mHeader) { final int mateReferenceIndex = mHeader.getSequenceIndex(mateReferenceName); if (-1 != mateReferenceIndex) { setMateReferenceIndex(mateReferenceIndex); // sets mate reference name and index } else { mMateReferenceName = mateReferenceName.intern(); mMateReferenceIndex = null; } } else { mMateReferenceName = mateReferenceName.intern(); mMateReferenceIndex = null; } } /** * Returns the mate reference index for this record. * * If the mate reference name for this record has previously been resolved against the sequence dictionary, the * corresponding index is returned directly. Otherwise, the record must have a non-null SAMFileHeader that can be * used to resolve the index for the record's current mate reference name, unless the mate reference name is * NO_ALIGNMENT_REFERENCE_NAME. If the record has a header, and the name does not appear in the header's * sequence dictionary, the value NO_ALIGNMENT_REFERENCE_INDEX (-1) will be returned. If the record does not have * a header, an IllegalStateException is thrown. * * @return Index in the sequence dictionary of the mate reference sequence. If the read has no mate reference * sequence, or if the mate reference name is not found in the sequence index, NO_ALIGNMENT_REFERENCE_INDEX (-1) * is returned. * * @throws IllegalStateException if the mate reference index cannot be resolved because the SAMFileHeader for the * record is null. */ public Integer getMateReferenceIndex() { if (null == mMateReferenceIndex) { // try to resolve the reference index if (NO_ALIGNMENT_REFERENCE_NAME.equals(mMateReferenceName)) { mMateReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; } else if (null != mHeader) { mMateReferenceIndex = mHeader.getSequenceIndex(mMateReferenceName); } else { throw new IllegalStateException("A non-null SAMFileHeader is required to resolve the mate reference index"); } } return mMateReferenceIndex; } /** * Updates the mate reference index. The record must have a valid SAMFileHeader, and the mate reference index must appear in * the header's sequence dictionary, unless the mateReferenceIndex parameter equals NO_ALIGNMENT_REFERENCE_INDEX. If the mate * reference index is valid, the mate reference name will also be resolved and updated to the name for the sequence dictionary * entry corresponding to the index. * * @param mateReferenceIndex Must either equal NO_ALIGNMENT_REFERENCE_INDEX (-1) indicating no reference, or the * record must have a SAMFileHeader and the index must exist in the associated sequence * dictionary. * @throws IllegalStateException if the SAMFileHeader is null for this record or the mate reference index is not * found in the sequence dictionary for this record. */ public void setMateReferenceIndex(final int mateReferenceIndex) { if (mateReferenceIndex == NO_ALIGNMENT_REFERENCE_INDEX) { mMateReferenceIndex = NO_ALIGNMENT_REFERENCE_INDEX; mMateReferenceName = NO_ALIGNMENT_REFERENCE_NAME; } else if (null == mHeader) { throw new IllegalStateException("A non-null SAMFileHeader is required to resolve the mate reference index"); } else { SAMSequenceRecord samSequence = mHeader.getSequence(mateReferenceIndex); if (null != samSequence) { mMateReferenceIndex = mateReferenceIndex; mMateReferenceName = samSequence.getSequenceName(); } else { throw new IllegalArgumentException("Reference index " + mateReferenceIndex + " not found in sequence dictionary."); } } } /** * @return 1-based inclusive leftmost position of the clipped sequence, or 0 if there is no position. */ public int getAlignmentStart() { return mAlignmentStart; } /** * @param value 1-based inclusive leftmost position of the clipped sequence, or 0 if there is no position. */ public void setAlignmentStart(final int value) { mAlignmentStart = value; // Clear cached alignment end mAlignmentEnd = NO_ALIGNMENT_START; // Change to alignmentStart could change indexing bin setIndexingBin(null); } /** * @return 1-based inclusive rightmost position of the clipped sequence, or 0 read if unmapped. */ public int getAlignmentEnd() { if (getReadUnmappedFlag()) { return NO_ALIGNMENT_START; } else if (this.mAlignmentEnd == NO_ALIGNMENT_START) { this.mAlignmentEnd = mAlignmentStart + getCigar().getReferenceLength() - 1; } return this.mAlignmentEnd; } /** * @return the alignment start (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) * then this method will return 96. * * Invalid to call on an unmapped read. */ public int getUnclippedStart() { return SAMUtils.getUnclippedStart(getAlignmentStart(), getCigar()); } /** * @return the alignment end (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) * then this method will return 107. * * Invalid to call on an unmapped read. */ public int getUnclippedEnd() { return SAMUtils.getUnclippedEnd(getAlignmentEnd(), getCigar()); } /** * @param offset 1-based location within the unclipped sequence or 0 if there is no position. *

* Non static version of the static function with the same name. * @return 1-based inclusive reference position of the unclipped sequence at a given offset, */ public int getReferencePositionAtReadPosition(final int offset) { return getReferencePositionAtReadPosition(this, offset); } /** * @param rec record to use * @param offset 1-based location within the unclipped sequence * @return 1-based inclusive reference position of the unclipped sequence at a given offset, * or 0 if there is no position. * For example, given the sequence NNNAAACCCGGG, cigar 3S9M, and an alignment start of 1, * and a (1-based)offset 10 (start of GGG) it returns 7 (1-based offset starting after the soft clip. * For example: given the sequence AAACCCGGGTTT, cigar 4M1D6M, an alignment start of 1, * an offset of 4 returns reference position 4, an offset of 5 returns reference position 6. * Another example: given the sequence AAACCCGGGTTT, cigar 4M1I6M, an alignment start of 1, * an offset of 4 returns reference position 4, an offset of 5 returns 0. */ public static int getReferencePositionAtReadPosition(final SAMRecord rec, final int offset) { if (offset == 0) return 0; for (final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) { if (CoordMath.getEnd(alignmentBlock.getReadStart(), alignmentBlock.getLength()) < offset) { continue; } else if (offset < alignmentBlock.getReadStart()) { return 0; } else { return alignmentBlock.getReferenceStart() + offset - alignmentBlock.getReadStart(); } } return 0; // offset not located in an alignment block } /** * @param pos 1-based reference position * return the offset * @return 1-based (to match getReferencePositionAtReadPosition behavior) inclusive position into the * unclipped sequence at a given reference position, or 0 if there is no such position. * * See examples in the static version below */ public int getReadPositionAtReferencePosition(final int pos) { return getReadPositionAtReferencePosition(this, pos, false); } /** * @param pos 1-based reference position * @param returnLastBaseIfDeleted if positive, and reference position matches a deleted base in the read, function will * return the offset * @return 1-based (to match getReferencePositionAtReadPosition behavior) inclusive position into the * unclipped sequence at a given reference position, * or 0 if there is no such position. If returnLastBaseIfDeleted is true deletions are assumed to "live" on the last read base * in the preceding block. * * Non-static version of static function with the same name. See examples below. */ public int getReadPositionAtReferencePosition(final int pos, final boolean returnLastBaseIfDeleted) { return getReadPositionAtReferencePosition(this, pos, returnLastBaseIfDeleted); } /** * @param rec record to use * @param pos 1-based reference position * @param returnLastBaseIfDeleted if positive, and reference position matches a deleted base in the read, function will * return the offset * @return 1-based (to match getReferencePositionAtReadPosition behavior) inclusive position into the * unclipped sequence at a given reference position, * or 0 if there is no such position. If returnLastBaseIfDeleted is true deletions are assumed to "live" on the last read base * in the preceding block. * For example, given the sequence NNNAAACCCGGG, cigar 3S9M, and an alignment start of 1, * and a (1-based)pos of 7 (start of GGG) it returns 10 (1-based offset including the soft clip. * For example: given the sequence AAACCCGGGT, cigar 4M1D6M, an alignment start of 1, * a reference position of 4 returns offset of 4, a reference of 5 also returns an offset 4 (using "left aligning") if returnLastBaseIfDeleted * and 0 otherwise. * For example: given the sequence AAACtCGGGTT, cigar 4M1I6M, an alignment start of 1, * a position 4 returns an offset 5, a position of 5 returns 6 (the inserted base is the 5th offset), a position of 11 returns 0 since * that position in the reference doesn't overlap the read at all. * */ public static int getReadPositionAtReferencePosition(final SAMRecord rec, final int pos, final boolean returnLastBaseIfDeleted) { if (pos <= 0) { return 0; } int lastAlignmentOffset = 0; for (final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) { if (CoordMath.getEnd(alignmentBlock.getReferenceStart(), alignmentBlock.getLength()) >= pos) { if (pos < alignmentBlock.getReferenceStart()) { //There must have been a deletion block that skipped return returnLastBaseIfDeleted ? lastAlignmentOffset : 0; } else { return pos - alignmentBlock.getReferenceStart() + alignmentBlock.getReadStart() ; } } else { // record the offset to the last base in the current block, in case the next block starts too late lastAlignmentOffset = alignmentBlock.getReadStart() + alignmentBlock.getLength() - 1 ; } } // if we are here, the reference position was not overlapping the read at all return 0; } /** * @return 1-based inclusive leftmost position of the clipped mate sequence, or 0 if there is no position. */ public int getMateAlignmentStart() { return mMateAlignmentStart; } public void setMateAlignmentStart(final int mateAlignmentStart) { this.mMateAlignmentStart = mateAlignmentStart; } /** * @return insert size (difference btw 5' end of read & 5' end of mate), if possible, else 0. * Negative if mate maps to lower position than read. */ public int getInferredInsertSize() { return mInferredInsertSize; } public void setInferredInsertSize(final int inferredInsertSize) { this.mInferredInsertSize = inferredInsertSize; } /** * @return phred scaled mapping quality. 255 implies valid mapping but quality is hard to compute. */ public int getMappingQuality() { return mMappingQuality; } public void setMappingQuality(final int value) { mMappingQuality = value; } public String getCigarString() { if (mCigarString == null && getCigar() != null) { mCigarString = TextCigarCodec.encode(getCigar()); } return mCigarString; } public void setCigarString(final String value) { mCigarString = value; mCigar = null; mAlignmentBlocks = null; // Clear cached alignment end mAlignmentEnd = NO_ALIGNMENT_START; // Change to cigar could change alignmentEnd, and thus indexing bin setIndexingBin(null); } /** * Do not modify the value returned by this method. If you want to change the Cigar, create a new * Cigar and call setCigar() or call setCigarString() * @return Cigar object for the read, or null if there is none. */ public Cigar getCigar() { if (mCigar == null && mCigarString != null) { mCigar = TextCigarCodec.decode(mCigarString); if (null != getHeader() && getValidationStringency() != ValidationStringency.SILENT && !this.getReadUnmappedFlag()) { // Don't know line number, and don't want to force read name to be decoded. SAMUtils.processValidationErrors(this.validateCigar(-1L), -1L, getValidationStringency()); } } return mCigar; } /** * This method is preferred over getCigar().getNumElements(), because for BAMRecord it may be faster. * @return number of cigar elements (number + operator) in the cigar string. */ public int getCigarLength() { return getCigar().numCigarElements(); } public void setCigar(final Cigar cigar) { initializeCigar(cigar); // Change to cigar could change alignmentEnd, and thus indexing bin setIndexingBin(null); } /** * For setting the Cigar string when BAMRecord has decoded it. Use this rather than setCigar() * so that indexing bin doesn't get clobbered. */ protected void initializeCigar(final Cigar cigar) { this.mCigar = cigar; mCigarString = null; mAlignmentBlocks = null; // Clear cached alignment end mAlignmentEnd = NO_ALIGNMENT_START; } /** * Get the SAMReadGroupRecord for this SAMRecord. * @return The SAMReadGroupRecord from the SAMFileHeader for this SAMRecord, or null if * 1) this record has no RG tag, or 2) the header doesn't contain the read group with * the given ID.or 3) this record has no SAMFileHeader * @throws ClassCastException if RG tag does not have a String value. */ public SAMReadGroupRecord getReadGroup() { final String rgId = (String)getAttribute(SAMTagUtil.getSingleton().RG); if (rgId == null || getHeader() == null) { return null; } else { return getHeader().getReadGroup(rgId); } } /** * It is preferable to use the get*Flag() methods that handle the flag word symbolically. */ public int getFlags() { return mFlags; } public void setFlags(final int value) { mFlags = value; // Could imply change to readUnmapped flag, which could change indexing bin setIndexingBin(null); } /** * the read is paired in sequencing, no matter whether it is mapped in a pair. */ public boolean getReadPairedFlag() { return (mFlags & SAMFlag.READ_PAIRED.flag) != 0; } private void requireReadPaired() { if (!getReadPairedFlag()) { throw new IllegalStateException("Inappropriate call if not paired read"); } } /** * the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment). */ public boolean getProperPairFlag() { requireReadPaired(); return getProperPairFlagUnchecked(); } private boolean getProperPairFlagUnchecked() { return (mFlags & SAMFlag.PROPER_PAIR.flag) != 0; } /** * the query sequence itself is unmapped. */ public boolean getReadUnmappedFlag() { return (mFlags & SAMFlag.READ_UNMAPPED.flag) != 0; } /** * the mate is unmapped. */ public boolean getMateUnmappedFlag() { requireReadPaired(); return getMateUnmappedFlagUnchecked(); } private boolean getMateUnmappedFlagUnchecked() { return (mFlags & SAMFlag.MATE_UNMAPPED.flag) != 0; } /** * strand of the query (false for forward; true for reverse strand). */ public boolean getReadNegativeStrandFlag() { return (mFlags & SAMFlag.READ_REVERSE_STRAND.flag) != 0; } /** * strand of the mate (false for forward; true for reverse strand). */ public boolean getMateNegativeStrandFlag() { requireReadPaired(); return getMateNegativeStrandFlagUnchecked(); } private boolean getMateNegativeStrandFlagUnchecked() { return (mFlags & SAMFlag.MATE_REVERSE_STRAND.flag) != 0; } /** * the read is the first read in a pair. */ public boolean getFirstOfPairFlag() { requireReadPaired(); return getFirstOfPairFlagUnchecked(); } private boolean getFirstOfPairFlagUnchecked() { return (mFlags & SAMFlag.FIRST_OF_PAIR.flag) != 0; } /** * the read is the second read in a pair. */ public boolean getSecondOfPairFlag() { requireReadPaired(); return getSecondOfPairFlagUnchecked(); } private boolean getSecondOfPairFlagUnchecked() { return (mFlags & SAMFlag.SECOND_OF_PAIR.flag) != 0; } /** * the alignment is not primary (a read having split hits may have multiple primary alignment records). */ public boolean getNotPrimaryAlignmentFlag() { return (mFlags & SAMFlag.NOT_PRIMARY_ALIGNMENT.flag) != 0; } /** * the alignment is supplementary (TODO: further explanation?). */ public boolean getSupplementaryAlignmentFlag() { return (mFlags & SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag) != 0; } /** * the read fails platform/vendor quality checks. */ public boolean getReadFailsVendorQualityCheckFlag() { return (mFlags & SAMFlag.READ_FAILS_VENDOR_QUALITY_CHECK.flag) != 0; } /** * the read is either a PCR duplicate or an optical duplicate. */ public boolean getDuplicateReadFlag() { return (mFlags & SAMFlag.DUPLICATE_READ.flag) != 0; } /** * the read is paired in sequencing, no matter whether it is mapped in a pair. */ public void setReadPairedFlag(final boolean flag) { setFlag(flag, SAMFlag.READ_PAIRED.flag); } /** * the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment). */ public void setProperPairFlag(final boolean flag) { setFlag(flag, SAMFlag.PROPER_PAIR.flag); } /** * the query sequence itself is unmapped. This method name is misspelled. * Use setReadUnmappedFlag instead. * @deprecated */ public void setReadUmappedFlag(final boolean flag) { setReadUnmappedFlag(flag); } /** * the query sequence itself is unmapped. */ public void setReadUnmappedFlag(final boolean flag) { setFlag(flag, SAMFlag.READ_UNMAPPED.flag); // Change to readUnmapped could change indexing bin setIndexingBin(null); } /** * the mate is unmapped. */ public void setMateUnmappedFlag(final boolean flag) { setFlag(flag, SAMFlag.MATE_UNMAPPED.flag); } /** * strand of the query (false for forward; true for reverse strand). */ public void setReadNegativeStrandFlag(final boolean flag) { setFlag(flag, SAMFlag.READ_REVERSE_STRAND.flag); } /** * strand of the mate (false for forward; true for reverse strand). */ public void setMateNegativeStrandFlag(final boolean flag) { setFlag(flag, SAMFlag.MATE_REVERSE_STRAND.flag); } /** * the read is the first read in a pair. */ public void setFirstOfPairFlag(final boolean flag) { setFlag(flag, SAMFlag.FIRST_OF_PAIR.flag); } /** * the read is the second read in a pair. */ public void setSecondOfPairFlag(final boolean flag) { setFlag(flag, SAMFlag.SECOND_OF_PAIR.flag); } /** * the alignment is not primary (a read having split hits may have multiple primary alignment records). */ public void setNotPrimaryAlignmentFlag(final boolean flag) { setFlag(flag, SAMFlag.NOT_PRIMARY_ALIGNMENT.flag); } /** * the alignment is supplementary (TODO: further explanation?). */ public void setSupplementaryAlignmentFlag(final boolean flag) { setFlag(flag, SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag); } /** * the read fails platform/vendor quality checks. */ public void setReadFailsVendorQualityCheckFlag(final boolean flag) { setFlag(flag, SAMFlag.READ_FAILS_VENDOR_QUALITY_CHECK.flag); } /** * the read is either a PCR duplicate or an optical duplicate. */ public void setDuplicateReadFlag(final boolean flag) { setFlag(flag, SAMFlag.DUPLICATE_READ.flag); } /** * Tests if this record is either a secondary and/or supplementary alignment; * equivalent to {@code (getNotPrimaryAlignmentFlag() || getSupplementaryAlignmentFlag())}. */ public boolean isSecondaryOrSupplementary() { return getNotPrimaryAlignmentFlag() || getSupplementaryAlignmentFlag(); } private void setFlag(final boolean flag, final int bit) { if (flag) { mFlags |= bit; } else { mFlags &= ~bit; } } public ValidationStringency getValidationStringency() { return mValidationStringency; } /** * Control validation of lazily-decoded elements. */ public void setValidationStringency(final ValidationStringency validationStringency) { this.mValidationStringency = validationStringency; } /** * Get the value for a SAM tag. * WARNING: Some value types (e.g. byte[]) are mutable. It is dangerous to change one of these values in * place, because some SAMRecord implementations keep track of when attributes have been changed. If you * want to change an attribute value, call setAttribute() to replace the value. * * @param tag Two-character tag name. * @return Appropriately typed tag value, or null if the requested tag is not present. */ public Object getAttribute(final String tag) { return getAttribute(SAMTagUtil.getSingleton().makeBinaryTag(tag)); } /** * Get the tag value and attempt to coerce it into the requested type. * @param tag The requested tag. * @return The value of a tag, converted into a signed Integer if possible. * @throws RuntimeException If the value is not an integer type, or will not fit in a signed Integer. */ public Integer getIntegerAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof Integer) { return (Integer)val; } if (!(val instanceof Number)) { throw new RuntimeException("Value for tag " + tag + " is not Number: " + val.getClass()); } final long longVal = ((Number)val).longValue(); if (longVal < Integer.MIN_VALUE || longVal > Integer.MAX_VALUE) { throw new RuntimeException("Value for tag " + tag + " is not in Integer range: " + longVal); } return (int)longVal; } /** * A convenience method that will return a valid unsigned integer as a Long, * or fail with an exception if the tag value is invalid. * * @param tag Two-character tag name. * @return valid unsigned integer associated with the tag, as a Long * @throws {@link htsjdk.samtools.SAMException} if the value is out of range for a 32-bit unsigned value, or not a Number */ public Long getUnsignedIntegerAttribute(final String tag) throws SAMException { return getUnsignedIntegerAttribute(SAMTagUtil.getSingleton().makeBinaryTag(tag)); } /** * A convenience method that will return a valid unsigned integer as a Long, * or fail with an exception if the tag value is invalid. * * @param tag Binary representation of a 2-char String tag as created by SAMTagUtil. * @return valid unsigned integer associated with the tag, as a Long * @throws {@link htsjdk.samtools.SAMException} if the value is out of range for a 32-bit unsigned value, or not a Number */ public Long getUnsignedIntegerAttribute(final short tag) throws SAMException { final Object value = getAttribute(tag); if (value == null) { return null; } if (value instanceof Number) { final long lValue = ((Number)value).longValue(); if (SAMUtils.isValidUnsignedIntegerAttribute(lValue)) { return lValue; } else { throw new SAMException("Unsigned integer value of tag " + SAMTagUtil.getSingleton().makeStringTag(tag) + " is out of bounds for a 32-bit unsigned integer: " + lValue); } } else { throw new SAMException("Unexpected attribute value data type " + value.getClass() + " for tag " + SAMTagUtil.getSingleton().makeStringTag(tag)); } } /** * Get the tag value and attempt to coerce it into the requested type. * @param tag The requested tag. * @return The value of a tag, converted into a Short if possible. * @throws RuntimeException If the value is not an integer type, or will not fit in a Short. */ public Short getShortAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof Short) { return (Short)val; } if (!(val instanceof Number)) { throw new RuntimeException("Value for tag " + tag + " is not Number: " + val.getClass()); } final long longVal = ((Number)val).longValue(); if (longVal < Short.MIN_VALUE || longVal > Short.MAX_VALUE) { throw new RuntimeException("Value for tag " + tag + " is not in Short range: " + longVal); } return (short)longVal; } /** * Get the tag value and attempt to coerce it into the requested type. * @param tag The requested tag. * @return The value of a tag, converted into a Byte if possible. * @throws RuntimeException If the value is not an integer type, or will not fit in a Byte. */ public Byte getByteAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof Byte) { return (Byte)val; } if (!(val instanceof Number)) { throw new RuntimeException("Value for tag " + tag + " is not Number: " + val.getClass()); } final long longVal = ((Number)val).longValue(); if (longVal < Byte.MIN_VALUE || longVal > Byte.MAX_VALUE) { throw new RuntimeException("Value for tag " + tag + " is not in Short range: " + longVal); } return (byte)longVal; } public String getStringAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof String) { return (String)val; } throw new SAMException("Value for tag " + tag + " is not a String: " + val.getClass()); } public Character getCharacterAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof Character) { return (Character)val; } throw new SAMException("Value for tag " + tag + " is not a Character: " + val.getClass()); } public Float getFloatAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof Float) { return (Float)val; } throw new SAMException("Value for tag " + tag + " is not a Float: " + val.getClass()); } /** Will work for signed byte array, unsigned byte array, or old-style hex array */ public byte[] getByteArrayAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof byte[]) { return (byte[])val; } throw new SAMException("Value for tag " + tag + " is not a byte[]: " + val.getClass()); } public byte[] getUnsignedByteArrayAttribute(final String tag) { final byte[] ret = getByteArrayAttribute(tag); if (ret != null) requireUnsigned(tag); return ret; } /** Will work for signed byte array or old-style hex array */ public byte[] getSignedByteArrayAttribute(final String tag) { final byte[] ret = getByteArrayAttribute(tag); if (ret != null) requireSigned(tag); return ret; } public short[] getUnsignedShortArrayAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof short[]) { requireUnsigned(tag); return (short[]) val; } throw new SAMException("Value for tag " + tag + " is not a short[]: " + val.getClass()); } public short[] getSignedShortArrayAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof short[]) { requireSigned(tag); return (short[]) val; } throw new SAMException("Value for tag " + tag + " is not a short[]: " + val.getClass()); } public int[] getUnsignedIntArrayAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof int[]) { requireUnsigned(tag); return (int[]) val; } throw new SAMException("Value for tag " + tag + " is not a int[]: " + val.getClass()); } public int[] getSignedIntArrayAttribute(final String tag) { final Object val = getAttribute(tag); if (val == null) return null; if (val instanceof int[]) { requireSigned(tag); return (int[]) val; } throw new SAMException("Value for tag " + tag + " is not a int[]: " + val.getClass()); } public float[] getFloatArrayAttribute(final String tag) { final Object val = getAttribute(tag); if (val != null && !(val instanceof float[])) { throw new SAMException("Value for tag " + tag + " is not a float[]: " + val.getClass()); } return (float[]) val; } /** * @return True if this tag is an unsigned array, else false. * @throws SAMException if the tag is not present. */ public boolean isUnsignedArrayAttribute(final String tag) { final SAMBinaryTagAndValue tmp = this.mAttributes.find(SAMTagUtil.getSingleton().makeBinaryTag(tag)); if (tmp != null) return tmp.isUnsignedArray(); throw new SAMException("Tag " + tag + " is not present in this SAMRecord"); } private void requireSigned(final String tag) { if (isUnsignedArrayAttribute(tag)) throw new SAMException("Value for tag " + tag + " is not signed"); } private void requireUnsigned(final String tag) { if (!isUnsignedArrayAttribute(tag)) throw new SAMException("Value for tag " + tag + " is not unsigned"); } /** * @see SAMRecord#getAttribute(java.lang.String) * @param tag Binary representation of a 2-char String tag as created by SAMTagUtil. */ public Object getAttribute(final short tag) { if (this.mAttributes == null) return null; else { final SAMBinaryTagAndValue tmp = this.mAttributes.find(tag); if (tmp != null) return tmp.value; else return null; } } /** * Set a named attribute onto the SAMRecord. Passing a null value causes the attribute to be cleared. * @param tag two-character tag name. See http://samtools.sourceforge.net/SAM1.pdf for standard and user-defined tags. * @param value Supported types are String, Char, Integer, Float, * Long (for values that fit into a signed or unsigned 32-bit integer only), * byte[], short[], int[], float[]. * If value == null, tag is cleared. * * Byte and Short are allowed but discouraged. If written to a SAM file, these will be converted to Integer, * whereas if written to BAM, getAttribute() will return as Byte or Short, respectively. * * Long is allowed for values that fit into a signed or unsigned 32-bit integer only, but discouraged. * * To set unsigned byte[], unsigned short[] or unsigned int[] (which is discouraged because of poor Java language * support), setUnsignedArrayAttribute() must be used instead of this method. * * String values are not validated to ensure that they conform to SAM spec. */ public void setAttribute(final String tag, final Object value) { if (value != null && value.getClass().isArray() && Array.getLength(value) == 0) { throw new IllegalArgumentException("Empty value passed for tag " + tag); } setAttribute(SAMTagUtil.getSingleton().makeBinaryTag(tag), value); } /** * Because Java does not support unsigned integer types, we think it is a bad idea to encode them in SAM * files. If you must do so, however, you must call this method rather than setAttribute, because calling * this method is the way to indicate that, e.g. a short array should be interpreted as unsigned shorts. * @param value must be one of byte[], short[], int[] */ public void setUnsignedArrayAttribute(final String tag, final Object value) { if (!value.getClass().isArray()) { throw new IllegalArgumentException("Non-array passed to setUnsignedArrayAttribute for tag " + tag); } if (Array.getLength(value) == 0) { throw new IllegalArgumentException("Empty array passed to setUnsignedArrayAttribute for tag " + tag); } setAttribute(SAMTagUtil.getSingleton().makeBinaryTag(tag), value, true); } /** * @see htsjdk.samtools.SAMRecord#setAttribute(java.lang.String, java.lang.Object) * @param tag Binary representation of a 2-char String tag as created by SAMTagUtil. */ protected void setAttribute(final short tag, final Object value) { setAttribute(tag, value, false); } /** * Checks if the value is allowed as an attribute value. * * @param value the value to be checked * @return true if the value is valid and false otherwise */ protected static boolean isAllowedAttributeValue(final Object value) { if (value instanceof Byte || value instanceof Short || value instanceof Integer || value instanceof String || value instanceof Character || value instanceof Float || value instanceof byte[] || value instanceof short[] || value instanceof int[] || value instanceof float[]) { return true; } // A special case for Longs: we require Long values to fit into either a uint32_t or an int32_t, // as that is what the BAM spec allows. if (value instanceof Long) { return SAMUtils.isValidUnsignedIntegerAttribute((Long) value) || ((Long) value >= Integer.MIN_VALUE && (Long) value <= Integer.MAX_VALUE); } return false; } protected void setAttribute(final short tag, final Object value, final boolean isUnsignedArray) { if (value == null) { // setting a tag value to null removes the tag: if (this.mAttributes != null) { this.mAttributes = this.mAttributes.remove(tag); } return; } if (isAllowedAttributeValue(value)) { final SAMBinaryTagAndValue tmp; if (!isUnsignedArray) { tmp = new SAMBinaryTagAndValue(tag, value); } else { if (!value.getClass().isArray() || value instanceof float[]) { throw new SAMException("Attribute type " + value.getClass() + " cannot be encoded as an unsigned array. Tag: " + SAMTagUtil.getSingleton().makeStringTag(tag)); } tmp = new SAMBinaryTagAndUnsignedArrayValue(tag, value); } if (this.mAttributes == null) { this.mAttributes = tmp; } else { this.mAttributes = this.mAttributes.insert(tmp); } } else { throw new SAMException("Attribute type " + value.getClass() + " not supported. Tag: " + SAMTagUtil.getSingleton().makeStringTag(tag)); } } /** * Removes all attributes. */ public void clearAttributes() { mAttributes = null; } /** * Replace any existing attributes with the given linked item. */ protected void setAttributes(final SAMBinaryTagAndValue attributes) { mAttributes = attributes; } /** * @return Pointer to the first of the tags. Returns null if there are no tags. */ protected SAMBinaryTagAndValue getBinaryAttributes() { return mAttributes; } /** * @return reference name, null if this is unmapped */ @Override public String getContig() { if( getReadUnmappedFlag()) { return null; } else { return getReferenceName(); } } /** * an alias of {@link #getAlignmentStart() * @return 1-based inclusive leftmost position of the clipped sequence, or 0 if there is no position. */ @Override public int getStart() { return getAlignmentStart(); } /** * an alias of {@link #getAlignmentEnd()} * @return 1-based inclusive rightmost position of the clipped sequence, or 0 read if unmapped. */ @Override public int getEnd() { return getAlignmentEnd(); } /** * Tag name and value of an attribute, for getAttributes() method. */ public static class SAMTagAndValue { public final String tag; public final Object value; public SAMTagAndValue(final String tag, final Object value) { this.tag = tag; this.value = value; } } /** * @return list of {tag, value} tuples */ public List getAttributes() { SAMBinaryTagAndValue binaryAttributes = getBinaryAttributes(); final List ret = new ArrayList(); while (binaryAttributes != null) { ret.add(new SAMTagAndValue(SAMTagUtil.getSingleton().makeStringTag(binaryAttributes.tag), binaryAttributes.value)); binaryAttributes = binaryAttributes.getNext(); } return ret; } Integer getIndexingBin() { return mIndexingBin; } /** * Used internally when writing BAMRecords. * @param mIndexingBin c.f. http://samtools.sourceforge.net/SAM1.pdf */ void setIndexingBin(final Integer mIndexingBin) { this.mIndexingBin = mIndexingBin; } /** * Does not change state of this. * @return indexing bin based on alignment start & end. */ int computeIndexingBin() { // reg2bin has zero-based, half-open API final int alignmentStart = getAlignmentStart()-1; int alignmentEnd = getAlignmentEnd(); if (alignmentEnd <= 0) { // If alignment end cannot be determined (e.g. because this read is not really aligned), // then treat this as a one base alignment for indexing purposes. alignmentEnd = alignmentStart + 1; } return GenomicIndexUtil.reg2bin(alignmentStart, alignmentEnd); } /** * @return the SAMFileHeader for this record. If the header is null, the following SAMRecord methods may throw * exceptions: *

    *
  • getReferenceIndex
  • *
  • setReferenceIndex
  • *
  • getMateReferenceIndex
  • *
  • setMateReferenceIndex
  • *

* Record comparators (i.e. SAMRecordCoordinateComparator and SAMRecordDuplicateComparator) require records with * non-null header values. *

* A record with null a header may be validated by the isValid method, but the reference and mate reference indices, * read group, sequence dictionary, and alignment start will not be fully validated unless a header is present. *

* SAMTextWriter, BAMFileWriter, and CRAMFileWriter all require records to have a valid header in order to be * written. Any record that does not have a header at the time it is added to the writer will be updated to use the * header associated with the writer. */ public SAMFileHeader getHeader() { return mHeader; } /** * Sets the SAMFileHeader for this record. Setting the header into SAMRecord facilitates conversion between reference * sequence names and indices. *

* NOTE: If the record has a reference or mate reference name, the corresponding reference and mate reference * indices are resolved and updated using the sequence dictionary in the new header. setHeader does not throw an * exception if either the reference or mate reference name does not appear in the new header's sequence dictionary. *

* When the SAMFileHeader is set to null, the reference and mate reference indices are cleared. Therefore, calls to * the following SAMRecord methods on records with a null header may throw IllegalArgumentExceptions: *

    *
  • getReferenceIndex
  • *
  • setReferenceIndex
  • *
  • getMateReferenceIndex
  • *
  • setMateReferenceIndex
  • *

* Record comparators (i.e. SAMRecordCoordinateComparator and SAMRecordDuplicateComparator) require records with * non-null header values. *

* A record with null a header may be validated by the isValid method, but the reference and mate reference indices, * read group, sequence dictionary, and alignment start will not be fully validated unless a header is present. *

* SAMTextWriter, BAMFileWriter, and CRAMFileWriter all require records to have a valid header in order to be * written. Any record that does not have a header at the time it is added to the writer will be updated to use the * header associated with the writer. * * @param header contains sequence dictionary for this SAMRecord */ public void setHeader(final SAMFileHeader header) { this.mHeader = header; if (null == header) { // mark the reference indices as unresolved mReferenceIndex = null; mMateReferenceIndex = null; } else { // attempt to resolve the existing reference names and indices against the new sequence dictionary, but // don't throw if the names don't appear in the dictionary setReferenceName(mReferenceName); setMateReferenceName(mMateReferenceName); } } /** * If this record has a valid binary representation of the variable-length portion of a binary record stored, * return that byte array, otherwise return null. This will never be true for SAMRecords. It will be true * for BAMRecords that have not been eagerDecoded(), and for which none of the data in the variable-length * portion has been changed. */ public byte[] getVariableBinaryRepresentation() { return null; } /** * Depending on the concrete implementation, the binary file size of attributes may be known without * computing them all. * @return binary file size of attribute, if known, else -1 */ public int getAttributesBinarySize() { return -1; } /** * * @return String representation of this. * @deprecated This method is not guaranteed to return a valid SAM text representation of the SAMRecord. * To get standard SAM text representation, use htsjdk.samtools.SAMRecord#getSAMString(). */ public String format() { final StringBuilder buffer = new StringBuilder(); addField(buffer, getReadName(), null, null); addField(buffer, getFlags(), null, null); addField(buffer, getReferenceName(), null, "*"); addField(buffer, getAlignmentStart(), 0, "*"); addField(buffer, getMappingQuality(), 0, "0"); addField(buffer, getCigarString(), null, "*"); addField(buffer, getMateReferenceName(), null, "*"); addField(buffer, getMateAlignmentStart(), 0, "*"); addField(buffer, getInferredInsertSize(), 0, "*"); addField(buffer, getReadString(), null, "*"); addField(buffer, getBaseQualityString(), null, "*"); if (mAttributes != null) { SAMBinaryTagAndValue entry = getBinaryAttributes(); while (entry != null) { addField(buffer, formatTagValue(entry.tag, entry.value)); entry = entry.getNext(); } } return buffer.toString(); } private void addField(final StringBuilder buffer, final Object value, final Object defaultValue, final String defaultString) { if (safeEquals(value, defaultValue)) { addField(buffer, defaultString); } else if (value == null) { addField(buffer, ""); } else { addField(buffer, value.toString()); } } private void addField(final StringBuilder buffer, final String field) { if (buffer.length() > 0) { buffer.append('\t'); } buffer.append(field); } private String formatTagValue(final short tag, final Object value) { final String tagString = SAMTagUtil.getSingleton().makeStringTag(tag); if (value == null || value instanceof String) { return tagString + ":Z:" + value; } else if (value instanceof Integer || value instanceof Long || value instanceof Short || value instanceof Byte) { return tagString + ":i:" + value; } else if (value instanceof Character) { return tagString + ":A:" + value; } else if (value instanceof Float) { return tagString + ":f:" + value; } else if (value instanceof byte[]) { return tagString + ":H:" + StringUtil.bytesToHexString((byte[]) value); } else { throw new RuntimeException("Unexpected value type for tag " + tagString + ": " + value + " of class " + value.getClass().getName()); } } private boolean safeEquals(final Object o1, final Object o2) { if (o1 == o2) { return true; } else if (o1 == null || o2 == null) { return false; } else { return o1.equals(o2); } } /** * Force all lazily-initialized data members to be initialized. If a subclass overrides this method, * typically it should also call super method. */ protected void eagerDecode() { getCigar(); getCigarString(); } /** * Returns blocks of the read sequence that have been aligned directly to the * reference sequence. Note that clipped portions of the read and inserted and * deleted bases (vs. the reference) are not represented in the alignment blocks. */ public List getAlignmentBlocks() { if (this.mAlignmentBlocks == null) { this.mAlignmentBlocks = SAMUtils.getAlignmentBlocks(getCigar(), getAlignmentStart(), "read cigar"); } return this.mAlignmentBlocks; } /** * Run all validations of CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. * @param recordNumber For error reporting. -1 if not known. * @return List of errors, or null if no errors. */ public List validateCigar(final long recordNumber) { List ret = null; if (null != getHeader() && getValidationStringency() != ValidationStringency.SILENT && !this.getReadUnmappedFlag()) { ret = SAMUtils.validateCigar(this, getCigar(), getReferenceIndex(), getAlignmentBlocks(), recordNumber, "Read CIGAR"); } return ret; } @Override public boolean equals(final Object o) { if (this == o) return true; if (!(o instanceof SAMRecord)) return false; final SAMRecord samRecord = (SAMRecord) o; // First check all the elements that do not require decoding if (mAlignmentStart != samRecord.mAlignmentStart) return false; if (mFlags != samRecord.mFlags) return false; if (mInferredInsertSize != samRecord.mInferredInsertSize) return false; if (mMappingQuality != samRecord.mMappingQuality) return false; if (mMateAlignmentStart != samRecord.mMateAlignmentStart) return false; if (mIndexingBin != null ? !mIndexingBin.equals(samRecord.mIndexingBin) : samRecord.mIndexingBin != null) return false; if (mMateReferenceIndex != null ? !mMateReferenceIndex.equals(samRecord.mMateReferenceIndex) : samRecord.mMateReferenceIndex != null) return false; if (mReferenceIndex != null ? !mReferenceIndex.equals(samRecord.mReferenceIndex) : samRecord.mReferenceIndex != null) return false; eagerDecode(); samRecord.eagerDecode(); if (mReadName != null ? !mReadName.equals(samRecord.mReadName) : samRecord.mReadName != null) return false; if (mAttributes != null ? !mAttributes.equals(samRecord.mAttributes) : samRecord.mAttributes != null) return false; if (!Arrays.equals(mBaseQualities, samRecord.mBaseQualities)) return false; if (mCigar != null ? !mCigar.equals(samRecord.mCigar) : samRecord.mCigar != null) return false; if (mMateReferenceName != null ? !mMateReferenceName.equals(samRecord.mMateReferenceName) : samRecord.mMateReferenceName != null) return false; if (!Arrays.equals(mReadBases, samRecord.mReadBases)) return false; if (mReferenceName != null ? !mReferenceName.equals(samRecord.mReferenceName) : samRecord.mReferenceName != null) return false; return true; } @Override public int hashCode() { eagerDecode(); int result = mReadName != null ? mReadName.hashCode() : 0; result = 31 * result + (mReadBases != null ? Arrays.hashCode(mReadBases) : 0); result = 31 * result + (mBaseQualities != null ? Arrays.hashCode(mBaseQualities) : 0); result = 31 * result + (mReferenceName != null ? mReferenceName.hashCode() : 0); result = 31 * result + mAlignmentStart; result = 31 * result + mMappingQuality; result = 31 * result + (mCigarString != null ? mCigarString.hashCode() : 0); result = 31 * result + mFlags; result = 31 * result + (mMateReferenceName != null ? mMateReferenceName.hashCode() : 0); result = 31 * result + mMateAlignmentStart; result = 31 * result + mInferredInsertSize; result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); result = 31 * result + (mReferenceIndex != null ? mReferenceIndex.hashCode() : 0); result = 31 * result + (mMateReferenceIndex != null ? mMateReferenceIndex.hashCode() : 0); result = 31 * result + (mIndexingBin != null ? mIndexingBin.hashCode() : 0); return result; } /** * Perform various validations of SAMRecord. * Note that this method deliberately returns null rather than Collections.emptyList() if there * are no validation errors, because callers tend to assume that if a non-null list is returned, it is modifiable. * * A record with null a header may be validated by the isValid method, but the reference and mate reference indices, * read group, sequence dictionary, and alignment start will not be fully validated unless a header is present. * * @return null if valid. If invalid, returns a list of error messages. * */ public List isValid() { return isValid(false); } /** * Perform various validations of SAMRecord. * Note that this method deliberately returns null rather than Collections.emptyList() if there * are no validation errors, because callers tend to assume that if a non-null list is returned, it is modifiable. * * A record with null a header may be validated by the isValid method, but the reference and mate reference indices, * read group, sequence dictionary, and alignment start will not be fully validated unless a header is present. * * @param firstOnly return only the first error if true, false otherwise * @return null if valid. If invalid, returns a list of error messages. */ public List isValid(final boolean firstOnly) { // ret is only instantiate if there are errors to report, in order to reduce GC in the typical case // in which everything is valid. It's ugly, but more efficient. ArrayList ret = null; if (!getReadPairedFlag()) { if (getProperPairFlagUnchecked()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getMateUnmappedFlagUnchecked()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_UNMAPPED, "Mate unmapped flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getMateNegativeStrandFlagUnchecked()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_NEG_STRAND, "Mate negative strand flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getFirstOfPairFlagUnchecked()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_FIRST_OF_PAIR, "First of pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getSecondOfPairFlagUnchecked()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_SECOND_OF_PAIR, "Second of pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (null != getHeader() && getMateReferenceIndex() != NO_ALIGNMENT_REFERENCE_INDEX) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MATE_REF_INDEX, "MRNM should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } } else { final List errors = isValidReferenceIndexAndPosition(mMateReferenceIndex, mMateReferenceName, getMateAlignmentStart(), true, firstOnly); if (errors != null) { if (firstOnly) return errors; if (ret == null) ret = new ArrayList(); ret.addAll(errors); } if (!hasMateReferenceName() && !getMateUnmappedFlag()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_UNMAPPED, "Mapped mate should have mate reference name", getReadName())); if (firstOnly) return ret; } if (!getFirstOfPairFlagUnchecked() && !getSecondOfPairFlagUnchecked()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.PAIRED_READ_NOT_MARKED_AS_FIRST_OR_SECOND, "Paired read should be marked as first of pair or second of pair.", getReadName())); if (firstOnly) return ret; } /* TODO: PIC-97 This validation should be enabled, but probably at this point there are too many BAM files that have the proper pair flag set when read or mate is unmapped. if (getMateUnmappedFlag() && getProperPairFlagUnchecked()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } */ } if (getInferredInsertSize() > MAX_INSERT_SIZE || getInferredInsertSize() < -MAX_INSERT_SIZE) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INSERT_SIZE, "Insert size out of range", getReadName())); if (firstOnly) return ret; } if (getReadUnmappedFlag()) { if (getNotPrimaryAlignmentFlag()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_NOT_PRIM_ALIGNMENT, "Not primary alignment flag should not be set for unmapped read.", getReadName())); if (firstOnly) return ret; } if (getSupplementaryAlignmentFlag()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_SUPPLEMENTARY_ALIGNMENT, "Supplementary alignment flag should not be set for unmapped read.", getReadName())); if (firstOnly) return ret; } if (getMappingQuality() != 0) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MAPPING_QUALITY, "MAPQ should be 0 for unmapped read.", getReadName())); if (firstOnly) return ret; } /* non-empty CIGAR on unmapped read is now allowed, because there are special reads when SAM is used to store assembly. */ /* TODO: PIC-97 This validation should be enabled, but probably at this point there are too many BAM files that have the proper pair flag set when read or mate is unmapped. if (getProperPairFlagUnchecked()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unmapped read.", getReadName())); } */ } else { if (getMappingQuality() >= 256) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MAPPING_QUALITY, "MAPQ should be < 256.", getReadName())); if (firstOnly) return ret; } if (getCigarLength() == 0) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "CIGAR should have > zero elements for mapped read.", getReadName())); /* todo - will uncomment once unit tests are added } else if (getCigar().getReadLength() != getReadLength()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "CIGAR read length " + getCigar().getReadLength() + " doesn't match read length " + getReadLength(), getReadName())); */ if (firstOnly) return ret; } if (getHeader() != null && getHeader().getSequenceDictionary().size() == 0) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.MISSING_SEQUENCE_DICTIONARY, "Empty sequence dictionary.", getReadName())); if (firstOnly) return ret; } if (!hasReferenceName()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_READ_UNMAPPED, "Mapped read should have valid reference name", getReadName())); if (firstOnly) return ret; } /* Oops! We know this is broken in older BAM files, so this having this validation will cause all sorts of problems! if (getIndexingBin() != null && getIndexingBin() != computeIndexingBin()) { ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INDEXING_BIN, "Indexing bin (" + getIndexingBin() + ") does not agree with computed value (" + computeIndexingBin() + ")", getReadName())); } */ } // Validate the RG ID is found in header final String rgId = (String)getAttribute(SAMTagUtil.getSingleton().RG); if (rgId != null && getHeader() != null && getHeader().getReadGroup(rgId) == null) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.READ_GROUP_NOT_FOUND, "RG ID on SAMRecord not found in header: " + rgId, getReadName())); if (firstOnly) return ret; } final List errors = isValidReferenceIndexAndPosition(mReferenceIndex, mReferenceName, getAlignmentStart(), false); if (errors != null) { if (ret == null) ret = new ArrayList(); ret.addAll(errors); if (firstOnly) return ret; } // TODO(mccowan): Is this asking "is this the primary alignment"? if (this.getReadLength() == 0 && !this.getNotPrimaryAlignmentFlag()) { final Object fz = getAttribute(SAMTagUtil.getSingleton().FZ); if (fz == null) { final String cq = (String)getAttribute(SAMTagUtil.getSingleton().CQ); final String cs = (String)getAttribute(SAMTagUtil.getSingleton().CS); if (cq == null || cq.length() == 0 || cs == null || cs.length() == 0) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.EMPTY_READ, "Zero-length read without FZ, CS or CQ tag", getReadName())); if (firstOnly) return ret; } else if (!getReadUnmappedFlag()) { boolean hasIndel = false; for (final CigarElement cigarElement : getCigar().getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.INSERTION) { hasIndel = true; break; } } if (!hasIndel) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.EMPTY_READ, "Colorspace read with zero-length bases but no indel", getReadName())); if (firstOnly) return ret; } } } } if (this.getReadLength() != getBaseQualities().length && !Arrays.equals(getBaseQualities(), NULL_QUALS)) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.MISMATCH_READ_LENGTH_AND_QUALS_LENGTH, "Read length does not match quals length", getReadName())); if (firstOnly) return ret; } if (this.getAlignmentStart() != NO_ALIGNMENT_START && this.getIndexingBin() != null && this.computeIndexingBin() != this.getIndexingBin()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INDEXING_BIN, "bin field of BAM record does not equal value computed based on alignment start and end, and length of sequence to which read is aligned", getReadName())); if (firstOnly) return ret; } if (ret == null || ret.size() == 0) { return null; } return ret; } /** * Gets the source of this SAM record -- both the reader that retrieved the record and the position on disk from * whence it came. * @return The file source. Note that the reader will be null if not activated using SAMFileReader.enableFileSource(). */ public SAMFileSource getFileSource() { return mFileSource; } /** * Sets a marker providing the source reader for this file and the position in the file from which the read originated. * @param fileSource source of the given file. */ protected void setFileSource(final SAMFileSource fileSource) { mFileSource = fileSource; } private List isValidReferenceIndexAndPosition(final Integer referenceIndex, final String referenceName, final int alignmentStart, final boolean isMate) { return isValidReferenceIndexAndPosition(referenceIndex, referenceName, alignmentStart, isMate, false); } private List isValidReferenceIndexAndPosition(final Integer referenceIndex, final String referenceName, final int alignmentStart, final boolean isMate, final boolean firstOnly) { final boolean hasReference = hasReferenceName(referenceIndex, referenceName); // ret is only instantiate if there are errors to report, in order to reduce GC in the typical case // in which everything is valid. It's ugly, but more efficient. ArrayList ret = null; if (!hasReference) { if (alignmentStart != 0) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start should be 0 because reference name = *.", isMate), getReadName())); if (firstOnly) return ret; } } else { if (alignmentStart == 0) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start should != 0 because reference name != *.", isMate), getReadName())); if (firstOnly) return ret; } if (getHeader() != null && getHeader().getSequenceDictionary().size() > 0) { final SAMSequenceRecord sequence = (referenceIndex != null? getHeader().getSequence(referenceIndex): getHeader().getSequence(referenceName)); if (sequence == null) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_REFERENCE_INDEX, buildMessage("Reference sequence not found in sequence dictionary.", isMate), getReadName())); if (firstOnly) return ret; } else { if (alignmentStart > sequence.getSequenceLength()) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start (" + alignmentStart + ") must be <= reference sequence length (" + sequence.getSequenceLength() + ") on reference " + sequence.getSequenceName(), isMate), getReadName())); if (firstOnly) return ret; } } } } return ret; } private String buildMessage(final String baseMessage, final boolean isMate) { return isMate ? "Mate " + baseMessage : baseMessage; } /** * Note that this does a shallow copy of everything, except for the attribute list, for which a copy of the list * is made, but the attributes themselves are copied by reference. This should be safe because callers should * never modify a mutable value returned by any of the get() methods anyway. */ @Override public Object clone() throws CloneNotSupportedException { final SAMRecord newRecord = (SAMRecord)super.clone(); if (mAttributes != null) { newRecord.mAttributes = this.mAttributes.copy(); } return newRecord; } /** * Returns a deep copy of the SAM record, with the following exceptions: * * - The header field, which shares the reference with the original record * - The file source field, which will always always be set to null in the copy * * Note that some fields, i.e. the cigar elements, alignment blocks, and * indexing bin, are not explicitly populated in the copy since they are lazily * generated on demand. * * Also note that this fails: * * original.deepCopy().equals(original) * * due to the fact that SAMBinaryTagAndValue.equals winds up calling object.equals on the * value field, which uses reference equality. * */ public SAMRecord deepCopy() { final SAMRecord newSAM = new SAMRecord(getHeader()); newSAM.setReadName(getReadName()); newSAM.setReadBases(Arrays.copyOf(getReadBases(), getReadLength())); final byte baseQualities[] = getBaseQualities(); newSAM.setBaseQualities(Arrays.copyOf(baseQualities, baseQualities.length)); newSAM.setReferenceName(getReferenceName()); newSAM.setAlignmentStart(getAlignmentStart()); // clears mAlignmentEnd newSAM.setMappingQuality(getMappingQuality()); newSAM.setCigarString(getCigarString()); // clears Cigar element and alignmentBlocks newSAM.setFileSource(null); newSAM.setFlags(getFlags()); newSAM.setMateReferenceName(getMateReferenceName()); newSAM.setMateAlignmentStart(getMateAlignmentStart()); newSAM.setInferredInsertSize(getInferredInsertSize()); if (null != getHeader()) { newSAM.setReferenceIndex(getReferenceIndex()); newSAM.setMateReferenceIndex(getMateReferenceIndex()); } else { newSAM.mReferenceIndex = null; newSAM.mMateReferenceIndex = null; } newSAM.setValidationStringency(getValidationStringency()); SAMBinaryTagAndValue attributes = getBinaryAttributes(); if (null != attributes) { newSAM.setAttributes(attributes.deepCopy()); } return newSAM; } /** Simple toString() that gives a little bit of useful info about the read. */ @Override public String toString() { final StringBuilder builder = new StringBuilder(64); builder.append(getReadName()); if (getReadPairedFlag()) { if (getFirstOfPairFlag()) { builder.append(" 1/2"); } else { builder.append(" 2/2"); } } builder.append(" "); builder.append(String.valueOf(getReadLength())); builder.append("b"); if (getReadUnmappedFlag()) { builder.append(" unmapped read."); } else { builder.append(" aligned read."); } return builder.toString(); } /** Returns the record in the SAM line-based text format. Fields are separated by '\t' characters, and the String is terminated by '\n'. */ public String getSAMString() { return SAMTextWriter.getSAMString(this); } public String getPairedReadName() { final StringBuilder builder = new StringBuilder(64); builder.append(getReadName()); if (getReadPairedFlag()) { if (getFirstOfPairFlag()) { builder.append(" 1/2"); } else { builder.append(" 2/2"); } } return builder.toString(); } /** * shortcut to

SAMFlag.getFlags( this.getFlags() );
* @returns a set of SAMFlag associated to this sam record */ public final Set getSAMFlags() { return SAMFlag.getFlags(this.getFlags()); } /** * Fetches the value of a transient attribute on the SAMRecord, of null if not set. * * The intended use for transient attributes is to store values that are 1-to-1 with the SAMRecord, * may be needed many times and are expensive to compute. These values can be computed lazily and * then stored as transient attributes to avoid frequent re-computation. */ public final Object getTransientAttribute(final Object key) { return (this.transientAttributes == null) ? null : this.transientAttributes.get(key); } /** * Sets the value of a transient attribute, and returns the previous value if defined. * * The intended use for transient attributes is to store values that are 1-to-1 with the SAMRecord, * may be needed many times and are expensive to compute. These values can be computed lazily and * then stored as transient attributes to avoid frequent re-computation. */ public final Object setTransientAttribute(final Object key, final Object value) { if (this.transientAttributes == null) this.transientAttributes = new HashMap(); return this.transientAttributes.put(key, value); } /** * Removes a transient attribute if it is stored, and returns the stored value. If there is not * a stored value, will return null. */ public final Object removeTransientAttribute(final Object key) { if (this.transientAttributes != null) return this.transientAttributes.remove(key); else return null; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecordComparator.java000066400000000000000000000032721263034757100243270ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.util.Comparator; /** * Interface for comparators that define the various SAM sort orders. */ public interface SAMRecordComparator extends Comparator { /** * Less stringent compare method than the regular compare. If the two records * are equal enough that their ordering in a sorted SAM file would be arbitrary, * this method returns 0. * @return negative if samRecord1 < samRecord2, 0 if equal, else positive */ public int fileOrderCompare(SAMRecord samRecord1, SAMRecord samRecord2); } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecordCoordinateComparator.java000066400000000000000000000107501263034757100263360ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Comparator for sorting SAMRecords by coordinate. Note that the header is required because * the order of sequences in the header defines the major sort order. * * Ideally this method would only return 0 for completely equal SAMRecords, so that sort is * completely deterministic. This implementation does not achieve this completely, but it * comes pretty close, while avoiding decoding the variable length fields, except for read name, * which is decoded if coordinate and strand are equal. * * Extreme care must be taken to ensure the following: * if A == B, then B == A * if A < B, then B > A * if A < B && B < C, then A < C * */ public class SAMRecordCoordinateComparator implements SAMRecordComparator { public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { int cmp = fileOrderCompare(samRecord1, samRecord2); if (cmp != 0) { return cmp; } // Test of negative strand flag is not really necessary, because it is tested // with cmp if getFlags, but it is left here because that is the way it was done // in the past. if (samRecord1.getReadNegativeStrandFlag() == samRecord2.getReadNegativeStrandFlag()) { cmp = samRecord1.getReadName().compareTo(samRecord2.getReadName()); if (cmp != 0) return cmp; cmp = compareInts(samRecord1.getFlags(), samRecord2.getFlags()); if (cmp != 0) return cmp; cmp = compareInts(samRecord1.getMappingQuality(), samRecord2.getMappingQuality()); if (cmp != 0) return cmp; cmp = compareInts(samRecord1.getMateReferenceIndex(), samRecord2.getMateReferenceIndex()); if (cmp != 0) return cmp; cmp = compareInts(samRecord1.getMateAlignmentStart(), samRecord2.getMateAlignmentStart()); if (cmp != 0) return cmp; cmp = compareInts(samRecord1.getInferredInsertSize(), samRecord2.getInferredInsertSize()); return cmp; } else return (samRecord1.getReadNegativeStrandFlag()? 1: -1); } private int compareInts(int i1, int i2) { if (i1 < i2) return -1; else if (i1 > i2) return 1; else return 0; } /** * Less stringent compare method than the regular compare. If the two records * are equal enough that their ordering in a sorted SAM file would be arbitrary, * this method returns 0. If read is paired and unmapped, use the mate mapping to sort. * Records being compared must have non-null SAMFileHeaders. * * @return negative if samRecord1 < samRecord2, 0 if equal, else positive */ public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { if (null == samRecord1.getHeader() || null == samRecord2.getHeader()) { throw new IllegalArgumentException("Records must have non-null SAMFileHeaders to be compared"); } final int refIndex1 = samRecord1.getReferenceIndex(); final int refIndex2 = samRecord2.getReferenceIndex(); if (refIndex1 == -1) { return (refIndex2 == -1? 0: 1); } else if (refIndex2 == -1) { return -1; } final int cmp = refIndex1 - refIndex2; if (cmp != 0) { return cmp; } return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart(); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecordDuplicateComparator.java000066400000000000000000000364131263034757100261650ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Compares records based on if they should be considered PCR Duplicates (see MarkDuplicates). * * There are three orderings provided by this comparator: compare, duplicateSetCompare, and fileOrderCompare. * * Specify the headers when constructing this comparator if you would like to consider the library as the major sort key. * The records being compared must also have non-null SAMFileHeaders. * * @author nhomer */ public class SAMRecordDuplicateComparator implements SAMRecordComparator { /** An enum to provide type-safe keys for transient attributes the comparator puts on SAMRecords. */ private static enum Attr { LibraryId, ReadCoordinate, MateCoordinate } private static final byte FF = 0, FR = 1, F = 2, RF = 3, RR = 4, R = 5; private final Map libraryIds = new HashMap(); // from library string to library id private short nextLibraryId = 1; private ScoringStrategy scoringStrategy = ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH; public SAMRecordDuplicateComparator() {} public SAMRecordDuplicateComparator(final List headers) { // pre-populate the library names for (final SAMFileHeader header : headers) { for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { final String libraryName = readGroup.getLibrary(); if (null != libraryName) { final short libraryId = this.nextLibraryId++; this.libraryIds.put(libraryName, libraryId); } } } } public void setScoringStrategy(final ScoringStrategy scoringStrategy) { this.scoringStrategy = scoringStrategy; } /** * Populates the set of transient attributes on SAMRecords if they are not already there. */ private void populateTransientAttributes(final SAMRecord... recs) { for (final SAMRecord rec : recs) { if (rec.getTransientAttribute(Attr.LibraryId) != null) continue; rec.setTransientAttribute(Attr.LibraryId, getLibraryId(rec)); rec.setTransientAttribute(Attr.ReadCoordinate, rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart()); rec.setTransientAttribute(Attr.MateCoordinate, getMateCoordinate(rec)); } } /** * Gets the library name from the header for the record. If the RG tag is not present on * the record, or the library isn't denoted on the read group, a constant string is * returned. */ private static String getLibraryName(final SAMRecord rec) { final String readGroupId = (String) rec.getAttribute("RG"); if (readGroupId != null) { final SAMFileHeader samHeader = rec.getHeader(); if (null != samHeader) { final SAMReadGroupRecord rg = samHeader.getReadGroup(readGroupId); if (rg != null) { final String libraryName = rg.getLibrary(); if (null != libraryName) return libraryName; } } } return "Unknown Library"; } /** Get the library ID for the given SAM record. */ private short getLibraryId(final SAMRecord rec) { final String library = getLibraryName(rec); Short libraryId = this.libraryIds.get(library); if (libraryId == null) { libraryId = this.nextLibraryId++; this.libraryIds.put(library, libraryId); } return libraryId; } /** * Convenience method for comparing two orientation bytes. This is critical if we have mapped reads compared to fragment reads. */ private int compareOrientationByteCollapseOrientation(final int orientation1, final int orientation2) { // F == FR, F == FF // R == RF, R == RR if (F == orientation1 || R == orientation1) { // first orientation is fragment /** * We want * F == FR, F == FF * R == RF, R == RR */ if (F == orientation1) { if (F == orientation2 || FR == orientation2 || FF == orientation2) { return 0; } } else { // R == orientation1 if (R == orientation2 || RF == orientation2 || RR == orientation2) { return 0; } } } else if (F == orientation2 || R == orientation2) { // first orientation is paired, second is fragment return -compareOrientationByteCollapseOrientation(orientation2, orientation1); } return orientation1 - orientation2; } /** * Returns a single byte that encodes the orientation of the two reads in a pair. */ private static byte getPairedOrientationByte(final boolean read1NegativeStrand, final boolean read2NegativeStrand) { if (read1NegativeStrand) { if (read2NegativeStrand) return SAMRecordDuplicateComparator.RR; else return SAMRecordDuplicateComparator.RF; } else { if (read2NegativeStrand) return SAMRecordDuplicateComparator.FR; else return SAMRecordDuplicateComparator.FF; } } private int getFragmentOrientation(final SAMRecord record) { return record.getReadNegativeStrandFlag() ? SAMRecordDuplicateComparator.R : SAMRecordDuplicateComparator.F; } private int getPairedOrientation(final SAMRecord record) { if (record.getReadPairedFlag() && !record.getReadUnmappedFlag() && !record.getMateUnmappedFlag()) { return getPairedOrientationByte(record.getReadNegativeStrandFlag(), record.getMateNegativeStrandFlag()); } else { return getFragmentOrientation(record); } } private int getMateReferenceIndex(final SAMRecord record) { if (record.getReadPairedFlag() && !record.getReadUnmappedFlag() && !record.getMateUnmappedFlag()) { return record.getMateReferenceIndex(); } else { return -1; } } private int getMateCoordinate(final SAMRecord record) { if (record.getReadPairedFlag() && !record.getReadUnmappedFlag() && !record.getMateUnmappedFlag()) { return record.getMateNegativeStrandFlag() ? SAMUtils.getMateUnclippedEnd(record) : SAMUtils.getMateUnclippedStart(record); } else { return -1; } } /** Is one end of a pair, or the fragment, unmapped? */ private boolean hasUnmappedEnd(final SAMRecord record) { return (record.getReadUnmappedFlag() || (record.getReadPairedFlag() && record.getMateUnmappedFlag())); } /** Are both ends of a pair, or the fragment, mapped? */ private boolean hasMappedEnd(final SAMRecord record) { return (!record.getReadUnmappedFlag() || (record.getReadPairedFlag() && !record.getMateUnmappedFlag())); } /** Is this paired end and are both ends of a pair mapped */ private boolean pairedEndAndBothMapped(final SAMRecord record) { return (record.getReadPairedFlag() && !record.getReadUnmappedFlag() && !record.getMateUnmappedFlag()); } /** * Most stringent comparison. * * Two records are compared based on if they are duplicates of each other, and then based * on if they should be prioritized for being the most "representative". Typically, the representative * is the record in the SAM file that is *not* marked as a duplicate within a set of duplicates. * * Compare by file order, then duplicate scoring strategy, read name. * * If both reads are paired and both ends mapped, always prefer the first end over the second end. This is needed to * properly choose the first end for optical duplicate identification when both ends are mapped to the same position etc. */ public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { populateTransientAttributes(samRecord1, samRecord2); int cmp; // temporary variables for comparisons int samRecord1Value, samRecord2Value; cmp = fileOrderCompare(samRecord1, samRecord2); // the duplicate scoring strategy if (cmp == 0) { cmp = DuplicateScoringStrategy.compare(samRecord1, samRecord2, this.scoringStrategy, true); } // the read name if (cmp == 0) { cmp = samRecord1.getReadName().compareTo(samRecord2.getReadName()); } // needed for optical duplicate detection when both ends are mapped to the same position. if (cmp == 0) { if (samRecord1.getReadPairedFlag() && samRecord2.getReadPairedFlag()) { samRecord1Value = samRecord1.getFirstOfPairFlag() ? 0 : 1; samRecord2Value = samRecord2.getFirstOfPairFlag() ? 0 : 1; cmp = samRecord1Value - samRecord2Value; } } return cmp; } /** * Compares: Library identifier, reference index, read coordinate, orientation of the read (or read pair), mate's coordinate (if paired and mapped), * mapped ends, ... * * collapseOrientation - true if we want cases where fragment orientation to paired end orientation can be equal (ex. F == FR), false otherwise * considerNumberOfEndsMappedAndPairing - true if we want to prefer paired ends with both ends mapped over paired ends with only one end mapped, or paired ends with end * mapped over fragment reads, false otherwise. * */ private int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2, final boolean collapseOrientation, final boolean considerNumberOfEndsMappedAndPairing) { populateTransientAttributes(samRecord1, samRecord2); int cmp; if (null == samRecord1.getHeader() || null == samRecord2.getHeader()) { throw new IllegalArgumentException("Records must have non-null SAMFileHeaders to be compared"); } // temporary variables for comparisons int samRecord1Value, samRecord2Value; // library identifier { samRecord1Value = (Short) samRecord1.getTransientAttribute(Attr.LibraryId); samRecord2Value = (Short) samRecord2.getTransientAttribute(Attr.LibraryId); cmp = samRecord1Value - samRecord2Value; } // reference index if (cmp == 0) { samRecord1Value = samRecord1.getReferenceIndex(); samRecord2Value = samRecord2.getReferenceIndex(); // NB: this accounts for unmapped reads to be placed at the ends of the file if (samRecord1Value == -1) { cmp = (samRecord2Value == -1) ? 0 : 1; } else if (samRecord2Value == -1) { cmp = -1; } else { cmp = samRecord1Value - samRecord2Value; } } // read coordinate if (cmp == 0) { samRecord1Value = (Integer) samRecord1.getTransientAttribute(Attr.ReadCoordinate); samRecord2Value = (Integer) samRecord2.getTransientAttribute(Attr.ReadCoordinate); cmp = samRecord1Value - samRecord2Value; } // orientation if (cmp == 0) { samRecord1Value = getPairedOrientation(samRecord1); samRecord2Value = getPairedOrientation(samRecord2); if (collapseOrientation) { cmp = compareOrientationByteCollapseOrientation(samRecord1Value, samRecord2Value); } else { cmp = samRecord1Value - samRecord2Value; } } // both ends need to be mapped if (pairedEndAndBothMapped(samRecord1) && pairedEndAndBothMapped(samRecord2)) { // mate's reference index if (cmp == 0) { samRecord1Value = getMateReferenceIndex(samRecord1); samRecord2Value = getMateReferenceIndex(samRecord2); cmp = samRecord1Value - samRecord2Value; } // mate's coordinate if (cmp == 0) { samRecord1Value = (Integer) samRecord1.getTransientAttribute(Attr.MateCoordinate); samRecord2Value = (Integer) samRecord2.getTransientAttribute(Attr.MateCoordinate);; cmp = samRecord1Value - samRecord2Value; } } if (cmp == 0) { samRecord1Value = hasMappedEnd(samRecord1) ? 0 : 1; samRecord2Value = hasMappedEnd(samRecord2) ? 0 : 1; cmp = samRecord1Value - samRecord2Value; } // if both paired or both unpaired, then check if one of the two ends (or single end) is unmapped // else prefer the one that is paired end if (cmp == 0 && considerNumberOfEndsMappedAndPairing) { if (samRecord1.getReadPairedFlag() == samRecord2.getReadPairedFlag()) { // Is this unmapped or its mate? samRecord1Value = hasUnmappedEnd(samRecord1) ? 1 : 0; samRecord2Value = hasUnmappedEnd(samRecord2) ? 1 : 0; cmp = samRecord1Value - samRecord2Value; } else { // if we care if one is paired and the other is not cmp = samRecord1.getReadPairedFlag() ? -1 : 1; } } return cmp; } /** * Less stringent than compare, such that two records are equal enough such that their ordering within their duplicate set would be arbitrary. * * Major difference between this and fileOrderCompare is how we compare the orientation byte. Here we want: * F == FR, F == FF * R == RF, R == RR */ public int duplicateSetCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { return fileOrderCompare(samRecord1, samRecord2, true, false); } /** * Less stringent than duplicateSetCompare, such that two records are equal enough such that their ordering in a sorted SAM file would be arbitrary. */ public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { return fileOrderCompare(samRecord1, samRecord2, false, true); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecordFactory.java000066400000000000000000000023161263034757100236250ustar00rootroot00000000000000package htsjdk.samtools; /** * Factory interface which allows plugging in of different classes for generating instances of * SAMRecord and BAMRecord when reading from SAM/BAM files. * * @author Tim Fennell */ public interface SAMRecordFactory { /** Create a new SAMRecord to be filled in */ public SAMRecord createSAMRecord(SAMFileHeader header); /** Create a new BAM Record. */ public BAMRecord createBAMRecord(final SAMFileHeader header, final int referenceSequenceIndex, final int alignmentStart, final short readNameLength, final short mappingQuality, final int indexingBin, final int cigarLen, final int flags, final int readLen, final int mateReferenceSequenceIndex, final int mateAlignmentStart, final int insertSize, final byte[] variableLengthBlock); } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecordIterator.java000077500000000000000000000046301263034757100240130ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; /** * A general interface that adds functionality to a CloseableIterator of * SAMRecords. Currently, this interface is implemented by iterators that * want to validate as they are iterating that that the records in the * underlying SAM/BAM file are in a particular order. */ public interface SAMRecordIterator extends CloseableIterator { /** * Establishes that records returned by this iterator are expected to * be in the specified sort order. If this method has been called, * then implementers must throw an IllegalStateException from next() * when a record is read that violates the sort order. This method * may be called multiple times over the course of an iteration, * changing the expected sort, if desired -- from the time it is called, * it validates whatever sort is set, or stops validating if it * is set to null or SAMFileHeader.SortOrder.unsorted. If this method * is not called, then no validation of the iterated records is done. * * @param sortOrder The order in which records are expected to be returned * @return This SAMRecordIterator */ public SAMRecordIterator assertSorted(SAMFileHeader.SortOrder sortOrder); } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecordQueryHashComparator.java000066400000000000000000000055431263034757100261640ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2015 Tim Fennell * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.Murmur3; /** * SAMRecord comparator that provides an ordering based on a hash of the queryname. Has * the useful property that reads with the same name will be grouped together, but that * reads appear in an otherwise random order. Useful for when the read names in a BAM * are correlated to something else (e.g. position, read group), making a straight * queryname sort undesirable. * * @author Tim Fennell */ public class SAMRecordQueryHashComparator extends SAMRecordQueryNameComparator { private final Murmur3 hasher = new Murmur3(42); /** * Compares two records based on an integer hash of their read name's. If the hash * values are equal, falls back to the behaviour of SAMRecordQueryNameComparator * to break the tie. */ @Override public int compare(final SAMRecord lhs, final SAMRecord rhs) { final int retval = compareHashes(lhs, rhs); if (retval == 0) return super.compare(lhs, rhs); else return retval; } /** * Compares two records based on an integer hash of their read names. If the hash * values are equal, falls back to the behaviour of SAMRecordQueryNameComparator * to break the tie. */ @Override public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) { final int retval = compareHashes(lhs, rhs); if (retval == 0) return super.fileOrderCompare(lhs, rhs); else return retval; } /** Compares the hash values for two records. */ private int compareHashes(final SAMRecord lhs, final SAMRecord rhs) { return new Integer(this.hasher.hashUnencodedChars(lhs.getReadName())).compareTo(this.hasher.hashUnencodedChars(rhs.getReadName())); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecordQueryNameComparator.java000066400000000000000000000072441263034757100261610ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Comparator for "queryname" ordering of SAMRecords. */ public class SAMRecordQueryNameComparator implements SAMRecordComparator { public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { int cmp = fileOrderCompare(samRecord1, samRecord2); if (cmp != 0) { return cmp; } final boolean r1Paired = samRecord1.getReadPairedFlag(); final boolean r2Paired = samRecord2.getReadPairedFlag(); if (r1Paired || r2Paired) { if (!r1Paired) return 1; else if (!r2Paired) return -1; else if (samRecord1.getFirstOfPairFlag() && samRecord2.getSecondOfPairFlag()) return -1; else if (samRecord1.getSecondOfPairFlag() && samRecord2.getFirstOfPairFlag()) return 1; } if (samRecord1.getReadNegativeStrandFlag() != samRecord2.getReadNegativeStrandFlag()) { return (samRecord1.getReadNegativeStrandFlag()? 1: -1); } if (samRecord1.getNotPrimaryAlignmentFlag() != samRecord2.getNotPrimaryAlignmentFlag()) { return samRecord2.getNotPrimaryAlignmentFlag()? -1: 1; } if (samRecord1.getSupplementaryAlignmentFlag() != samRecord2.getSupplementaryAlignmentFlag()) { return samRecord2.getSupplementaryAlignmentFlag() ? -1 : 1; } final Integer hitIndex1 = samRecord1.getIntegerAttribute(SAMTag.HI.name()); final Integer hitIndex2 = samRecord2.getIntegerAttribute(SAMTag.HI.name()); if (hitIndex1 != null) { if (hitIndex2 == null) return 1; else { cmp = hitIndex1.compareTo(hitIndex2); if (cmp != 0) return cmp; } } else if (hitIndex2 != null) return -1; return 0; } /** * Less stringent compare method than the regular compare. If the two records * are equal enough that their ordering in a sorted SAM file would be arbitrary, * this method returns 0. * * @return negative if samRecord1 < samRecord2, 0 if equal, else positive */ public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { return compareReadNames(samRecord1.getReadName(), samRecord2.getReadName()); } /** * Encapsulate algorithm for comparing read names in queryname-sorted file, since there have been * conversations about changing the behavior. */ public static int compareReadNames(final String readName1, final String readName2) { return readName1.compareTo(readName2); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecordSetBuilder.java000066400000000000000000000572171263034757100242720ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.samtools.util.SequenceUtil; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Random; import java.util.TreeSet; /** * Factory class for creating SAMRecords for testing purposes. Various methods can be called * to add new SAM records (or pairs of records) to a list which can then be returned at * any point. The records must reference human chromosomes (excluding randoms etc.). *

* Although this is a class for testing, it is in the src tree because it is included in the sam jarfile. * * @author Tim Fennell */ public class SAMRecordSetBuilder implements Iterable { private static final String[] chroms = { "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY", "chrM" }; private static final byte[] BASES = {'A', 'C', 'G', 'T'}; private static final String READ_GROUP_ID = "1"; private static final String SAMPLE = "FREE_SAMPLE"; private final Random random = new Random(); private SAMFileHeader header; private final Collection records; private int readLength = 36; private SAMProgramRecord programRecord = null; private SAMReadGroupRecord readGroup = null; private boolean useNmFlag = false; private boolean unmappedHasBasesAndQualities = true; public static final int DEFAULT_CHROMOSOME_LENGTH = 200000000; public static final ScoringStrategy DEFAULT_DUPLICATE_SCORING_STRATEGY = ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH; /** * Constructs a new SAMRecordSetBuilder with all the data needed to keep the records * sorted in coordinate order. */ public SAMRecordSetBuilder() { this(true, SAMFileHeader.SortOrder.coordinate); } /** * Construct a new SAMRecordSetBuilder. * * @param sortOrder If sortForMe, defines the sort order. * @param sortForMe If true, keep the records created in sorted order. */ public SAMRecordSetBuilder(final boolean sortForMe, final SAMFileHeader.SortOrder sortOrder) { this(sortForMe, sortOrder, true); } public SAMRecordSetBuilder(final boolean sortForMe, final SAMFileHeader.SortOrder sortOrder, final boolean addReadGroup) { this(sortForMe, sortOrder, addReadGroup, DEFAULT_CHROMOSOME_LENGTH); } public SAMRecordSetBuilder(final boolean sortForMe, final SAMFileHeader.SortOrder sortOrder, final boolean addReadGroup, final int defaultChromosomeLength) { this(sortForMe, sortOrder, addReadGroup, defaultChromosomeLength, DEFAULT_DUPLICATE_SCORING_STRATEGY); } public SAMRecordSetBuilder(final boolean sortForMe, final SAMFileHeader.SortOrder sortOrder, final boolean addReadGroup, final int defaultChromosomeLength, final ScoringStrategy duplicateScoringStrategy) { final List sequences = new ArrayList(); for (final String chrom : chroms) { final SAMSequenceRecord sequenceRecord = new SAMSequenceRecord(chrom, defaultChromosomeLength); sequences.add(sequenceRecord); } this.header = new SAMFileHeader(); this.header.setSequenceDictionary(new SAMSequenceDictionary(sequences)); this.header.setSortOrder(sortOrder); if (sortForMe) { final SAMRecordComparator comparator; if (sortOrder == SAMFileHeader.SortOrder.queryname) { comparator = new SAMRecordQueryNameComparator(); } else { comparator = new SAMRecordCoordinateComparator(); } this.records = new TreeSet(comparator); } else { this.records = new ArrayList(); } if (addReadGroup) { final SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(READ_GROUP_ID); readGroupRecord.setSample(SAMPLE); readGroupRecord.setPlatform("ILLUMINA"); final List readGroups = new ArrayList(); readGroups.add(readGroupRecord); this.header.setReadGroups(readGroups); } } public void setUnmappedHasBasesAndQualities(final boolean value) { this.unmappedHasBasesAndQualities = value; } public int size() { return this.records.size(); } /** * Set the seed of the random number generator for cases in which repeatable result is desired. * * @param seed */ public void setRandomSeed(final long seed) { random.setSeed(seed); } /** * Adds the given program record to the header, and assigns the PG tag to any SAMRecords * created after it has been added. May be called multiple times in order to assign different * PG IDs to different SAMRecords. programRecord may be null to stop assignment of PG tag. * It is up to the caller to ensure that program record IDs do not collide. */ public void setProgramRecord(final SAMProgramRecord programRecord) { this.programRecord = programRecord; if (programRecord != null) { this.header.addProgramRecord(programRecord); } } public void setUseNmFlag(final boolean useNmFlag) { this.useNmFlag = useNmFlag; } public void setReadGroup(final SAMReadGroupRecord readGroup) { this.readGroup = readGroup; if (readGroup != null) { this.header.addReadGroup(readGroup); } } /** Returns the accumulated list of sam records. */ public Collection getRecords() { return this.records; } public void setHeader(final SAMFileHeader header) { this.header = header.clone(); } /** The record should already have the DS and MC tags computed */ public void addRecord(final SAMRecord record) { if (record.getReadPairedFlag() && !record.getMateUnmappedFlag() && null == record.getAttribute(SAMTagUtil.getSingleton().MC)) { throw new SAMException("Mate Cigar tag (MC) not found in: " + record.getReadName()); } this.records.add(record); } /** Returns a CloseableIterator over the collection of SAMRecords. */ public CloseableIterator iterator() { return new CloseableIterator() { private final Iterator iterator = records.iterator(); public void close() { /** Do nothing. */} public boolean hasNext() { return this.iterator.hasNext(); } public SAMRecord next() { return this.iterator.next(); } public void remove() { this.iterator.remove(); } }; } /** * Adds a fragment record (mapped or unmapped) to the set using the provided contig start and optionally the strand, * cigar string, quality string or default quality score. This does not modify the flag field, which should be updated * if desired before adding the return to the list of records. */ private SAMRecord createReadNoFlag(final String name, final int contig, final int start, final boolean negativeStrand, final boolean recordUnmapped, final String cigar, final String qualityString, final int defaultQuality) throws SAMException { final SAMRecord rec = new SAMRecord(this.header); rec.setReadName(name); if (chroms.length <= contig) { throw new SAMException("Contig too big [" + chroms.length + " < " + contig); } if (0 <= contig) { rec.setReferenceIndex(contig); rec.setReferenceName(chroms[contig]); rec.setAlignmentStart(start); } if (!recordUnmapped) { rec.setReadNegativeStrandFlag(negativeStrand); if (null != cigar) { rec.setCigarString(cigar); } else if (!rec.getReadUnmappedFlag()) { rec.setCigarString(readLength + "M"); } rec.setMappingQuality(255); } else { rec.setReadUnmappedFlag(true); } rec.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); if(useNmFlag){ rec.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTagFromCigar(rec)); } if (programRecord != null) { rec.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); } if (readGroup != null) { rec.setAttribute(SAMTag.RG.name(), readGroup.getReadGroupId()); } if (!recordUnmapped || this.unmappedHasBasesAndQualities) { fillInBasesAndQualities(rec, qualityString, defaultQuality); } return rec; } /** * Adds a skeletal fragment (non-PE) record to the set using the provided * contig start and strand information. */ public SAMRecord addFrag(final String name, final int contig, final int start, final boolean negativeStrand) { return addFrag(name, contig, start, negativeStrand, false, null, null, -1); } /** * Adds a fragment record (mapped or unmapped) to the set using the provided contig start and optionally the strand, * cigar string, quality string or default quality score. */ public SAMRecord addFrag(final String name, final int contig, final int start, final boolean negativeStrand, final boolean recordUnmapped, final String cigar, final String qualityString, final int defaultQuality) throws SAMException { return addFrag(name, contig, start, negativeStrand, recordUnmapped, cigar, qualityString, defaultQuality, false); } /** * Adds a fragment record (mapped or unmapped) to the set using the provided contig start and optionally the strand, * cigar string, quality string or default quality score. */ public SAMRecord addFrag(final String name, final int contig, final int start, final boolean negativeStrand, final boolean recordUnmapped, final String cigar, final String qualityString, final int defaultQuality, final boolean isSecondary) throws SAMException { final htsjdk.samtools.SAMRecord rec = createReadNoFlag(name, contig, start, negativeStrand, recordUnmapped, cigar, qualityString, defaultQuality); if (isSecondary) rec.setNotPrimaryAlignmentFlag(true); this.records.add(rec); return rec; } /** * Adds a fragment record (mapped or unmapped) to the set using the provided contig start and optionally the strand, * cigar string, quality string or default quality score. */ public SAMRecord addFrag(final String name, final int contig, final int start, final boolean negativeStrand, final boolean recordUnmapped, final String cigar, final String qualityString, final int defaultQuality, final boolean isSecondary, final boolean isSupplementary) throws SAMException { final htsjdk.samtools.SAMRecord rec = createReadNoFlag(name, contig, start, negativeStrand, recordUnmapped, cigar, qualityString, defaultQuality); if (isSecondary) rec.setNotPrimaryAlignmentFlag(true); if (isSupplementary) rec.setSupplementaryAlignmentFlag(true); this.records.add(rec); return rec; } /** * Fills in the bases and qualities for the given record. Quality data is randomly generated if the defaultQuality * is set to -1. Otherwise all qualities will be set to defaultQuality. If a quality string is provided that string * will be used instead of the defaultQuality. */ private void fillInBasesAndQualities(final SAMRecord rec, final String qualityString, final int defaultQuality) { if (null == qualityString) { fillInBasesAndQualities(rec, defaultQuality); } else { fillInBases(rec); rec.setBaseQualityString(qualityString); } } /** * Randomly fills in the bases for the given record. */ private void fillInBases(final SAMRecord rec) { final int length = this.readLength; final byte[] bases = new byte[length]; for (int i = 0; i < length; ++i) { bases[i] = BASES[this.random.nextInt(BASES.length)]; } rec.setReadBases(bases); } /** * Adds an unmapped fragment read to the builder. */ public void addUnmappedFragment(final String name) { addFrag(name, -1, -1, false, true, null, null, -1, false); } /** * Adds a skeletal pair of records to the set using the provided * contig starts. The pair is assumed to be a well * formed pair sitting on a single contig. */ public void addPair(final String name, final int contig, final int start1, final int start2) { final SAMRecord end1 = new SAMRecord(this.header); final SAMRecord end2 = new SAMRecord(this.header); final boolean end1IsFirstOfPair = this.random.nextBoolean(); end1.setReadName(name); end1.setReferenceIndex(contig); end1.setAlignmentStart(start1); end1.setReadNegativeStrandFlag(false); end1.setCigarString(readLength + "M"); if(useNmFlag) end1.setAttribute(ReservedTagConstants.NM, 0); end1.setMappingQuality(255); end1.setReadPairedFlag(true); end1.setProperPairFlag(true); end1.setMateReferenceIndex(contig); end1.setAttribute(SAMTag.MC.name(), readLength + "M"); end1.setMateAlignmentStart(start2); end1.setMateNegativeStrandFlag(true); end1.setFirstOfPairFlag(end1IsFirstOfPair); end1.setSecondOfPairFlag(!end1IsFirstOfPair); end1.setInferredInsertSize((int) CoordMath.getLength(start1, CoordMath.getEnd(start2, this.readLength))); end1.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); if (programRecord != null) { end1.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); } if (readGroup != null) { end1.setAttribute(SAMTag.RG.name(), readGroup.getReadGroupId()); } fillInBasesAndQualities(end1); end2.setReadName(name); end2.setReferenceIndex(contig); end2.setAlignmentStart(start2); end2.setReadNegativeStrandFlag(true); end2.setCigarString(readLength + "M"); if(useNmFlag) end2.setAttribute(ReservedTagConstants.NM,0); end2.setMappingQuality(255); end2.setReadPairedFlag(true); end2.setProperPairFlag(true); end2.setMateReferenceIndex(contig); end2.setAttribute(SAMTag.MC.name(), readLength + "M"); end2.setMateAlignmentStart(start1); end2.setMateNegativeStrandFlag(false); end2.setFirstOfPairFlag(!end1IsFirstOfPair); end2.setSecondOfPairFlag(end1IsFirstOfPair); end2.setInferredInsertSize(end1.getInferredInsertSize()); end2.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); if (programRecord != null) { end2.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); } if (readGroup != null) { end2.setAttribute(SAMTag.RG.name(), readGroup.getReadGroupId()); } fillInBasesAndQualities(end2); this.records.add(end1); this.records.add(end2); } /** * Adds a pair of records (mapped or unmmapped) to the set using the provided contig starts. * The pair is assumed to be a well formed pair sitting on a single contig. */ public List addPair(final String name, final int contig, final int start1, final int start2, final boolean record1Unmapped, final boolean record2Unmapped, final String cigar1, final String cigar2, final boolean strand1, final boolean strand2, final int defaultQuality) { return this.addPair(name, contig, contig, start1, start2, record1Unmapped, record2Unmapped, cigar1, cigar2, strand1, strand2, false, false, defaultQuality); } /** * Adds a pair of records (mapped or unmmapped) to the set using the provided contig starts. * The pair is assumed to be a well formed pair sitting on a single contig. */ public List addPair(final String name, final int contig1, final int contig2, final int start1, final int start2, final boolean record1Unmapped, final boolean record2Unmapped, final String cigar1, final String cigar2, final boolean strand1, final boolean strand2, final boolean record1NonPrimary, final boolean record2NonPrimary, final int defaultQuality) { final List recordsList = new LinkedList(); final SAMRecord end1 = createReadNoFlag(name, contig1, start1, strand1, record1Unmapped, cigar1, null, defaultQuality); final SAMRecord end2 = createReadNoFlag(name, contig2, start2, strand2, record2Unmapped, cigar2, null, defaultQuality); end1.setReadPairedFlag(true); end1.setFirstOfPairFlag(true); if (!record1Unmapped && !record2Unmapped) { end1.setProperPairFlag(true); end2.setProperPairFlag(true); } end2.setReadPairedFlag(true); end2.setSecondOfPairFlag(true); if (record1NonPrimary) end1.setNotPrimaryAlignmentFlag(true); if (record2NonPrimary) end2.setNotPrimaryAlignmentFlag(true); if (record1NonPrimary) end1.setNotPrimaryAlignmentFlag(true); if (record2NonPrimary) end2.setNotPrimaryAlignmentFlag(true); // set mate info SamPairUtil.setMateInfo(end1, end2, true); recordsList.add(end1); recordsList.add(end2); records.add(end1); records.add(end2); return recordsList; } /** * Adds a pair of records (mapped or unmmapped) to the set using the provided contig starts. * The pair is assumed to be a well formed pair sitting on a single contig. */ public List addPair(final String name, final int contig, final int start1, final int start2, final boolean record1Unmapped, final boolean record2Unmapped, final String cigar1, final String cigar2, final boolean strand1, final boolean strand2, final boolean record1NonPrimary, final boolean record2NonPrimary, final int defaultQuality) { return addPair(name, contig, contig, start1, start2, record1Unmapped, record2Unmapped, cigar1, cigar2, strand1, strand2, record1NonPrimary, record2NonPrimary, defaultQuality); } /** * Adds a pair with both ends unmapped to the builder. */ public void addUnmappedPair(final String name) { final SAMRecord end1 = new SAMRecord(this.header); final SAMRecord end2 = new SAMRecord(this.header); final boolean end1IsFirstOfPair = this.random.nextBoolean(); end1.setReadName(name); end1.setReadPairedFlag(true); end1.setReadUnmappedFlag(true); end1.setAttribute(SAMTag.MC.name(), null); end1.setProperPairFlag(false); end1.setFirstOfPairFlag(end1IsFirstOfPair); end1.setSecondOfPairFlag(!end1IsFirstOfPair); end1.setMateUnmappedFlag(true); end1.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); if (programRecord != null) { end1.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); } if (this.unmappedHasBasesAndQualities) { fillInBasesAndQualities(end1); } end2.setReadName(name); end2.setReadPairedFlag(true); end2.setReadUnmappedFlag(true); end2.setAttribute(SAMTag.MC.name(), null); end2.setProperPairFlag(false); end2.setFirstOfPairFlag(!end1IsFirstOfPair); end2.setSecondOfPairFlag(end1IsFirstOfPair); end2.setMateUnmappedFlag(true); end2.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); if (programRecord != null) { end2.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); } if (this.unmappedHasBasesAndQualities) { fillInBasesAndQualities(end2); } this.records.add(end1); this.records.add(end2); } /** * Fills in bases and qualities with randomly generated data. * Relies on the alignment start and end having been set to get read length. */ private void fillInBasesAndQualities(final SAMRecord rec) { fillInBasesAndQualities(rec, -1); } /** * Fills in bases and qualities with a set default quality. If the defaultQuality is set to -1 quality scores will * be randomly generated. * Relies on the alignment start and end having been set to get read length. */ private void fillInBasesAndQualities(final SAMRecord rec, final int defaultQuality) { final int length = this.readLength; final byte[] quals = new byte[length]; if (-1 != defaultQuality) { Arrays.fill(quals, (byte) defaultQuality); } else { for (int i = 0; i < length; ++i) { quals[i] = (byte) this.random.nextInt(50); } } rec.setBaseQualities(quals); fillInBases(rec); } /** * Creates samFileReader from the data in instance of this class * * @return SAMFileReader */ public SamReader getSamReader() { final File tempFile; try { tempFile = File.createTempFile("temp", ".sam"); } catch (final IOException e) { throw new RuntimeIOException("problems creating tempfile", e); } this.header.setAttribute("VN", "1.0"); final SAMFileWriter w = new SAMFileWriterFactory().makeBAMWriter(this.header, true, tempFile); for (final SAMRecord r : this.getRecords()) { w.addAlignment(r); } w.close(); final SamReader reader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(tempFile); tempFile.deleteOnExit(); return reader; } public SAMFileHeader getHeader() { return header; } public void setReadLength(final int readLength) { this.readLength = readLength; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMRecordUtil.java000066400000000000000000000063331263034757100231360ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.StringUtil; /** * @author alecw@broadinstitute.org */ public class SAMRecordUtil { /** List of String tags that must be reversed if present when a SAMRecord is reverseComplemented */ private static final short[] STRING_TAGS_TO_REVERSE = { SAMTagUtil.getSingleton().U2, SAMTagUtil.getSingleton().OQ }; /** * Reverse-complement all known sequence and base quality attributes of the SAMRecord. */ public static void reverseComplement(final SAMRecord rec) { final byte[] readBases = rec.getReadBases(); SequenceUtil.reverseComplement(readBases); rec.setReadBases(readBases); final byte qualities[] = rec.getBaseQualities(); reverseArray(qualities); rec.setBaseQualities(qualities); final byte[] sqTagValue = (byte[])rec.getAttribute(SAMTagUtil.getSingleton().SQ); if (sqTagValue != null) { SQTagUtil.reverseComplementSqArray(sqTagValue); rec.setAttribute(SAMTagUtil.getSingleton().SQ, sqTagValue); } final String e2TagValue = (String)rec.getAttribute(SAMTagUtil.getSingleton().E2); if (e2TagValue != null) { final byte[] secondaryBases = StringUtil.stringToBytes(e2TagValue); SequenceUtil.reverseComplement(secondaryBases); rec.setAttribute(SAMTagUtil.getSingleton().E2, StringUtil.bytesToString(secondaryBases)); } for (final short stringTag : STRING_TAGS_TO_REVERSE) { final String value = (String)rec.getAttribute(stringTag); if (value != null) { rec.setAttribute(stringTag, StringUtil.reverseString(value)); } } } /** * Reverse the given array in place. */ public static void reverseArray(final byte[] array) { final int lastIndex = array.length - 1; int i, j; for (i=0, j=lastIndex; i mSequences = new ArrayList(); private final Map mSequenceMap = new HashMap(); public SAMSequenceDictionary() { } public SAMSequenceDictionary(final List list) { this(); setSequences(list); } @XmlTransient //we use the field instead of getter/setter public List getSequences() { return Collections.unmodifiableList(mSequences); } public SAMSequenceRecord getSequence(final String name) { return mSequenceMap.get(name); } /** * Replaces the existing list of SAMSequenceRecords with the given list. * Reset the aliases * * @param list This value is used directly, rather than being copied. */ public void setSequences(final List list) { mSequences = list; mSequenceMap.clear(); int index = 0; for (final SAMSequenceRecord record : list) { record.setSequenceIndex(index++); if (mSequenceMap.put(record.getSequenceName(), record) != null) { throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " + record.getSequenceName()); } } } public void addSequence(final SAMSequenceRecord sequenceRecord) { if (mSequenceMap.containsKey(sequenceRecord.getSequenceName())) { throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " + sequenceRecord.getSequenceName()); } sequenceRecord.setSequenceIndex(mSequences.size()); mSequences.add(sequenceRecord); mSequenceMap.put(sequenceRecord.getSequenceName(), sequenceRecord); } /** * @return The SAMSequenceRecord with the given index, or null if index is out of range. */ public SAMSequenceRecord getSequence(final int sequenceIndex) { if (sequenceIndex < 0 || sequenceIndex >= mSequences.size()) { return null; } return mSequences.get(sequenceIndex); } /** * @return The index for the given sequence name, or -1 if the name is not found. */ public int getSequenceIndex(final String sequenceName) { final SAMSequenceRecord record = mSequenceMap.get(sequenceName); if (record == null) { return -1; } return record.getSequenceIndex(); } /** * @return number of SAMSequenceRecord(s) in this dictionary */ public int size() { return mSequences.size(); } /** * @return The sum of the lengths of the sequences in this dictionary */ public long getReferenceLength() { long len = 0L; for (final SAMSequenceRecord seq : getSequences()) { len += seq.getSequenceLength(); } return len; } /** * @return true is the dictionary is empty */ public boolean isEmpty() { return mSequences.isEmpty(); } private static String DICT_MISMATCH_TEMPLATE = "SAM dictionaries are not the same: %s."; /** * Non-comprehensive {@link #equals(Object)}-assertion: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call * {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}. * Aliases are ignored. * * @throws AssertionError When the dictionaries are not the same, with some human-readable information as to why */ public void assertSameDictionary(final SAMSequenceDictionary that) { if (this == that) return; final Iterator thatSequences = that.mSequences.iterator(); for (final SAMSequenceRecord thisSequence : mSequences) { if (!thatSequences.hasNext()) throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thisSequence + " is present in only one dictionary")); else { final SAMSequenceRecord thatSequence = thatSequences.next(); if(!thatSequence.isSameSequence(thisSequence)) throw new AssertionError( String.format(DICT_MISMATCH_TEMPLATE, thatSequence + " was found when " + thisSequence + " was expected") ); } } if (thatSequences.hasNext()) throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thatSequences.next() + " is present in only one dictionary")); } /** returns true if the two dictionaries are the same, aliases are NOT considered */ @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; SAMSequenceDictionary that = (SAMSequenceDictionary) o; if (!mSequences.equals(that.mSequences)) return false; return true; } /** * Add an alias to a SAMSequenceRecord. This can be use to provide some * alternate names fo a given contig. e.g: * 1,chr1,chr01,01,CM000663,NC_000001.10 e.g: * MT,chrM * * @param originalName * existing contig name * @param altName * new contig name * @return the contig associated to the 'originalName/altName' */ public SAMSequenceRecord addSequenceAlias(final String originalName, final String altName) { if (originalName == null) throw new IllegalArgumentException("original name cannot be null"); if (altName == null) throw new IllegalArgumentException("alt name cannot be null"); final SAMSequenceRecord originalSeqRecord = getSequence(originalName); if (originalSeqRecord == null) throw new IllegalArgumentException("Sequence " + originalName + " doesn't exist in dictionary."); // same name, nothing to do if (originalName.equals(altName)) return originalSeqRecord; final SAMSequenceRecord altSeqRecord = getSequence(altName); if (altSeqRecord != null) { // alias was already set to the same record if (altSeqRecord.equals(originalSeqRecord)) return originalSeqRecord; // alias was already set to another record throw new IllegalArgumentException("Alias " + altName + " was already set to " + altSeqRecord.getSequenceName()); } mSequenceMap.put(altName, originalSeqRecord); return originalSeqRecord; } /** * return a MD5 sum for ths dictionary, the checksum is re-computed each * time this method is called. * *

     * md5( (seq1.md5_if_available) + ' '+(seq2.name+seq2.length) + ' '+...)
     * 
* * @return a MD5 checksum for this dictionary or the empty string if it is * empty */ public String md5() { if (isEmpty()) return ""; try { final MessageDigest md5 = MessageDigest.getInstance("MD5"); md5.reset(); for (final SAMSequenceRecord samSequenceRecord : mSequences) { if (samSequenceRecord.getSequenceIndex() > 0) md5.update((byte) ' '); final String md5_tag = samSequenceRecord.getAttribute(SAMSequenceRecord.MD5_TAG); if (md5_tag != null) { md5.update(md5_tag.getBytes()); } else { md5.update(samSequenceRecord.getSequenceName().getBytes()); md5.update(String.valueOf(samSequenceRecord.getSequenceLength()).getBytes()); } } String hash = new BigInteger(1, md5.digest()).toString(16); if (hash.length() != 32) { final String zeros = "00000000000000000000000000000000"; hash = zeros.substring(0, 32 - hash.length()) + hash; } return hash; } catch (Exception e) { throw new RuntimeException(e); } } @Override public int hashCode() { return mSequences.hashCode(); } @Override public String toString() { return "SAMSequenceDictionary:( sequences:"+ size()+ " length:"+ getReferenceLength()+" "+ " md5:"+md5()+")"; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMSequenceRecord.java000066400000000000000000000227031263034757100237700ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.math.BigInteger; import java.net.URI; import java.net.URISyntaxException; import java.util.Arrays; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlValue; /** * Header information about a reference sequence. Corresponds to @SQ header record in SAM text header. */ @XmlRootElement(name="Reference") public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Cloneable { public static final long serialVersionUID = 1L; // AbstractSAMHeaderRecord implements Serializable private String mSequenceName = null; // Value must be interned() if it's ever set/modified private int mSequenceIndex = -1; private int mSequenceLength = 0; public static final String SEQUENCE_NAME_TAG = "SN"; public static final String SEQUENCE_LENGTH_TAG = "LN"; public static final String MD5_TAG = "M5"; public static final String ASSEMBLY_TAG = "AS"; public static final String URI_TAG = "UR"; public static final String SPECIES_TAG = "SP"; /** If one sequence has this length, and another sequence had a different length, isSameSequence will * not complain that they are different sequences. */ public static final int UNKNOWN_SEQUENCE_LENGTH = 0; /** * This is not a valid sequence name, because it is reserved in the MRNM field of SAM text format * to mean "same reference as RNAME field." */ public static final String RESERVED_MRNM_SEQUENCE_NAME = "="; /** * The standard tags are stored in text header without type information, because the type of these tags is known. */ public static final Set STANDARD_TAGS = new HashSet(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG, SPECIES_TAG)); // Split on any whitespace private static Pattern SEQUENCE_NAME_SPLITTER = Pattern.compile("\\s"); // These are the chars matched by \\s. private static char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab /** a (private) empty constructor is required for JAXB.XML-serialisation */ @SuppressWarnings("unused") private SAMSequenceRecord() { } /** * @deprecated Use SAMSequenceRecord(final String name, final int sequenceLength) instead. * sequenceLength is required for the object to be considered valid. */ public SAMSequenceRecord(final String name) { this(name, UNKNOWN_SEQUENCE_LENGTH); } public SAMSequenceRecord(final String name, final int sequenceLength) { if (name != null) { if (SEQUENCE_NAME_SPLITTER.matcher(name).find()) { throw new SAMException("Sequence name contains invalid character: " + name); } validateSequenceName(name); mSequenceName = name.intern(); } mSequenceLength = sequenceLength; } @XmlValue public String getSequenceName() { return mSequenceName; } /* this private method is used by XML serialization */ @SuppressWarnings("unused") private void setSequenceName(final String name) { if (name != null) { mSequenceName = name.intern(); } else { mSequenceName = null; } } @XmlAttribute(name="length") public int getSequenceLength() { return mSequenceLength; } public void setSequenceLength(final int value) { mSequenceLength = value; } @XmlAttribute(name="assembly") public String getAssembly() { return (String) getAttribute(ASSEMBLY_TAG); } public void setAssembly(final String value) { setAttribute(ASSEMBLY_TAG, value); } @XmlAttribute(name="species") public String getSpecies() { return (String) getAttribute(SPECIES_TAG); } public void setSpecies(final String value) { setAttribute(SPECIES_TAG, value); } @XmlAttribute(name="md5") public String getMd5() { return (String) getAttribute(MD5_TAG); } public void setMd5(final String value) { setAttribute(MD5_TAG, value); } /** * @return Index of this record in the sequence dictionary it lives in. */ @XmlAttribute(name="index") public int getSequenceIndex() { return mSequenceIndex; } // Private state used only by SAM implementation. public void setSequenceIndex(final int value) { mSequenceIndex = value; } /** * Looser comparison than equals(). We look only at sequence index, sequence length, and MD5 tag value * (or sequence names, if there is no MD5 tag in either record. */ public boolean isSameSequence(final SAMSequenceRecord that) { if (this == that) return true; if (that == null) return false; if (mSequenceIndex != that.mSequenceIndex) return false; // PIC-439. Allow undefined length. if (mSequenceLength != UNKNOWN_SEQUENCE_LENGTH && that.mSequenceLength != UNKNOWN_SEQUENCE_LENGTH && mSequenceLength != that.mSequenceLength) return false; if (this.getAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getAttribute(SAMSequenceRecord.MD5_TAG) != null) { final BigInteger thisMd5 = new BigInteger((String)this.getAttribute(SAMSequenceRecord.MD5_TAG), 16); final BigInteger thatMd5 = new BigInteger((String)that.getAttribute(SAMSequenceRecord.MD5_TAG), 16); if (!thisMd5.equals(thatMd5)) { return false; } } else { if (mSequenceName != that.mSequenceName) return false; // Compare using == since we intern() the Strings } return true; } private URI makeURI(final String s) throws URISyntaxException { URI uri = new URI(s); if (uri.getScheme() == null) { uri = new URI("file", uri.getUserInfo(), uri.getHost(), uri.getPort(), uri.getPath(), uri.getQuery(), uri.getFragment()); } return uri; } @Override public boolean equals(final Object o) { if (this == o) return true; if (!(o instanceof SAMSequenceRecord)) return false; final SAMSequenceRecord that = (SAMSequenceRecord) o; if (mSequenceIndex != that.mSequenceIndex) return false; if (mSequenceLength != that.mSequenceLength) return false; if (!attributesEqual(that)) return false; if (mSequenceName != that.mSequenceName) return false; // Compare using == since we intern() the name return true; } @Override public int hashCode() { return mSequenceName != null ? mSequenceName.hashCode() : 0; } Set getStandardTags() { return STANDARD_TAGS; } public final SAMSequenceRecord clone() { final SAMSequenceRecord ret = new SAMSequenceRecord(this.mSequenceName, this.mSequenceLength); ret.mSequenceIndex = this.mSequenceIndex; for (final Map.Entry entry : this.getAttributes()) { ret.setAttribute(entry.getKey(), entry.getValue()); } return ret; } /** * Truncate sequence name at first whitespace. */ public static String truncateSequenceName(final String sequenceName) { /* * Instead of using regex split, do it manually for better performance. return SEQUENCE_NAME_SPLITTER.split(sequenceName, 2)[0]; */ int truncateAt = sequenceName.length(); for (final char c : WHITESPACE_CHARS) { int index = sequenceName.indexOf(c); if (index != -1 && index < truncateAt) { truncateAt = index; } } return sequenceName.substring(0, truncateAt); } /** * Throw an exception if the sequence name is not valid. */ public static void validateSequenceName(final String name) { if (RESERVED_MRNM_SEQUENCE_NAME.equals(name)) { throw new SAMException("'" + RESERVED_MRNM_SEQUENCE_NAME + "' is not a valid sequence name"); } } @Override public String toString() { return String.format( "SAMSequenceRecord(name=%s,length=%s,dict_index=%s,assembly=%s)", getSequenceName(), getSequenceLength(), getSequenceIndex(), getAssembly() ); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMSortOrderChecker.java000066400000000000000000000057731263034757100243010ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Encapsulates simple check for SAMRecord order. * @author alecw@broadinstitute.org */ public class SAMSortOrderChecker { private final SAMFileHeader.SortOrder sortOrder; private SAMRecord prev; private final SAMRecordComparator comparator; public SAMSortOrderChecker(final SAMFileHeader.SortOrder sortOrder) { this.sortOrder = sortOrder; switch (sortOrder) { case coordinate: comparator = new SAMRecordCoordinateComparator(); break; case queryname: comparator = new SAMRecordQueryNameComparator(); break; case duplicate: comparator = new SAMRecordDuplicateComparator(); break; case unsorted: default: comparator = null; break; } } /** * Check if given SAMRecord violates sort order relative to previous SAMRecord. * @return True if sort order is unsorted, if this is the first record, or if previous <= rec. */ public boolean isSorted(final SAMRecord rec) { if (comparator == null) { return true; } boolean ret = true; if (prev != null) { ret = comparator.fileOrderCompare(prev, rec) <= 0; } prev = rec; return ret; } public SAMRecord getPreviousRecord() { return prev; } /** * Return the sort key used for the given sort order. Useful in error messages. */ public String getSortKey(final SAMRecord rec) { switch (sortOrder) { case coordinate: return rec.getReferenceName() + ":" + rec.getAlignmentStart(); case queryname: return rec.getReadName(); case unsorted: default: return null; } } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMTag.java000066400000000000000000000035241263034757100215740ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * The standard tags for a SAM record that are defined in the SAM spec. */ public enum SAMTag { AM, AS, BC, BQ, CC, CM, CO, CP, CQ, CS, CT, E2, FI, FS, FT, FZ, GC, // for backwards compatibility GS, // for backwards compatibility GQ, // for backwards compatibility LB, H0, H1, H2, HI, IH, MC, MF, // for backwards compatibility MD, MQ, NH, NM, OQ, OP, OC, OF, OR, PG, PQ, PT, PU, QT, Q2, R2, RG, RT, S2, // for backwards compatibility SA, SM, SQ, // for backwards compatibility TC, U2, UQ } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMTagUtil.java000066400000000000000000000125451263034757100224350ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.StringUtil; /** * Facility for converting between String and short representation of a SAM tag. short representation * is used by SAM JDK internally and is much more efficient. Callers are encouraged to obtain the short * value for a tag of interest once, and then use the SAMRecord attribute API that takes shorts rather than * Strings. * * @author alecw@broadinstitute.org */ public class SAMTagUtil { // Standard tags pre-computed for convenience public final short RG = makeBinaryTag(SAMTag.RG.name()); public final short LB = makeBinaryTag(SAMTag.LB.name()); public final short PU = makeBinaryTag(SAMTag.PU.name()); public final short PG = makeBinaryTag(SAMTag.PG.name()); public final short AS = makeBinaryTag(SAMTag.AS.name()); public final short SQ = makeBinaryTag(SAMTag.SQ.name()); public final short MQ = makeBinaryTag(SAMTag.MQ.name()); public final short NM = makeBinaryTag(SAMTag.NM.name()); public final short H0 = makeBinaryTag(SAMTag.H0.name()); public final short H1 = makeBinaryTag(SAMTag.H1.name()); public final short H2 = makeBinaryTag(SAMTag.H2.name()); public final short UQ = makeBinaryTag(SAMTag.UQ.name()); public final short PQ = makeBinaryTag(SAMTag.PQ.name()); public final short NH = makeBinaryTag(SAMTag.NH.name()); public final short IH = makeBinaryTag(SAMTag.IH.name()); public final short HI = makeBinaryTag(SAMTag.HI.name()); public final short MD = makeBinaryTag(SAMTag.MD.name()); public final short CS = makeBinaryTag(SAMTag.CS.name()); public final short CQ = makeBinaryTag(SAMTag.CQ.name()); public final short CM = makeBinaryTag(SAMTag.CM.name()); public final short R2 = makeBinaryTag(SAMTag.R2.name()); public final short Q2 = makeBinaryTag(SAMTag.Q2.name()); public final short S2 = makeBinaryTag(SAMTag.S2.name()); public final short CC = makeBinaryTag(SAMTag.CC.name()); public final short CP = makeBinaryTag(SAMTag.CP.name()); public final short SM = makeBinaryTag(SAMTag.SM.name()); public final short AM = makeBinaryTag(SAMTag.AM.name()); public final short MF = makeBinaryTag(SAMTag.MF.name()); public final short E2 = makeBinaryTag(SAMTag.E2.name()); public final short U2 = makeBinaryTag(SAMTag.U2.name()); public final short OQ = makeBinaryTag(SAMTag.OQ.name()); public final short FZ = makeBinaryTag(SAMTag.FZ.name()); public final short SA = makeBinaryTag(SAMTag.SA.name()); public final short MC = makeBinaryTag(SAMTag.MC.name()); private static SAMTagUtil singleton; // Cache of already-converted tags. Should speed up SAM text generation. // Not synchronized because race condition is not a problem. private final String[] stringTags = new String[Short.MAX_VALUE]; /** * Despite the fact that this class has state, it should be thread-safe because the cache * gets filled with the same values by any thread. */ public static SAMTagUtil getSingleton() { if (singleton == null) { singleton = new SAMTagUtil(); } return singleton; } /** * Convert from String representation of tag name to short representation. * * @param tag 2-character String representation of a tag name. * @return Tag name packed as 2 ASCII bytes in a short. */ public short makeBinaryTag(final String tag) { if (tag.length() != 2) { throw new IllegalArgumentException("String tag does not have length() == 2: " + tag); } return (short)(tag.charAt(1) << 8 | tag.charAt(0)); } /** * Convert from short representation of tag name to String representation. * * @param tag Tag name packed as 2 ASCII bytes in a short. * @return 2-character String representation of a tag name. */ public String makeStringTag(final short tag) { String ret = stringTags[tag]; if (ret == null) { final byte[] stringConversionBuf = new byte[2]; stringConversionBuf[0] = (byte)(tag & 0xff); stringConversionBuf[1] = (byte)((tag >> 8) & 0xff); ret = StringUtil.bytesToString(stringConversionBuf); stringTags[tag] = ret; } return ret; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMTestUtil.java000066400000000000000000000127301263034757100226350ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Misc methods for SAM-related unit tests. These are in the src tree rather than the tests tree * so that they will be included in sam.jar, and therefore can be used by tests outside of htsjdk.samtools. */ public class SAMTestUtil { /** * Indicates that a required sanity-check condition was not met. */ public static class SanityCheckFailedException extends RuntimeException { public SanityCheckFailedException(String message) { super(message); } } /** * Basic sanity check for a pair of SAMRecords. * @throws SanityCheckFailedException if the sanity check failed */ public void assertPairValid(final SAMRecord firstEnd, final SAMRecord secondEnd) throws SanityCheckFailedException { assertEquals(firstEnd.getReadName(), secondEnd.getReadName()); assertTrue(firstEnd.getFirstOfPairFlag()); assertTrue(secondEnd.getSecondOfPairFlag()); assertFalse(secondEnd.getFirstOfPairFlag()); assertFalse(firstEnd.getSecondOfPairFlag()); if (!firstEnd.getReadUnmappedFlag() && !secondEnd.getReadUnmappedFlag()) { assertNotSame(firstEnd.getReadNegativeStrandFlag(), secondEnd.getReadNegativeStrandFlag()); } } /** * Basic sanity check for a SAMRecord. * @throws SanityCheckFailedException if the sanity check failed */ public void assertReadValid(final SAMRecord read) throws SanityCheckFailedException { assertEquals(read.getReadBases().length, read.getBaseQualities().length); // Note that it is possible to have an unmapped read that has a coordinate if (read.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { assertEquals(read.getAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); assertTrue(read.getReadUnmappedFlag()); } else { assertNotSame(read.getAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); } if (read.getReadUnmappedFlag()) { assertEquals(read.getMappingQuality(), SAMRecord.NO_MAPPING_QUALITY); assertEquals(read.getCigar().getCigarElements().size(), 0); } else { assertNotSame(read.getCigar().getCigarElements(), 0); } if (read.getReadPairedFlag()) { if (read.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { assertEquals(read.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); assertTrue(read.getMateUnmappedFlag()); } else { // Even if the mate is unmapped, if it has a reference name, it should have a position. assertNotSame(read.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); } if (read.getReadUnmappedFlag() || read.getMateUnmappedFlag() || !read.getReferenceName().equals(read.getMateReferenceName())) { assertEquals(read.getInferredInsertSize(), 0); } else { assertNotSame(read.getInferredInsertSize(), 0); } if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) { assertNotSame(read.getReadNegativeStrandFlag(), read.getMateNegativeStrandFlag()); assertNotSame(read.getMateNegativeStrandFlag(), read.getReadName()); } } else { assertEquals(read.getInferredInsertSize(), 0); } } private static void assertEquals(T a, T b) { if (a == null) { if (b != null) { throw new SanityCheckFailedException("\"" + a + "\" does not equal \"" + b + "\""); } } else if (!a.equals(b)) { throw new SanityCheckFailedException("\"" + a + "\" does not equal \"" + b + "\""); } } private static void assertNotSame(T a, T b) { if (a != b) { throw new SanityCheckFailedException("\"" + a + "\" and \"" + b + "\" are not the same object"); } } private static void assertTrue(boolean condition) { if (!condition) { throw new SanityCheckFailedException("The condition is false"); } } private static void assertFalse(boolean condition) { if (condition) { throw new SanityCheckFailedException("The condition is true"); } } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMTextHeaderCodec.java000066400000000000000000000507041263034757100240560ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.DateParser; import htsjdk.samtools.util.LineReader; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.samtools.util.StringUtil; import java.io.BufferedWriter; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; /** * Parser for a SAM text header, and a generator of SAM text header. */ public class SAMTextHeaderCodec { private static final String HEADER_LINE_START = "@"; // These attributes are populated when parsing or generating private SAMFileHeader mFileHeader; private final TextTagCodec mTagCodec = new TextTagCodec(); // These attributes are populated when parsing text private String mCurrentLine; private LineReader mReader; private String mSource; private List sequences; private List readGroups; // Accumulate header while reading it from input. private final StringBuilder textHeader = new StringBuilder(); // For error reporting when parsing private ValidationStringency validationStringency = ValidationStringency.SILENT; // These attributes are populated when generating text private BufferedWriter writer; private static final String TAG_KEY_VALUE_SEPARATOR = ":"; private static final char TAG_KEY_VALUE_SEPARATOR_CHAR = ':'; private static final String FIELD_SEPARATOR = "\t"; private static final char FIELD_SEPARATOR_CHAR = '\t'; private static final Pattern FIELD_SEPARATOR_RE = Pattern.compile(FIELD_SEPARATOR); public static final String COMMENT_PREFIX = HEADER_LINE_START + HeaderRecordType.CO.name() + FIELD_SEPARATOR; /** * Reads text SAM header and converts to a SAMFileHeader object. * @param reader Where to get header text from. * @param source Name of the input file, for error messages. May be null. * @return complete header object. */ public SAMFileHeader decode(final LineReader reader, final String source) { mFileHeader = new SAMFileHeader(); mReader = reader; mSource = source; sequences = new ArrayList(); readGroups = new ArrayList(); while (advanceLine() != null) { final ParsedHeaderLine parsedHeaderLine = new ParsedHeaderLine(mCurrentLine); if (!parsedHeaderLine.isLineValid()) { continue; } switch (parsedHeaderLine.getHeaderRecordType()) { case HD: parseHDLine(parsedHeaderLine); break; case PG: parsePGLine(parsedHeaderLine); break; case RG: parseRGLine(parsedHeaderLine); break; case SQ: parseSQLine(parsedHeaderLine); break; case CO: mFileHeader.addComment(mCurrentLine); break; default: throw new IllegalStateException("Unrecognized header record type: " + parsedHeaderLine.getHeaderRecordType()); } } mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); mFileHeader.setReadGroups(readGroups); // Only store the header text if there was a parsing error or the it's less than 1MB on disk / 2MB in mem if (!mFileHeader.getValidationErrors().isEmpty() || textHeader.length() < (1024 * 1024)) { mFileHeader.setTextHeader(textHeader.toString()); } SAMUtils.processValidationErrors(mFileHeader.getValidationErrors(), -1, validationStringency); return mFileHeader; } private String advanceLine() { final int nextChar = mReader.peek(); if (nextChar != '@') { return null; } mCurrentLine = mReader.readLine(); textHeader.append(mCurrentLine).append("\n"); return mCurrentLine; } /** * Transfer standard and non-standard tags from text representation to in-memory representation. * All values are now stored as Strings. * @param record attributes get set into this object. * @param textAttributes Map of tag type to value. Some values may be removed by this method. */ private void transferAttributes(final AbstractSAMHeaderRecord record, final Map textAttributes) { // All header tags are now of type String, so no need to distinguish standard from non-standard. for (final Map.Entry entry : textAttributes.entrySet()) { record.setAttribute(entry.getKey(), entry.getValue()); } } private void parsePGLine(final ParsedHeaderLine parsedHeaderLine) { assert(HeaderRecordType.PG.equals(parsedHeaderLine.getHeaderRecordType())); if (!parsedHeaderLine.requireTag(SAMProgramRecord.PROGRAM_GROUP_ID_TAG)) { return; } final SAMProgramRecord programRecord = new SAMProgramRecord(parsedHeaderLine.removeValue(SAMProgramRecord.PROGRAM_GROUP_ID_TAG)); transferAttributes(programRecord, parsedHeaderLine.mKeyValuePairs); mFileHeader.addProgramRecord(programRecord); } private void parseRGLine(final ParsedHeaderLine parsedHeaderLine) { assert(HeaderRecordType.RG.equals(parsedHeaderLine.getHeaderRecordType())); if (!parsedHeaderLine.requireTag(SAMReadGroupRecord.READ_GROUP_ID_TAG)) { return; } // Allow no SM tag if validation stringency is not strict. This call has the side effect of reporting an error // or throwing an exception depending on validation stringency if this is missing. parsedHeaderLine.requireTag(SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG); final SAMReadGroupRecord samReadGroupRecord = new SAMReadGroupRecord(parsedHeaderLine.removeValue(SAMReadGroupRecord.READ_GROUP_ID_TAG)); transferAttributes(samReadGroupRecord, parsedHeaderLine.mKeyValuePairs); // Convert non-String attributes to the appropriate types final String predictedMedianInsertSize = (String)samReadGroupRecord.getAttribute(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG); if (predictedMedianInsertSize != null) { try { Integer.parseInt(predictedMedianInsertSize); samReadGroupRecord.setAttribute(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG,predictedMedianInsertSize); } catch (NumberFormatException e) { reportErrorParsingLine(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG + " is not numeric: " + predictedMedianInsertSize, SAMValidationError.Type.INVALID_PREDICTED_MEDIAN_INSERT_SIZE, e); } } final String dateRunProduced = (String)samReadGroupRecord.getAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG); if (dateRunProduced != null) { Object date; try { date = mTagCodec.decodeDate(dateRunProduced); } catch (DateParser.InvalidDateException e) { // Can't convert date string into Date object. Treat it as a string if validation // stringency allows it. date = dateRunProduced; reportErrorParsingLine(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG + " tag value '" + dateRunProduced + "' is not parseable as a date", SAMValidationError.Type.INVALID_DATE_STRING, e); } samReadGroupRecord.setAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG, date.toString()); } readGroups.add(samReadGroupRecord); } private void parseSQLine(final ParsedHeaderLine parsedHeaderLine) { assert(HeaderRecordType.SQ.equals(parsedHeaderLine.getHeaderRecordType())); if (!parsedHeaderLine.requireTag(SAMSequenceRecord.SEQUENCE_NAME_TAG) || !parsedHeaderLine.requireTag(SAMSequenceRecord.SEQUENCE_LENGTH_TAG)) { return; } String sequenceName = parsedHeaderLine.removeValue(SAMSequenceRecord.SEQUENCE_NAME_TAG); sequenceName = SAMSequenceRecord.truncateSequenceName(sequenceName); final SAMSequenceRecord samSequenceRecord = new SAMSequenceRecord(sequenceName, Integer.parseInt(parsedHeaderLine.removeValue(SAMSequenceRecord.SEQUENCE_LENGTH_TAG))); transferAttributes(samSequenceRecord, parsedHeaderLine.mKeyValuePairs); sequences.add(samSequenceRecord); } private void parseHDLine(final ParsedHeaderLine parsedHeaderLine) { assert(HeaderRecordType.HD.equals(parsedHeaderLine.getHeaderRecordType())); if (!parsedHeaderLine.requireTag(SAMFileHeader.VERSION_TAG)) { return; } transferAttributes(mFileHeader, parsedHeaderLine.mKeyValuePairs); } private void reportErrorParsingLine(String reason, final SAMValidationError.Type type, final Throwable nestedException) { reason = "Error parsing SAM header. " + reason + ". Line:\n" + mCurrentLine; if (validationStringency != ValidationStringency.STRICT) { final SAMValidationError error = new SAMValidationError(type, reason, null, mReader.getLineNumber()); error.setSource(mSource); mFileHeader.addValidationError(error); } else { String fileMessage = ""; if (mSource != null) { fileMessage = "File " + mSource; } throw new SAMFormatException(reason + "; " + fileMessage + "; Line number " + mReader.getLineNumber(), nestedException); } } private enum HeaderRecordType { HD, SQ, RG, PG, CO } /** * Takes a header line as a String and converts it into a HeaderRecordType, and a map of key:value strings. * If the line does not contain a recognized HeaderRecordType, then the line is considered invalid, and will * not have any key:value pairs. */ private class ParsedHeaderLine { private HeaderRecordType mHeaderRecordType; private final Map mKeyValuePairs = new LinkedHashMap(); private boolean lineValid = false; ParsedHeaderLine(final String line) { assert(line.startsWith(HEADER_LINE_START)); // Tab-separate String[] fields = new String[1024]; int numFields = StringUtil.split(line, fields, FIELD_SEPARATOR_CHAR); if (numFields == fields.length) { // Lots of fields, so fall back fields = FIELD_SEPARATOR_RE.split(line); numFields = fields.length; } // Parse the HeaderRecordType try { mHeaderRecordType = HeaderRecordType.valueOf(fields[0].substring(1)); } catch (IllegalArgumentException e) { reportErrorParsingLine("Unrecognized header record type", SAMValidationError.Type.UNRECOGNIZED_HEADER_TYPE, null); mHeaderRecordType = null; return; } // Do not parse key:value pairs for comment lines. if (mHeaderRecordType == HeaderRecordType.CO) { lineValid = true; return; } final String[] keyAndValue = new String[2]; // Parse they key:value pairs for (int i = 1; i < numFields; ++i) { if (StringUtil.splitConcatenateExcessTokens(fields[i], keyAndValue, TAG_KEY_VALUE_SEPARATOR_CHAR) != 2) { reportErrorParsingLine("Problem parsing " + HEADER_LINE_START + mHeaderRecordType + " key:value pair", SAMValidationError.Type.POORLY_FORMATTED_HEADER_TAG, null); continue; } if (mKeyValuePairs.containsKey(keyAndValue[0]) && ! mKeyValuePairs.get(keyAndValue[0]).equals(keyAndValue[1])) { reportErrorParsingLine("Problem parsing " + HEADER_LINE_START + mHeaderRecordType + " key:value pair " + keyAndValue[0] + ":" + keyAndValue[1] + " clashes with " + keyAndValue[0] + ":" + mKeyValuePairs.get(keyAndValue[0]), SAMValidationError.Type.HEADER_TAG_MULTIPLY_DEFINED, null); continue; } mKeyValuePairs.put(keyAndValue[0], keyAndValue[1]); } lineValid = true; } /** * True if the line is recognized as one of the valid HeaderRecordTypes. */ public boolean isLineValid() { return lineValid; } /** * Handling depends on the validation stringency. If the tag is not present, and stringency is strict, * an exception is thrown. If stringency is not strict, false is returned. * @param tag Must be present for the line to be considered value. * @return True if tag is present. */ boolean requireTag(final String tag) { if (!mKeyValuePairs.containsKey(tag)) { reportErrorParsingLine(HEADER_LINE_START + mHeaderRecordType + " line missing " + tag + " tag", SAMValidationError.Type.HEADER_RECORD_MISSING_REQUIRED_TAG, null); return false; } return true; } /** * @return null if line is invalid, otherwise the parsed HeaderRecordType */ public HeaderRecordType getHeaderRecordType() { return mHeaderRecordType; } boolean containsKey(final String key) { return mKeyValuePairs.containsKey(key); } String getValue(final String key) { return mKeyValuePairs.get(key); } String removeValue(final String key) { final String ret = mKeyValuePairs.get(key); mKeyValuePairs.remove(key); return ret; } } /** * Convert SAMFileHeader from in-memory representation to text representation. Always writes * SAMFileHeader.CURRENT_VERSION as the version in the header. * @param writer where to write the header text. * @param header object to be converted to text. */ public void encode(final Writer writer, final SAMFileHeader header) { encode(writer, header, false); } /** * Convert SAMFileHeader from in-memory representation to text representation. * @param writer where to write the header text. * @param header object to be converted to text. * @param keepExistingVersionNumber If true, writes whatever version # was in the header. If false, writes * SAMFileHeader.CURRENT_VERSION. */ public void encode(final Writer writer, final SAMFileHeader header, final boolean keepExistingVersionNumber) { mFileHeader = header; this.writer = new BufferedWriter(writer); writeHDLine(keepExistingVersionNumber); for (final SAMSequenceRecord sequenceRecord: header.getSequenceDictionary().getSequences()) { writeSQLine(sequenceRecord); } for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { writeRGLine(readGroup); } for (final SAMProgramRecord programRecord : header.getProgramRecords()) { writePGLine(programRecord); } for (final String comment : header.getComments()) { println(comment); } try { this.writer.flush(); } catch (IOException e) { throw new RuntimeIOException(e); } } private void println(final String s) { try { writer.append(s); writer.append("\n"); } catch (IOException e) { throw new RuntimeIOException(e); } } private void writePGLine(final SAMProgramRecord programRecord) { if (programRecord == null) { return; } final String[] fields = new String[2 + programRecord.getAttributes().size()]; fields[0] = HEADER_LINE_START + HeaderRecordType.PG; fields[1] = SAMProgramRecord.PROGRAM_GROUP_ID_TAG + TAG_KEY_VALUE_SEPARATOR + programRecord.getProgramGroupId(); encodeTags(programRecord, fields, 2); println(StringUtil.join(FIELD_SEPARATOR, fields)); } private void writeRGLine(final SAMReadGroupRecord readGroup) { final String[] fields = new String[2 + readGroup.getAttributes().size()]; fields[0] = HEADER_LINE_START + HeaderRecordType.RG; fields[1] = SAMReadGroupRecord.READ_GROUP_ID_TAG + TAG_KEY_VALUE_SEPARATOR + readGroup.getReadGroupId(); encodeTags(readGroup, fields, 2); println(StringUtil.join(FIELD_SEPARATOR, fields)); } private void writeHDLine(final boolean keepExistingVersionNumber) { final SAMFileHeader newHeader; if (keepExistingVersionNumber) { newHeader = mFileHeader; } else { // Make a copy of the header, excluding the version from the input header, so that // output get CURRENT_VERSION instead of whatever the version of the input header was. newHeader = new SAMFileHeader(); for (final Map.Entry entry : mFileHeader.getAttributes()) { if (!entry.getKey().equals(SAMFileHeader.VERSION_TAG)) { newHeader.setAttribute(entry.getKey(), entry.getValue()); } } } final String[] fields = new String[1 + newHeader.getAttributes().size()]; fields[0] = HEADER_LINE_START + HeaderRecordType.HD; encodeTags(newHeader, fields, 1); println(StringUtil.join(FIELD_SEPARATOR, fields)); } private void writeSQLine(final SAMSequenceRecord sequenceRecord) { final int numAttributes =sequenceRecord.getAttributes() != null ? sequenceRecord.getAttributes().size() : 0; final String[] fields = new String[3 + numAttributes]; fields[0] = HEADER_LINE_START + HeaderRecordType.SQ; fields[1] = SAMSequenceRecord.SEQUENCE_NAME_TAG + TAG_KEY_VALUE_SEPARATOR + sequenceRecord.getSequenceName(); fields[2] = SAMSequenceRecord.SEQUENCE_LENGTH_TAG + TAG_KEY_VALUE_SEPARATOR + Integer.toString(sequenceRecord.getSequenceLength()); encodeTags(sequenceRecord, fields, 3); println(StringUtil.join(FIELD_SEPARATOR, fields)); } /** * Encode all the attributes in the given object as text * @param rec object containing attributes, and knowledge of which are standard tags * @param fields where to put the text representation of the tags. Must be big enough to hold all tags. * @param offset where to start putting text tag representations. */ private void encodeTags(final AbstractSAMHeaderRecord rec, final String[] fields, int offset) { for (final Map.Entry entry: rec.getAttributes()) { fields[offset++] = mTagCodec.encodeUntypedTag(entry.getKey(), entry.getValue()); } } public void setValidationStringency(final ValidationStringency validationStringency) { if (validationStringency == null) { throw new IllegalArgumentException("null validationStringency not allowed"); } this.validationStringency = validationStringency; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMTextReader.java000066400000000000000000000202771263034757100231340ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.BufferedLineReader; import htsjdk.samtools.util.CloseableIterator; import java.io.File; import java.io.InputStream; /** * Internal class for reading SAM text files. */ class SAMTextReader extends SamReader.ReaderImplementation { private SAMRecordFactory samRecordFactory; private BufferedLineReader mReader; private SAMFileHeader mFileHeader = null; private String mCurrentLine = null; private RecordIterator mIterator = null; private File mFile = null; private ValidationStringency validationStringency = ValidationStringency.DEFAULT_STRINGENCY; /** * Add information about the origin (reader and position) to SAM records. */ private SamReader mParentReader; /** * Prepare to read a SAM text file. * * @param stream Need not be buffered, as this class provides buffered reading. */ public SAMTextReader(final InputStream stream, final ValidationStringency validationStringency, final SAMRecordFactory factory) { mReader = new BufferedLineReader(stream); this.validationStringency = validationStringency; this.samRecordFactory = factory; readHeader(); } /** * Prepare to read a SAM text file. * * @param stream Need not be buffered, as this class provides buffered reading. * @param file For error reporting only. */ public SAMTextReader(final InputStream stream, final File file, final ValidationStringency validationStringency, final SAMRecordFactory factory) { this(stream, validationStringency, factory); mFile = file; } /** * If true, writes the source of every read into the source SAMRecords. * * @param enabled true to write source information into each SAMRecord. */ public void enableFileSource(final SamReader reader, final boolean enabled) { this.mParentReader = enabled ? reader : null; } void enableIndexCaching(final boolean enabled) { throw new UnsupportedOperationException("Cannot enable index caching for a SAM text reader"); } void enableIndexMemoryMapping(final boolean enabled) { throw new UnsupportedOperationException("Cannot enable index memory mapping for a SAM text reader"); } void enableCrcChecking(final boolean enabled) { // Do nothing - this has no meaning for SAM reading } void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; } @Override public SamReader.Type type() { return SamReader.Type.SAM_TYPE; } public boolean hasIndex() { return false; } public BAMIndex getIndex() { throw new UnsupportedOperationException(); } public void close() { if (mReader != null) { try { mReader.close(); } finally { mReader = null; } } } public SAMFileHeader getFileHeader() { return mFileHeader; } public ValidationStringency getValidationStringency() { return validationStringency; } public void setValidationStringency(final ValidationStringency stringency) { this.validationStringency = stringency; } /** * There can only be one extant iterator on a SAMTextReader at a time. The previous one must * be closed before calling getIterator(). Because the input stream is not seekable, a subsequent * call to getIterator() returns an iterator that starts where the last one left off. * * @return Iterator of SAMRecords in file order. */ public CloseableIterator getIterator() { if (mReader == null) { throw new IllegalStateException("File reader is closed"); } if (mIterator != null) { throw new IllegalStateException("Iteration in progress"); } mIterator = new RecordIterator(); return mIterator; } /** * Generally loads data at a given point in the file. Unsupported for SAMTextReaders. * * @param fileSpan The file span. * @return An iterator over the given file span. */ public CloseableIterator getIterator(final SAMFileSpan fileSpan) { throw new UnsupportedOperationException("Cannot directly iterate over regions within SAM text files."); } /** * Generally gets a pointer to the first read in the file. Unsupported for SAMTextReaders. * * @return An pointer to the first read in the file. */ public SAMFileSpan getFilePointerSpanningReads() { throw new UnsupportedOperationException("Cannot retrieve file pointers within SAM text files."); } /** * Unsupported for SAM text files. */ public CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { throw new UnsupportedOperationException("Cannot query SAM text files"); } @Override public CloseableIterator query(final QueryInterval[] intervals, final boolean contained) { throw new UnsupportedOperationException("Cannot query SAM text files"); } /** * Unsupported for SAM text files. */ public CloseableIterator queryAlignmentStart(final String sequence, final int start) { throw new UnsupportedOperationException("Cannot query SAM text files"); } public CloseableIterator queryUnmapped() { throw new UnsupportedOperationException("Cannot query SAM text files"); } private void readHeader() { final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); headerCodec.setValidationStringency(validationStringency); mFileHeader = headerCodec.decode(mReader, (mFile != null ? mFile.toString() : null)); advanceLine(); } private String advanceLine() { mCurrentLine = mReader.readLine(); return mCurrentLine; } /** * SAMRecord iterator for SAMTextReader */ private class RecordIterator implements CloseableIterator { private final SAMLineParser parser = new SAMLineParser(samRecordFactory, validationStringency, mFileHeader, mParentReader, mFile); private RecordIterator() { if (mReader == null) { throw new IllegalStateException("Reader is closed."); } } public void close() { SAMTextReader.this.close(); } public boolean hasNext() { return mCurrentLine != null; } public SAMRecord next() { if (!hasNext()) { throw new IllegalStateException("Cannot call next() on exhausted iterator"); } try { return parseLine(); } finally { advanceLine(); } } public void remove() { throw new UnsupportedOperationException("Not supported: remove"); } private SAMRecord parseLine() { return parser.parseLine(mCurrentLine, mReader.getLineNumber()); } } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMTextWriter.java000066400000000000000000000146461263034757100232110ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.AsciiWriter; import htsjdk.samtools.util.RuntimeIOException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.StringWriter; import java.io.Writer; /** * Writer for text-format SAM files. */ public class SAMTextWriter extends SAMFileWriterImpl { private static final String FIELD_SEPARATOR = "\t"; private final Writer out; // For error reporting only. private final File file; private final TextTagCodec tagCodec = new TextTagCodec(); private final SAMTagUtil tagUtil = new SAMTagUtil(); /** * Constructs a SAMTextWriter that outputs to a Writer. * @param out Writer. */ public SAMTextWriter(Writer out) { this.out = out; this.file = null; } /** * Constructs a SAMTextWriter that writes to a File. * @param file Where to write the output. */ public SAMTextWriter(final File file) { try { this.file = file; this.out = new AsciiWriter(new FileOutputStream(file)); } catch (IOException e) { throw new RuntimeIOException(e); } } /** * Returns the Writer used by this instance. Useful for flushing the output. */ public Writer getWriter() { return out; } /** * Constructs a SAMTextWriter that writes to an OutputStream. The OutputStream * is wrapped in an AsciiWriter, which can be retrieved with getWriter(). * @param stream Need not be buffered because this class provides buffering. */ public SAMTextWriter(final OutputStream stream) { this.file = null; this.out = new AsciiWriter(stream); } /** * Write the record. * * @param alignment SAMRecord. */ public void writeAlignment(final SAMRecord alignment) { try { out.write(alignment.getReadName()); out.write(FIELD_SEPARATOR); out.write(Integer.toString(alignment.getFlags())); out.write(FIELD_SEPARATOR); out.write(alignment.getReferenceName()); out.write(FIELD_SEPARATOR); out.write(Integer.toString(alignment.getAlignmentStart())); out.write(FIELD_SEPARATOR); out.write(Integer.toString(alignment.getMappingQuality())); out.write(FIELD_SEPARATOR); out.write(alignment.getCigarString()); out.write(FIELD_SEPARATOR); // == is OK here because these strings are interned if (alignment.getReferenceName() == alignment.getMateReferenceName() && SAMRecord.NO_ALIGNMENT_REFERENCE_NAME != alignment.getReferenceName()) { out.write("="); } else { out.write(alignment.getMateReferenceName()); } out.write(FIELD_SEPARATOR); out.write(Integer.toString(alignment.getMateAlignmentStart())); out.write(FIELD_SEPARATOR); out.write(Integer.toString(alignment.getInferredInsertSize())); out.write(FIELD_SEPARATOR); out.write(alignment.getReadString()); out.write(FIELD_SEPARATOR); out.write(alignment.getBaseQualityString()); SAMBinaryTagAndValue attribute = alignment.getBinaryAttributes(); while (attribute != null) { out.write(FIELD_SEPARATOR); final String encodedTag; if (attribute.isUnsignedArray()) { encodedTag = tagCodec.encodeUnsignedArray(tagUtil.makeStringTag(attribute.tag), attribute.value); } else { encodedTag = tagCodec.encode(tagUtil.makeStringTag(attribute.tag), attribute.value); } out.write(encodedTag); attribute = attribute.getNext(); } out.write("\n"); } catch (IOException e) { throw new RuntimeIOException(e); } } /* This method is called by SAMRecord.getSAMString(). */ private static SAMTextWriter textWriter = null; private static StringWriter stringWriter = null; static synchronized String getSAMString(final SAMRecord alignment) { if (stringWriter == null) stringWriter = new StringWriter(); if (textWriter == null) textWriter = new SAMTextWriter(stringWriter); stringWriter.getBuffer().setLength(0); textWriter.writeAlignment(alignment); return stringWriter.toString(); } /** * Write the header text. This method can also be used to write * an arbitrary String, not necessarily the header. * * @param textHeader String containing the text to write. */ public void writeHeader(final String textHeader) { try { out.write(textHeader); } catch (IOException e) { throw new RuntimeIOException(e); } } /** * Do any required flushing here. */ public void finish() { try { out.close(); } catch (IOException e) { throw new RuntimeIOException(e); } } /** * For producing error messages. * * @return Output filename, or null if there isn't one. */ public String getFilename() { if (file == null) { return null; } return file.getAbsolutePath(); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMTools.java000066400000000000000000000072561263034757100221670ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; import java.io.File; /** * Command line utility for manipulating SAM/BAM files. */ public class SAMTools { private String mCommand = null; private File mInputFile = null; public static void main(final String[] args) throws Exception { final int status = new SAMTools().run(args); if (status != 0) { System.exit(status); } } private SAMTools() { } private void usage() { System.out.println(); System.out.println("SAMTools version 0.1.0"); System.out.println("Tools for manipulating SAM/BAM files"); System.out.println(); System.out.println("Usage: SAMTools "); System.out.println(); System.out.println("Commands:"); System.out.println(" help"); System.out.println(" view "); System.out.println(); } private boolean parseArguments(final String[] args) { if (args.length == 0) { usage(); return true; } final String command = args[0]; final int argpos = 1; final int argcount = args.length - argpos; if (command.equals("help")) { usage(); return true; } else if (command.equals("view")) { if (argcount != 1) { usage(); return false; } mInputFile = new File(args[1]); if (!mInputFile.exists()) { System.out.println("Input file not found: " + mInputFile); return false; } } else { System.out.println("Unrecognized command: " + command); System.out.println(); usage(); return false; } mCommand = command; return true; } private int run(final String[] args) throws Exception { if (!parseArguments(args)) { return 1; } if (mCommand == null) { return 0; } if (mCommand.equals("view")) { return runView(); } return 1; } private int runView() { final SamReader reader = SamReaderFactory.makeDefault().open(mInputFile); final CloseableIterator iterator = reader.iterator(); while (iterator.hasNext()) { final SAMRecord record = iterator.next(); System.out.println(record.getSAMString()); } iterator.close(); return 0; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMUtils.java000066400000000000000000001360521263034757100221640ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.BinaryCodec; import htsjdk.samtools.util.CigarUtil; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.RuntimeEOFException; import htsjdk.samtools.util.StringUtil; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.TreeMap; /** * Utilty methods. */ public final class SAMUtils { // Representation of bases, one for when in low-order nybble, one for when in high-order nybble. private static final byte COMPRESSED_EQUAL_LOW = 0; private static final byte COMPRESSED_A_LOW = 1; private static final byte COMPRESSED_C_LOW = 2; private static final byte COMPRESSED_M_LOW = 3; private static final byte COMPRESSED_G_LOW = 4; private static final byte COMPRESSED_R_LOW = 5; private static final byte COMPRESSED_S_LOW = 6; private static final byte COMPRESSED_V_LOW = 7; private static final byte COMPRESSED_T_LOW = 8; private static final byte COMPRESSED_W_LOW = 9; private static final byte COMPRESSED_Y_LOW = 10; private static final byte COMPRESSED_H_LOW = 11; private static final byte COMPRESSED_K_LOW = 12; private static final byte COMPRESSED_D_LOW = 13; private static final byte COMPRESSED_B_LOW = 14; private static final byte COMPRESSED_N_LOW = 15; private static final byte COMPRESSED_EQUAL_HIGH = COMPRESSED_EQUAL_LOW << 4; private static final byte COMPRESSED_A_HIGH = COMPRESSED_A_LOW << 4; private static final byte COMPRESSED_C_HIGH = COMPRESSED_C_LOW << 4; private static final byte COMPRESSED_G_HIGH = COMPRESSED_G_LOW << 4; private static final byte COMPRESSED_T_HIGH = (byte) (COMPRESSED_T_LOW << 4); private static final byte COMPRESSED_N_HIGH = (byte) (COMPRESSED_N_LOW << 4); private static final byte COMPRESSED_M_HIGH = (byte) (COMPRESSED_M_LOW << 4); private static final byte COMPRESSED_R_HIGH = (byte) (COMPRESSED_R_LOW << 4); private static final byte COMPRESSED_S_HIGH = (byte) (COMPRESSED_S_LOW << 4); private static final byte COMPRESSED_V_HIGH = (byte) (COMPRESSED_V_LOW << 4); private static final byte COMPRESSED_W_HIGH = (byte) (COMPRESSED_W_LOW << 4); private static final byte COMPRESSED_Y_HIGH = (byte) (COMPRESSED_Y_LOW << 4); private static final byte COMPRESSED_H_HIGH = (byte) (COMPRESSED_H_LOW << 4); private static final byte COMPRESSED_K_HIGH = (byte) (COMPRESSED_K_LOW << 4); private static final byte COMPRESSED_D_HIGH = (byte) (COMPRESSED_D_LOW << 4); private static final byte COMPRESSED_B_HIGH = (byte) (COMPRESSED_B_LOW << 4); private static final byte [] COMPRESSED_LOOKUP_TABLE = new byte[]{ '=', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N' }; public static final int MAX_PHRED_SCORE = 93; /** * Convert from a byte array containing =AaCcGgTtNn represented as ASCII, to a byte array half as long, * with =, A, C, G, T converted to 0, 1, 2, 4, 8, 15. * * @param readBases Bases as ASCII bytes. * @return New byte array with bases represented as nybbles, in BAM binary format. */ static byte[] bytesToCompressedBases(final byte[] readBases) { final byte[] compressedBases = new byte[(readBases.length + 1) / 2]; int i; for (i = 1; i < readBases.length; i += 2) { compressedBases[i / 2] = (byte) (charToCompressedBaseHigh(readBases[i - 1]) | charToCompressedBaseLow(readBases[i])); } // Last nybble if (i == readBases.length) { compressedBases[i / 2] = charToCompressedBaseHigh((char) readBases[i - 1]); } return compressedBases; } /** * Convert from a byte array with basese stored in nybbles, with =, A, C, G, T represented as 0, 1, 2, 4, 8, 15, * to a a byte array containing =AaCcGgTtNn represented as ASCII. * * @param length Number of bases (not bytes) to convert. * @param compressedBases Bases represented as nybbles, in BAM binary format. * @param compressedOffset Byte offset in compressedBases to start. * @return New byte array with bases as ASCII bytes. */ public static byte[] compressedBasesToBytes(final int length, final byte[] compressedBases, final int compressedOffset) { final byte[] ret = new byte[length]; int i; for (i = 1; i < length; i += 2) { final int compressedIndex = i / 2 + compressedOffset; ret[i - 1] = compressedBaseToByteHigh(compressedBases[compressedIndex]); ret[i] = compressedBaseToByteLow(compressedBases[compressedIndex]); } // Last nybble if (i == length) { ret[i - 1] = compressedBaseToByteHigh(compressedBases[i / 2 + compressedOffset]); } return ret; } /** * Convert from ASCII byte to BAM nybble representation of a base in low-order nybble. * * @param base One of =AaCcGgTtNn. * @return Low-order nybble-encoded equivalent. */ private static byte charToCompressedBaseLow(final int base) { switch (base) { case '=': return COMPRESSED_EQUAL_LOW; case 'a': case 'A': return COMPRESSED_A_LOW; case 'c': case 'C': return COMPRESSED_C_LOW; case 'g': case 'G': return COMPRESSED_G_LOW; case 't': case 'T': return COMPRESSED_T_LOW; case 'n': case 'N': case '.': return COMPRESSED_N_LOW; // IUPAC ambiguity codes case 'M': case 'm': return COMPRESSED_M_LOW; case 'R': case 'r': return COMPRESSED_R_LOW; case 'S': case 's': return COMPRESSED_S_LOW; case 'V': case 'v': return COMPRESSED_V_LOW; case 'W': case 'w': return COMPRESSED_W_LOW; case 'Y': case 'y': return COMPRESSED_Y_LOW; case 'H': case 'h': return COMPRESSED_H_LOW; case 'K': case 'k': return COMPRESSED_K_LOW; case 'D': case 'd': return COMPRESSED_D_LOW; case 'B': case 'b': return COMPRESSED_B_LOW; default: throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); } } /** * Convert from ASCII byte to BAM nybble representation of a base in high-order nybble. * * @param base One of =AaCcGgTtNn. * @return High-order nybble-encoded equivalent. */ private static byte charToCompressedBaseHigh(final int base) { switch (base) { case '=': return COMPRESSED_EQUAL_HIGH; case 'a': case 'A': return COMPRESSED_A_HIGH; case 'c': case 'C': return COMPRESSED_C_HIGH; case 'g': case 'G': return COMPRESSED_G_HIGH; case 't': case 'T': return COMPRESSED_T_HIGH; case 'n': case 'N': case '.': return COMPRESSED_N_HIGH; // IUPAC ambiguity codes case 'M': case 'm': return COMPRESSED_M_HIGH; case 'R': case 'r': return COMPRESSED_R_HIGH; case 'S': case 's': return COMPRESSED_S_HIGH; case 'V': case 'v': return COMPRESSED_V_HIGH; case 'W': case 'w': return COMPRESSED_W_HIGH; case 'Y': case 'y': return COMPRESSED_Y_HIGH; case 'H': case 'h': return COMPRESSED_H_HIGH; case 'K': case 'k': return COMPRESSED_K_HIGH; case 'D': case 'd': return COMPRESSED_D_HIGH; case 'B': case 'b': return COMPRESSED_B_HIGH; default: throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); } } /** * Returns the byte corresponding to a certain nybble * @param base One of COMPRESSED_*_LOW, a low-order nybble encoded base. * @return ASCII base, one of ACGTN=. */ private static byte compressedBaseToByte(byte base){ try{ return COMPRESSED_LOOKUP_TABLE[base]; }catch(IndexOutOfBoundsException e){ throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); } } /** * Convert from BAM nybble representation of a base in low-order nybble to ASCII byte. * * @param base One of COMPRESSED_*_LOW, a low-order nybble encoded base. * @return ASCII base, one of ACGTN=. */ private static byte compressedBaseToByteLow(final int base) { return compressedBaseToByte((byte)(base & 0xf)); } /** * Convert from BAM nybble representation of a base in high-order nybble to ASCII byte. * * @param base One of COMPRESSED_*_HIGH, a high-order nybble encoded base. * @return ASCII base, one of ACGTN=. */ private static byte compressedBaseToByteHigh(final int base) { return compressedBaseToByte((byte)((base >> 4) & 0xf)); } /** * Convert bases in place into canonical form, upper case, and with no-call represented as N. * * @param bases */ static void normalizeBases(final byte[] bases) { for (int i = 0; i < bases.length; ++i) { bases[i] = StringUtil.toUpperCase(bases[i]); if (bases[i] == '.') { bases[i] = 'N'; } } } /** * Convert an array of bytes, in which each byte is a binary phred quality score, to * printable ASCII representation of the quality scores, ala FASTQ format. *

* Equivalent to phredToFastq(data, 0, data.length) * * @param data Array of bytes in which each byte is a binar phred score. * @return String with ASCII representation of those quality scores. */ public static String phredToFastq(final byte[] data) { if (data == null) { return null; } return phredToFastq(data, 0, data.length); } /** * Convert an array of bytes, in which each byte is a binary phred quality score, to * printable ASCII representation of the quality scores, ala FASTQ format. * * @param buffer Array of bytes in which each byte is a binar phred score. * @param offset Where in buffer to start conversion. * @param length How many bytes of buffer to convert. * @return String with ASCII representation of those quality scores. */ public static String phredToFastq(final byte[] buffer, final int offset, final int length) { final char[] chars = new char[length]; for (int i = 0; i < length; i++) { chars[i] = phredToFastq(buffer[offset + i] & 0xFF); } return new String(chars); } /** * Convert a single binary phred score to printable ASCII representation, ala FASTQ format. * * @param phredScore binary phred score. * @return Printable ASCII representation of phred score. */ public static char phredToFastq(final int phredScore) { if (phredScore < 0 || phredScore > MAX_PHRED_SCORE) { throw new IllegalArgumentException("Cannot encode phred score: " + phredScore); } return (char) (33 + phredScore); } /** * Convert a string with phred scores in printable ASCII FASTQ format to an array * of binary phred scores. * * @param fastq Phred scores in FASTQ printable ASCII format. * @return byte array of binary phred scores in which each byte corresponds to a character in the input string. */ public static byte[] fastqToPhred(final String fastq) { if (fastq == null) { return null; } final int length = fastq.length(); final byte[] scores = new byte[length]; for (int i = 0; i < length; i++) { scores[i] = (byte) fastqToPhred(fastq.charAt(i)); } return scores; } /** * Converts printable qualities in Sanger fastq format to binary phred scores. */ public static void fastqToPhred(final byte[] fastq) { for (int i = 0; i < fastq.length; ++i) { fastq[i] = (byte) fastqToPhred((char) (fastq[i] & 0xff)); } } /** * Convert a single printable ASCII FASTQ format phred score to binary phred score. * * @param ch Printable ASCII FASTQ format phred score. * @return Binary phred score. */ public static int fastqToPhred(final char ch) { if (ch < 33 || ch > 126) { throw new IllegalArgumentException("Invalid fastq character: " + ch); } return (ch - 33); } /** * calculate the bin given an alignment in [beg,end) * Copied from SAM spec. * * @param beg 0-based start of read (inclusive) * @param end 0-based end of read (exclusive) * @deprecated Use GenomicIndexUtil.reg2bin */ static int reg2bin(final int beg, final int end) { return GenomicIndexUtil.reg2bin(beg, end); } /** * Handle a list of validation errors according to the validation stringency. * * @param validationErrors List of errors to report, or null if there are no errors. * @param samRecordIndex Record number of the SAMRecord corresponding to the validation errors, or -1 if * the record number is not known. * @param validationStringency If STRICT, throw a SAMFormatException. If LENIENT, print the validation * errors to stderr. If SILENT, do nothing. */ public static void processValidationErrors(final List validationErrors, final long samRecordIndex, final ValidationStringency validationStringency) { if (validationErrors != null && validationErrors.size() > 0) { for (final SAMValidationError validationError : validationErrors) { validationError.setRecordNumber(samRecordIndex); } if (validationStringency == ValidationStringency.STRICT) { throw new SAMFormatException("SAM validation error: " + validationErrors.get(0)); } else if (validationStringency == ValidationStringency.LENIENT) { for (final SAMValidationError error : validationErrors) { System.err.println("Ignoring SAM validation error: " + error); } } } } public static void processValidationError(final SAMValidationError validationError, final ValidationStringency validationStringency) { if (validationStringency == ValidationStringency.STRICT) { throw new SAMFormatException("SAM validation error: " + validationError); } else if (validationStringency == ValidationStringency.LENIENT) { System.err.println("Ignoring SAM validation error: " + validationError); } } private static final SAMHeaderRecordComparator HEADER_RECORD_COMPARATOR = new SAMHeaderRecordComparator( SAMReadGroupRecord.PLATFORM_UNIT_TAG, SAMReadGroupRecord.LIBRARY_TAG, SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG, SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG, SAMReadGroupRecord.SEQUENCING_CENTER_TAG, SAMReadGroupRecord.PLATFORM_TAG, SAMReadGroupRecord.DESCRIPTION_TAG, SAMReadGroupRecord.READ_GROUP_ID_TAG // We don't actually want to compare with ID but it's suitable // "just in case" since it's the only one that's actually required ); /** * Calculate a hash code from identifying information in the RG (read group) records in a SAM file's * header. This hash code changes any time read groups are added or removed. Comparing one file's * hash code to another's tells you if the read groups in the BAM files are different. */ public static String calculateReadGroupRecordChecksum(final File input, final File referenceFasta) { final String ENCODING = "UTF-8"; final MessageDigest digest; try { digest = MessageDigest.getInstance("MD5"); } catch (final NoSuchAlgorithmException nsae) { throw new Error("No MD5 algorithm was available in a Java JDK? Unheard-of!"); } // Sort the read group records by their first final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(input); final List sortedRecords = new ArrayList(reader.getFileHeader().getReadGroups()); Collections.sort(sortedRecords, HEADER_RECORD_COMPARATOR); for (final SAMReadGroupRecord rgRecord : sortedRecords) { final TreeMap sortedAttributes = new TreeMap(); for (final Map.Entry attributeEntry : rgRecord.getAttributes()) { sortedAttributes.put(attributeEntry.getKey(), attributeEntry.getValue()); } try { for (final Map.Entry sortedEntry : sortedAttributes.entrySet()) { if (!sortedEntry.getKey().equals(SAMReadGroupRecord.READ_GROUP_ID_TAG)) { // Redundant check, safety first digest.update(sortedEntry.getKey().getBytes(ENCODING)); digest.update(sortedEntry.getValue().getBytes(ENCODING)); } } } catch (final UnsupportedEncodingException uee) { throw new Error("No " + ENCODING + "!? WTH?"); } } // Convert to a String and pad to get the full 32 chars. final StringBuilder hashText = new StringBuilder((new BigInteger(1, digest.digest())).toString(16)); while (hashText.length() < 32) hashText.insert(0, "0"); CloserUtil.close(reader); return hashText.toString(); } /** * Chains program in front of the first "head" item in the list of * SAMProgramRecords in header. This method should not be used * when there are multiple chains of program groups in a header, only when * it can safely be assumed that there is only one chain. It correctly handles * the case where program has already been added to the header, so * it can be used whether creating a SAMProgramRecord with a constructor or when * calling SAMFileHeader.createProgramRecord(). */ public static void chainSAMProgramRecord(final SAMFileHeader header, final SAMProgramRecord program) { final List pgs = header.getProgramRecords(); if (pgs.size() > 0) { final List referencedIds = new ArrayList(); for (final SAMProgramRecord pg : pgs) { if (pg.getPreviousProgramGroupId() != null) { referencedIds.add(pg.getPreviousProgramGroupId()); } } for (final SAMProgramRecord pg : pgs) { // if record being chained has already been added, ignore it if (pg.getProgramGroupId().equals(program.getProgramGroupId())) { continue; } if (!referencedIds.contains(pg.getProgramGroupId())) { program.setPreviousProgramGroupId(pg.getProgramGroupId()); break; } } } } /** * Strip mapping information from a SAMRecord. * * WARNING: by clearing the secondary and supplementary flags, * this may have the affect of producing multiple distinct records with the * same read name and flags, which may lead to invalid SAM/BAM output. * Callers of this method should make sure to deal with this issue. */ public static void makeReadUnmapped(final SAMRecord rec) { if (rec.getReadNegativeStrandFlag()) { SAMRecordUtil.reverseComplement(rec); rec.setReadNegativeStrandFlag(false); } rec.setDuplicateReadFlag(false); rec.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); rec.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); rec.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); rec.setInferredInsertSize(0); rec.setNotPrimaryAlignmentFlag(false); rec.setSupplementaryAlignmentFlag(false); rec.setProperPairFlag(false); rec.setReadUnmappedFlag(true); } /** * Strip mapping information from a SAMRecord, but preserve it in the 'O' tags if it isn't already set. */ public static void makeReadUnmappedWithOriginalTags(final SAMRecord rec) { if (!hasOriginalMappingInformation(rec)) { rec.setAttribute(SAMTag.OP.name(), rec.getAlignmentStart()); rec.setAttribute(SAMTag.OC.name(), rec.getCigarString()); rec.setAttribute(SAMTag.OF.name(), rec.getFlags()); rec.setAttribute(SAMTag.OR.name(), rec.getReferenceName()); } makeReadUnmapped(rec); } /** * See if any tags pertaining to original mapping information have been set. */ public static boolean hasOriginalMappingInformation(final SAMRecord rec) { return rec.getAttribute(SAMTag.OP.name()) != null || rec.getAttribute(SAMTag.OC.name()) != null || rec.getAttribute(SAMTag.OF.name()) != null || rec.getAttribute(SAMTag.OR.name()) != null; } /** * Determines if a cigar has any element that both consumes read bases and consumes reference bases * (e.g. is not all soft-clipped) */ public static boolean cigarMapsNoBasesToRef(final Cigar cigar) { for (final CigarElement el : cigar.getCigarElements()) { if (el.getOperator().consumesReadBases() && el.getOperator().consumesReferenceBases()) { return false; } } return true; } /** * Tests if the provided record is mapped entirely beyond the end of the reference (i.e., the alignment start is greater than the * length of the sequence to which the record is mapped). * @param record must not have a null SamFileHeader */ public static boolean recordMapsEntirelyBeyondEndOfReference(final SAMRecord record) { if (record.getHeader() == null) { throw new SAMException("A non-null SAMHeader is required to resolve the mapping position: " + record.getReadName()); } else { return record.getHeader().getSequence(record.getReferenceIndex()).getSequenceLength() < record.getAlignmentStart(); } } /** * @return negative if mapq1 < mapq2, etc. * Note that MAPQ(0) < MAPQ(255) < MAPQ(1) */ public static int compareMapqs(final int mapq1, final int mapq2) { if (mapq1 == mapq2) return 0; if (mapq1 == 0) return -1; else if (mapq2 == 0) return 1; else if (mapq1 == 255) return -1; else if (mapq2 == 255) return 1; else return mapq1 - mapq2; } /** * Hokey algorithm for combining two MAPQs into values that are comparable, being cognizant of the fact * that in MAPQ world, 1 > 255 > 0. In this algorithm, 255 is treated as if it were 0.01, so that * CombinedMapq(1,0) > CombinedMapq(255, 255) > CombinedMapq(0, 0). * The return value should not be used for anything other than comparing to the return value of other * invocations of this method. */ public static int combineMapqs(int m1, int m2) { if (m1 == 255) m1 = 1; else m1 *= 100; if (m2 == 255) m2 = 1; else m2 *= 100; return m1 + m2; } /** * Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file * offset after skipping over the text header and the sequence records. */ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) { try { return BAMFileReader.findVirtualOffsetOfFirstRecord(bamFile); } catch (final IOException ioe) { throw new RuntimeEOFException(ioe); } } /** * Given a Cigar, Returns blocks of the sequence that have been aligned directly to the * reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference) * are not represented in the alignment blocks. * * @param cigar The cigar containing the alignment information * @param alignmentStart The start (1-based) of the alignment * @param cigarTypeName The type of cigar passed - for error logging. * @return List of alignment blocks */ public static List getAlignmentBlocks(final Cigar cigar, final int alignmentStart, final String cigarTypeName) { if (cigar == null) return Collections.emptyList(); final List alignmentBlocks = new ArrayList(); int readBase = 1; int refBase = alignmentStart; for (final CigarElement e : cigar.getCigarElements()) { switch (e.getOperator()) { case H: break; // ignore hard clips case P: break; // ignore pads case S: readBase += e.getLength(); break; // soft clip read bases case N: refBase += e.getLength(); break; // reference skip case D: refBase += e.getLength(); break; case I: readBase += e.getLength(); break; case M: case EQ: case X: final int length = e.getLength(); alignmentBlocks.add(new AlignmentBlock(readBase, refBase, length)); readBase += length; refBase += length; break; default: throw new IllegalStateException("Case statement didn't deal with " + cigarTypeName + " op: " + e.getOperator()); } } return Collections.unmodifiableList(alignmentBlocks); } /** * @param alignmentStart The start (1-based) of the alignment * @param cigar The cigar containing the alignment information * @return the alignment start (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) * then this method will return 96. *

* Invalid to call on an unmapped read. * Invalid to call with cigar = null */ public static int getUnclippedStart(final int alignmentStart, final Cigar cigar) { int unClippedStart = alignmentStart; for (final CigarElement cig : cigar.getCigarElements()) { final CigarOperator op = cig.getOperator(); if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { unClippedStart -= cig.getLength(); } else { break; } } return unClippedStart; } /** * @param alignmentEnd The end (1-based) of the alignment * @param cigar The cigar containing the alignment information * @return the alignment end (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) * then this method will return 107. *

* Invalid to call on an unmapped read. * Invalid to call with cigar = null */ public static int getUnclippedEnd(final int alignmentEnd, final Cigar cigar) { int unClippedEnd = alignmentEnd; final List cigs = cigar.getCigarElements(); for (int i = cigs.size() - 1; i >= 0; --i) { final CigarElement cig = cigs.get(i); final CigarOperator op = cig.getOperator(); if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { unClippedEnd += cig.getLength(); } else { break; } } return unClippedEnd; } /** * Returns the Mate Cigar String as stored in the attribute 'MC'. * * @param rec the SAM record * @return Mate Cigar String, or null if there is none. */ public static String getMateCigarString(final SAMRecord rec) { return rec.getStringAttribute(SAMTag.MC.name()); } /** * Returns the Mate Cigar or null if there is none. * * @param rec the SAM record * @param withValidation true if we are to validate the mate cigar before returning, false otherwise. * @return Cigar object for the read's mate, or null if there is none. */ public static Cigar getMateCigar(final SAMRecord rec, final boolean withValidation) { final String mateCigarString = getMateCigarString(rec); Cigar mateCigar = null; if (mateCigarString != null) { mateCigar = TextCigarCodec.decode(mateCigarString); if (withValidation && rec.getValidationStringency() != ValidationStringency.SILENT) { final List alignmentBlocks = getAlignmentBlocks(mateCigar, rec.getMateAlignmentStart(), "mate cigar"); SAMUtils.processValidationErrors(validateCigar(rec, mateCigar, rec.getMateReferenceIndex(), alignmentBlocks, -1, "Mate CIGAR"), -1L, rec.getValidationStringency()); } } return mateCigar; } /** * Returns the Mate Cigar or null if there is none. No validation is done on the returned cigar. * * @param rec the SAM record * @return Cigar object for the read's mate, or null if there is none. */ public static Cigar getMateCigar(final SAMRecord rec) { return getMateCigar(rec, false); } /** * @param rec the SAM record * @return number of cigar elements (number + operator) in the mate cigar string. */ public static int getMateCigarLength(final SAMRecord rec) { final Cigar mateCigar = getMateCigar(rec); return (mateCigar != null) ? mateCigar.numCigarElements() : 0; } /** * This method uses the MateCigar value as determined from the attribute MC. It must be non-null. * * @param rec the SAM record * @return 1-based inclusive rightmost position of the clipped mate sequence, or 0 read if unmapped. */ public static int getMateAlignmentEnd(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) { throw new RuntimeException("getMateAlignmentEnd called on an unmapped mate."); } final Cigar mateCigar = SAMUtils.getMateCigar(rec); if (mateCigar == null) { throw new SAMException("Mate CIGAR (Tag MC) not found."); } return CoordMath.getEnd(rec.getMateAlignmentStart(), mateCigar.getReferenceLength()); } /** * @param rec the SAM record * @return the mate alignment start (1-based, inclusive) adjusted for clipped bases. For example if the mate * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) * then this method will return 96. *

* Invalid to call on an unmapped read. */ public static int getMateUnclippedStart(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) throw new RuntimeException("getMateUnclippedStart called on an unmapped mate."); final Cigar mateCigar = getMateCigar(rec); if (mateCigar == null) { throw new SAMException("Mate CIGAR (Tag MC) not found."); } return SAMUtils.getUnclippedStart(rec.getMateAlignmentStart(), mateCigar); } /** * @param rec the SAM record * @return the mate alignment end (1-based, inclusive) adjusted for clipped bases. For example if the mate * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) * then this method will return 107. *

* Invalid to call on an unmapped read. */ public static int getMateUnclippedEnd(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) { throw new RuntimeException("getMateUnclippedEnd called on an unmapped mate."); } final Cigar mateCigar = SAMUtils.getMateCigar(rec); if (mateCigar == null) { throw new SAMException("Mate CIGAR (Tag MC) not found."); } return SAMUtils.getUnclippedEnd(getMateAlignmentEnd(rec), mateCigar); } /** * @param rec the SAM record * Returns blocks of the mate sequence that have been aligned directly to the * reference sequence. Note that clipped portions of the mate and inserted and * deleted bases (vs. the reference) are not represented in the alignment blocks. */ public static List getMateAlignmentBlocks(final SAMRecord rec) { return getAlignmentBlocks(getMateCigar(rec), rec.getMateAlignmentStart(), "mate cigar"); } /** * Run all validations of the mate's CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. * * @param rec the SAM record * @param cigar The cigar containing the alignment information * @param referenceIndex The reference index * @param alignmentBlocks The alignment blocks (parsed from the cigar) * @param recordNumber For error reporting. -1 if not known. * @param cigarTypeName For error reporting. "Read CIGAR" or "Mate Cigar" * @return List of errors, or null if no errors. */ public static List validateCigar(final SAMRecord rec, final Cigar cigar, final Integer referenceIndex, final List alignmentBlocks, final long recordNumber, final String cigarTypeName) { // Don't know line number, and don't want to force read name to be decoded. List ret = cigar.isValid(rec.getReadName(), recordNumber); if (referenceIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { SAMFileHeader samHeader = rec.getHeader(); if (null == samHeader) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.MISSING_HEADER, cigarTypeName + " A non-null SAMHeader is required to validate cigar elements for: ", rec.getReadName(), recordNumber)); } else { final SAMSequenceRecord sequence = samHeader.getSequence(referenceIndex); final int referenceSequenceLength = sequence.getSequenceLength(); for (final AlignmentBlock alignmentBlock : alignmentBlocks) { if (alignmentBlock.getReferenceStart() + alignmentBlock.getLength() - 1 > referenceSequenceLength) { if (ret == null) ret = new ArrayList(); ret.add(new SAMValidationError(SAMValidationError.Type.CIGAR_MAPS_OFF_REFERENCE, cigarTypeName + " M operator maps off end of reference", rec.getReadName(), recordNumber)); break; } } } } return ret; } /** * Run all validations of the mate's CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. * * @param rec the SAM record * @param recordNumber For error reporting. -1 if not known. * @return List of errors, or null if no errors. */ public static List validateMateCigar(final SAMRecord rec, final long recordNumber) { List ret = null; if (rec.getValidationStringency() != ValidationStringency.SILENT) { if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { // The mateCigar will be defined if the mate is mapped if (getMateCigarString(rec) != null) { ret = SAMUtils.validateCigar(rec, getMateCigar(rec), rec.getMateReferenceIndex(), getMateAlignmentBlocks(rec), recordNumber, "Mate CIGAR"); } } else { if (getMateCigarString(rec) != null) { ret = new ArrayList(); if (rec.getMateUnmappedFlag()) { // If the Mate is unmapped, and the Mate Cigar String (MC Attribute) exists, that is a validation error. ret.add(new SAMValidationError(SAMValidationError.Type.MATE_CIGAR_STRING_INVALID_PRESENCE, "Mate CIGAR String (MC Attribute) present for a read whose mate is unmapped", rec.getReadName(), recordNumber)); } else { // If the Mate is not paired, and the Mate Cigar String (MC Attribute) exists, that is a validation error. ret.add(new SAMValidationError(SAMValidationError.Type.MATE_CIGAR_STRING_INVALID_PRESENCE, "Mate CIGAR String (MC Attribute) present for a read that is not paired", rec.getReadName(), recordNumber)); } } } } return ret; } /** * Checks to see if it is valid for this record to have a mate CIGAR (MC) and then if there is a mate CIGAR available. This is done by * checking that this record is paired, its mate is mapped, and that it returns a non-null mate CIGAR. * * @param rec * @return */ public static boolean hasMateCigar(SAMRecord rec) { // NB: use getMateCigarString rather than getMateCigar to avoid validation. return (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && null != SAMUtils.getMateCigarString(rec)); } /** * Returns a string that is the the read group ID and read name separated by a colon. This is meant to cannonically * identify a given record within a set of records. * * @param record * @return */ public static String getCanonicalRecordName(final SAMRecord record) { String name = record.getStringAttribute(ReservedTagConstants.READ_GROUP_ID); if (null == name) name = record.getReadName(); else name = name + ":" + record.getReadName(); return name; } /** * Returns the number of bases that need to be clipped due to overlapping pairs. If the record is not paired, * or the given record's start position is greater than its mate's start position, zero is automatically returned. * NB: This method assumes that the record's mate is not contained within the given record's alignment. * * @param rec * @return the number of bases at the end of the read that need to be clipped such that there would be no overlapping bases with its mate. * Read bases include only those from insertion, match, or mismatch Cigar operators. */ public static int getNumOverlappingAlignedBasesToClip(final SAMRecord rec) { // NB: ignores how to handle supplemental records when present for both ends by just using the mate information in the record. if (!rec.getReadPairedFlag() || rec.getReadUnmappedFlag() || rec.getMateUnmappedFlag()) return 0; // Only clip records that are left-most in genomic order and overlapping. if (rec.getMateAlignmentStart() < rec.getAlignmentStart()) return 0; // right-most, so ignore. // Find the number of read bases after the given mate's alignment start. int numBasesToClip = 0; final int refStartPos = rec.getMateAlignmentStart(); // relative reference position after which we should start clipping final Cigar cigar = rec.getCigar(); int refPos = rec.getAlignmentStart(); for (final CigarElement el : cigar.getCigarElements()) { final CigarOperator operator = el.getOperator(); final int refBasesLength = operator.consumesReferenceBases() ? el.getLength() : 0; if (refStartPos <= refPos + refBasesLength - 1) { // add to clipped bases if (operator == CigarOperator.MATCH_OR_MISMATCH) { // M if (refStartPos < refPos) numBasesToClip += refBasesLength; // use all of the bases else numBasesToClip += (refPos + refBasesLength) - refStartPos; // since the mate's alignment start can be in the middle of a cigar element } else if (operator == CigarOperator.SOFT_CLIP || operator == CigarOperator.HARD_CLIP || operator == CigarOperator.PADDING || operator == CigarOperator.SKIPPED_REGION) { // ignore } else { // ID numBasesToClip += operator.consumesReadBases() ? el.getLength() : 0; // clip all the bases in the read from this operator } } refPos += refBasesLength; } if (numBasesToClip < 0) return 0; // left-most but not overlapping return numBasesToClip; } /** * Returns a (possibly new) record that has been clipped if isa mapped paired and has overlapping bases with its mate. * See {@link #getNumOverlappingAlignedBasesToClip(SAMRecord)} for how the number of overlapping bases is computed. * NB: this does not properly consider a cigar like: 100M20S10H. * NB: This method assumes that the record's mate is not contained within the given record's alignment. * * @param record the record from which to clip bases. * @param noSideEffects if true a modified clone of the original record is returned, otherwise we modify the record directly. * @return */ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, final boolean noSideEffects) { return clipOverlappingAlignedBases(record, getNumOverlappingAlignedBasesToClip(record), noSideEffects); } /** * Returns a (possibly new) SAMRecord with the given number of bases soft-clipped at the end of the read if is a mapped * paired and has overlapping bases with its mate. * NB: this does not properly consider a cigar like: 100M20S10H. * NB: This method assumes that the record's mate is not contained within the given record's alignment. * * @param record the record from which to clip bases. * @param numOverlappingBasesToClip the number of bases to clip at the end of the read. * @param noSideEffects if true a modified clone of the original record is returned, otherwise we modify the record directly. * @return */ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, final int numOverlappingBasesToClip, final boolean noSideEffects) { // NB: ignores how to handle supplemental records when present for both ends by just using the mate information in the record. if (numOverlappingBasesToClip <= 0 || record.getReadUnmappedFlag() || record.getMateUnmappedFlag()) return record; try { final SAMRecord rec = noSideEffects ? ((SAMRecord)record.clone()) : record; // watch out for when the second read overlaps all of the first read if (rec.getMateAlignmentStart() <= rec.getAlignmentStart()) { // make it unmapped rec.setReadUnmappedFlag(true); return rec; } // 1-based index of first base in read to clip. int clipFrom = rec.getReadLength() - numOverlappingBasesToClip + 1; // we have to check if the last cigar element is soft-clipping, so we can subtract that from clipFrom final CigarElement cigarElement = rec.getCigar().getCigarElement(rec.getCigarLength()-1); if (CigarOperator.SOFT_CLIP == cigarElement.getOperator()) clipFrom -= cigarElement.getLength(); // FIXME: does not properly consider a cigar like: 100M20S10H // clip it, clip it good rec.setCigar(new Cigar(CigarUtil.softClipEndOfRead(clipFrom, rec.getCigar().getCigarElements()))); return rec; } catch (final CloneNotSupportedException e) { throw new SAMException(e.getMessage(), e); } } /** * Checks if a long attribute value is within the allowed range of a 32-bit unsigned integer. * * @param value a long value to check * @return true if value is >= 0 and <= {@link BinaryCodec#MAX_UINT}, and false otherwise */ public static boolean isValidUnsignedIntegerAttribute(long value) { return value >= 0 && value <= BinaryCodec.MAX_UINT; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SAMValidationError.java000066400000000000000000000232561263034757100241710ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import java.io.Serializable; /** * Class that encapsulates a validation error message as well as a type code so that * errors can be aggregated by type. * * @author Doug Voet */ public class SAMValidationError implements Serializable { public static final long serialVersionUID = 1L; public enum Severity { WARNING, ERROR } public enum Type { /** quality encodings out of range; appear to be Solexa or Illumina when Phread expected */ INVALID_QUALITY_FORMAT(Severity.WARNING), /** proper pair flag set for unpaired read */ INVALID_FLAG_PROPER_PAIR, /** mate unmapped flag set when mate is mapped or not set when mate is not mapped */ INVALID_FLAG_MATE_UNMAPPED, /** mate unmapped flag does not match read unmapped flag of mate */ MISMATCH_FLAG_MATE_UNMAPPED, /** mate negative strand flag set for unpaired read */ INVALID_FLAG_MATE_NEG_STRAND, /** mate negative strand flag does not match read negative strand flag of mate */ MISMATCH_FLAG_MATE_NEG_STRAND, /** first of pair flag set for unpaired read */ INVALID_FLAG_FIRST_OF_PAIR, /** second of pair flag set for unpaired read */ INVALID_FLAG_SECOND_OF_PAIR, /** pair flag set but not marked as first or second of pair */ PAIRED_READ_NOT_MARKED_AS_FIRST_OR_SECOND(Severity.WARNING), /** not primary alignment flag set for unmapped read */ INVALID_FLAG_NOT_PRIM_ALIGNMENT, /** supplementary alignment flag set for unmapped read */ INVALID_FLAG_SUPPLEMENTARY_ALIGNMENT, /** mapped read flat not set for mapped read */ INVALID_FLAG_READ_UNMAPPED, /** * inferred insert size is out of range * @see SAMRecord#MAX_INSERT_SIZE */ INVALID_INSERT_SIZE, /** mapping quality set for unmapped read or is >= 256 */ INVALID_MAPPING_QUALITY, /** CIGAR string is empty for mapped read or not empty of unmapped read, or other CIGAR badness. */ INVALID_CIGAR, /** CIGAR string contains I followed by D, or vice versa */ ADJACENT_INDEL_IN_CIGAR(Severity.WARNING), /** mate reference index (MRNM) set for unpaired read */ INVALID_MATE_REF_INDEX, /** mate reference index (MRNM) does not match reference index of mate */ MISMATCH_MATE_REF_INDEX, /** reference index not found in sequence dictionary */ INVALID_REFERENCE_INDEX, /** alignment start is can not be correct */ INVALID_ALIGNMENT_START, /** mate alignment does not match alignment start of mate */ MISMATCH_MATE_ALIGNMENT_START, /** the record's mate fields do not match the corresponding fields of the mate */ MATE_FIELD_MISMATCH, /** the NM tag (nucleotide differences) is incorrect */ INVALID_TAG_NM, /** the NM tag (nucleotide differences) is missing */ MISSING_TAG_NM(Severity.WARNING), /** the sam/bam file is missing the header */ MISSING_HEADER, /** there is no sequence dictionary in the header */ MISSING_SEQUENCE_DICTIONARY, /** the header is missing read group information */ MISSING_READ_GROUP, /** the record is out of order */ RECORD_OUT_OF_ORDER, /** A read group ID on a SAMRecord is not found in the header */ READ_GROUP_NOT_FOUND, /** A SAMRecord is found with no read group id */ RECORD_MISSING_READ_GROUP(Severity.WARNING), /** Indexing bin set on SAMRecord does not agree with computed value. */ INVALID_INDEXING_BIN, MISSING_VERSION_NUMBER, INVALID_VERSION_NUMBER, TRUNCATED_FILE, MISMATCH_READ_LENGTH_AND_QUALS_LENGTH, EMPTY_READ, /** * Bases corresponding to M operator in CIGAR are beyond the end of the reference. */ CIGAR_MAPS_OFF_REFERENCE, /** Length of E2 (secondary base calls) and U2 (secondary base quals) tag values should match read length */ MISMATCH_READ_LENGTH_AND_E2_LENGTH, MISMATCH_READ_LENGTH_AND_U2_LENGTH, /** Secondary base calls should not be the same as primary, unless one or the other is N */ E2_BASE_EQUALS_PRIMARY_BASE(Severity.WARNING), /** BAM appears to be healthy, but is an older file so doesn't have terminator block. */ BAM_FILE_MISSING_TERMINATOR_BLOCK(Severity.WARNING), /** Header record is not one of the standard types */ UNRECOGNIZED_HEADER_TYPE, /** Header tag does not have colon */ POORLY_FORMATTED_HEADER_TAG, /** Header tag appears more than once in header line with different value */ HEADER_TAG_MULTIPLY_DEFINED, HEADER_RECORD_MISSING_REQUIRED_TAG, /** Date string is not ISO-8601 */ INVALID_DATE_STRING(Severity.WARNING), /** Unsigned integer tag value is deprecated in BAM. */ TAG_VALUE_TOO_LARGE, /** Invalid virtualFilePointer in index */ INVALID_INDEX_FILE_POINTER, /** PI tag value is not numeric. */ INVALID_PREDICTED_MEDIAN_INSERT_SIZE, /** Same read group id appears more than once */ DUPLICATE_READ_GROUP_ID, /** The read group is missing its PL (platform unit) field */ MISSING_PLATFORM_VALUE, /** The read group has an invalid value set for its PL field */ INVALID_PLATFORM_VALUE, /** Same program group id appears more than once */ DUPLICATE_PROGRAM_GROUP_ID, /** Read is marked as paired, but its pair was not found. */ MATE_NOT_FOUND, /** Both mates are marked as first of pair, or both mates are marked as second of pair. */ MATES_ARE_SAME_END, /** The Cigar String in the MC Tag does not match the Cigar String for the mate of this read. */ MISMATCH_MATE_CIGAR_STRING, /** There is a Cigar String (stored in the MC Tag) for a read whose mate is NOT mapped. */ MATE_CIGAR_STRING_INVALID_PRESENCE; public final Severity severity; private Type() { this.severity = Severity.ERROR; } private Type(final Severity severity) { this.severity = severity; } /** * @return Format for writing to histogram summary output. */ public String getHistogramString() { return this.severity.name() + ":" + this.name(); } } private final Type type; private final String message; private final String readName; private long recordNumber = -1; private String source; /** * Construct a SAMValidationError with unknown record number. * @param type * @param message * @param readName May be null if readName is not known. */ public SAMValidationError(final Type type, final String message, final String readName) { this.type = type; this.message = message; this.readName = readName; } /** * Construct a SAMValidationError with possibly-known record number. * @param type * @param message * @param readName May be null if readName is not known. * @param recordNumber Position of the record in the SAM file it has been read from. -1 if not known. */ public SAMValidationError(final Type type, final String message, final String readName, final long recordNumber) { this(type, message, readName); this.recordNumber = recordNumber; } public String toString() { final StringBuilder builder = new StringBuilder(); builder.append(type.severity.toString()); builder.append(": "); if (source != null) { builder.append("File ").append(source.toString()).append(", "); } if (recordNumber > 0) { builder.append("Record ").append(recordNumber).append(", "); } if (readName != null) { builder.append("Read name ").append(readName).append(", "); } return builder.append(message).toString(); } public Type getType() { return type; } public String getMessage() { return message; } /** may be null */ public String getReadName() { return readName; } /** 1-based. -1 if not known. */ public long getRecordNumber() { return recordNumber; } public void setRecordNumber(final long recordNumber) { this.recordNumber = recordNumber; } public String getSource() { return source; } public void setSource(final String source) { this.source = source; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SQTagUtil.java000066400000000000000000000142711263034757100223360ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; /** * Utility methods for encoding and decoding the SQ tag value of SAMRecord. * * @author alecw@broadinstitute.org */ public class SQTagUtil { /** * The ordinals of these are stored in the high-order 2 bits of each byte of the SQ tag. * Note that these have the convenient property that the binary complement of each ordinal, masked to * the two low-order bits, is the complementary base. */ public enum SQBase { SQ_A('A'), SQ_C('C'), SQ_G('G'), SQ_T('T'); private final Character base; SQBase(final Character base) { this.base = base; } public Character getBase() { return base; } } /** * For complementing SQBase ordinals. */ private static final int COMPLEMENT_MASK = 3; private static final int QUALITY_MASK = 0x3f; public static final byte MAX_QUALITY = QUALITY_MASK; private static final int BASE_INDEX_SHIFT = 6; /** * Convert a pair of likelihoods into a value suitable for passing to baseAndProbDiffToSqValue. * @param secondBestLikelihood Probability of the 2nd-best base call. 1 > secondBestLikelihood > thirdBestLikelihood. * @param thirdBestLikelihood Probability of the 3rd-best base call. thirdBestLikelihood > 0. * @return ratio of input probabilities for storing in SQ tag. */ public static byte sqScaledProbabilityRatio(final double secondBestLikelihood, final double thirdBestLikelihood) { if (secondBestLikelihood >= 1.0 || thirdBestLikelihood <= 0 || thirdBestLikelihood > secondBestLikelihood) { throw new IllegalArgumentException("Likelihoods out of range. second best: " + secondBestLikelihood + "; third best: " + thirdBestLikelihood); } // Cap value at QUALITY_MASK return (byte)(Math.min(Math.round(-10.0 * Math.log10(thirdBestLikelihood/secondBestLikelihood)), QUALITY_MASK)); } /** * Compress a base and a log probabiliy difference (-10log10(p3/p2)) into * a single byte so that it can be output in a SAMRecord's SQ field. * * @param base the 2nd-best base. * @param probRatio the log probability difference between the secondary and tertiary bases (-10log10(p3/p2)), * rounded to an integer and capped so it fits in 6 bits. * @return a byte containing the index and the log probability difference. */ public static byte baseAndProbDiffToSqValue(final SQBase base, final byte probRatio) { return baseAndProbDiffToSqValue(base.ordinal(), probRatio); } /** * Compress a base and a log probabiliy difference (-10log10(p3/p2)) into * a single byte so that it can be output in a SAMRecord's SQ field. * * @param base the 2nd-best base (A=0, C=1, G=2, T=3). * @param probRatio the log probability difference between the secondary and tertiary bases (-10log10(p3/p2)), * rounded to an integer and capped so it fits in 6 bits. If this value is > MAX_QUALITY, it is truncated to that. * @return a byte containing the index and the log probability difference. */ public static byte baseAndProbDiffToSqValue(final int base, final byte probRatio) { return (byte)((base << BASE_INDEX_SHIFT) | Math.min(probRatio, QUALITY_MASK)); } /** * Retrieve SQ-scaled probability ratio from SQ value. * @param sqValue * @return the log probability difference between the secondary and tertiary bases (-10log10(p3/p2)). */ public static byte sqValueToProbRatio(final byte sqValue) { return (byte)(sqValue & QUALITY_MASK); } /** * Retrieve the 2nd-best base call from SQ value. * @param sqValue * @return 2nd-best base call. */ public static SQBase sqValueToBase(final byte sqValue) { return SQBase.values()[sqValueToBaseOrdinal(sqValue)]; } /** * Retrieve the 2nd-best base call from SQ value. * @param sqValue * @return Ordinal of 2nd-best base call. */ public static int sqValueToBaseOrdinal(final byte sqValue) { return (sqValue & 0xff) >>> BASE_INDEX_SHIFT; } /** * Reverses and complements the sqValues in place. * @param sqArray Array of SQ-values, with 2nd-best base in high-order 2 bits, and probability diff * in low-order 6 bits. */ public static void reverseComplementSqArray(final byte[] sqArray) { final int lastIndex = sqArray.length - 1; int i, j; for (i=0, j=lastIndex; i getIterator() { return getIterator(getFilePointerSpanningReads()); } @Override public CloseableIterator getIterator(SAMFileSpan chunks) { if (run == null) { throw new RuntimeException("Cannot create iterator - SRA run is uninitialized"); } if (virtualHeader == null) { throw new RuntimeException("Cannot create iterator - SAM file header is uninitialized"); } List chunkList = ((BAMFileSpan) chunks).getChunks(); final SRAIterator newIterator = new SRAIterator(acc, run, virtualHeader, cachedReferences, recordRangeInfo, chunkList); if (validationStringency != null) { newIterator.setValidationStringency(validationStringency); } return newIterator; } @Override public SAMFileSpan getFilePointerSpanningReads() { if (recordRangeInfo.getTotalRecordRangeLength() <= 0) { throw new RuntimeException("Cannot create file span - SRA file is empty"); } return new BAMFileSpan(new Chunk(0, recordRangeInfo.getTotalRecordRangeLength())); } @Override public CloseableIterator query(QueryInterval[] intervals, boolean contained) { BAMFileSpan span = new BAMFileSpan(); BrowseableBAMIndex index = getBrowseableIndex(); for (QueryInterval interval : intervals) { BAMFileSpan intervalSpan; if (!contained) { intervalSpan = index.getSpanOverlapping(interval.referenceIndex, interval.start, interval.end); } else { intervalSpan = getSpanContained(interval.referenceIndex, interval.start, interval.end); } span.add(intervalSpan); } return getIterator(span); } @Override public CloseableIterator queryAlignmentStart(String sequence, int start) { int sequenceIndex = virtualHeader.getSequenceIndex(sequence); if (sequenceIndex == -1) { throw new IllegalArgumentException("Unknown sequence '" + sequence + "' was passed to SRAFileReader"); } return getIterator(getSpanContained(sequenceIndex, start, -1)); } @Override public CloseableIterator queryUnmapped() { if (recordRangeInfo.getTotalRecordRangeLength() <= 0) { throw new RuntimeException("Cannot create file span - SRA file is empty"); } SAMFileSpan span = new BAMFileSpan(new Chunk(recordRangeInfo.getTotalReferencesLength(), recordRangeInfo.getTotalRecordRangeLength())); return getIterator(span); } @Override public void close() { } @Override public ValidationStringency getValidationStringency() { return validationStringency; } /** INDEXING */ /** * Returns true if the supported index is browseable, meaning the bins in it can be traversed * and chunk data inspected and retrieved. * * @return True if the index supports the BrowseableBAMIndex interface. False otherwise. */ @Override public boolean hasBrowseableIndex() { return true; } /** * Gets an index tagged with the BrowseableBAMIndex interface. Throws an exception if no such * index is available. * * @return An index with a browseable interface, if possible. * @throws SAMException if no such index is available. */ @Override public BrowseableBAMIndex getBrowseableIndex() { return index; } /** * Iterate through the given chunks in the file. * * @param chunks List of chunks for which to retrieve data. * @return An iterator over the given chunks. */ @Override public SAMRecordIterator iterator(final SAMFileSpan chunks) { CloseableIterator it = getIterator(chunks); if (it == null) { return null; } return (SAMRecordIterator) it; } /** ReaderImplementation */ @Override void enableFileSource(final SamReader reader, final boolean enabled) { log.info("enableFileSource is not supported"); } @Override void enableIndexCaching(final boolean enabled) { log.info("enableIndexCaching is not supported"); } @Override void enableIndexMemoryMapping(final boolean enabled) { log.info("enableIndexMemoryMapping is not supported"); } @Override void enableCrcChecking(final boolean enabled) { log.info("enableCrcChecking is not supported"); } @Override void setSAMRecordFactory(final SAMRecordFactory factory) { log.info("setSAMRecordFactory is not supported"); } @Override void setValidationStringency(final ValidationStringency validationStringency) { this.validationStringency = validationStringency; } protected SRAIterator.RecordRangeInfo getRecordsRangeInfo() { return recordRangeInfo; } private SAMFileHeader loadSamHeader() throws ErrorMsg { if (run == null) { throw new RuntimeException("Cannot load SAMFileHeader - SRA run is uninitialized"); } String runName = run.getName(); SAMFileHeader header = new SAMFileHeader(); header.setSortOrder(SAMFileHeader.SortOrder.coordinate); ReadGroupIterator itRg = run.getReadGroups(); while (itRg.nextReadGroup()) { String rgName = itRg.getName(); if (rgName.isEmpty()) rgName = runName; SAMReadGroupRecord rg = new SAMReadGroupRecord(rgName); rg.setSample(runName); header.addReadGroup(rg); } ReferenceIterator itRef = run.getReferences(); while (itRef.nextReference()) { header.addSequence(new SAMSequenceRecord(itRef.getCanonicalName(), (int) itRef.getLength())); } return header; } private BAMFileSpan getSpanContained(int sequenceIndex, long start, long end) { if (recordRangeInfo.getTotalRecordRangeLength() <= 0) { throw new RuntimeException("Cannot create file span - SRA file is empty"); } long sequenceOffset = recordRangeInfo.getReferenceOffsets().get(sequenceIndex); long sequenceLength = recordRangeInfo.getReferenceLengthsAligned().get(sequenceIndex); if (end == -1) { end = sequenceLength; } if (start > sequenceLength) { throw new IllegalArgumentException("Sequence start position is larger than its length"); } if (end > sequenceLength) { throw new IllegalArgumentException("Sequence end position is larger than its length"); } return new BAMFileSpan(new Chunk(sequenceOffset + start, sequenceOffset + end)); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SRAIndex.java000066400000000000000000000242161263034757100221360ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ package htsjdk.samtools; import java.util.ArrayList; import java.util.BitSet; import java.util.HashSet; import java.util.List; import java.util.Set; /** * Emulates BAM index so that we can request chunks of records from SRAFileReader * * Here is how it works: * SRA allows reading of alignments by Reference position fast, so we divide our "file" range for alignments as * a length of all references. Reading unaligned reads is then fast if we use read positions for lookup and (internally) * filter out aligned fragments. * * Total SRA "file" range is calculated as sum of all reference lengths plus number of reads (both aligned and unaligned) * in SRA archive. * * Now, we can use Chunks to lookup for aligned and unaligned fragments. * * We emulate BAM index bins by mapping SRA reference positions to bin numbers. * And then we map from bin number to list of chunks, which represent SRA "file" positions (which are simply reference * positions). * * We only emulate last level of BAM index bins (and they refer to a portion of reference SRA_BIN_SIZE bases long). * For all other bins RuntimeException will be returned (but since nobody else creates bins, except SRAIndex class * that is fine). * * But since the last level of bins was not meant to refer to fragments that only partially overlap bin reference * positions, we also return chunk that goes 5000 bases left before beginning of the bin to assure fragments that * start before the bin positions but still overlap with it can be retrieved by SRA reader. * Later we will add support to NGS API to get a maximum number of bases that we need to go left to retrieve such fragments. * * Created by andrii.nikitiuk on 9/4/15. */ public class SRAIndex implements BrowseableBAMIndex { /** * Number of reference bases bins in last level can represent */ public static final int SRA_BIN_SIZE = 16 * 1024; /** * Chunks of that size will be created when using SRA index */ public static final int SRA_CHUNK_SIZE = 50000; /** * First bin number in last level */ private static final int SRA_BIN_INDEX_OFFSET = GenomicIndexUtil.LEVEL_STARTS[GenomicIndexUtil.LEVEL_STARTS.length - 1]; /** * How many bases should we go left on the reference to find all fragments that start before requested interval * but overlap with it */ private static final int MAX_FRAGMENT_OVERLAP = 5000; private SAMFileHeader header; private SRAIterator.RecordRangeInfo recordRangeInfo; /** * @param header sam header * @param recordRangeInfo info about record ranges withing SRA archive */ public SRAIndex(SAMFileHeader header, SRAIterator.RecordRangeInfo recordRangeInfo) { this.header = header; this.recordRangeInfo = recordRangeInfo; } /** * Gets the size (number of bins in) a given level of a BAM index. * @param levelNumber Level for which to inspect the size. * @return Size of the given level. */ @Override public int getLevelSize(int levelNumber) { if (levelNumber == GenomicIndexUtil.LEVEL_STARTS.length - 1) return GenomicIndexUtil.MAX_BINS - GenomicIndexUtil.LEVEL_STARTS[levelNumber]-1; else return GenomicIndexUtil.LEVEL_STARTS[levelNumber+1] - GenomicIndexUtil.LEVEL_STARTS[levelNumber]; } /** * SRA only operates on bins from last level * @param bin The bin for which to determine the level. * @return bin level */ @Override public int getLevelForBin(Bin bin) { if (bin.getBinNumber() < SRA_BIN_INDEX_OFFSET) { throw new RuntimeException("SRA only supports bins from the last level"); } return GenomicIndexUtil.LEVEL_STARTS.length - 1; } /** * Gets the first locus that this bin can index into. * @param bin The bin to test. * @return first position that associated with given bin number */ @Override public int getFirstLocusInBin(Bin bin) { if (bin.getBinNumber() < SRA_BIN_INDEX_OFFSET) { throw new RuntimeException("SRA only supports bins from the last level"); } return (bin.getBinNumber() - SRA_BIN_INDEX_OFFSET) * SRA_BIN_SIZE + 1; } /** * Gets the last locus that this bin can index into. * @param bin The bin to test. * @return last position that associated with given bin number */ @Override public int getLastLocusInBin(Bin bin) { if (bin.getBinNumber() < SRA_BIN_INDEX_OFFSET) { throw new RuntimeException("SRA only supports bins from the last level"); } return (bin.getBinNumber() - SRA_BIN_INDEX_OFFSET + 1) * SRA_BIN_SIZE; } /** * Provides a list of bins that contain bases at requested positions * @param referenceIndex sequence of desired SAMRecords * @param startPos 1-based start of the desired interval, inclusive * @param endPos 1-based end of the desired interval, inclusive * @return a list of bins that contain relevant data */ @Override public BinList getBinsOverlapping(int referenceIndex, int startPos, int endPos) { long refLength = recordRangeInfo.getReferenceLengthsAligned().get(referenceIndex); // convert to chunk address space within reference long refStartPos = startPos - 1; long refEndPos = endPos; if (refEndPos >= refLength) { throw new RuntimeException("refEndPos is larger than reference length"); } int firstBinNumber = (int)refStartPos / SRA_BIN_SIZE; int lastBinNumber = (int)(refEndPos - 1) / SRA_BIN_SIZE; int numberOfBins = ((int)refLength / SRA_BIN_SIZE) + 1; BitSet binBitSet = new BitSet(); binBitSet.set(0, SRA_BIN_INDEX_OFFSET, false); if (firstBinNumber > 0) { binBitSet.set(SRA_BIN_INDEX_OFFSET, SRA_BIN_INDEX_OFFSET + firstBinNumber, false); } binBitSet.set(SRA_BIN_INDEX_OFFSET + firstBinNumber, SRA_BIN_INDEX_OFFSET + lastBinNumber + 1, true); if (lastBinNumber + 1 < numberOfBins) { binBitSet.set(SRA_BIN_INDEX_OFFSET + lastBinNumber + 1, SRA_BIN_INDEX_OFFSET + numberOfBins, false); } return new BinList(referenceIndex, binBitSet); } @Override public BAMFileSpan getSpanOverlapping(Bin bin) { return new BAMFileSpan(getBinChunks(bin)); } @Override public BAMFileSpan getSpanOverlapping(int referenceIndex, int startPos, int endPos) { BinList binList = getBinsOverlapping(referenceIndex, startPos, endPos); BAMFileSpan result = new BAMFileSpan(); Set savedChunks = new HashSet(); for (Bin bin : binList) { List chunks = getSpanOverlapping(bin).getChunks(); for (Chunk chunk : chunks) { if (!savedChunks.contains(chunk)) { savedChunks.add(chunk); result.add(chunk); } } } return result; } /** * @return a position where aligned fragments end */ @Override public long getStartOfLastLinearBin() { int numberOfReferences = recordRangeInfo.getReferenceLengthsAligned().size(); long refOffset = recordRangeInfo.getReferenceOffsets().get(numberOfReferences - 1); long lastChunkNumber = recordRangeInfo.getReferenceLengthsAligned().get(numberOfReferences - 1) / SRA_CHUNK_SIZE; return lastChunkNumber * SRA_CHUNK_SIZE + refOffset; } @Override public BAMIndexMetaData getMetaData(int reference) { throw new UnsupportedOperationException("Getting of BAM index metadata for SRA is not implemented"); } @Override public void close() { } /** * @param bin Requested bin * @return chunks that represent all bases of requested bin */ private List getBinChunks(Bin bin) { if (bin.containsChunks()) { return bin.getChunkList(); } if (bin.getBinNumber() < SRA_BIN_INDEX_OFFSET) { throw new RuntimeException("SRA only supports bins from the last level"); } int binNumber = bin.getBinNumber() - SRA_BIN_INDEX_OFFSET; long refOffset = recordRangeInfo.getReferenceOffsets().get(bin.getReferenceSequence()); // move requested position MAX_FRAGMENT_OVERLAP bases behind, so that we take all the reads that overlap requested position int firstChunkCorrection = binNumber == 0 ? 0 : -MAX_FRAGMENT_OVERLAP; long binGlobalOffset = binNumber * SRA_BIN_SIZE + refOffset; long firstChunkNumber = (binGlobalOffset + firstChunkCorrection) / SRA_CHUNK_SIZE; long lastChunkNumber = (binGlobalOffset + SRA_BIN_SIZE - 1) / SRA_CHUNK_SIZE; List chunks = new ArrayList(); for (long chunkNumber = firstChunkNumber; chunkNumber <= lastChunkNumber; chunkNumber++) { chunks.add(new Chunk(chunkNumber * SRA_CHUNK_SIZE, (chunkNumber + 1) * SRA_CHUNK_SIZE)); } return chunks; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SRAIterator.java000066400000000000000000000227371263034757100226660ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ /** * Created by andrii.nikitiuk on 8/11/15. */ package htsjdk.samtools; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.sra.ReferenceCache; import htsjdk.samtools.sra.SRAAccession; import htsjdk.samtools.sra.SRAAlignmentIterator; import htsjdk.samtools.sra.SRAUnalignmentIterator; import htsjdk.samtools.sra.SRAUtils; import ngs.ErrorMsg; import ngs.ReadCollection; import ngs.Reference; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; /** * SRA iterator which returns SAMRecords for requested list of chunks */ public class SRAIterator implements SAMRecordIterator { private ValidationStringency validationStringency; private SRAAccession accession; private ReadCollection run; private SAMFileHeader header; private ReferenceCache cachedReferences; private RecordRangeInfo recordRangeInfo; private Iterator chunksIterator; private Chunk currentChunk; private SRAAlignmentIterator alignmentIterator; private SRAUnalignmentIterator unalignmentIterator; /** * Describes record ranges info needed for emulating BAM index */ public static class RecordRangeInfo { private List referenceOffsets; private List referenceLengthsAligned; private long totalReferencesLength; private long numberOfReads; // is used for unaligned read space private long totalRecordRangeLength; /** * @param referenceLengthsAligned a list with lengths of each reference * @param numberOfReads total number of reads within SRA archive */ public RecordRangeInfo(List referenceLengthsAligned, long numberOfReads) { this.numberOfReads = numberOfReads; this.referenceLengthsAligned = referenceLengthsAligned; referenceOffsets = new ArrayList(); totalReferencesLength = 0; for (Long refLen : referenceLengthsAligned) { referenceOffsets.add(totalReferencesLength); totalReferencesLength += refLen; } totalRecordRangeLength = totalReferencesLength + this.numberOfReads; } public long getNumberOfReads() { return numberOfReads; } public long getTotalReferencesLength() { return totalReferencesLength; } public long getTotalRecordRangeLength() { return totalRecordRangeLength; } public final List getReferenceOffsets() { return Collections.unmodifiableList(referenceOffsets); } public final List getReferenceLengthsAligned() { return Collections.unmodifiableList(referenceLengthsAligned); } } /** * Loads record ranges needed for emulating BAM index * @param run read collection * @return record ranges */ public static RecordRangeInfo getRecordsRangeInfo(ReadCollection run) { try { return new RecordRangeInfo(SRAUtils.getReferencesLengthsAligned(run), SRAUtils.getNumberOfReads(run)); } catch (ErrorMsg e) { throw new RuntimeException(e); } } /** * @param run opened read collection * @param header sam header * @param cachedReferences list of cached references shared among all iterators from a single SRAFileReader * @param recordRangeInfo info about record ranges withing SRA archive * @param chunks used to determine which records the iterator should return */ public SRAIterator(SRAAccession accession, final ReadCollection run, final SAMFileHeader header, ReferenceCache cachedReferences, final RecordRangeInfo recordRangeInfo, final List chunks) { this.accession = accession; this.run = run; this.header = header; this.cachedReferences = cachedReferences; this.recordRangeInfo = recordRangeInfo; chunksIterator = chunks.iterator(); if (chunksIterator.hasNext()) { currentChunk = chunksIterator.next(); } hasNext(); } /** * NGS iterators implement a single method "nextObject" which return true if the operation was successful or * false if there are no more objects available. * That means that there is no way to check "hasNext" without actually moving the iterator forward. * Because of that all the logic of moving iterator forward is actually happens in "hasNext". * * Here is explanation of how it works: * Iterator holds a list of chunks of requested records. Here we have chunksIterator that walks though that list. * We walk though that list using chunksIterator. If current chunk can represent aligned fragments then we create * SRAAlignmentIterator iterator, pass the chunk into it and ask if it can find any record. If record was found, * we say that we have next; otherwise we check if the chunk can represent unaligned fragments and then create * SRAUnalignmentIterator if so and do the same steps as with alignemnt iterator. * * If record was not found in both SRAAlignmentIterator and SRAUnalignmentIterator (it is possible that reference * range has no alignments or that reads range has all aligned fragment), we try the next chunk. * * When there are no more chunks and both iterators have no more records we return false. * * @return true if there are more records available */ @Override public boolean hasNext() { while (currentChunk != null) { if (alignmentIterator == null) { if (currentChunk.getChunkStart() < recordRangeInfo.getTotalReferencesLength()) { alignmentIterator = new SRAAlignmentIterator(accession, run, header, cachedReferences, recordRangeInfo, currentChunk); if (validationStringency != null) { alignmentIterator.setValidationStringency(validationStringency); } } } if (alignmentIterator != null && alignmentIterator.hasNext()) { return true; } if (unalignmentIterator == null) { if (currentChunk.getChunkEnd() > recordRangeInfo.getTotalReferencesLength()) { unalignmentIterator = new SRAUnalignmentIterator(accession, run, header, recordRangeInfo, currentChunk); if (validationStringency != null) { unalignmentIterator.setValidationStringency(validationStringency); } } } if (unalignmentIterator != null && unalignmentIterator.hasNext()) { return true; } alignmentIterator = null; unalignmentIterator = null; if (chunksIterator.hasNext()) { currentChunk = chunksIterator.next(); } else { currentChunk = null; } } return false; } /** * Call hasNext to make sure that one of inner iterators points to the next record, the retrieve the record from * one of them. * @return lazy SRA record */ @Override public SAMRecord next() { if (!hasNext()) { throw new NoSuchElementException("No more records are available in SRAIterator"); } if (alignmentIterator != null && alignmentIterator.hasNext()) { return alignmentIterator.next(); } return unalignmentIterator.next(); } @Override public void remove() { throw new UnsupportedOperationException("Removal of records not implemented."); } @Override public void close() { } @Override public SAMRecordIterator assertSorted(final SortOrder sortOrder) { throw new UnsupportedOperationException("assertSorted is not implemented."); } public void setValidationStringency(ValidationStringency validationStringency) { this.validationStringency = validationStringency; if (alignmentIterator != null) { alignmentIterator.setValidationStringency(validationStringency); } if (unalignmentIterator != null) { unalignmentIterator.setValidationStringency(validationStringency); } } } htsjdk-2.0.1/src/java/htsjdk/samtools/SamFileHeaderMerger.java000066400000000000000000001127641263034757100243220ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.SequenceUtil; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; /** * Merges SAMFileHeaders that have the same sequences into a single merged header * object while providing read group translation for cases where read groups * clash across input headers. */ public class SamFileHeaderMerger { /** * A 4-digit base 36 number is going to be attached to colliding SAMFileHeaderRecords, * To do this we first create an array of values to convert integer remainders into * base 36 values, we use base 36 because we have 10 digits and 26 numbers */ private static final char[] INT_TO_BASE36 = new char[36]; static { int aVal = (int) 'A'; int zeroVal = (int) '0'; for (int i = 0; i < 10; i++) { INT_TO_BASE36[i] = (char) (zeroVal + i); } for (int i = 0; i < 26; i++) { INT_TO_BASE36[i + 10] = (char) (aVal + i); } } //Super Header to construct private final SAMFileHeader mergedHeader; private Collection readers; private final Collection headers; private int recordCounter; //Translation of old group ids to new group ids private final Map> samReadGroupIdTranslation = new IdentityHashMap>(); //the read groups from different files use the same group ids private boolean hasReadGroupCollisions = false; //the program records from different files use the same program record ids private boolean hasProgramGroupCollisions = false; //Translation of old program group ids to new program group ids private final Map> samProgramGroupIdTranslation = new IdentityHashMap>(); private boolean hasMergedSequenceDictionary = false; // Translation of old sequence dictionary ids to new dictionary ids // This is an IdentityHashMap because it can be quite expensive to compute the hashCode for // large SAMFileHeaders. It is possible that two input files will have identical headers so that // the regular HashMap would fold them together, but the value stored in each of the two // Map entries will be the same, so it should not hurt anything. private final Map> samSeqDictionaryIdTranslationViaHeader = new IdentityHashMap>(); //HeaderRecordFactory that creates SAMReadGroupRecord instances. private static final HeaderRecordFactory READ_GROUP_RECORD_FACTORY = new HeaderRecordFactory() { public SAMReadGroupRecord createRecord(final String id, final SAMReadGroupRecord srcReadGroupRecord) { return new SAMReadGroupRecord(id, srcReadGroupRecord); } }; //HeaderRecordFactory that creates SAMProgramRecord instances. private static final HeaderRecordFactory PROGRAM_RECORD_FACTORY = new HeaderRecordFactory() { public SAMProgramRecord createRecord(final String id, final SAMProgramRecord srcProgramRecord) { return new SAMProgramRecord(id, srcProgramRecord); } }; //comparator used to sort lists of program group and read group records private static final Comparator RECORD_ID_COMPARATOR = new Comparator() { public int compare(final AbstractSAMHeaderRecord o1, final AbstractSAMHeaderRecord o2) { return o1.getId().compareTo(o2.getId()); } }; /** * Create SAMFileHeader with additional information. Required that sequence dictionaries agree. * * @param readers sam file readers to combine * @param sortOrder sort order new header should have * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) */ public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) { this(readers, sortOrder, false); } /** * Create SAMFileHeader with additional information. * * @param readers sam file readers to combine * @param sortOrder sort order new header should have * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that * all input sequence dictionaries be identical. * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) */ public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder, final boolean mergeDictionaries) { this(sortOrder, getHeadersFromReaders(readers), mergeDictionaries); this.readers = readers; } /** * Create SAMFileHeader with additional information.. This is the preferred constructor. * * @param sortOrder sort order new header should have * @param headers sam file headers to combine * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that * all input sequence dictionaries be identical. */ public SamFileHeaderMerger(final SAMFileHeader.SortOrder sortOrder, final Collection headers, final boolean mergeDictionaries) { this.headers = new LinkedHashSet(headers); this.mergedHeader = new SAMFileHeader(); SAMSequenceDictionary sequenceDictionary; try { sequenceDictionary = getSequenceDictionary(headers); this.hasMergedSequenceDictionary = false; } catch (SequenceUtil.SequenceListsDifferException pe) { if (mergeDictionaries) { sequenceDictionary = mergeSequenceDictionaries(headers); this.hasMergedSequenceDictionary = true; } else { throw pe; } } this.mergedHeader.setSequenceDictionary(sequenceDictionary); // Set program that creates input alignments for (final SAMProgramRecord program : mergeProgramGroups(headers)) { this.mergedHeader.addProgramRecord(program); } // Set read groups for merged header final List readGroups = mergeReadGroups(headers); this.mergedHeader.setReadGroups(readGroups); this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none); this.mergedHeader.setSortOrder(sortOrder); for (final SAMFileHeader header : headers) { for (final String comment : header.getComments()) { this.mergedHeader.addComment(comment); } } } // Utilility method to make use with old constructor private static List getHeadersFromReaders(final Collection readers) { final List headers = new ArrayList(readers.size()); for (final SamReader reader : readers) { headers.add(reader.getFileHeader()); } return headers; } /** * Checks to see if there are clashes where different readers are using the same read * group IDs. If yes, then those IDs that collided are remapped. * * @param headers headers to combine * @return new list of read groups constructed from all the readers */ private List mergeReadGroups(final Collection headers) { //prepare args for mergeHeaderRecords(..) call final HashSet idsThatAreAlreadyTaken = new HashSet(); final List> readGroupsToProcess = new LinkedList>(); for (final SAMFileHeader header : headers) { for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { //verify that there are no existing id collisions in this input file if (!idsThatAreAlreadyTaken.add(readGroup.getId())) throw new SAMException("Input file: " + header + " contains more than one RG with the same id (" + readGroup.getId() + ")"); readGroupsToProcess.add(new HeaderRecordAndFileHeader(readGroup, header)); } idsThatAreAlreadyTaken.clear(); } final List result = new LinkedList(); recordCounter = 0; hasReadGroupCollisions = mergeHeaderRecords(readGroupsToProcess, READ_GROUP_RECORD_FACTORY, idsThatAreAlreadyTaken, samReadGroupIdTranslation, result); //sort the result list by record id Collections.sort(result, RECORD_ID_COMPARATOR); return result; } /** * Checks to see if there are clashes where different readers are using the same program * group IDs. If yes, then those IDs that collided are remapped. * * @param headers headers to combine * @return new list of program groups constructed from all the readers */ private List mergeProgramGroups(final Collection headers) { final List overallResult = new LinkedList(); //this Set will accumulate all SAMProgramRecord ids that have been encountered so far. final HashSet idsThatAreAlreadyTaken = new HashSet(); //need to process all program groups List> programGroupsLeftToProcess = new LinkedList>(); for (final SAMFileHeader header : headers) { for (final SAMProgramRecord programGroup : header.getProgramRecords()) { //verify that there are no existing id collisions in this input file if (!idsThatAreAlreadyTaken.add(programGroup.getId())) throw new SAMException("Input file: " + header + " contains more than one PG with the same id (" + programGroup.getId() + ")"); programGroupsLeftToProcess.add(new HeaderRecordAndFileHeader(programGroup, header)); } idsThatAreAlreadyTaken.clear(); } recordCounter = 0; //A program group header (lets say ID=2 PN=B PP=1) may have a PP (previous program) attribute which chains it to //another program group header (lets say ID=1 PN=A) to indicate that the given file was //processed by program A followed by program B. These PP attributes potentially //connect headers into one or more tree structures. Merging is done by //first merging all headers that don't have PP attributes (eg. tree roots), //then updating and merging all headers whose PPs point to the tree-root headers, //and so on until all program group headers are processed. //currentProgramGroups is the list of records to merge next. Start by merging the programGroups that don't have a PP attribute (eg. the tree roots). List> currentProgramGroups = new LinkedList>(); for (final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { final HeaderRecordAndFileHeader pair = programGroupsLeftToProcessIterator.next(); if (pair.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG) == null) { programGroupsLeftToProcessIterator.remove(); currentProgramGroups.add(pair); } } //merge currentProgramGroups while (!currentProgramGroups.isEmpty()) { final List currentResult = new LinkedList(); hasProgramGroupCollisions |= mergeHeaderRecords(currentProgramGroups, PROGRAM_RECORD_FACTORY, idsThatAreAlreadyTaken, samProgramGroupIdTranslation, currentResult); //add currentResults to overallResults overallResult.addAll(currentResult); //apply the newly-computed id translations to currentProgramGroups and programGroupsLeftToProcess currentProgramGroups = translateIds(currentProgramGroups, samProgramGroupIdTranslation, false); programGroupsLeftToProcess = translateIds(programGroupsLeftToProcess, samProgramGroupIdTranslation, true); //find all records in programGroupsLeftToProcess whose ppId points to a record that was just processed (eg. a record that's in currentProgramGroups), //and move them to the list of programGroupsToProcessNext. final LinkedList> programGroupsToProcessNext = new LinkedList>(); for (final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { final HeaderRecordAndFileHeader pairLeftToProcess = programGroupsLeftToProcessIterator.next(); final Object ppIdOfRecordLeftToProcess = pairLeftToProcess.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); //find what currentProgramGroups this ppId points to (NOTE: they have to come from the same file) for (final HeaderRecordAndFileHeader justProcessedPair : currentProgramGroups) { final String idJustProcessed = justProcessedPair.getHeaderRecord().getId(); if (pairLeftToProcess.getFileHeader() == justProcessedPair.getFileHeader() && ppIdOfRecordLeftToProcess.equals(idJustProcessed)) { programGroupsLeftToProcessIterator.remove(); programGroupsToProcessNext.add(pairLeftToProcess); break; } } } currentProgramGroups = programGroupsToProcessNext; } //verify that all records were processed if (!programGroupsLeftToProcess.isEmpty()) { final StringBuffer errorMsg = new StringBuffer(programGroupsLeftToProcess.size() + " program groups weren't processed. Do their PP ids point to existing PGs? \n"); for (final HeaderRecordAndFileHeader pair : programGroupsLeftToProcess) { final SAMProgramRecord record = pair.getHeaderRecord(); errorMsg.append("@PG ID:" + record.getProgramGroupId() + " PN:" + record.getProgramName() + " PP:" + record.getPreviousProgramGroupId() + "\n"); } throw new SAMException(errorMsg.toString()); } //sort the result list by record id Collections.sort(overallResult, RECORD_ID_COMPARATOR); return overallResult; } /** * Utility method that takes a list of program groups and remaps all their * ids (including ppIds if requested) using the given idTranslationTable. *

* NOTE: when remapping, this method creates new SAMProgramRecords and * doesn't mutate any records in the programGroups list. * * @param programGroups The program groups to translate. * @param idTranslationTable The translation table. * @param translatePpIds Whether ppIds should be translated as well. * @return The list of translated records. */ private List> translateIds( final List> programGroups, final Map> idTranslationTable, final boolean translatePpIds) { //go through programGroups and translate any IDs and PPs based on the idTranslationTable. final List> result = new LinkedList>(); for (final HeaderRecordAndFileHeader pair : programGroups) { final SAMProgramRecord record = pair.getHeaderRecord(); final String id = record.getProgramGroupId(); final String ppId = (String) record.getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); final SAMFileHeader header = pair.getFileHeader(); final Map translations = idTranslationTable.get(header); //see if one or both ids need to be translated SAMProgramRecord translatedRecord = null; if (translations != null) { final String translatedId = translations.get(id); final String translatedPpId = translatePpIds ? translations.get(ppId) : null; final boolean needToTranslateId = translatedId != null && !translatedId.equals(id); final boolean needToTranslatePpId = translatedPpId != null && !translatedPpId.equals(ppId); if (needToTranslateId && needToTranslatePpId) { translatedRecord = new SAMProgramRecord(translatedId, record); translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); } else if (needToTranslateId) { translatedRecord = new SAMProgramRecord(translatedId, record); } else if (needToTranslatePpId) { translatedRecord = new SAMProgramRecord(id, record); translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); } } if (translatedRecord != null) { result.add(new HeaderRecordAndFileHeader(translatedRecord, header)); } else { result.add(pair); //keep the original record } } return result; } /** * Utility method for merging a List of AbstractSAMHeaderRecords. If it finds * records that have identical ids and attributes, it will collapse them * into one record. If it finds records that have identical ids but * non-identical attributes, this is treated as a collision. When collision happens, * the records' ids are remapped, and an old-id to new-id mapping is added to the idTranslationTable. *

* NOTE: Non-collided records also get recorded in the idTranslationTable as * old-id to old-id. This way, an idTranslationTable lookup should never return null. * * @param headerRecords The header records to merge. * @param headerRecordFactory Constructs a specific subclass of AbstractSAMHeaderRecord. * @param idsThatAreAlreadyTaken If the id of a headerRecord matches an id in this set, it will be treated as a collision, and the headRecord's id will be remapped. * @param idTranslationTable When records collide, their ids are remapped, and an old-id to new-id * mapping is added to the idTranslationTable. Non-collided records also get recorded in the idTranslationTable as * old-id to old-id. This way, an idTranslationTable lookup should never return null. * @param result The list of merged header records. * @return True if there were collisions. */ private boolean mergeHeaderRecords(final List> headerRecords, final HeaderRecordFactory headerRecordFactory, final HashSet idsThatAreAlreadyTaken, final Map> idTranslationTable, final List result) { //The outer Map bins the header records by their ids. The nested Map further collapses //header records which, in addition to having the same id, also have identical attributes. //In other words, each key in the nested map represents one or more //header records which have both identical ids and identical attributes. The List of //SAMFileHeaders keeps track of which readers these header record(s) came from. final Map>> idToRecord = new LinkedHashMap>>(); //Populate the idToRecord and seenIds data structures for (final HeaderRecordAndFileHeader pair : headerRecords) { final RecordType record = pair.getHeaderRecord(); final SAMFileHeader header = pair.getFileHeader(); final String recordId = record.getId(); Map> recordsWithSameId = idToRecord.get(recordId); if (recordsWithSameId == null) { recordsWithSameId = new LinkedHashMap>(); idToRecord.put(recordId, recordsWithSameId); } List fileHeaders = recordsWithSameId.get(record); if (fileHeaders == null) { fileHeaders = new LinkedList(); recordsWithSameId.put(record, fileHeaders); } fileHeaders.add(header); } //Resolve any collisions between header records by remapping their ids. boolean hasCollisions = false; for (final Map.Entry>> entry : idToRecord.entrySet()) { final String recordId = entry.getKey(); final Map> recordsWithSameId = entry.getValue(); for (final Map.Entry> recordWithUniqueAttr : recordsWithSameId.entrySet()) { final RecordType record = recordWithUniqueAttr.getKey(); final List fileHeaders = recordWithUniqueAttr.getValue(); String newId; if (!idsThatAreAlreadyTaken.contains(recordId)) { //don't remap 1st record. If there are more records //with this id, they will be remapped in the 'else'. newId = recordId; idsThatAreAlreadyTaken.add(recordId); ++recordCounter; } else { //there is more than one record with this id. hasCollisions = true; //Below we tack on one of roughly 1.7 million possible 4 digit base36 at random we do this because //our old process of just counting from 0 upward and adding that to the previous id led to 1000s of hits on //idsThatAreAlreadyTaken.contains just to resolve 1 collision when merging 1000s of similarly processed bams while (idsThatAreAlreadyTaken.contains(newId = recordId + "." + positiveFourDigitBase36Str(recordCounter++))) ; idsThatAreAlreadyTaken.add(newId); } for (final SAMFileHeader fileHeader : fileHeaders) { Map readerTranslationTable = idTranslationTable.get(fileHeader); if (readerTranslationTable == null) { readerTranslationTable = new HashMap(); idTranslationTable.put(fileHeader, readerTranslationTable); } readerTranslationTable.put(recordId, newId); } result.add(headerRecordFactory.createRecord(newId, record)); } } return hasCollisions; } /** * Convert an integer to base36, protected solely for testing * * @param leftOver Both the initial value and the running quotient * @return A four digit string composed of base 36 symbols */ public static String positiveFourDigitBase36Str(int leftOver) { if (leftOver == 0) { return "0"; } final StringBuilder builder = new StringBuilder(10); while (leftOver > 0) { final int valueIndex = leftOver % 36; builder.append(INT_TO_BASE36[valueIndex]); leftOver /= 36; } return builder.reverse().toString(); } /** * Get the sequences off the SAMFileHeader. Throws runtime exception if the sequence * are different from one another. * * @param headers headers to pull sequences from * @return sequences from files. Each file should have the same sequence */ private SAMSequenceDictionary getSequenceDictionary(final Collection headers) { SAMSequenceDictionary sequences = null; for (final SAMFileHeader header : headers) { if (sequences == null) { sequences = header.getSequenceDictionary(); } else { final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); SequenceUtil.assertSequenceDictionariesEqual(sequences, currentSequences); } } return sequences; } /** * Get the sequences from the SAMFileHeader, and merge the resulting sequence dictionaries. * * @param headers headers to pull sequences from * @return sequences from files. Each file should have the same sequence */ private SAMSequenceDictionary mergeSequenceDictionaries(final Collection headers) { SAMSequenceDictionary sequences = new SAMSequenceDictionary(); for (final SAMFileHeader header : headers) { final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); sequences = mergeSequences(sequences, currentSequences); } // second pass, make a map of the original seqeunce id -> new sequence id createSequenceMapping(headers, sequences); return sequences; } /** * They've asked to merge the sequence headers. What we support right now is finding the sequence name superset. * * @param mergeIntoDict the result of merging so far. All SAMSequenceRecords in here have been cloned from the originals. * @param mergeFromDict A new sequence dictionary to merge into mergeIntoDict. * @return A new sequence dictionary that resulting from merging the two inputs. */ private SAMSequenceDictionary mergeSequences(final SAMSequenceDictionary mergeIntoDict, final SAMSequenceDictionary mergeFromDict) { // a place to hold the sequences that we haven't found a home for, in the order the appear in mergeFromDict. final LinkedList holder = new LinkedList(); // Return value will be created from this. final LinkedList resultingDict = new LinkedList(); for (final SAMSequenceRecord sequenceRecord : mergeIntoDict.getSequences()) { resultingDict.add(sequenceRecord); } // Index into resultingDict of previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. int prevloc = -1; // Previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. SAMSequenceRecord previouslyMerged = null; for (final SAMSequenceRecord sequenceRecord : mergeFromDict.getSequences()) { // Does it already exist in resultingDict? final int loc = getIndexOfSequenceName(resultingDict, sequenceRecord.getSequenceName()); if (loc == -1) { // If doesn't already exist in resultingDict, save it an decide where to insert it later. holder.add(sequenceRecord.clone()); } else if (prevloc > loc) { // If sequenceRecord already exists in resultingDict, but prior to the previous one // from mergeIntoDict that already existed, cannot merge. throw new SAMException("Cannot merge sequence dictionaries because sequence " + sequenceRecord.getSequenceName() + " and " + previouslyMerged.getSequenceName() + " are in different orders in two input sequence dictionaries."); } else { // Since sequenceRecord already exists in resultingDict, don't need to add it. // Add in all the sequences prior to it that have been held in holder. resultingDict.addAll(loc, holder); // Remember the index of sequenceRecord so can check for merge imcompatibility. prevloc = loc + holder.size(); previouslyMerged = sequenceRecord; holder.clear(); } } // Append anything left in holder. if (holder.size() != 0) { resultingDict.addAll(holder); } return new SAMSequenceDictionary(resultingDict); } /** * Find sequence in list. * * @param list List to search for the sequence name. * @param sequenceName Name to search for. * @return Index of SAMSequenceRecord with the given name in list, or -1 if not found. */ private static int getIndexOfSequenceName(final List list, final String sequenceName) { for (int i = 0; i < list.size(); ++i) { if (list.get(i).getSequenceName().equals(sequenceName)) { return i; } } return -1; } /** * create the sequence mapping. This map is used to convert the unmerged header sequence ID's to the merged * list of sequence id's. * * @param headers the collections of headers. * @param masterDictionary the superset dictionary we've created. */ private void createSequenceMapping(final Collection headers, final SAMSequenceDictionary masterDictionary) { final LinkedList resultingDictStr = new LinkedList(); for (final SAMSequenceRecord r : masterDictionary.getSequences()) { resultingDictStr.add(r.getSequenceName()); } for (final SAMFileHeader header : headers) { final Map seqMap = new HashMap(); final SAMSequenceDictionary dict = header.getSequenceDictionary(); for (final SAMSequenceRecord rec : dict.getSequences()) { seqMap.put(rec.getSequenceIndex(), resultingDictStr.indexOf(rec.getSequenceName())); } this.samSeqDictionaryIdTranslationViaHeader.put(header, seqMap); } } /** * Returns the read group id that should be used for the input read and RG id. * * @deprecated replaced by getReadGroupId(SAMFileHeader, String) */ public String getReadGroupId(final SamReader reader, final String originalReadGroupId) { return getReadGroupId(reader.getFileHeader(), originalReadGroupId); } /** Returns the read group id that should be used for the input read and RG id. */ public String getReadGroupId(final SAMFileHeader header, final String originalReadGroupId) { return this.samReadGroupIdTranslation.get(header).get(originalReadGroupId); } /** * @param reader one of the input files * @param originalProgramGroupId a program group ID from the above input file * @return new ID from the merged list of program groups in the output file * @deprecated replaced by getProgramGroupId(SAMFileHeader, String) */ public String getProgramGroupId(final SamReader reader, final String originalProgramGroupId) { return getProgramGroupId(reader.getFileHeader(), originalProgramGroupId); } /** * @param header one of the input headers * @param originalProgramGroupId a program group ID from the above input file * @return new ID from the merged list of program groups in the output file */ public String getProgramGroupId(final SAMFileHeader header, final String originalProgramGroupId) { return this.samProgramGroupIdTranslation.get(header).get(originalProgramGroupId); } /** Returns true if there are read group duplicates within the merged headers. */ public boolean hasReadGroupCollisions() { return this.hasReadGroupCollisions; } /** Returns true if there are program group duplicates within the merged headers. */ public boolean hasProgramGroupCollisions() { return hasProgramGroupCollisions; } /** @return if we've merged the sequence dictionaries, return true */ public boolean hasMergedSequenceDictionary() { return hasMergedSequenceDictionary; } /** Returns the merged header that should be written to any output merged file. */ public SAMFileHeader getMergedHeader() { return this.mergedHeader; } /** * Returns the collection of readers that this header merger is working with. May return null. * * @deprecated replaced by getHeaders() */ public Collection getReaders() { return this.readers; } /** * Returns the collection of readers that this header merger is working with. */ public Collection getHeaders() { return this.headers; } /** * returns the new mapping for a specified reader, given it's old sequence index * * @param reader the reader * @param oldReferenceSequenceIndex the old sequence (also called reference) index * @return the new index value * @deprecated replaced by getMergedSequenceIndex(SAMFileHeader, Integer) */ public Integer getMergedSequenceIndex(final SamReader reader, final Integer oldReferenceSequenceIndex) { return this.getMergedSequenceIndex(reader.getFileHeader(), oldReferenceSequenceIndex); } /** * Another mechanism for getting the new sequence index, for situations in which the reader is not available. * Note that if the SAMRecord has already had its header replaced with the merged header, this won't work. * * @param header The original header for the input record in question. * @param oldReferenceSequenceIndex The original sequence index. * @return the new index value that is compatible with the merged sequence index. */ public Integer getMergedSequenceIndex(final SAMFileHeader header, final Integer oldReferenceSequenceIndex) { final Map mapping = this.samSeqDictionaryIdTranslationViaHeader.get(header); if (mapping == null) { throw new SAMException("No sequence dictionary mapping available for header: " + header); } final Integer newIndex = mapping.get(oldReferenceSequenceIndex); if (newIndex == null) { throw new SAMException("No mapping for reference index " + oldReferenceSequenceIndex + " from header: " + header); } return newIndex; } /** * Implementations of this interface are used by mergeHeaderRecords(..) to instantiate * specific subclasses of AbstractSAMHeaderRecord. */ private static interface HeaderRecordFactory { /** * Constructs a new instance of RecordType. * * @param id The id of the new record. * @param srcRecord Except for the id, the new record will be a copy of this source record. */ public RecordType createRecord(final String id, RecordType srcRecord); } /** * Struct that groups together a subclass of AbstractSAMHeaderRecord with the * SAMFileHeader that it came from. */ private static class HeaderRecordAndFileHeader { private final RecordType headerRecord; private final SAMFileHeader samFileHeader; public HeaderRecordAndFileHeader(final RecordType headerRecord, final SAMFileHeader samFileHeader) { this.headerRecord = headerRecord; this.samFileHeader = samFileHeader; } public RecordType getHeaderRecord() { return headerRecord; } public SAMFileHeader getFileHeader() { return samFileHeader; } } } htsjdk-2.0.1/src/java/htsjdk/samtools/SamFileValidator.java000066400000000000000000001132631263034757100237100ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.SAMValidationError.Type; import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFileWalker; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.FastqQualityFormat; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.util.QualityEncodingDetector; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.StringUtil; import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.PrintWriter; import java.util.AbstractMap; import java.util.ArrayList; import java.util.Collection; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; /** * Validates SAM files as follows: *

    *
  • checks sam file header for sequence dictionary
  • *
  • checks sam file header for read groups
  • *
  • for each sam record *
      *
    • reports error detected by SAMRecord.isValid()
    • *
    • validates NM (nucleotide differences) exists and matches reality
    • *
    • validates mate fields agree with data in the mate record
    • *
    *
  • *
* * @author Doug Voet * @see SAMRecord#isValid() */ public class SamFileValidator { private Histogram errorsByType = new Histogram(); private final PrintWriter out; private PairEndInfoMap pairEndInfoByName; private ReferenceSequenceFileWalker refFileWalker = null; private boolean verbose = false; private int maxVerboseOutput = 100; private SAMSortOrderChecker orderChecker; private Set errorsToIgnore = EnumSet.noneOf(Type.class); private boolean ignoreWarnings = false; private boolean bisulfiteSequenced = false; private boolean validateIndex = false; private boolean sequenceDictionaryEmptyAndNoWarningEmitted = false; private final int maxTempFiles; private final static Log log = Log.getInstance(SamFileValidator.class); public SamFileValidator(final PrintWriter out, final int maxTempFiles) { this.out = out; this.maxTempFiles = maxTempFiles; } /** * Sets one or more error types that should not be reported on. */ public void setErrorsToIgnore(final Collection types) { if (!types.isEmpty()) { this.errorsToIgnore = EnumSet.copyOf(types); } } public void setIgnoreWarnings(final boolean ignoreWarnings) { this.ignoreWarnings = ignoreWarnings; } /** * Outputs validation summary report to out. * * @param samReader records to validate * @param reference if null, NM tag validation is skipped * @return boolean true if there are no validation errors, otherwise false */ public boolean validateSamFileSummary(final SamReader samReader, final ReferenceSequenceFile reference) { init(reference, samReader.getFileHeader()); validateSamFile(samReader, out); boolean result = errorsByType.isEmpty(); if (errorsByType.getCount() > 0) { // Convert to a histogram with String IDs so that WARNING: or ERROR: can be prepended to the error type. final Histogram errorsAndWarningsByType = new Histogram("Error Type", "Count"); for (final Histogram.Bin bin : errorsByType.values()) { errorsAndWarningsByType.increment(bin.getId().getHistogramString(), bin.getValue()); } final MetricsFile metricsFile = new MetricsFile(); errorsByType.setBinLabel("Error Type"); errorsByType.setValueLabel("Count"); metricsFile.setHistogram(errorsAndWarningsByType); metricsFile.write(out); } cleanup(); return result; } /** * Outputs validation error details to out. * * @param samReader records to validate * @param reference if null, NM tag validation is skipped * processing will stop after this threshold has been reached * @return boolean true if there are no validation errors, otherwise false */ public boolean validateSamFileVerbose(final SamReader samReader, final ReferenceSequenceFile reference) { init(reference, samReader.getFileHeader()); try { validateSamFile(samReader, out); } catch (MaxOutputExceededException e) { out.println("Maximum output of [" + maxVerboseOutput + "] errors reached."); } boolean result = errorsByType.isEmpty(); cleanup(); return result; } public void validateBamFileTermination(final File inputFile) { BufferedInputStream inputStream = null; try { inputStream = IOUtil.toBufferedStream(new FileInputStream(inputFile)); if (!BlockCompressedInputStream.isValidFile(inputStream)) { return; } final BlockCompressedInputStream.FileTermination terminationState = BlockCompressedInputStream.checkTermination(inputFile); if (terminationState.equals(BlockCompressedInputStream.FileTermination.DEFECTIVE)) { addError(new SAMValidationError(Type.TRUNCATED_FILE, "BAM file has defective last gzip block", inputFile.getPath())); } else if (terminationState.equals(BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK)) { addError(new SAMValidationError(Type.BAM_FILE_MISSING_TERMINATOR_BLOCK, "Older BAM file -- does not have terminator block", inputFile.getPath())); } } catch (IOException e) { throw new SAMException("IOException", e); } finally { if (inputStream != null) { CloserUtil.close(inputStream); } } } private void validateSamFile(final SamReader samReader, final PrintWriter out) { try { validateHeader(samReader.getFileHeader()); orderChecker = new SAMSortOrderChecker(samReader.getFileHeader().getSortOrder()); validateSamRecordsAndQualityFormat(samReader, samReader.getFileHeader()); validateUnmatchedPairs(); if (validateIndex) { try { BamIndexValidator.exhaustivelyTestIndex(samReader); } catch (Exception e) { addError(new SAMValidationError(Type.INVALID_INDEX_FILE_POINTER, e.getMessage(), null)); } } if (errorsByType.isEmpty()) { out.println("No errors found"); } } finally { out.flush(); } } /** * Report on reads marked as paired, for which the mate was not found. */ private void validateUnmatchedPairs() { final InMemoryPairEndInfoMap inMemoryPairMap; if (pairEndInfoByName instanceof CoordinateSortedPairEndInfoMap) { // For the coordinate-sorted map, need to detect mate pairs in which the mateReferenceIndex on one end // does not match the readReference index on the other end, so the pairs weren't united and validated. inMemoryPairMap = new InMemoryPairEndInfoMap(); CloseableIterator> it = ((CoordinateSortedPairEndInfoMap) pairEndInfoByName).iterator(); while (it.hasNext()) { Map.Entry entry = it.next(); PairEndInfo pei = inMemoryPairMap.remove(entry.getValue().readReferenceIndex, entry.getKey()); if (pei != null) { // Found a mismatch btw read.mateReferenceIndex and mate.readReferenceIndex List errors = pei.validateMates(entry.getValue(), entry.getKey()); for (final SAMValidationError error : errors) { addError(error); } } else { // Mate not found. inMemoryPairMap.put(entry.getValue().mateReferenceIndex, entry.getKey(), entry.getValue()); } } it.close(); } else { inMemoryPairMap = (InMemoryPairEndInfoMap) pairEndInfoByName; } // At this point, everything in InMemoryMap is a read marked as a pair, for which a mate was not found. for (final Map.Entry entry : inMemoryPairMap) { addError(new SAMValidationError(Type.MATE_NOT_FOUND, "Mate not found for paired read", entry.getKey())); } } /** * SAM record and quality format validations are combined into a single method because validation must be completed * in only a single pass of the SamRecords (because a SamReader's iterator() method may not return the same * records on a subsequent call). */ private void validateSamRecordsAndQualityFormat(final Iterable samRecords, final SAMFileHeader header) { final SAMRecordIterator iter = (SAMRecordIterator) samRecords.iterator(); final ProgressLogger progress = new ProgressLogger(log, 10000000, "Validated Read"); final QualityEncodingDetector qualityDetector = new QualityEncodingDetector(); try { while (iter.hasNext()) { final SAMRecord record = iter.next(); qualityDetector.add(record); final long recordNumber = progress.getCount() + 1; final Collection errors = record.isValid(); if (errors != null) { for (final SAMValidationError error : errors) { error.setRecordNumber(recordNumber); addError(error); } } validateMateFields(record, recordNumber); final boolean hasValidSortOrder = validateSortOrder(record, recordNumber); validateReadGroup(record, header); final boolean cigarIsValid = validateCigar(record, recordNumber); if (cigarIsValid) { try { validateNmTag(record, recordNumber); } catch (SAMException e) { if (hasValidSortOrder) { // If a CRAM file has an invalid sort order, the ReferenceFileWalker will throw a // SAMException due to an out of order request when retrieving reference bases during NM // tag validation; rethrow the exception only if the sort order is valid, otherwise // swallow the exception and carry on validating throw e; } } } validateSecondaryBaseCalls(record, recordNumber); validateTags(record, recordNumber); if (sequenceDictionaryEmptyAndNoWarningEmitted && !record.getReadUnmappedFlag()) { addError(new SAMValidationError(Type.MISSING_SEQUENCE_DICTIONARY, "Sequence dictionary is empty", null)); sequenceDictionaryEmptyAndNoWarningEmitted = false; } progress.record(record); } try { if (progress.getCount() > 0) { // Avoid exception being thrown as a result of no qualities being read final FastqQualityFormat format = qualityDetector.generateBestGuess(QualityEncodingDetector.FileContext.SAM, FastqQualityFormat.Standard); if (format != FastqQualityFormat.Standard) { addError(new SAMValidationError(Type.INVALID_QUALITY_FORMAT, String.format("Detected %s quality score encoding, but expected %s.", format, FastqQualityFormat.Standard), null)); } } } catch (SAMException e) { addError(new SAMValidationError(Type.INVALID_QUALITY_FORMAT, e.getMessage(), null)); } } catch (SAMFormatException e) { // increment record number because the iterator behind the SAMFileReader // reads one record ahead so we will get this failure one record ahead final String msg = "SAMFormatException on record " + progress.getCount() + 1; out.println(msg); throw new SAMException(msg, e); } catch (FileTruncatedException e) { addError(new SAMValidationError(Type.TRUNCATED_FILE, "File is truncated", null)); } finally { iter.close(); } } private void validateReadGroup(final SAMRecord record, final SAMFileHeader header) { final SAMReadGroupRecord rg = record.getReadGroup(); if (rg == null) { addError(new SAMValidationError(Type.RECORD_MISSING_READ_GROUP, "A record is missing a read group", record.getReadName())); } else if (header.getReadGroup(rg.getId()) == null) { addError(new SAMValidationError(Type.READ_GROUP_NOT_FOUND, "A record has a read group not found in the header: ", record.getReadName() + ", " + rg.getReadGroupId())); } } /** * Report error if a tag value is a Long. */ private void validateTags(final SAMRecord record, final long recordNumber) { for (final SAMRecord.SAMTagAndValue tagAndValue : record.getAttributes()) { if (tagAndValue.value instanceof Long) { addError(new SAMValidationError(Type.TAG_VALUE_TOO_LARGE, "Numeric value too large for tag " + tagAndValue.tag, record.getReadName(), recordNumber)); } } } private void validateSecondaryBaseCalls(final SAMRecord record, final long recordNumber) { final String e2 = (String) record.getAttribute(SAMTag.E2.name()); if (e2 != null) { if (e2.length() != record.getReadLength()) { addError(new SAMValidationError(Type.MISMATCH_READ_LENGTH_AND_E2_LENGTH, String.format("E2 tag length (%d) != read length (%d)", e2.length(), record.getReadLength()), record.getReadName(), recordNumber)); } final byte[] bases = record.getReadBases(); final byte[] secondaryBases = StringUtil.stringToBytes(e2); for (int i = 0; i < Math.min(bases.length, secondaryBases.length); ++i) { if (SequenceUtil.isNoCall(bases[i]) || SequenceUtil.isNoCall(secondaryBases[i])) { continue; } if (SequenceUtil.basesEqual(bases[i], secondaryBases[i])) { addError(new SAMValidationError(Type.E2_BASE_EQUALS_PRIMARY_BASE, String.format("Secondary base call (%c) == primary base call (%c)", (char) secondaryBases[i], (char) bases[i]), record.getReadName(), recordNumber)); break; } } } final String u2 = (String) record.getAttribute(SAMTag.U2.name()); if (u2 != null && u2.length() != record.getReadLength()) { addError(new SAMValidationError(Type.MISMATCH_READ_LENGTH_AND_U2_LENGTH, String.format("U2 tag length (%d) != read length (%d)", u2.length(), record.getReadLength()), record.getReadName(), recordNumber)); } } private boolean validateCigar(final SAMRecord record, final long recordNumber) { if (record.getReadUnmappedFlag()) { return true; } return validateCigar(record, recordNumber, true); } private boolean validateMateCigar(final SAMRecord record, final long recordNumber) { return validateCigar(record, recordNumber, false); } private boolean validateCigar(final SAMRecord record, final long recordNumber, final boolean isReadCigar) { final ValidationStringency savedStringency = record.getValidationStringency(); record.setValidationStringency(ValidationStringency.LENIENT); final List errors = isReadCigar ? record.validateCigar(recordNumber) : SAMUtils.validateMateCigar(record, recordNumber); record.setValidationStringency(savedStringency); if (errors == null) { return true; } boolean valid = true; for (final SAMValidationError error : errors) { addError(error); valid = false; } return valid; } private boolean validateSortOrder(final SAMRecord record, final long recordNumber) { final SAMRecord prev = orderChecker.getPreviousRecord(); boolean isValidSortOrder = orderChecker.isSorted(record); if (!isValidSortOrder) { addError(new SAMValidationError( Type.RECORD_OUT_OF_ORDER, String.format( "The record is out of [%s] order, prior read name [%s], prior coodinates [%d:%d]", record.getHeader().getSortOrder().name(), prev.getReadName(), prev.getReferenceIndex(), prev.getAlignmentStart()), record.getReadName(), recordNumber)); } return isValidSortOrder; } private void init(final ReferenceSequenceFile reference, final SAMFileHeader header) { if (header.getSortOrder() == SAMFileHeader.SortOrder.coordinate) { this.pairEndInfoByName = new CoordinateSortedPairEndInfoMap(); } else { this.pairEndInfoByName = new InMemoryPairEndInfoMap(); } if (reference != null) { this.refFileWalker = new ReferenceSequenceFileWalker(reference); } } private void cleanup() { this.errorsByType = null; this.pairEndInfoByName = null; this.refFileWalker = null; } private void validateNmTag(final SAMRecord record, final long recordNumber) { if (!record.getReadUnmappedFlag()) { final Integer tagNucleotideDiffs = record.getIntegerAttribute(ReservedTagConstants.NM); if (tagNucleotideDiffs == null) { addError(new SAMValidationError( Type.MISSING_TAG_NM, "NM tag (nucleotide differences) is missing", record.getReadName(), recordNumber)); } else if (refFileWalker != null) { final ReferenceSequence refSequence = refFileWalker.get(record.getReferenceIndex()); final int actualNucleotideDiffs = SequenceUtil.calculateSamNmTag(record, refSequence.getBases(), 0, isBisulfiteSequenced()); if (!tagNucleotideDiffs.equals(actualNucleotideDiffs)) { addError(new SAMValidationError( Type.INVALID_TAG_NM, "NM tag (nucleotide differences) in file [" + tagNucleotideDiffs + "] does not match reality [" + actualNucleotideDiffs + "]", record.getReadName(), recordNumber)); } } } } private void validateMateFields(final SAMRecord record, final long recordNumber) { if (!record.getReadPairedFlag() || record.isSecondaryOrSupplementary()) { return; } validateMateCigar(record, recordNumber); final PairEndInfo pairEndInfo = pairEndInfoByName.remove(record.getReferenceIndex(), record.getReadName()); if (pairEndInfo == null) { pairEndInfoByName.put(record.getMateReferenceIndex(), record.getReadName(), new PairEndInfo(record, recordNumber)); } else { final List errors = pairEndInfo.validateMates(new PairEndInfo(record, recordNumber), record.getReadName()); for (final SAMValidationError error : errors) { addError(error); } } } private void validateHeader(final SAMFileHeader fileHeader) { for (final SAMValidationError error : fileHeader.getValidationErrors()) { addError(error); } if (fileHeader.getVersion() == null) { addError(new SAMValidationError(Type.MISSING_VERSION_NUMBER, "Header has no version number", null)); } else if (!SAMFileHeader.ACCEPTABLE_VERSIONS.contains(fileHeader.getVersion())) { addError(new SAMValidationError(Type.INVALID_VERSION_NUMBER, "Header version: " + fileHeader.getVersion() + " does not match any of the acceptable versions: " + StringUtil.join(", ", SAMFileHeader.ACCEPTABLE_VERSIONS.toArray(new String[0])), null)); } if (fileHeader.getSequenceDictionary().isEmpty()) { sequenceDictionaryEmptyAndNoWarningEmitted = true; } if (fileHeader.getReadGroups().isEmpty()) { addError(new SAMValidationError(Type.MISSING_READ_GROUP, "Read groups is empty", null)); } final List pgs = fileHeader.getProgramRecords(); for (int i = 0; i < pgs.size() - 1; i++) { for (int j = i + 1; j < pgs.size(); j++) { if (pgs.get(i).getProgramGroupId().equals(pgs.get(j).getProgramGroupId())) { addError(new SAMValidationError(Type.DUPLICATE_PROGRAM_GROUP_ID, "Duplicate " + "program group id: " + pgs.get(i).getProgramGroupId(), null)); } } } final List rgs = fileHeader.getReadGroups(); final Set readGroupIDs = new HashSet(); for (final SAMReadGroupRecord record : rgs) { final String readGroupID = record.getReadGroupId(); if (readGroupIDs.contains(readGroupID)) { addError(new SAMValidationError(Type.DUPLICATE_READ_GROUP_ID, "Duplicate " + "read group id: " + readGroupID, null)); } else { readGroupIDs.add(readGroupID); } final String platformValue = record.getPlatform(); if (platformValue == null || "".equals(platformValue)) { addError(new SAMValidationError(Type.MISSING_PLATFORM_VALUE, "A platform (PL) attribute was not found for read group ", readGroupID)); } else { // NB: cannot be null, so not catching a NPE try { SAMReadGroupRecord.PlatformValue.valueOf(platformValue.toUpperCase()); } catch (IllegalArgumentException e) { addError(new SAMValidationError(Type.INVALID_PLATFORM_VALUE, "The platform (PL) attribute (" + platformValue + ") + was not one of the valid values for read group ", readGroupID)); } } } } private void addError(final SAMValidationError error) { // Just ignore an error if it's of a type we're not interested in if (this.errorsToIgnore.contains(error.getType())) return; if (this.ignoreWarnings && error.getType().severity == SAMValidationError.Severity.WARNING) return; this.errorsByType.increment(error.getType()); if (verbose) { out.println(error); out.flush(); if (this.errorsByType.getCount() >= maxVerboseOutput) { throw new MaxOutputExceededException(); } } } /** * Control verbosity * * @param verbose True in order to emit a message per error or warning. * @param maxVerboseOutput If verbose, emit no more than this many messages. Ignored if !verbose. */ public void setVerbose(final boolean verbose, final int maxVerboseOutput) { this.verbose = verbose; this.maxVerboseOutput = maxVerboseOutput; } public boolean isBisulfiteSequenced() { return bisulfiteSequenced; } public void setBisulfiteSequenced(boolean bisulfiteSequenced) { this.bisulfiteSequenced = bisulfiteSequenced; } public SamFileValidator setValidateIndex(boolean validateIndex) { // The SAMFileReader must also have IndexCaching enabled to have the index validated, // samReader.enableIndexCaching(true); this.validateIndex = validateIndex; return this; } public static class ValidationMetrics extends MetricBase { } /** * This class is used so we don't have to store the entire SAMRecord in memory while we wait * to find a record's mate and also to store the record number. */ private static class PairEndInfo { private final int readAlignmentStart; private final int readReferenceIndex; private final boolean readNegStrandFlag; private final boolean readUnmappedFlag; private final String readCigarString; private final int mateAlignmentStart; private final int mateReferenceIndex; private final boolean mateNegStrandFlag; private final boolean mateUnmappedFlag; private final String mateCigarString; private final boolean firstOfPairFlag; private final long recordNumber; public PairEndInfo(final SAMRecord record, final long recordNumber) { this.recordNumber = recordNumber; this.readAlignmentStart = record.getAlignmentStart(); this.readNegStrandFlag = record.getReadNegativeStrandFlag(); this.readReferenceIndex = record.getReferenceIndex(); this.readUnmappedFlag = record.getReadUnmappedFlag(); this.readCigarString = record.getCigarString(); this.mateAlignmentStart = record.getMateAlignmentStart(); this.mateNegStrandFlag = record.getMateNegativeStrandFlag(); this.mateReferenceIndex = record.getMateReferenceIndex(); this.mateUnmappedFlag = record.getMateUnmappedFlag(); final Object mcs = record.getAttribute(SAMTag.MC.name()); this.mateCigarString = (mcs != null) ? (String) mcs : null; this.firstOfPairFlag = record.getFirstOfPairFlag(); } private PairEndInfo(int readAlignmentStart, int readReferenceIndex, boolean readNegStrandFlag, boolean readUnmappedFlag, String readCigarString, int mateAlignmentStart, int mateReferenceIndex, boolean mateNegStrandFlag, boolean mateUnmappedFlag, String mateCigarString, boolean firstOfPairFlag, long recordNumber) { this.readAlignmentStart = readAlignmentStart; this.readReferenceIndex = readReferenceIndex; this.readNegStrandFlag = readNegStrandFlag; this.readUnmappedFlag = readUnmappedFlag; this.readCigarString = readCigarString; this.mateAlignmentStart = mateAlignmentStart; this.mateReferenceIndex = mateReferenceIndex; this.mateNegStrandFlag = mateNegStrandFlag; this.mateUnmappedFlag = mateUnmappedFlag; this.mateCigarString = mateCigarString; this.firstOfPairFlag = firstOfPairFlag; this.recordNumber = recordNumber; } public List validateMates(final PairEndInfo mate, final String readName) { final List errors = new ArrayList(); validateMateFields(this, mate, readName, errors); validateMateFields(mate, this, readName, errors); // Validations that should not be repeated on both ends if (this.firstOfPairFlag == mate.firstOfPairFlag) { final String whichEnd = this.firstOfPairFlag ? "first" : "second"; errors.add(new SAMValidationError( Type.MATES_ARE_SAME_END, "Both mates are marked as " + whichEnd + " of pair", readName, this.recordNumber )); } return errors; } private void validateMateFields(final PairEndInfo end1, final PairEndInfo end2, final String readName, final List errors) { if (end1.mateAlignmentStart != end2.readAlignmentStart) { errors.add(new SAMValidationError( Type.MISMATCH_MATE_ALIGNMENT_START, "Mate alignment does not match alignment start of mate", readName, end1.recordNumber)); } if (end1.mateNegStrandFlag != end2.readNegStrandFlag) { errors.add(new SAMValidationError( Type.MISMATCH_FLAG_MATE_NEG_STRAND, "Mate negative strand flag does not match read negative strand flag of mate", readName, end1.recordNumber)); } if (end1.mateReferenceIndex != end2.readReferenceIndex) { errors.add(new SAMValidationError( Type.MISMATCH_MATE_REF_INDEX, "Mate reference index (MRNM) does not match reference index of mate", readName, end1.recordNumber)); } if (end1.mateUnmappedFlag != end2.readUnmappedFlag) { errors.add(new SAMValidationError( Type.MISMATCH_FLAG_MATE_UNMAPPED, "Mate unmapped flag does not match read unmapped flag of mate", readName, end1.recordNumber)); } if ((end1.mateCigarString != null) && (!end1.mateCigarString.equals(end2.readCigarString))) { errors.add(new SAMValidationError( Type.MISMATCH_MATE_CIGAR_STRING, "Mate CIGAR string does not match CIGAR string of mate", readName, end1.recordNumber)); } // Note - don't need to validate that the mateCigarString is a valid cigar string, since this // will be validated by validateCigar on the mate's record itself. } } /** * Thrown in addError indicating that maxVerboseOutput has been exceeded and processing should stop */ private static class MaxOutputExceededException extends SAMException { MaxOutputExceededException() { super("maxVerboseOutput exceeded."); } } interface PairEndInfoMap extends Iterable> { void put(int mateReferenceIndex, String key, PairEndInfo value); PairEndInfo remove(int mateReferenceIndex, String key); CloseableIterator> iterator(); } private class CoordinateSortedPairEndInfoMap implements PairEndInfoMap { private final CoordinateSortedPairInfoMap onDiskMap = new CoordinateSortedPairInfoMap(maxTempFiles, new Codec()); public void put(int mateReferenceIndex, String key, PairEndInfo value) { onDiskMap.put(mateReferenceIndex, key, value); } public PairEndInfo remove(int mateReferenceIndex, String key) { return onDiskMap.remove(mateReferenceIndex, key); } public CloseableIterator> iterator() { return onDiskMap.iterator(); } private class Codec implements CoordinateSortedPairInfoMap.Codec { private DataInputStream in; private DataOutputStream out; public void setOutputStream(final OutputStream os) { this.out = new DataOutputStream(os); } public void setInputStream(final InputStream is) { this.in = new DataInputStream(is); } public void encode(final String key, final PairEndInfo record) { try { out.writeUTF(key); out.writeInt(record.readAlignmentStart); out.writeInt(record.readReferenceIndex); out.writeBoolean(record.readNegStrandFlag); out.writeBoolean(record.readUnmappedFlag); out.writeUTF(record.readCigarString); out.writeInt(record.mateAlignmentStart); out.writeInt(record.mateReferenceIndex); out.writeBoolean(record.mateNegStrandFlag); out.writeBoolean(record.mateUnmappedFlag); // writeUTF can't take null, so store a null mateCigarString as an empty string out.writeUTF(record.mateCigarString != null ? record.mateCigarString : ""); out.writeBoolean(record.firstOfPairFlag); out.writeLong(record.recordNumber); } catch (IOException e) { throw new SAMException("Error spilling PairInfo to disk", e); } } public Map.Entry decode() { try { final String key = in.readUTF(); final int readAlignmentStart = in.readInt(); final int readReferenceIndex = in.readInt(); final boolean readNegStrandFlag = in.readBoolean(); final boolean readUnmappedFlag = in.readBoolean(); final String readCigarString = in.readUTF(); final int mateAlignmentStart = in.readInt(); final int mateReferenceIndex = in.readInt(); final boolean mateNegStrandFlag = in.readBoolean(); final boolean mateUnmappedFlag = in.readBoolean(); // read mateCigarString - note that null value is stored as an empty string final String mcs = in.readUTF(); final String mateCigarString = !mcs.isEmpty() ? mcs : null; final boolean firstOfPairFlag = in.readBoolean(); final long recordNumber = in.readLong(); final PairEndInfo rec = new PairEndInfo(readAlignmentStart, readReferenceIndex, readNegStrandFlag, readUnmappedFlag, readCigarString, mateAlignmentStart, mateReferenceIndex, mateNegStrandFlag, mateUnmappedFlag, mateCigarString, firstOfPairFlag, recordNumber); return new AbstractMap.SimpleEntry(key, rec); } catch (IOException e) { throw new SAMException("Error reading PairInfo from disk", e); } } } } private static class InMemoryPairEndInfoMap implements PairEndInfoMap { private final Map map = new HashMap(); public void put(int mateReferenceIndex, String key, PairEndInfo value) { if (mateReferenceIndex != value.mateReferenceIndex) throw new IllegalArgumentException("mateReferenceIndex does not agree with PairEndInfo"); map.put(key, value); } public PairEndInfo remove(int mateReferenceIndex, String key) { return map.remove(key); } public CloseableIterator> iterator() { final Iterator> it = map.entrySet().iterator(); return new CloseableIterator>() { public void close() { // do nothing } public boolean hasNext() { return it.hasNext(); } public Map.Entry next() { return it.next(); } public void remove() { it.remove(); } }; } } } htsjdk-2.0.1/src/java/htsjdk/samtools/SamFiles.java000066400000000000000000000033411263034757100222200ustar00rootroot00000000000000package htsjdk.samtools; import htsjdk.samtools.cram.CRAIIndex; import htsjdk.samtools.cram.build.CramIO; import java.io.File; /** * @author mccowan */ public class SamFiles { /** * Finds the index file associated with the provided SAM file. The index file must exist and be reachable to be found. * * @return The index for the provided SAM, or null if one was not found. */ public static File findIndex(final File samFile) { // If input is foo.bam, look for foo.bai File indexFile; final String fileName = samFile.getName(); if (fileName.endsWith(BamFileIoUtils.BAM_FILE_EXTENSION)) { final String bai = fileName.substring(0, fileName.length() - BamFileIoUtils.BAM_FILE_EXTENSION.length()) + BAMIndex.BAMIndexSuffix; indexFile = new File(samFile.getParent(), bai); if (indexFile.isFile()) { return indexFile; } } else if (fileName.endsWith(CramIO.CRAM_FILE_EXTENSION)) { final String crai = fileName.substring(0, fileName.length() - CramIO.CRAM_FILE_EXTENSION.length()) + CRAIIndex.CRAI_INDEX_SUFFIX; indexFile = new File(samFile.getParent(), crai); if (indexFile.isFile()) { return indexFile; } indexFile = new File(samFile.getParent(), samFile.getName() + CRAIIndex.CRAI_INDEX_SUFFIX); if (indexFile.isFile()) { return indexFile; } } // If foo.bai doesn't exist look for foo.bam.bai indexFile = new File(samFile.getParent(), samFile.getName() + BAMIndex.BAMIndexSuffix); if (indexFile.isFile()) { return indexFile; } return null; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SamIndexes.java000066400000000000000000000062341263034757100225610ustar00rootroot00000000000000package htsjdk.samtools; import htsjdk.samtools.cram.CRAIIndex; import htsjdk.samtools.seekablestream.SeekableBufferedStream; import htsjdk.samtools.seekablestream.SeekableStream; import java.io.BufferedInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URL; /** * A helper class to read BAI and CRAI indexes. Main goal is to provide BAI stream as a sort of common API for all index types. *

* Created by vadim on 14/08/2015. */ public enum SamIndexes { BAI(BAMIndex.BAMIndexSuffix, "BAI\1".getBytes()), // CRAI is gzipped text, so it's magic is same as {@link java.util.zip.GZIPInputStream.GZIP_MAGIC} CRAI(CRAIIndex.CRAI_INDEX_SUFFIX, new byte[]{(byte) 0x1f, (byte) 0x8b}); public final String fileNameSuffix; public final byte[] magic; SamIndexes(final String fileNameSuffix, final byte[] magic) { this.fileNameSuffix = fileNameSuffix; this.magic = magic; } public static InputStream openIndexFileAsBaiOrNull(final File file, final SAMSequenceDictionary dictionary) throws IOException { return openIndexUrlAsBaiOrNull(file.toURI().toURL(), dictionary); } public static InputStream openIndexUrlAsBaiOrNull(final URL url, final SAMSequenceDictionary dictionary) throws IOException { if (url.getFile().toLowerCase().endsWith(BAI.fileNameSuffix.toLowerCase())) { return url.openStream(); } if (url.getFile().toLowerCase().endsWith(CRAI.fileNameSuffix.toLowerCase())) { return CRAIIndex.openCraiFileAsBaiStream(url.openStream(), dictionary); } return null; } public static InputStream asBaiStreamOrNull(final InputStream inputStream, final SAMSequenceDictionary dictionary) throws IOException { final BufferedInputStream bis = new BufferedInputStream(inputStream); bis.mark(BAI.magic.length); if (doesStreamStartWith(bis, BAI.magic)) { bis.reset(); return bis; } else { bis.reset(); } bis.mark(CRAI.magic.length); if (doesStreamStartWith(bis, CRAI.magic)) { bis.reset(); return CRAIIndex.openCraiFileAsBaiStream(bis, dictionary); } else { bis.reset(); } return null; } public static SeekableStream asBaiSeekableStreamOrNull(final SeekableStream inputStream, final SAMSequenceDictionary dictionary) throws IOException { final SeekableBufferedStream bis = new SeekableBufferedStream(inputStream); bis.seek(0); if (doesStreamStartWith(bis, BAI.magic)) { bis.seek(0); return bis; } bis.seek(0); if (doesStreamStartWith(bis, CRAI.magic)) { bis.seek(0); return CRAIIndex.openCraiFileAsBaiStream(bis, dictionary); } else { bis.reset(); } return null; } private static boolean doesStreamStartWith(final InputStream is, final byte[] bytes) throws IOException { for (final byte b : bytes) { if (is.read() != (0xFF & b)) { return false; } } return true; } } htsjdk-2.0.1/src/java/htsjdk/samtools/SamInputResource.java000066400000000000000000000236571263034757100240010ustar00rootroot00000000000000package htsjdk.samtools; import htsjdk.samtools.seekablestream.SeekableFileStream; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.seekablestream.SeekableStreamFactory; import htsjdk.samtools.sra.SRAAccession; import htsjdk.samtools.util.Lazy; import htsjdk.samtools.util.RuntimeIOException; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; /** * Describes a SAM-like resource, including its data (where the records are), and optionally an index. *

* A data or index source may originate from a {@link java.io.File}, {@link java.io.InputStream}, {@link URL}, or * {@link htsjdk.samtools.seekablestream.SeekableStream}; look for the appropriate overload for * {@code htsjdk.samtools.SamInputResource#of()}. * * @author mccowan */ public class SamInputResource { private final InputResource source; private InputResource index; SamInputResource(final InputResource data) { this(data, null); } SamInputResource(final InputResource source, final InputResource index) { if (source == null) throw new NullPointerException("source"); this.source = source; this.index = index; } /** The resource that is the SAM data (e.g., records) */ InputResource data() { return source; } /** * The resource that is the SAM index * * @return null, if no index is defined for this resource */ InputResource indexMaybe() { return index; } @Override public String toString() { return String.format("data=%s;index=%s", source, index); } /** Creates a {@link SamInputResource} reading from the provided resource, with no index. */ public static SamInputResource of(final File file) { return new SamInputResource(new FileInputResource(file)); } /** Creates a {@link SamInputResource} reading from the provided resource, with no index. */ public static SamInputResource of(final InputStream inputStream) { return new SamInputResource(new InputStreamInputResource(inputStream)); } /** Creates a {@link SamInputResource} reading from the provided resource, with no index. */ public static SamInputResource of(final URL url) { return new SamInputResource(new UrlInputResource(url)); } /** Creates a {@link SamInputResource} reading from the provided resource, with no index. */ public static SamInputResource of(final SeekableStream seekableStream) { return new SamInputResource(new SeekableStreamInputResource(seekableStream)); } public static SamInputResource of(final SRAAccession acc) { return new SamInputResource(new SRAInputResource(acc)); } /** Creates a {@link SamInputResource} from a string specifying *either* a url or a file path */ public static SamInputResource of(final String string) { try { URL url = new URL(string); // this will throw if its not a url return of(url); } catch (MalformedURLException e) { // ignore } return of(new File(string)); } /** Updates the index to point at the provided resource, then returns itself. */ public SamInputResource index(final File file) { this.index = new FileInputResource(file); return this; } /** Updates the index to point at the provided resource, then returns itself. */ public SamInputResource index(final InputStream inputStream) { this.index = new InputStreamInputResource(inputStream); return this; } /** Updates the index to point at the provided resource, then returns itself. */ public SamInputResource index(final URL url) { this.index = new UrlInputResource(url); return this; } /** Updates the index to point at the provided resource, then returns itself. */ public SamInputResource index(final SeekableStream seekableStream) { this.index = new SeekableStreamInputResource(seekableStream); return this; } } /** * Describes an arbitrary input source, which is something that can be accessed as either a * {@link htsjdk.samtools.seekablestream.SeekableStream} or {@link java.io.InputStream}. A concrete implementation of this class exists for * each of {@link InputResource.Type}. */ abstract class InputResource { protected InputResource(final Type type) {this.type = type;} enum Type { FILE, URL, SEEKABLE_STREAM, INPUT_STREAM, SRA_ACCESSION } private final Type type; final Type type() { return type; } /** Returns null if this resource cannot be represented as a {@link File}. */ abstract File asFile(); /** Returns null if this resource cannot be represented as a {@link URL}. */ abstract URL asUrl(); /** Returns null if this resource cannot be represented as a {@link htsjdk.samtools.seekablestream.SeekableStream}. */ abstract SeekableStream asUnbufferedSeekableStream(); /** All resource types support {@link java.io.InputStream} generation. */ abstract InputStream asUnbufferedInputStream(); /** SRA archive resource */ abstract SRAAccession asSRAAccession(); @Override public String toString() { final String childToString; switch (type()) { case FILE: childToString = asFile().toString(); break; case INPUT_STREAM: childToString = asUnbufferedInputStream().toString(); break; case SEEKABLE_STREAM: childToString = asUnbufferedSeekableStream().toString(); break; case URL: childToString = asUrl().toString(); break; case SRA_ACCESSION: childToString = asSRAAccession().toString(); break; default: throw new IllegalStateException(); } return String.format("%s:%s", type(), childToString); } } class FileInputResource extends InputResource { final File fileResource; final Lazy lazySeekableStream = new Lazy(new Lazy.LazyInitializer() { @Override public SeekableStream make() { try { return new SeekableFileStream(fileResource); } catch (final FileNotFoundException e) { throw new RuntimeIOException(e); } } }); FileInputResource(final File fileResource) { super(Type.FILE); this.fileResource = fileResource; } @Override public File asFile() { return fileResource; } @Override public URL asUrl() { return null; } @Override public SeekableStream asUnbufferedSeekableStream() { return lazySeekableStream.get(); } @Override public InputStream asUnbufferedInputStream() { return asUnbufferedSeekableStream(); } @Override public SRAAccession asSRAAccession() { return null; } } class UrlInputResource extends InputResource { final URL urlResource; final Lazy lazySeekableStream = new Lazy(new Lazy.LazyInitializer() { @Override public SeekableStream make() { try { return SeekableStreamFactory.getInstance().getStreamFor(urlResource); } catch (final IOException ioe) { throw new RuntimeIOException(ioe); } } }); UrlInputResource(final URL urlResource) { super(Type.URL); this.urlResource = urlResource; } @Override public File asFile() { return null; } @Override public URL asUrl() { return urlResource; } @Override public SeekableStream asUnbufferedSeekableStream() { return lazySeekableStream.get(); } @Override public InputStream asUnbufferedInputStream() { return asUnbufferedSeekableStream(); } @Override public SRAAccession asSRAAccession() { return null; } } class SeekableStreamInputResource extends InputResource { final SeekableStream seekableStreamResource; SeekableStreamInputResource(final SeekableStream seekableStreamResource) { super(Type.SEEKABLE_STREAM); this.seekableStreamResource = seekableStreamResource; } @Override File asFile() { return null; } @Override URL asUrl() { return null; } @Override SeekableStream asUnbufferedSeekableStream() { return seekableStreamResource; } @Override InputStream asUnbufferedInputStream() { return asUnbufferedSeekableStream(); } @Override public SRAAccession asSRAAccession() { return null; } } class InputStreamInputResource extends InputResource { final InputStream inputStreamResource; InputStreamInputResource(final InputStream inputStreamResource) { super(Type.INPUT_STREAM); this.inputStreamResource = inputStreamResource; } @Override File asFile() { return null; } @Override URL asUrl() { return null; } @Override SeekableStream asUnbufferedSeekableStream() { return null; } @Override InputStream asUnbufferedInputStream() { return inputStreamResource; } @Override public SRAAccession asSRAAccession() { return null; } } class SRAInputResource extends InputResource { final SRAAccession accession; SRAInputResource(final SRAAccession accession) { super(Type.SRA_ACCESSION); this.accession = accession; } @Override File asFile() { return null; } @Override URL asUrl() { return null; } @Override SeekableStream asUnbufferedSeekableStream() { return null; } @Override InputStream asUnbufferedInputStream() { return null; } @Override public SRAAccession asSRAAccession() { return accession; } }htsjdk-2.0.1/src/java/htsjdk/samtools/SamPairUtil.java000066400000000000000000000564601263034757100227210ustar00rootroot00000000000000/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.PeekableIterator; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Queue; /** * Utility methods for pairs of SAMRecords */ public class SamPairUtil { /** * The possible orientations of paired reads. * * F = mapped to forward strand * R = mapped to reverse strand * * FR means the read that's mapped to the forward strand comes before the * read mapped to the reverse strand when their 5'-end coordinates are * compared. */ public static enum PairOrientation { FR, // ( 5' --F--> <--R-- 5' ) - aka. innie RF, // ( <--R-- 5' 5' --F--> ) - aka. outie TANDEM // ( 5' --F--> 5' --F--> or ( <--R-- 5' <--R-- 5' ) } /** * Computes the pair orientation of the given SAMRecord. * @param r * @return PairOrientation of the given SAMRecord. * @throws IllegalArgumentException If the record is not a paired read, or * one or both reads are unmapped. */ public static PairOrientation getPairOrientation(final SAMRecord r) { final boolean readIsOnReverseStrand = r.getReadNegativeStrandFlag(); if(r.getReadUnmappedFlag() || !r.getReadPairedFlag() || r.getMateUnmappedFlag()) { throw new IllegalArgumentException("Invalid SAMRecord: " + r.getReadName() + ". This method only works for SAMRecords " + "that are paired reads with both reads aligned."); } if(readIsOnReverseStrand == r.getMateNegativeStrandFlag() ) { return PairOrientation.TANDEM; } final long positiveStrandFivePrimePos = ( readIsOnReverseStrand ? r.getMateAlignmentStart() //mate's 5' position ( x---> ) : r.getAlignmentStart() ); //read's 5' position ( x---> ) final long negativeStrandFivePrimePos = ( readIsOnReverseStrand ? r.getAlignmentEnd() //read's 5' position ( <---x ) : r.getAlignmentStart() + r.getInferredInsertSize() ); //mate's 5' position ( <---x ) return ( positiveStrandFivePrimePos < negativeStrandFivePrimePos ? PairOrientation.FR : PairOrientation.RF ); } // TODO: KT and TF say this is more complicated than what I have here public static boolean isProperPair(final SAMRecord firstEnd, final SAMRecord secondEnd, final List expectedOrientations) { // are both records mapped? if (firstEnd.getReadUnmappedFlag() || secondEnd.getReadUnmappedFlag()) { return false; } if (firstEnd.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { return false; } // AND are they both mapped to the same chromosome if (!firstEnd.getReferenceName().equals(secondEnd.getReferenceName())) { return false; } // AND is the pair orientation in the set of expected orientations final PairOrientation actual = getPairOrientation(firstEnd); return expectedOrientations.contains(actual); } public static void assertMate(final SAMRecord firstOfPair, final SAMRecord secondOfPair) { // Validate paired reads arrive as first of pair, then second of pair if (firstOfPair == null) { throw new SAMException( "First record does not exist - cannot perform mate assertion!"); } else if (secondOfPair == null) { throw new SAMException( firstOfPair.toString() + " is missing its mate"); } else if (!firstOfPair.getReadPairedFlag()) { throw new SAMException( "First record is not marked as paired: " + firstOfPair.toString()); } else if (!secondOfPair.getReadPairedFlag()) { throw new SAMException( "Second record is not marked as paired: " + secondOfPair.toString()); } else if (!firstOfPair.getFirstOfPairFlag()) { throw new SAMException( "First record is not marked as first of pair: " + firstOfPair.toString()); } else if (!secondOfPair.getSecondOfPairFlag()) { throw new SAMException( "Second record is not marked as second of pair: " + secondOfPair.toString()); } else if (!firstOfPair.getReadName().equals(secondOfPair.getReadName())) { throw new SAMException( "First [" + firstOfPair.getReadName() + "] and Second [" + secondOfPair.getReadName() + "] readnames do not match!"); } } /** * Obtain the secondOfPair mate belonging to the firstOfPair SAMRecord * (assumed to be in the next element of the specified samRecordIterator) * @param samRecordIterator the iterator assumed to contain the secondOfPair SAMRecord in the * next element in the iteration * @param firstOfPair the firstOfPair SAMRecord * @return the secondOfPair SAMRecord * @throws SAMException when the secondOfPair mate cannot be obtained due to assertion failures */ public static SAMRecord obtainAssertedMate(final Iterator samRecordIterator, final SAMRecord firstOfPair) { if (samRecordIterator.hasNext()) { final SAMRecord secondOfPair = samRecordIterator.next(); assertMate(firstOfPair, secondOfPair); return secondOfPair; } else { throw new SAMException( "Second record does not exist: " + firstOfPair.getReadName()); } } /** * Compute SAMRecord insert size * @param firstEnd * @param secondEnd * @return note that when storing insert size on the secondEnd, the return value must be negated. */ public static int computeInsertSize(final SAMRecord firstEnd, final SAMRecord secondEnd) { if (firstEnd.getReadUnmappedFlag() || secondEnd.getReadUnmappedFlag()) { return 0; } if (!firstEnd.getReferenceName().equals(secondEnd.getReferenceName())) { return 0; } final int firstEnd5PrimePosition = firstEnd.getReadNegativeStrandFlag()? firstEnd.getAlignmentEnd(): firstEnd.getAlignmentStart(); final int secondEnd5PrimePosition = secondEnd.getReadNegativeStrandFlag()? secondEnd.getAlignmentEnd(): secondEnd.getAlignmentStart(); final int adjustment = (secondEnd5PrimePosition >= firstEnd5PrimePosition) ? +1 : -1; return secondEnd5PrimePosition - firstEnd5PrimePosition + adjustment; } /** * Write the mate info for two SAMRecords * @param rec1 the first SAM record. Must have a non-null SAMFileHeader. * @param rec2 the second SAM record. Must have a non-null SAMFileHeader. * @param setMateCigar true if we are to update/create the Mate CIGAR (MC) optional tag, false if we are to clear any mate cigar tag that is present. */ public static void setMateInfo(final SAMRecord rec1, final SAMRecord rec2, final boolean setMateCigar) { // If neither read is unmapped just set their mate info if (!rec1.getReadUnmappedFlag() && !rec2.getReadUnmappedFlag()) { rec1.setMateReferenceIndex(rec2.getReferenceIndex()); rec1.setMateAlignmentStart(rec2.getAlignmentStart()); rec1.setMateNegativeStrandFlag(rec2.getReadNegativeStrandFlag()); rec1.setMateUnmappedFlag(false); rec1.setAttribute(SAMTag.MQ.name(), rec2.getMappingQuality()); rec2.setMateReferenceIndex(rec1.getReferenceIndex()); rec2.setMateAlignmentStart(rec1.getAlignmentStart()); rec2.setMateNegativeStrandFlag(rec1.getReadNegativeStrandFlag()); rec2.setMateUnmappedFlag(false); rec2.setAttribute(SAMTag.MQ.name(), rec1.getMappingQuality()); if (setMateCigar) { rec1.setAttribute(SAMTag.MC.name(), rec2.getCigarString()); rec2.setAttribute(SAMTag.MC.name(), rec1.getCigarString()); } else { rec1.setAttribute(SAMTag.MC.name(), null); rec2.setAttribute(SAMTag.MC.name(), null); } } // Else if they're both unmapped set that straight else if (rec1.getReadUnmappedFlag() && rec2.getReadUnmappedFlag()) { rec1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec1.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); rec1.setMateReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec1.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); rec1.setMateNegativeStrandFlag(rec2.getReadNegativeStrandFlag()); rec1.setMateUnmappedFlag(true); rec1.setAttribute(SAMTag.MQ.name(), null); rec1.setAttribute(SAMTag.MC.name(), null); rec1.setInferredInsertSize(0); rec2.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec2.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); rec2.setMateReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec2.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); rec2.setMateNegativeStrandFlag(rec1.getReadNegativeStrandFlag()); rec2.setMateUnmappedFlag(true); rec2.setAttribute(SAMTag.MQ.name(), null); rec2.setAttribute(SAMTag.MC.name(), null); rec2.setInferredInsertSize(0); } // And if only one is mapped copy it's coordinate information to the mate else { final SAMRecord mapped = rec1.getReadUnmappedFlag() ? rec2 : rec1; final SAMRecord unmapped = rec1.getReadUnmappedFlag() ? rec1 : rec2; unmapped.setReferenceIndex(mapped.getReferenceIndex()); unmapped.setAlignmentStart(mapped.getAlignmentStart()); mapped.setMateReferenceIndex(unmapped.getReferenceIndex()); mapped.setMateAlignmentStart(unmapped.getAlignmentStart()); mapped.setMateNegativeStrandFlag(unmapped.getReadNegativeStrandFlag()); mapped.setMateUnmappedFlag(true); // For the mapped read, set it's mateCigar to null, since the other read must be unmapped mapped.setAttribute(SAMTag.MC.name(), null); mapped.setInferredInsertSize(0); unmapped.setMateReferenceIndex(mapped.getReferenceIndex()); unmapped.setMateAlignmentStart(mapped.getAlignmentStart()); unmapped.setMateNegativeStrandFlag(mapped.getReadNegativeStrandFlag()); unmapped.setMateUnmappedFlag(false); // For the unmapped read, set it's mateCigar to the mate's Cigar, since the mate must be mapped if (setMateCigar) unmapped.setAttribute(SAMTag.MC.name(), mapped.getCigarString()); else unmapped.setAttribute(SAMTag.MC.name(), null); unmapped.setInferredInsertSize(0); } final int insertSize = SamPairUtil.computeInsertSize(rec1, rec2); rec1.setInferredInsertSize(insertSize); rec2.setInferredInsertSize(-insertSize); } /** * Write the mate info for two SAMRecords * @param rec1 the first SAM record * @param rec2 the second SAM record * @param header the SAM file header * @param setMateCigar true if we are to update/create the Mate CIGAR (MC) optional tag, false if we are to clear any mate cigar tag that is present. */ @Deprecated public static void setMateInfo(final SAMRecord rec1, final SAMRecord rec2, final SAMFileHeader header, final boolean setMateCigar) { setMateInfo(rec1, rec2, setMateCigar); } /** * Write the mate info for two SAMRecords. This will always clear/remove any mate cigar tag that is present. * @param rec1 the first SAM record * @param rec2 the second SAM record * @param header the SAM file header */ public static void setMateInfo(final SAMRecord rec1, final SAMRecord rec2, final SAMFileHeader header) { setMateInfo(rec1, rec2, false); } /** * Sets mate pair information appropriately on a supplemental SAMRecord (e.g. from a split alignment) * using the primary alignment of the read's mate. * @param supplemental a supplemental alignment for the mate pair of the primary supplied * @param matePrimary the primary alignment of the the mate pair of the supplemental * @param setMateCigar true if we are to update/create the Mate CIGAR (MC) optional tag, false if we are to clear any mate cigar tag that is present. */ public static void setMateInformationOnSupplementalAlignment( final SAMRecord supplemental, final SAMRecord matePrimary, final boolean setMateCigar) { supplemental.setMateReferenceIndex(matePrimary.getReferenceIndex()); supplemental.setMateAlignmentStart(matePrimary.getAlignmentStart()); supplemental.setMateNegativeStrandFlag(matePrimary.getReadNegativeStrandFlag()); supplemental.setMateUnmappedFlag(matePrimary.getReadUnmappedFlag()); supplemental.setInferredInsertSize(-matePrimary.getInferredInsertSize()); if (setMateCigar && !matePrimary.getReadUnmappedFlag()) { supplemental.setAttribute(SAMTag.MC.name(), matePrimary.getCigarString()); } else { supplemental.setAttribute(SAMTag.MC.name(), null); } } /** * Sets mate pair information appropriately on a supplemental SAMRecord (e.g. from a split alignment) * using the primary alignment of the read's mate. * @param supplemental a supplemental alignment for the mate pair of the primary supplied * @param matePrimary the primary alignment of the the mate pair of the supplemental */ public static void setMateInformationOnSupplementalAlignment( final SAMRecord supplemental, final SAMRecord matePrimary) { setMateInformationOnSupplementalAlignment(supplemental, matePrimary, false); } /** * This method will clear any mate cigar already present. */ public static void setProperPairAndMateInfo(final SAMRecord rec1, final SAMRecord rec2, final SAMFileHeader header, final List exepectedOrientations) { setProperPairAndMateInfo(rec1, rec2, header, exepectedOrientations, false); } /** * @param rec1 * @param rec2 * @param header * @param exepectedOrientations * @param addMateCigar true if we are to update/create the Mate CIGAR (MC) optional tag, false if we are to clear any mate cigar tag that is present. */ public static void setProperPairAndMateInfo(final SAMRecord rec1, final SAMRecord rec2, final SAMFileHeader header, final List exepectedOrientations, final boolean addMateCigar) { setMateInfo(rec1, rec2, header, addMateCigar); setProperPairFlags(rec1, rec2, exepectedOrientations); } public static void setProperPairFlags(final SAMRecord rec1, final SAMRecord rec2, final List expectedOrientations) { final boolean properPair = (!rec1.getReadUnmappedFlag() && !rec2.getReadUnmappedFlag()) ? isProperPair(rec1, rec2, expectedOrientations) : false; rec1.setProperPairFlag(properPair); rec2.setProperPairFlag(properPair); } /** * A class to iterate through SAMRecords and set mate information on the given records, and optionally * set the mate cigar tag (true by default). */ public static class SetMateInfoIterator extends PeekableIterator { private final Queue records = new LinkedList(); private final boolean setMateCigar; private final boolean ignoreMissingMates; private long numMateCigarsAdded = 0; /** * By default, the mate cigar tag is set * @param iterator the iterator to wrap */ public SetMateInfoIterator(final Iterator iterator) { this(iterator, true); } /** * @param iterator the iterator to wrap * @param setMateCigar true if we are to update/create the Mate CIGAR (MC) optional tag, false if we are to clear any mate cigar tag that is present. */ public SetMateInfoIterator(final Iterator iterator, final boolean setMateCigar) { this(iterator, setMateCigar, false); } /** * @param iterator the iterator to wrap * @param setMateCigar true if we are to update/create the Mate CIGAR (MC) optional tag, false if we are to clear any mate cigar tag that is present. * @param ignoreMissingMates set this to true if we are to ignore missing mates, otherwise an exception will be thrown when a missing mate is encountered */ public SetMateInfoIterator(final Iterator iterator, final boolean setMateCigar, final boolean ignoreMissingMates) { super(iterator); this.setMateCigar = setMateCigar; this.ignoreMissingMates = ignoreMissingMates; } /** * @return the current number of mate cigars added. This could be more than the number of records returned. */ public long getNumMateCigarsAdded() { return this.numMateCigarsAdded; } public boolean hasNext() { return (!records.isEmpty() || super.hasNext()); } /** * Populate this.records if necessary. */ private void advance() { // No need to advance if we have records remaining if (!records.isEmpty()) return; /** * Get all records with the same name, and then identify the canonical first and second end to which we * want to set mate info. */ SAMRecord firstPrimaryRecord = null, secondPrimaryRecord = null; final SAMRecord first = super.peek(); // peek so we consider it in the following loop boolean containsSupplementalRecord = false; while (super.hasNext() && super.peek().getReadName().equals(first.getReadName())) { final SAMRecord record = super.next(); // We must make sure that we find only one "primary" alignments for each end if (record.getReadPairedFlag()) { if (!record.isSecondaryOrSupplementary()) { if (record.getFirstOfPairFlag()) { if (null != firstPrimaryRecord) { throw new SAMException("Found two records that are paired, not supplementary, and first of the pair"); } firstPrimaryRecord = record; } else if (record.getSecondOfPairFlag()) { if (null != secondPrimaryRecord) { throw new SAMException("Found two records that are paired, not supplementary, and second of the pair"); } secondPrimaryRecord = record; } } if (record.getSupplementaryAlignmentFlag()) containsSupplementalRecord = true; } records.add(record); } // TODO: should we check that we do not have a mix of paired and fragment reads? // we must find both records to update the mate info if (null != firstPrimaryRecord && null != secondPrimaryRecord) { // Update mate info SamPairUtil.setMateInfo(firstPrimaryRecord, secondPrimaryRecord, this.setMateCigar); if (this.setMateCigar) this.numMateCigarsAdded += 2; // Set mate information on supplemental records if (containsSupplementalRecord) { for (final SAMRecord record : records) { if (record.getReadPairedFlag() && record.getSupplementaryAlignmentFlag()) { if (record.getFirstOfPairFlag()) { SamPairUtil.setMateInformationOnSupplementalAlignment(record, secondPrimaryRecord, this.setMateCigar); } else { SamPairUtil.setMateInformationOnSupplementalAlignment(record, firstPrimaryRecord, this.setMateCigar); } this.numMateCigarsAdded++; } } } } else if (!this.ignoreMissingMates) { if (null != firstPrimaryRecord && firstPrimaryRecord.getReadPairedFlag()) { throw new SAMException("Missing second read of pair: " + firstPrimaryRecord.getReadName()); } else if (null != secondPrimaryRecord && secondPrimaryRecord.getReadPairedFlag()) { throw new SAMException("Missing first read of pair: " + secondPrimaryRecord.getReadName()); } } } public SAMRecord next() { advance(); if (records.isEmpty()) throw new IllegalStateException("Unexpectedly found an empty record list"); return this.records.poll(); } public SAMRecord peek() { advance(); if (records.isEmpty()) throw new IllegalStateException("Unexpectedly found an empty record list"); return this.records.peek(); } } } htsjdk-2.0.1/src/java/htsjdk/samtools/SamReader.java000066400000000000000000000614531263034757100223700ustar00rootroot00000000000000package htsjdk.samtools; import htsjdk.samtools.util.CloseableIterator; import java.io.Closeable; import java.text.MessageFormat; /** * Describes functionality for objects that produce {@link SAMRecord}s and associated information. * * Currently, only deprecated readers implement this directly; actual readers implement this * via {@link ReaderImplementation} and {@link PrimitiveSamReader}, which {@link SamReaderFactory} * converts into full readers by using {@link PrimitiveSamReaderToSamReaderAdapter}. * * @author mccowan */ public interface SamReader extends Iterable, Closeable { /** Describes a type of SAM file. */ public abstract class Type { /** A string representation of this type. */ abstract String name(); /** The recommended file extension for SAMs of this type, without a period. */ public abstract String fileExtension(); /** The recommended file extension for SAM indexes of this type, without a period, or null if this type is not associated with indexes. */ abstract String indexExtension(); static class TypeImpl extends Type { final String name, fileExtension, indexExtension; TypeImpl(final String name, final String fileExtension, final String indexExtension) { this.name = name; this.fileExtension = fileExtension; this.indexExtension = indexExtension; } @Override String name() { return name; } @Override public String fileExtension() { return fileExtension; } @Override String indexExtension() { return indexExtension; } @Override public String toString() { return String.format("TypeImpl{name='%s', fileExtension='%s', indexExtension='%s'}", name, fileExtension, indexExtension); } } public static Type SRA_TYPE = new TypeImpl("SRA", "sra", null); public static Type CRAM_TYPE = new TypeImpl("CRAM", "cram", "crai"); public static Type BAM_TYPE = new TypeImpl("BAM", "bam", "bai"); public static Type SAM_TYPE = new TypeImpl("SAM", "sam", null); } /** * Facet for index-related operations. */ public interface Indexing { /** * Retrieves the index for the given file type. Ensure that the index is of the specified type. * * @return An index of the given type. */ public BAMIndex getIndex(); /** * Returns true if the supported index is browseable, meaning the bins in it can be traversed * and chunk data inspected and retrieved. * * @return True if the index supports the BrowseableBAMIndex interface. False otherwise. */ public boolean hasBrowseableIndex(); /** * Gets an index tagged with the BrowseableBAMIndex interface. Throws an exception if no such * index is available. * * @return An index with a browseable interface, if possible. * @throws SAMException if no such index is available. */ public BrowseableBAMIndex getBrowseableIndex(); /** * Iterate through the given chunks in the file. * * @param chunks List of chunks for which to retrieve data. * @return An iterator over the given chunks. */ public SAMRecordIterator iterator(final SAMFileSpan chunks); /** * Gets a pointer spanning all reads in the BAM file. * * @return Unbounded pointer to the first record, in chunk format. */ public SAMFileSpan getFilePointerSpanningReads(); } public SAMFileHeader getFileHeader(); /** * @return the {@link htsjdk.samtools.SamReader.Type} of this {@link htsjdk.samtools.SamReader} */ public Type type(); /** * @return a human readable description of the resource backing this sam reader */ public String getResourceDescription(); /** * @return true if ths is a BAM file, and has an index */ public boolean hasIndex(); /** * Exposes the {@link SamReader.Indexing} facet of this {@link SamReader}. * * @throws java.lang.UnsupportedOperationException If {@link #hasIndex()} returns false. */ public Indexing indexing(); /** * Iterate through file in order. For a SAMFileReader constructed from an InputStream, and for any SAM file, * a 2nd iteration starts where the 1st one left off. For a BAM constructed from a SeekableStream or File, each new iteration * starts at the first record. *

* Only a single open iterator on a SAM or BAM file may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. */ public SAMRecordIterator iterator(); /** * Iterate over records that match the given interval. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. You can use a second SAMFileReader to iterate * in parallel over the same underlying file. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param sequence Reference sequence of interest. * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. * @param contained If true, each SAMRecord returned will have its alignment completely contained in the * interval of interest. If false, the alignment of the returned SAMRecords need only overlap the interval of interest. * @return Iterator over the SAMRecords matching the interval. */ public SAMRecordIterator query(final String sequence, final int start, final int end, final boolean contained); /** * Iterate over records that overlap the given interval. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param sequence Reference sequence of interest. * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. * @return Iterator over the SAMRecords overlapping the interval. */ public SAMRecordIterator queryOverlapping(final String sequence, final int start, final int end); /** * Iterate over records that are contained in the given interval. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param sequence Reference sequence of interest. * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. * @return Iterator over the SAMRecords contained in the interval. */ public SAMRecordIterator queryContained(final String sequence, final int start, final int end); /** * Iterate over records that match one of the given intervals. This may be more efficient than querying * each interval separately, because multiple reads of the same SAMRecords is avoided. *

* Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. You can use a second SAMFileReader to iterate * in parallel over the same underlying file. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match an interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param intervals Intervals to be queried. The intervals must be optimized, i.e. in order, with overlapping * and abutting intervals merged. This can be done with {@link htsjdk.samtools.QueryInterval#optimizeIntervals} * @param contained If true, each SAMRecord returned is will have its alignment completely contained in one of the * intervals of interest. If false, the alignment of the returned SAMRecords need only overlap one of * the intervals of interest. * @return Iterator over the SAMRecords matching the interval. */ public SAMRecordIterator query(final QueryInterval[] intervals, final boolean contained); /** * Iterate over records that overlap any of the given intervals. This may be more efficient than querying * each interval separately, because multiple reads of the same SAMRecords is avoided. *

* Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param intervals Intervals to be queried. The intervals must be optimized, i.e. in order, with overlapping * and abutting intervals merged. This can be done with {@link htsjdk.samtools.QueryInterval#optimizeIntervals} */ public SAMRecordIterator queryOverlapping(final QueryInterval[] intervals); /** * Iterate over records that are contained in the given interval. This may be more efficient than querying * each interval separately, because multiple reads of the same SAMRecords is avoided. *

* Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param intervals Intervals to be queried. The intervals must be optimized, i.e. in order, with overlapping * and abutting intervals merged. This can be done with {@link htsjdk.samtools.QueryInterval#optimizeIntervals} * @return Iterator over the SAMRecords contained in any of the intervals. */ public SAMRecordIterator queryContained(final QueryInterval[] intervals); public SAMRecordIterator queryUnmapped(); /** * Iterate over records that map to the given sequence and start at the given position. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SAMFileReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * matches the arguments. * * @param sequence Reference sequence of interest. * @param start Alignment start of interest. * @return Iterator over the SAMRecords with the given alignment start. */ public SAMRecordIterator queryAlignmentStart(final String sequence, final int start); /** * Fetch the mate for the given read. Only valid to call this if hasIndex() == true. * This will work whether the mate has a coordinate or not, so long as the given read has correct * mate information. This method iterates over the SAM file, so there may not be an unclosed * iterator on the SAM file when this method is called. *

* Note that it is not possible to call queryMate when iterating over the SAMFileReader, because queryMate * requires its own iteration, and there cannot be two simultaneous iterations on the same SAMFileReader. The * work-around is to open a second SAMFileReader on the same input file, and call queryMate on the second * reader. * * @param rec Record for which mate is sought. Must be a paired read. * @return rec's mate, or null if it cannot be found. */ public SAMRecord queryMate(final SAMRecord rec); /** * The minimal subset of functionality needed for a {@link SAMRecord} data source. * {@link SamReader} itself is somewhat large and bulky, but the core functionality can be captured in * relatively few methods, which are included here. For documentation, see the corresponding methods * in {@link SamReader}. * * See also: {@link PrimitiveSamReaderToSamReaderAdapter}, {@link ReaderImplementation} * */ public interface PrimitiveSamReader { Type type(); boolean hasIndex(); BAMIndex getIndex(); SAMFileHeader getFileHeader(); CloseableIterator getIterator(); CloseableIterator getIterator(SAMFileSpan fileSpan); SAMFileSpan getFilePointerSpanningReads(); CloseableIterator query(QueryInterval[] intervals, boolean contained); CloseableIterator queryAlignmentStart(String sequence, int start); CloseableIterator queryUnmapped(); void close(); ValidationStringency getValidationStringency(); } /** * Decorator for a {@link SamReader.PrimitiveSamReader} that expands its functionality into a {@link SamReader}, * given the backing {@link SamInputResource}. * * Wraps the {@link Indexing} interface as well, which was originally separate from {@link SamReader} but in practice * the two are always implemented by the same class. * */ class PrimitiveSamReaderToSamReaderAdapter implements SamReader, Indexing { final PrimitiveSamReader p; final SamInputResource resource; public PrimitiveSamReaderToSamReaderAdapter(final PrimitiveSamReader p, final SamInputResource resource) { this.p = p; this.resource = resource; } PrimitiveSamReader underlyingReader() { return p; } @Override public SAMRecordIterator queryOverlapping(final String sequence, final int start, final int end) { return query(sequence, start, end, false); } @Override public SAMRecordIterator queryOverlapping(final QueryInterval[] intervals) { return query(intervals, false); } @Override public SAMRecordIterator queryContained(final String sequence, final int start, final int end) { return query(sequence, start, end, true); } @Override public SAMRecordIterator queryContained(final QueryInterval[] intervals) { return query(intervals, true); } /** * Wraps the boilerplate code for querying a record's mate, which is common across many implementations. * * @param rec Record for which mate is sought. Must be a paired read. * @return */ @Override public SAMRecord queryMate(final SAMRecord rec) { if (!rec.getReadPairedFlag()) { throw new IllegalArgumentException("queryMate called for unpaired read."); } if (rec.getFirstOfPairFlag() == rec.getSecondOfPairFlag()) { throw new IllegalArgumentException("SAMRecord must be either first and second of pair, but not both."); } final boolean firstOfPair = rec.getFirstOfPairFlag(); final CloseableIterator it; if (rec.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { it = queryUnmapped(); } else { it = queryAlignmentStart(rec.getMateReferenceName(), rec.getMateAlignmentStart()); } try { SAMRecord mateRec = null; while (it.hasNext()) { final SAMRecord next = it.next(); if (!next.getReadPairedFlag()) { if (rec.getReadName().equals(next.getReadName())) { throw new SAMFormatException("Paired and unpaired reads with same name: " + rec.getReadName()); } continue; } if (firstOfPair) { if (next.getFirstOfPairFlag()) continue; } else { if (next.getSecondOfPairFlag()) continue; } if (rec.getReadName().equals(next.getReadName())) { if (mateRec != null) { throw new SAMFormatException("Multiple SAMRecord with read name " + rec.getReadName() + " for " + (firstOfPair ? "second" : "first") + " end."); } mateRec = next; } } return mateRec; } finally { it.close(); } } @Override public boolean hasBrowseableIndex() { return hasIndex() && getIndex() instanceof BrowseableBAMIndex; } @Override public BrowseableBAMIndex getBrowseableIndex() { final BAMIndex index = getIndex(); if (!(index instanceof BrowseableBAMIndex)) throw new SAMException("Cannot return index: index created by BAM is not browseable."); return BrowseableBAMIndex.class.cast(index); } @Override public SAMRecordIterator iterator() { return new AssertingIterator(p.getIterator()); } @Override public SAMRecordIterator iterator(final SAMFileSpan chunks) { return new AssertingIterator(p.getIterator(chunks)); } @Override public void close() { p.close(); } @Override public SAMFileSpan getFilePointerSpanningReads() { return p.getFilePointerSpanningReads(); } @Override public SAMFileHeader getFileHeader() { return p.getFileHeader(); } @Override public Type type() { return p.type(); } @Override public String getResourceDescription() { return this.resource.toString(); } @Override public boolean hasIndex() { return p.hasIndex(); } @Override public Indexing indexing() { return this; } @Override public BAMIndex getIndex() { return p.getIndex(); } @Override public SAMRecordIterator query(final QueryInterval[] intervals, final boolean contained) { return AssertingIterator.of(p.query(intervals, contained)); } @Override public SAMRecordIterator query(final String sequence, final int start, final int end, final boolean contained) { return query(new QueryInterval[]{new QueryInterval(getFileHeader().getSequenceIndex(sequence), start, end)}, contained); } @Override public SAMRecordIterator queryUnmapped() { return AssertingIterator.of(p.queryUnmapped()); } @Override public SAMRecordIterator queryAlignmentStart(final String sequence, final int start) { return AssertingIterator.of(p.queryAlignmentStart(sequence, start)); } } static class AssertingIterator implements SAMRecordIterator { static AssertingIterator of(final CloseableIterator iterator) { return new AssertingIterator(iterator); } private final CloseableIterator wrappedIterator; private SAMRecord previous = null; private SAMRecordComparator comparator = null; public AssertingIterator(final CloseableIterator iterator) { wrappedIterator = iterator; } public SAMRecordIterator assertSorted(final SAMFileHeader.SortOrder sortOrder) { if (sortOrder == null || sortOrder == SAMFileHeader.SortOrder.unsorted) { comparator = null; return this; } comparator = sortOrder.getComparatorInstance(); return this; } public SAMRecord next() { final SAMRecord result = wrappedIterator.next(); if (comparator != null) { if (previous != null) { if (comparator.fileOrderCompare(previous, result) > 0) { throw new IllegalStateException(MessageFormat.format( "Records {0} ({1}:{2}) should come after {3} ({4}:{5}) when sorting with {6}", previous.getReadName(), previous.getReferenceName(), previous.getAlignmentStart(), result.getReadName(), result.getReferenceName(), result.getAlignmentStart(), comparator.getClass().getName()) ); } } previous = result; } return result; } public void close() { wrappedIterator.close(); } public boolean hasNext() { return wrappedIterator.hasNext(); } public void remove() { wrappedIterator.remove(); } } /** * Internal interface for SAM/BAM/CRAM file reader implementations, * as distinct from non-file-based readers. * * Implemented as an abstract class to enforce better access control. * * TODO -- Many of these methods only apply for a subset of implementations, * TODO -- and either no-op or throw an exception for the others. * TODO -- We should consider refactoring things to avoid this; * TODO -- perhaps we can get away with not having this class at all. */ abstract class ReaderImplementation implements PrimitiveSamReader { abstract void enableFileSource(final SamReader reader, final boolean enabled); abstract void enableIndexCaching(final boolean enabled); abstract void enableIndexMemoryMapping(final boolean enabled); abstract void enableCrcChecking(final boolean enabled); abstract void setSAMRecordFactory(final SAMRecordFactory factory); abstract void setValidationStringency(final ValidationStringency validationStringency); } } htsjdk-2.0.1/src/java/htsjdk/samtools/SamReaderFactory.java000066400000000000000000000521221263034757100237110ustar00rootroot00000000000000package htsjdk.samtools; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.EnumSet; import java.util.zip.GZIPInputStream; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.sra.SRAAccession; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.BlockCompressedStreamConstants; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; /** *

Describes the functionality for producing {@link SamReader}, and offers a * handful of static generators.

*
 *     SamReaderFactory.makeDefault().open(new File("/my/bam.bam");
 * 
*

Example: Configure a factory

*
 *      final {@link SamReaderFactory} factory =
 *          SamReaderFactory.makeDefault()
 *              .enable({@link Option#INCLUDE_SOURCE_IN_RECORDS}, {@link Option#VALIDATE_CRC_CHECKSUMS})
 *              .validationStringency({@link ValidationStringency#SILENT});
 *
 * 
*

Example: Open two bam files from different sources, using different options

*
 *     final {@link SamReaderFactory} factory =
 *          SamReaderFactory.makeDefault()
 *              .enable({@link Option#INCLUDE_SOURCE_IN_RECORDS}, {@link Option#VALIDATE_CRC_CHECKSUMS})
 *              .validationStringency({@link ValidationStringency#SILENT});
 *
 *     // File-based bam
 *     final {@link SamReader} fileReader = factory.open(new File("/my/bam.bam"));
 *
 *     // HTTP-hosted BAM with index from an arbitrary stream
 *     final SeekableStream myBamIndexStream = ...
 *     final {@link SamInputResource} resource =
 *          {@link SamInputResource}.of(new URL("http://example.com/data.bam")).index(myBamIndexStream);
 *     final {@link SamReader} complicatedReader = factory.open(resource);
 * 
* * @author mccowan */ public abstract class SamReaderFactory { private static ValidationStringency defaultValidationStringency = ValidationStringency.DEFAULT_STRINGENCY; abstract public SamReader open(final File file); abstract public SamReader open(final SamInputResource resource); abstract public ValidationStringency validationStringency(); abstract public ReferenceSource referenceSource(); /** Set this factory's {@link htsjdk.samtools.SAMRecordFactory} to the provided one, then returns itself. */ abstract public SamReaderFactory samRecordFactory(final SAMRecordFactory samRecordFactory); /** Enables the provided {@link Option}s, then returns itself. */ abstract public SamReaderFactory enable(final Option... options); /** Disables the provided {@link Option}s, then returns itself. */ abstract public SamReaderFactory disable(final Option... options); /** Sets a specific Option to a boolean value. * */ abstract public SamReaderFactory setOption(final Option option, boolean value); /** Sets the specified reference sequence * */ abstract public SamReaderFactory referenceSequence(File referenceSequence); /** Sets the specified reference sequence * */ abstract public SamReaderFactory referenceSource(ReferenceSource referenceSequence); /** Utility method to open the file get the header and close the file */ abstract public SAMFileHeader getFileHeader(File samFile); /** Reapplies any changed options to the reader * */ abstract public void reapplyOptions(SamReader reader); /** Set this factory's {@link ValidationStringency} to the provided one, then returns itself. */ abstract public SamReaderFactory validationStringency(final ValidationStringency validationStringency); private static SamReaderFactoryImpl DEFAULT = new SamReaderFactoryImpl(Option.DEFAULTS, defaultValidationStringency, DefaultSAMRecordFactory.getInstance()); public static void setDefaultValidationStringency(final ValidationStringency defaultValidationStringency) { SamReaderFactory.defaultValidationStringency = defaultValidationStringency; // The default may have changed, so reset the default SamReader DEFAULT = new SamReaderFactoryImpl(Option.DEFAULTS, defaultValidationStringency, DefaultSAMRecordFactory.getInstance()); } /** Creates a copy of the default {@link SamReaderFactory}. */ public static SamReaderFactory makeDefault() { return SamReaderFactoryImpl.copyOf(DEFAULT); } /** * Creates an "empty" factory with no enabled {@link Option}s, {@link ValidationStringency#DEFAULT_STRINGENCY}, and * {@link htsjdk.samtools.DefaultSAMRecordFactory}. */ public static SamReaderFactory make() { return new SamReaderFactoryImpl(EnumSet.noneOf(Option.class), ValidationStringency.DEFAULT_STRINGENCY, DefaultSAMRecordFactory.getInstance()); } private static class SamReaderFactoryImpl extends SamReaderFactory { private final static Log LOG = Log.getInstance(SamReaderFactory.class); private final EnumSet