pax_global_header 0000666 0000000 0000000 00000000064 12700552063 0014512 g ustar 00root root 0000000 0000000 52 comment=978e8e46cba44595ece623e5387749284a86b74b
skewer-0.2.2/ 0000775 0000000 0000000 00000000000 12700552063 0013013 5 ustar 00root root 0000000 0000000 skewer-0.2.2/LICENSE 0000664 0000000 0000000 00000002101 12700552063 0014012 0 ustar 00root root 0000000 0000000 The MIT License (MIT)
Copyright (c) 2013-2014 by Hongshan Jiang
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
skewer-0.2.2/Makefile 0000664 0000000 0000000 00000001661 12700552063 0014457 0 ustar 00root root 0000000 0000000 CXX=g++
DEBUG?=0
CXXFLAGS?=-c
LDFLAGS=-pthread
LDLIBS=-lrt
ifneq ($(OS), Windows_NT)
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S), Darwin)
LDFLAGS=-lpthread
LDLIBS=
endif
endif
SRC=src
SOURCES=$(SRC)/main.cpp $(SRC)/parameter.cpp $(SRC)/matrix.cpp $(SRC)/fastq.cpp
OBJECTS=$(SOURCES:.cpp=.o)
EXECUTABLE=skewer
ifeq ($(DEBUG), 1)
CXXFLAGS += -O0 -g -Wall -D_DEBUG
else
CXXFLAGS += -O2
endif
.PHONY: all debug clean
all:$(EXECUTABLE)
debug:
$(MAKE) $(MAKEFILE) DEBUG=1
$(EXECUTABLE):$(OBJECTS)
$(CXX) $(LDFLAGS) $(OBJECTS) -o $@ $(LDLIBS)
.cpp.o:
$(CXX) $(CXXFLAGS) $< -o $@
$(SRC)/main.o: $(SRC)/parameter.h $(SRC)/matrix.h $(SRC)/fastq.h $(SRC)/common.h
$(SRC)/parameter.o: $(SRC)/parameter.h $(SRC)/fastq.h $(SRC)/common.h
$(SRC)/matrix.o: $(SRC)/matrix.h $(SRC)/common.h
$(SRC)/fastq.o: $(SRC)/fastq.h $(SRC)/common.h
# Clean
clean:
rm -rf $(OBJECTS) $(EXECUTABLE)
# Install
install:
mv -f $(EXECUTABLE) /usr/local/bin
skewer-0.2.2/README.md 0000664 0000000 0000000 00000002370 12700552063 0014274 0 ustar 00root root 0000000 0000000 skewer
======
skewer (transferred from https://sourceforge.net/projects/skewer) implements the bit-masked k-difference matching algorithm dedicated to the task of adapter trimming and it is specially designed for processing next-generation sequencing (NGS) paired-end sequences.
### Citation
Jiang, H., Lei, R., Ding, S.W. and Zhu, S. (2014) Skewer: a fast and accurate adapter trimmer for next-generation sequencing paired-end reads. BMC Bioinformatics, 15, 182.
### Features
* Detection and removal of adapter sequences
* Insertion and deletion allowed in pattern matching
* Targeted at Single End, Paired End (PE), and Long Mate Pair (LMP) reads
* Demultiplexing of barcoded sequencing runs
* Multi-threading support
* Trimming based on phred quality scores
* IUPAC characters for barcodes and adapters
* Compressed input and output support
### Installation from binary
Copy skewer to your favorate BIN directory, and make sure the PATH environment variable is correctly set. For example:
$ mkdir -p ~/bin
$ cp -p skewer ~/bin/
$ echo 'export PATH=~/bin:$PATH' >> ~/.bashrc
$ source ~/.bashrc
### Installation from source codes
Enter into the directory of source codes, then:
$ make
$ sudo make install
skewer-0.2.2/src/ 0000775 0000000 0000000 00000000000 12700552063 0013602 5 ustar 00root root 0000000 0000000 skewer-0.2.2/src/common.h 0000664 0000000 0000000 00000004472 12700552063 0015252 0 ustar 00root root 0000000 0000000 /**********************************************************************
* Skewer - a fast and accurate adapter trimming tool
* using the bit-masked k-difference matching algorithm
* Copyright (c) 2013-2014 by Hongshan Jiang
* hongshan.jiang@gmail.com
*
* If you use this program, please cite the paper:
* Jiang, H., Lei, R., Ding, S.W. and Zhu, S. (2014) Skewer: a fast and
* accurate adapter trimmer for next-generation sequencing paired-end reads.
* BMC Bioinformatics, 15, 182.
* http://www.biomedcentral.com/1471-2105/15/182
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef _COMMON_H
#define _COMMON_H
typedef unsigned long long uint64;
typedef unsigned long uint32;
typedef unsigned short uint16;
typedef long long int64;
typedef long int32;
typedef short int16;
#ifndef uint
typedef unsigned int uint;
#endif
typedef unsigned char uchar;
const int MAX_PATH = 255;
const int MAX_ADAPTER_LEN = 64;
const int MAX_ADAPTER_CNT = 96;
typedef struct tag_INDEX{
int pos;
int pos2;
int bc;
}INDEX;
enum TRIM_MODE{
TRIM_DEFAULT = 0,
TRIM_HEAD = 1,
TRIM_TAIL = 2,
TRIM_ANY = 3,
TRIM_PE = 4,
TRIM_PE_HEAD = 5,
TRIM_PE_TAIL = 6,
TRIM_PE_ANY = 7,
TRIM_MP = 8,
TRIM_MP_HEAD = 9,
TRIM_MP_TAIL = 10,
TRIM_MP_ANY = 11,
TRIM_AP = 16,
TRIM_AP_HEAD = 17,
TRIM_MODE_CNT = 14
};
#endif // _COMMON_H
skewer-0.2.2/src/fastq.cpp 0000664 0000000 0000000 00000017701 12700552063 0015432 0 ustar 00root root 0000000 0000000 /**********************************************************************
* Skewer - a fast and accurate adapter trimming tool
* using the bit-masked k-difference matching algorithm
* Copyright (c) 2013-2014 by Hongshan Jiang
* hongshan.jiang@gmail.com
*
* If you use this program, please cite the paper:
* Jiang, H., Lei, R., Ding, S.W. and Zhu, S. (2014) Skewer: a fast and
* accurate adapter trimmer for next-generation sequencing paired-end reads.
* BMC Bioinformatics, 15, 182.
* http://www.biomedcentral.com/1471-2105/15/182
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "fastq.h"
const char * FASTQ_FORMAT_NAME[FASTQ_FORMAT_CNT] = {
"Sanger/Illumina 1.8+ FASTQ",
"Solexa/Illumina 1.3+/Illumina 1.5+ FASTQ",
"Unknown format",
"Contradict formats",
"FASTA"
};
////////////////////////
cFQ::cFQ()
{
memset(&rec, 0, sizeof(RECORD));
in = NULL;
offset = 0L;
next_pos = 0L;
rno = 0;
}
cFQ::~cFQ()
{
if(rec.id.s != NULL)
free(rec.id.s);
if(rec.seq.s != NULL)
free(rec.seq.s);
if(rec.com.s != NULL)
free(rec.com.s);
if(rec.qual.s != NULL)
free(rec.qual.s);
}
void cFQ::associateFile(FILE * fp)
{
in = fp;
offset = 0L;
rno = 0;
}
inline int cFQ::read_line(LINE &l)
{
return (l.n = getline(&l.s, &l.a, in));
}
// readRecord
// return value:
// -1: EOF
// -2: ERROR
// 1: sucess
int cFQ::readRecord(RECORD *pRecord)
{
if(pRecord == NULL){
pRecord = &rec;
}
tag = fgetc(in);
if(tag == EOF)
return -1;
read_line(pRecord->id);
if(tag == '>') {
pRecord->qual.n = 0;
// read fasta instead
char c = fgetc(in);
pRecord->seq.n = 0;
pRecord->com.n = 0;
while (c != '>' && c != EOF) {
if (pRecord->seq.a <= size_t(pRecord->seq.n+1)) {
pRecord->seq.s=(char *)realloc(pRecord->seq.s, pRecord->seq.a=(pRecord->seq.a * 3 / 2 + 64));
}
if (!isspace(c))
pRecord->seq.s[pRecord->seq.n++]=c;
c = fgetc(in);
}
if (c != EOF) {
ungetc(c, in);
}
}
else{
read_line(pRecord->seq);
read_line(pRecord->com);
read_line(pRecord->qual);
}
rno++;
offset += 1 + pRecord->id.n + pRecord->seq.n + pRecord->com.n + pRecord->qual.n;
if(tag == '>'){
if(pRecord->seq.n <= 0){
fprintf(stderr, "Malformed fasta record %d: empty sequence\n", rno);
return -2;
}
return 1;
}
if (tag != '@' || pRecord->com.s[0] != '+' || pRecord->seq.n != pRecord->qual.n) {
const char *errtyp = (pRecord->seq.n != pRecord->qual.n) ? "length mismatch" :
pRecord->id.s[0] != '@' ? "no '@' for id" : "no '+' for comment";
fprintf(stderr, "Malformed fastq record %d: %s\n", rno, errtyp);
return -2;
}
// win32-safe chomp
pRecord->seq.s[--pRecord->seq.n] = '\0';
if (pRecord->seq.s[pRecord->seq.n-1] == '\r') {
pRecord->seq.s[--pRecord->seq.n] = '\0';
}
pRecord->qual.s[--pRecord->qual.n] = '\0';
if (pRecord->qual.s[pRecord->qual.n-1] == '\r') {
pRecord->qual.s[--pRecord->qual.n] = '\0';
}
return 1;
}
///////////////////
// subroutines
const char *fext(const char *f) {
const char *x = strrchr(f,'.');
return (x != NULL) ? (x + 1) : "";
}
///////////////////////
// external functions
CFILE gzopen(const char * fileName, const char * mode)
{
// maybe use zlib some day?
CFILE cf;
const char * ext = fext(fileName);
if(strcmp(mode, "r") == 0){
cf.fp = fopen(fileName, "r");
if(cf.fp == NULL){
return cf;
}
fclose(cf.fp);
}
if (strcmp(ext,"gz") == 0) {
char *tmp=(char *)malloc(strlen(fileName)+100);
if (strchr(mode, 'w')) {
strcpy(tmp, "gzip --rsyncable > '");
strcat(tmp, fileName);
strcat(tmp, "'");
} else {
strcpy(tmp, "gunzip -c '");
strcat(tmp, fileName);
strcat(tmp, "'");
}
cf.fp = popen(tmp, mode);
cf.bGz = true;
free(tmp);
} else if (strcmp(ext,"zip") == 0) {
char *tmp=(char *)malloc(strlen(fileName)+100);
if (strchr(mode,'w')) {
strcpy(tmp, "zip -q '");
strcat(tmp, fileName);
strcat(tmp, "' -");
} else {
strcpy(tmp, "unzip -p '");
strcat(tmp, fileName);
strcat(tmp, "'");
}
cf.fp = popen(tmp, mode);
cf.bGz = true;
free(tmp);
} else {
cf.fp = fopen(fileName, mode);
cf.bGz = false;
}
return cf;
}
int gzclose(CFILE *f)
{
if( (f == NULL) || (f->fp == NULL) )
return -1;
int iRet = (f->bGz ? pclose(f->fp) : fclose(f->fp));
f->fp = NULL;
return iRet;
}
int64 gzsize(const char * fileName)
{
FILE * fp = fopen(fileName, "rb");
if(fp == NULL)
return 0L;
size_t len = strlen(fileName);
bool bCompressed = (len > 3) && (strcmp(fileName + len - 3, ".gz") == 0);
int64 file_length = 0L;
if(bCompressed){
if(fseek(fp, -4L, SEEK_END) == 0){
int64 compress_length;
unsigned char buffer[4];
uint32 x;
fread(buffer, 1, 4, fp);
x = buffer[3];
x <<= 8;
x |= buffer[2];
x <<= 8;
x |= buffer[1];
x <<= 8;
x |= buffer[0];
compress_length = ftell(fp);
file_length = x;
if(file_length < 2 * compress_length){
file_length += (((2 * compress_length) - file_length + (1L << 32) - 1) / (1L << 32)) * (1L << 32);
}
}
}
else{
if(fseek(fp, 0, SEEK_END) == 0){
file_length = ftell(fp);
}
}
fclose(fp);
return file_length;
}
int gzreadlen(char * fileName)
{
int nReadLen = 0;
CFILE cf;
cFQ fq;
cf = gzopen(fileName, "r");
if(cf.fp == NULL){
fprintf(stderr, "Can not open %s for reading\n", fileName);
return -1;
}
fq.associateFile(cf.fp);
int iRet;
for(int i=0; i<100; i++){
iRet = fq.readRecord();
if(iRet <= 0){
if(iRet < -1){ // error
nReadLen = -1;
}
break;
}
if(fq.rec.seq.n > nReadLen){
nReadLen = fq.rec.seq.n;
}
}
gzclose(&cf);
return nReadLen;
}
enum FASTQ_FORMAT gzformat(char * fileNames[], int nFileCnt)
{
CFILE cf;
cFQ fq;
int i, j;
char *str;
char chr;
FASTQ_FORMAT format = UNKNOWN_FASTQ, format_new;
for(i=0; i 0){
if(fq.rec.com.n == 0){
format_new = FASTA;
break;
}
for(j=0, str=fq.rec.qual.s; j 74){
format_new = SOLEXA_FASTQ;
break;
}
}
if(j < fq.rec.qual.n) break;
}
gzclose(&cf);
if(format == UNKNOWN_FASTQ){
format = format_new;
}
else{
if(format_new != format && format_new != UNKNOWN_FASTQ){
format = CONTRADICT_FASTQ;
break;
}
}
}
return format;
}
void gzstrncpy (char * dest, const char * src, int n)
{
while( (*dest++ = *src++) && --n );
*dest = '\0';
}
void versatile_gettime(struct timespec *tp)
{
#ifdef __MACH__
static double orwl_timebase = 0.0;
static uint64_t orwl_timestart = 0;
if(!orwl_timestart){
mach_timebase_info_data_t tb = { 0 };
mach_timebase_info(&tb);
orwl_timebase = tb.numer;
orwl_timebase /= tb.denom;
orwl_timestart = mach_absolute_time();
}
struct timespec t;
double diff = (mach_absolute_time() - orwl_timestart) * orwl_timebase;
tp->tv_sec = diff * ORWL_NANO;
tp->tv_nsec = diff - (t.tv_sec * ORWL_GIGA);
#else
clock_gettime(CLOCK_MONOTONIC, tp);
#endif
}
skewer-0.2.2/src/fastq.h 0000664 0000000 0000000 00000007066 12700552063 0015102 0 ustar 00root root 0000000 0000000 /**********************************************************************
* Skewer - a fast and accurate adapter trimming tool
* using the bit-masked k-difference matching algorithm
* Copyright (c) 2013-2014 by Hongshan Jiang
* hongshan.jiang@gmail.com
*
* If you use this program, please cite the paper:
* Jiang, H., Lei, R., Ding, S.W. and Zhu, S. (2014) Skewer: a fast and
* accurate adapter trimmer for next-generation sequencing paired-end reads.
* BMC Bioinformatics, 15, 182.
* http://www.biomedcentral.com/1471-2105/15/182
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef _FASTQ_H
#define _FASTQ_H
// standard libs
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#ifdef __MACH__
#include
#define ORWL_NANO (+1.0E-9)
#define ORWL_GIGA UINT64_C(1000000000)
#endif
#include "common.h"
typedef struct tag_LINE {
char *s;
int n;
size_t a;
}LINE;
typedef enum{
TAG_NORMAL = 0,
TAG_BLURRY = 1,
TAG_BADQUAL = 2,
TAG_EMPTY = 3,
TAG_SHORT = 4,
TAG_CONTAMINANT = 5,
TAG_UNDETERMINED = 6,
TAG_LONG = 7
}REC_TAG;
typedef struct tag_REC{
uint32 tag:7; // REC_TAG
uint32 bExchange:1;
uint32 nCnt:24;
INDEX idx;
LINE id;
LINE seq;
LINE com;
LINE qual;
}RECORD;
class cFQ
{
private:
// int size;
public:
char tag;
RECORD rec;
// RECORD *pBuffer;
// int cnt;
FILE * in;
int64 offset;
int64 next_pos;
int rno;
private:
inline int read_line(LINE &l);
public:
cFQ();
~cFQ();
// bool InitBuffer(int nBuffSize=256);
// void DestroyBuffer();
// void clearBuffer();
void associateFile(FILE * fp);
int readRecord(RECORD *pRecord=NULL);
// int readRecord2Buffer();
// RECORD * getLastRecord();
inline int64 tell(){ return offset; }
};
typedef struct tag_CFILE{
FILE * fp;
bool bGz;
}CFILE;
enum FASTQ_FORMAT{
SANGER_FASTQ = 0,
SOLEXA_FASTQ = 1,
UNKNOWN_FASTQ = 2,
CONTRADICT_FASTQ = 3,
FASTA = 4,
FASTQ_FORMAT_CNT = 5
};
extern const char * FASTQ_FORMAT_NAME[FASTQ_FORMAT_CNT];
// open a file, possibly gzipped, exit on failure
extern CFILE gzopen(const char * fileName, const char *mode);
extern int gzclose(CFILE *f);
extern int64 gzsize(const char * fileName);
extern enum FASTQ_FORMAT gzformat(char * fileNames[], int nFileCnt);
extern int gzreadlen(char * fileName);
extern void gzstrncpy (char * dest, const char * src, int n);
// time function
extern void versatile_gettime(struct timespec *tp);
#endif // _FASTQ_H
skewer-0.2.2/src/main.cpp 0000664 0000000 0000000 00000233017 12700552063 0015240 0 ustar 00root root 0000000 0000000 /**********************************************************************
* Skewer - a fast and accurate adapter trimming tool
* using the bit-masked k-difference matching algorithm
* Copyright (c) 2013-2016 by Hongshan Jiang
* hongshan.jiang@gmail.com
*
* If you use this program, please cite the paper:
* Jiang, H., Lei, R., Ding, S.W. and Zhu, S. (2014) Skewer: a fast and
* accurate adapter trimmer for next-generation sequencing paired-end reads.
* BMC Bioinformatics, 15, 182.
* http://www.biomedcentral.com/1471-2105/15/182
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "common.h"
#include "fastq.h"
#include "parameter.h"
#include "matrix.h"
using namespace std;
inline void OutputTaggedRecord(FILE * fpOut, RECORD * pRecord)
{
// refer to "enum REC_TAG" defined in "fastq.h"
const char * TAG_NAME[8] = { "NORMAL", "BLURRY", "BADQUAL", "EMPTY", "SHORT", "CONTAMINANT", "UNDETERMINED", "LONG" };
if(pRecord->com.n > 0){ // fastq
fprintf(fpOut, "@%s TAG=%s\n%s\n+\n%s\n", strtok(pRecord->id.s, "\n"), TAG_NAME[pRecord->tag], pRecord->seq.s, pRecord->qual.s);
}
else{ // fasta
fprintf(fpOut, ">%s TAG=%s\n%s\n", strtok(pRecord->id.s, "\n"), TAG_NAME[pRecord->tag], pRecord->seq.s);
}
}
inline void OutputMaskedRecord(FILE * fpOut, RECORD * pRecord, int offset, int len)
{
int i;
for(i=0; iseq.s[i] = tolower(pRecord->seq.s[i]);
}
for(i=offset+len; iseq.n; i++){
pRecord->seq.s[i] = tolower(pRecord->seq.s[i]);
}
if(pRecord->com.n > 0) // fastq
fprintf(fpOut, "@%s%s\n+\n%s\n", pRecord->id.s, pRecord->seq.s, pRecord->qual.s);
else // fasta
fprintf(fpOut, ">%s%s\n", pRecord->id.s, pRecord->seq.s);
}
inline void OutputEntireRecord(FILE * fpOut, RECORD * pRecord)
{
if(pRecord->com.n > 0) // fastq
fprintf(fpOut, "@%s%s\n+\n%s\n", pRecord->id.s, pRecord->seq.s, pRecord->qual.s);
else // fasta
fprintf(fpOut, ">%s%s\n", pRecord->id.s, pRecord->seq.s);
}
inline void OutputEntireRecordFilledWithNs(FILE * fpOut, RECORD * pRecord, int offset, int len)
{
int len2 = pRecord->seq.n - offset - len;
string s1 = string(offset, 'N');
string s2 = string(len2, 'N');
if(pRecord->com.n > 0){ // fastq
string q1 = string(offset, '!');
string q2 = string(len2, '!');
fprintf(fpOut, "@%s%s%.*s%s\n+\n%s%.*s%s\n", pRecord->id.s, s1.c_str(), len, pRecord->seq.s + offset, s2.c_str(),
q1.c_str(), len, pRecord->qual.s + offset, q2.c_str());
}
else // fasta
fprintf(fpOut, ">%s%s%.*s%s\n", pRecord->id.s, s1.c_str(), len, pRecord->seq.s + offset, s2.c_str());
}
inline void OutputPartialRecord(FILE * fpOut, RECORD * pRecord, int offset, int len)
{
if(pRecord->com.n > 0) // fastq
fprintf(fpOut, "@%s%.*s\n+\n%.*s\n", pRecord->id.s, len, pRecord->seq.s + offset, len, pRecord->qual.s + offset);
else // fasta
fprintf(fpOut, ">%s%.*s\n", pRecord->id.s, len, pRecord->seq.s + offset);
}
class cStats
{
struct timespec tpstart, tpend;
private:
bool bPaired;
size_t minLen;
size_t maxLen;
size_t maxReadLen;
size_t allocLen;
size_t nBarcodes;
long * pHist;
long * pBarcode;
vector * pBarcodeNames;
const char * pDecorate;
public:
long nBlurry;
long nBad;
long nContaminant;
long nUndetermined;
long nEmpty;
long nShort;
long nLong;
long nTrimAvail;
long nUntrimAvail;
CFILE *fpOuts;
CFILE *fpOuts2;
CFILE *fpMasked;
CFILE *fpMasked2;
CFILE *fpExcluded;
CFILE *fpExcluded2;
CFILE fpUntrim;
CFILE fpUntrim2;
CFILE fpBarcodes;
CFILE fpMapfile;
int nFiles;
int nFiles2;
bool bBarcode;
bool bStdout;
// for mutiple threads
int64 total_file_length;
cFQ * pfq;
cFQ * pfq2;
FILE * fpOut;
FILE * fpOut2;
FILE * fpMask;
FILE * fpMask2;
FILE * fpExcl;
FILE * fpExcl2;
FILE * fpBarcode;
int minAverageQual;
int minEndQual;
bool bFivePrimeEnd;
bool bFilterNs;
bool bFilterUndetermined;
bool bRedistribute;
bool bQuiet;
bool bMatepair;
bool bFillWithNs;
bool bCutTail;
int iCutF, iCutR;
int getMinLen(){
return int(minLen);
}
int getMaxLen(){
return int(maxLen);
}
public:
cStats(){
nBlurry = nBad = nContaminant = nUndetermined = nEmpty = nShort = nLong = 0L;
nTrimAvail = nUntrimAvail = 0;
bPaired = false;
pHist = NULL;
pBarcode = NULL;
bBarcode = false;
bStdout = false;
bFilterNs = false;
bFilterUndetermined = false;
bFillWithNs = false;
bRedistribute = false;
bCutTail = false;
minLen = allocLen = 0;
maxLen = INT_MAX;
nBarcodes = 0;
iCutF = iCutR = 0;
pDecorate = "";
fpOuts = fpOuts2 = NULL;
fpMasked = fpMasked2 = NULL;
fpMask = fpMask2 = NULL;
fpExcl = fpExcl2 = NULL;
fpExcluded = fpExcluded2 = NULL;
nFiles = nFiles2 = 0;
fpUntrim.fp = fpUntrim2.fp = NULL;
fpBarcodes.fp = NULL;
fpMapfile.fp = NULL;
}
~cStats(){
int i;
gzclose(&fpMapfile);
gzclose(&fpBarcodes);
gzclose(&fpUntrim2);
gzclose(&fpUntrim);
if(fpOuts2 != NULL){
for(i=nFiles2-1; i>=0; i--){
gzclose(&fpOuts2[i]);
}
free(fpOuts2);
fpOuts2 = NULL;
nFiles2 = 0;
}
if(fpOuts != NULL){
for(i=nFiles-1; i>=0; i--){
gzclose(&fpOuts[i]);
}
free(fpOuts);
fpOuts = NULL;
nFiles = 0;
}
if (fpMasked != NULL){
gzclose(&fpMasked[0]);
free(fpMasked);
fpMasked = NULL;
}
if (fpMasked2 != NULL){
gzclose(&fpMasked2[0]);
free(fpMasked2);
fpMasked2 = NULL;
}
if (fpExcluded != NULL){
gzclose(&fpExcluded[0]);
free(fpExcluded);
fpExcluded = NULL;
}
if (fpExcluded2 != NULL) {
gzclose(&fpExcluded2[0]);
free(fpExcluded2);
fpExcluded2 = NULL;
}
if(pBarcode != NULL){
delete [] pBarcode;
pBarcode = NULL;
nBarcodes = 0;
}
if(pHist != NULL){
delete [] pHist;
pHist = NULL;
allocLen = 0;
}
}
bool initHist(cParameter * pParameter){
this->minLen = pParameter->minLen;
this->maxReadLen = 0;
pHist = new long[50];
if(pHist == NULL)
return false;
allocLen = 50;
memset(pHist, 0, allocLen * sizeof(long));
nBarcodes = 0;
this->iCutF = pParameter->iCutF;
this->iCutR = pParameter->iCutR;
if(!pParameter->bBarcode)
return true;
pBarcode = new long[pParameter->output.size()];
if(pBarcode == NULL)
return false;
nBarcodes = pParameter->output.size();
memset(pBarcode, 0, nBarcodes * sizeof(long));
pBarcodeNames = &pParameter->barcodeNames;
bBarcode = true;
return true;
}
void InitGlobalAttributes(cParameter * pParameter, int64 total_file_length, bool bPaired, cFQ * pFq, cFQ * pFq2){
// global attributes used by threads
this->total_file_length = total_file_length;
this->pfq = pFq;
this->fpOut = pParameter->bStdout ? stdout : fpOuts[0].fp;
this->pDecorate = pParameter->pDecorate;
if(bPaired){
this->pfq2 = pFq2;
this->fpOut2 = fpOuts2[0].fp;
this->fpBarcode = fpBarcodes.fp;
}
this->fpMask = (fpMasked != NULL) ? fpMasked[0].fp : NULL;
this->fpMask2 = (fpMasked2 != NULL) ? fpMasked2[0].fp : NULL;
this->fpExcl = (fpExcluded != NULL) ? fpExcluded[0].fp : NULL;
this->fpExcl2 = (fpExcluded2 != NULL) ? fpExcluded2[0].fp : NULL;
this->minAverageQual = (pParameter->minAverageQual > 0) ? (pParameter->baseQual + pParameter->minAverageQual) : 0;
this->minEndQual = (pParameter->minEndQual > 0) ? (pParameter->baseQual + pParameter->minEndQual) : 0;
this->minLen = pParameter->minLen;
this->maxLen = (pParameter->maxLen > 0) ? pParameter->maxLen : INT_MAX;
this->bFivePrimeEnd = ((pParameter->trimMode & TRIM_ANY) == TRIM_HEAD);
this->bQuiet = pParameter->bQuiet || pParameter->bStdin;
this->bFilterNs = pParameter->bFilterNs;
this->bFilterUndetermined = pParameter->bFilterUndetermined;
this->bRedistribute = pParameter->bRedistribute;
this->bCutTail = pParameter->bCutTail;
this->bFillWithNs = pParameter->bFillWithNs;
}
bool openOutputFiles(cParameter * pParameter){
bPaired = (pParameter->nFileCnt >= 2);
bStdout = pParameter->bStdout;
assert(!(bStdout && (bPaired | bBarcode)));
if(bStdout){
return true;
}
fpOuts = (CFILE *)calloc(pParameter->output.size(), sizeof(CFILE));
if(fpOuts == NULL){
fprintf(stderr, "Can not allocate memory for file handles for writing\n");
return false;
}
for(nFiles=0; nFilesoutput.size()); nFiles++){
fpOuts[nFiles] = gzopen(pParameter->output[nFiles].c_str(), "w");
if(fpOuts[nFiles].fp == NULL){
fprintf(stderr, "Can not open %s for writing\n", pParameter->output[nFiles].c_str());
break;
}
}
if(!pParameter->untrimmed.empty()){
fpUntrim = gzopen(pParameter->untrimmed.c_str(), "w");
if(fpUntrim.fp == NULL){
fprintf(stderr, "Can not open %s for writing\n", pParameter->untrimmed.c_str());
return false;
}
}
else if(pParameter->bWriteMasked) {
fpMasked = (CFILE *)calloc(1, sizeof(CFILE));
if(fpMasked == NULL){
fprintf(stderr, "Can not allocate memory for file handles for writing\n");
return false;
}
fpMasked[0] = gzopen(pParameter->masked[0].c_str(), "w");
if(fpMasked[0].fp == NULL){
fprintf(stderr, "Can not open masked for writing\n");
}
if (bPaired) {
fpMasked2 = (CFILE *)calloc(1, sizeof(CFILE));
if(fpMasked2 == NULL){
fprintf(stderr, "Can not allocate memory for file handles for writing\n");
return false;
}
fpMasked2[0] = gzopen(pParameter->masked2[0].c_str(), "w");
if(fpMasked2[0].fp == NULL){
fprintf(stderr, "Can not open masked for writing\n");
}
}
}
if(bPaired){
fpOuts2 = (CFILE *)calloc(pParameter->output2.size(), sizeof(CFILE));
if(fpOuts2 == NULL){
fprintf(stderr, "Can not allocate memory for file handles for writing\n");
return false;
}
for(nFiles2=0; nFiles2output2.size()); nFiles2++){
fpOuts2[nFiles2] = gzopen(pParameter->output2[nFiles2].c_str(), "w");
if(fpOuts2[nFiles2].fp == NULL){
fprintf(stderr, "Can not open %s for writing\n", pParameter->output2[nFiles2].c_str());
break;
}
}
if(!pParameter->untrimmed2.empty()){
fpUntrim2 = gzopen(pParameter->untrimmed2.c_str(), "w");
if(fpUntrim2.fp == NULL){
fprintf(stderr, "Can not open %s for writing\n", pParameter->untrimmed2.c_str());
return false;
}
}
if(!pParameter->barcodes.empty()){
fpBarcodes = gzopen(pParameter->barcodes.c_str(), "w");
if(fpBarcodes.fp == NULL){
fprintf(stderr, "Can not open %s for writing\n", pParameter->barcodes.c_str());
return false;
}
}
if(!pParameter->mapfile.empty()){
fpMapfile = gzopen(pParameter->mapfile.c_str(), "w");
if(fpMapfile.fp == NULL){
fprintf(stderr, "Can not open %s for writing\n", pParameter->mapfile.c_str());
return false;
}
}
}
if(pParameter->bWriteExcluded) {
fpExcluded = (CFILE *)calloc(1, sizeof(CFILE));
if(fpExcluded == NULL){
fprintf(stderr, "Can not allocate memory for file handles for writing\n");
return false;
}
fpExcluded[0] = gzopen(pParameter->excluded[0].c_str(), "w");
if(fpExcluded[0].fp == NULL){
fprintf(stderr, "Can not open excluded for writing\n");
}
if (bPaired) {
fpExcluded2 = (CFILE *)calloc(1, sizeof(CFILE));
if(fpExcluded2 == NULL){
fprintf(stderr, "Can not allocate memory for file handles for writing\n");
return false;
}
fpExcluded2[0] = gzopen(pParameter->excluded2[0].c_str(), "w");
if(fpExcluded2[0].fp == NULL){
fprintf(stderr, "Can not open excluded for writing\n");
}
}
}
return true;
}
bool incrementCount(size_t readLen){
if(readLen + 1 > allocLen){
size_t newAllocLen = readLen * 3 / 2 + 32;
long * pNewHist = new long[newAllocLen];
if(pNewHist == NULL)
return false;
if(pHist != NULL){
memcpy(pNewHist, pHist, allocLen * sizeof(long));
delete [] pHist;
}
memset(&pNewHist[allocLen], 0, (newAllocLen - allocLen) * sizeof(long));
pHist = pNewHist;
allocLen = newAllocLen;
}
if(readLen > maxReadLen){
maxReadLen = readLen;
}
pHist[readLen]++;
return true;
}
bool incrementBarcode(size_t bc){
if(bc >= nBarcodes)
return false;
pBarcode[bc]++;
return true;
}
void printHist(FILE * fp, bool bLeadingRtn=true){
char buffer[100];
long sum = nTrimAvail + nUntrimAvail;
sprintf(buffer, "%ld", sum);
int width = int(strlen(buffer));
if(bLeadingRtn)
fprintf(fp, "\n");
int i;
if(bBarcode && (nTrimAvail > 0) ){
fprintf(fp, "Barcode dispatch after trimming:\n");
fprintf(fp, "category \tcount\tpercentage:\n");
for(i=0; i 100) width = 100;
int i;
int point = int(min(ratio, 1.0) * width);
for(i=0; i';
for(; itm_wday],
mon_name[loctime->tm_mon],
loctime->tm_mday, loctime->tm_hour,
loctime->tm_min, loctime->tm_sec,
1900 + loctime->tm_year);
return result;
}
void printTime(const char * message, FILE * fp, int flag=0x1){
if(flag & 0x02) fprintf(fp, "\n");
fputs(timeStamp(), fp);
int color = (fp == stdout) ? 2 : -1;
color_fprintf(color, fp, " >> %s", message);
if(flag & 0x01) fprintf(fp, "\n");
}
void start(){
versatile_gettime(&tpstart);
}
void end(){
versatile_gettime(&tpend);
}
void printDiffTime(FILE * fp, bool bRnt=true){
double timediff = (tpend.tv_sec-tpstart.tv_sec)+(tpend.tv_nsec-tpstart.tv_nsec)/1e9;
fprintf(fp, " (%.3lfs)", timediff);
if(bRnt) fprintf(fp, "\n");
}
void printSummary(FILE * fp){
char buffer[100];
long sum = nBlurry + nBad + nContaminant + nUndetermined + nEmpty + nShort + nLong + nTrimAvail + nUntrimAvail;
sprintf(buffer, "%ld", sum);
int width = int(strlen(buffer));
const char * entity = bPaired ? "read pairs" : "reads";
fprintf(fp, "%.*ld %s processed; of these:\n", width, sum, entity);
if(bFilterNs)
fprintf(fp, "%*ld (%5.2f%%) degenerative %s filtered out\n", width, nBlurry, (nBlurry * 100.0) / sum, entity);
if(nBad > 0)
fprintf(fp, "%*ld (%5.2f%%) %s filtered out by quality control\n", width, nBad, (nBad * 100.0) / sum, entity);
if(nContaminant > 0)
fprintf(fp, "%*ld (%5.2f%%) non-junction %s filtered out by contaminant control\n", width, nContaminant, (nContaminant * 100.0) / sum, entity);
if(bFilterUndetermined)
fprintf(fp, "%*ld (%5.2f%%) undetermined %s filtered out by contaminant control\n", width, nUndetermined, (nUndetermined * 100.0) / sum, entity);
if(minLen > 0){
if(minLen > 1)
fprintf(fp, "%*ld (%5.2f%%) short %s filtered out after trimming by size control\n", width, nShort, (nShort * 100.0) / sum, entity);
fprintf(fp, "%*ld (%5.2f%%) empty %s filtered out after trimming by size control\n", width, nEmpty, (nEmpty * 100.0) / sum, entity);
}
if(nLong > 0)
fprintf(fp, "%*ld (%5.2f%%) long %s filtered out after trimming by size control\n", width, nLong, (nLong * 100.0) / sum, entity);
long nAvailSum = nTrimAvail + nUntrimAvail;
fprintf(fp, "%*ld (%5.2f%%) %s available", width, nAvailSum, (nAvailSum * 100.0) / sum, entity);
if(nAvailSum > 0){
fprintf(fp, "; of these:\n");
if(nTrimAvail > 0)
fprintf(fp, "%*ld (%5.2f%%) %s %s available after processing\n", width, nTrimAvail, (nTrimAvail * 100.0) / nAvailSum, pDecorate, entity);
if(nUntrimAvail > 0)
fprintf(fp, "%*ld (%5.2f%%) un%s %s available after processing\n", width, nUntrimAvail, (nUntrimAvail * 100.0) / nAvailSum, pDecorate, entity);
}
else{
fprintf(fp, ".\n");
}
}
bool writeMapFile(cParameter * pParameter){
if(fpMapfile.fp == NULL){
return true;
}
int i, bc, bc2;
fprintf(fpMapfile.fp, "#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tReversePrimer\tDescription\n");
string sampleId, barcode, fw_primer, rv_primer;
for(i=0; irowNames[bc] + pParameter->colNames[bc2];
barcode = cMatrix::fw_barcodes[bc] + cMatrix::rv_barcodes[bc2];
fw_primer = cMatrix::fw_primers[bc];
rv_primer = cMatrix::rv_primers[bc2];
fprintf(fpMapfile.fp, "%s\t%s\t%s\t%s\tNA\n", sampleId.c_str(), barcode.c_str(), fw_primer.c_str(), rv_primer.c_str());
}
return true;
}
};
typedef enum{
TASK_READ = 0,
TASK_WRITE = 1,
TASK_END = 2
}TASK_TYPE;
typedef struct{
TASK_TYPE type;
int nItemCnt;
int nBlockSize;
int64 startId;
}TASK;
class cTaskManager
{
private:
deque queue;
pthread_mutex_t mutex;
pthread_mutex_t mutex_cnt;
pthread_mutex_t mutex_item;
int nItemCnt;
int nBlockSize;
int nBufferSize;
bool bFinished;
int64 nextId;
public:
bool bSingleBlock;
cTaskManager(){
pthread_mutex_init(&mutex, NULL);
pthread_mutex_init(&mutex_cnt, NULL);
pthread_mutex_init(&mutex_item, NULL);
}
~cTaskManager(){
pthread_mutex_destroy(&mutex_item);
pthread_mutex_destroy(&mutex_cnt);
pthread_mutex_destroy(&mutex);
}
void initialize(int nSize, int nBlockSize, int64 id = 0L){
TASK task;
task.type = TASK_READ;
task.nItemCnt = 0;
task.nBlockSize = nBlockSize;
task.startId = id;
nextId = id;
nBufferSize = nSize;
this->nBlockSize = nBlockSize;
bSingleBlock = (nSize == nBlockSize);
nItemCnt = 0;
bFinished = false;
queue.clear();
queue.push_back(task);
}
void finish(){
bFinished = true;
}
bool IsFinished(){
return bFinished;
}
bool getTask(TASK & task){
bool bRet;
pthread_mutex_lock(&mutex);
if(queue.empty()){
bRet = false;
}
else{
bRet = true;
task = queue.front();
queue.pop_front();
}
pthread_mutex_unlock(&mutex);
return bRet;
}
void addTask(TASK & task){
pthread_mutex_lock(&mutex);
queue.push_back(task);
pthread_mutex_unlock(&mutex);
}
void insertTask(TASK & task){
pthread_mutex_lock(&mutex);
queue.push_front(task);
pthread_mutex_unlock(&mutex);
}
bool increaseCnt(){
bool bFull;
pthread_mutex_lock(&mutex_cnt);
if(nItemCnt + nBlockSize <= nBufferSize){
bFull = false;
nItemCnt += nBlockSize;
}
else{
bFull = true;
}
pthread_mutex_unlock(&mutex_cnt);
return !bFull;
}
void decreaseCnt(){
pthread_mutex_lock(&mutex_cnt);
nItemCnt -= nBlockSize;
pthread_mutex_unlock(&mutex_cnt);
}
int getItemCnt(int id, RECORD *pRecord){
int nCnt;
pthread_mutex_lock(&mutex_item);
nCnt = pRecord->nCnt;
if(nCnt <= 0){
nextId = id;
}
pthread_mutex_unlock(&mutex_item);
return nCnt;
}
bool setItemCnt(int id, RECORD *pRecord, int nCnt){
bool bDependent;
pthread_mutex_lock(&mutex_item);
if(id == nextId)
bDependent = false;
else{
bDependent = true;
pRecord->nCnt = nCnt;
}
pthread_mutex_unlock(&mutex_item);
return bDependent;
}
};
class cData{
public:
int tid;
cStats * pStats;
cTaskManager * pTaskMan;
RECORD * pBuffer;
int size;
};
typedef struct tag_mtaux_t{
int n_threads;
pthread_t *tid;
cData *w;
} mtaux_t;
class cWork {
friend class cData;
cTaskManager taskManager;
RECORD * pBuffer;
int size;
bool bPaired;
cFQ fq;
cFQ fq2;
//int minLen;
//bool bFivePrimeEnd;
mtaux_t *mt; // for multi-threading
private:
inline bool fldEqual(char *a, char *b)
{
while (*a && *b){
if(*a != *b) return false;
if( (*a == ' ') || (*a == '/') ) return true;
++a;
++b;
}
return true;
}
public:
pthread_attr_t attr;
public:
cWork(){
mt = NULL;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
pBuffer = NULL;
size = 0;
}
~cWork(){
if(mt != NULL){
if(mt->w != NULL)
delete [] mt->w;
if(mt->tid != NULL)
free(mt->tid);
free(mt);
mt = NULL;
}
DestroyBuffer();
pthread_attr_destroy(&attr);
}
bool InitBuffer(int nBuffSize, bool bPaired=false){
pBuffer = (RECORD *)calloc(nBuffSize * (1 + bPaired), sizeof(RECORD));
size = nBuffSize;
this->bPaired = bPaired;
return (pBuffer != NULL);
}
void DestroyBuffer(){
if(pBuffer == NULL) return;
RECORD *pBuf;
int i;
for(pBuf=pBuffer,i=size * (1+bPaired); i>0; i--,pBuf++){
if(pBuf->id.s != NULL)
free(pBuf->id.s);
if(pBuf->seq.s != NULL)
free(pBuf->seq.s);
if(pBuf->com.s != NULL)
free(pBuf->com.s);
if(pBuf->qual.s != NULL)
free(pBuf->qual.s);
}
free(pBuffer);
pBuffer = NULL;
size = 0;
}
bool Init(cParameter * pParameter, cStats * pStats, int64 total_file_length, FILE * fp, FILE * fp2=NULL){
mt = (mtaux_t *)calloc(1, sizeof(mtaux_t));
if(mt == NULL)
return false;
mt->n_threads = (pParameter->nThreads <= 1) ? 1: pParameter->nThreads;
mt->tid = (pthread_t *)calloc(mt->n_threads, sizeof(pthread_t));
mt->w = new cData[mt->n_threads];
if( (mt->tid == NULL) || (mt->w == NULL) )
return false;
bool bPaired = (fp2 != NULL);
fq.associateFile(fp);
if(bPaired) fq2.associateFile(fp2);
int nBasicSize = (total_file_length > 8 * 100 * 1024L * 1024L) ? 10 : ((total_file_length / 100 / 1024 / 1024) + 2);
int nBlockSize = nBasicSize * mt->n_threads;
int nSize = (mt->n_threads * 2 - 1) * nBlockSize; // 1, 3, 5, 7, ..., 31
if(!InitBuffer(nSize, bPaired))
return false;
pStats->InitGlobalAttributes(pParameter, total_file_length, bPaired, &fq, &fq2);
int i;
for(i=0; in_threads; i++){
mt->w[i].tid = i;
mt->w[i].pStats = pStats;
mt->w[i].pTaskMan = &taskManager;
mt->w[i].pBuffer = pBuffer;
mt->w[i].size = size;
}
cMatrix::InitParameters(pParameter->trimMode, pParameter->epsilon, pParameter->delta, pParameter->baseQual, pParameter->bShareAdapter);
cMatrix::iMinOverlap = pParameter->minK;
vector *pAdapters;
TRIM_MODE trimMode = ((pParameter->trimMode & TRIM_ANY) == TRIM_DEFAULT) ? TRIM_TAIL : TRIM_MODE(pParameter->trimMode & TRIM_ANY);
pAdapters = &pParameter->adapters;
for(i=0; isize()); i++){
cMatrix::AddAdapter(cMatrix::firstAdapters, (char *)(*pAdapters)[i].c_str(), (*pAdapters)[i].length(), trimMode);
}
if(!pParameter->bShareAdapter){
pAdapters = &pParameter->adapters2;
for(i=0; isize()); i++){
cMatrix::AddAdapter(cMatrix::secondAdapters,(char *)(*pAdapters)[i].c_str(), (*pAdapters)[i].length(), trimMode);
}
}
cMatrix::CalculateIndices(pParameter->bMatrix, pParameter->rowNames.size(), pParameter->colNames.size());
if(bPaired){
if( (pParameter->trimMode & TRIM_MP) != 0 ){
pAdapters = &pParameter->juncAdapters;
for(i=0; isize()); i++){
cMatrix::AddAdapter(cMatrix::junctionAdapters,(char *)(*pAdapters)[i].c_str(), (*pAdapters)[i].length(), TRIM_ANY);
}
cMatrix::CalculateJunctionLengths();
}
if( (pParameter->trimMode & TRIM_AP) != 0 ){
if(pStats->fpMapfile.fp != NULL){
cMatrix::InitBarcodes(cMatrix::firstAdapters, pParameter->iCutF, (pParameter->bShareAdapter ? cMatrix::firstAdapters : cMatrix::secondAdapters), pParameter->iCutR);
}
}
}
taskManager.initialize(nSize, nBlockSize);
return true;
}
mtaux_t * getMultiThreadingPointer(){
return mt;
}
};
void * mt_worker(void * data)
{
cData * pData = (cData *)data;
cTaskManager *pTaskMan = pData->pTaskMan;
cStats * pStats = pData->pStats;
int64 file_length = pStats->total_file_length;
cFQ * pfq = pStats->pfq;
FILE *fpOut = pStats->fpOut;
FILE *fpMask = pStats->fpMask;
FILE *fpExcl = pStats->fpExcl;
int minAverageQual = pStats->minAverageQual;
int minEndQual = pStats->minEndQual;
int minLen = pStats->getMinLen();
int maxLen = pStats->getMaxLen();
bool bFivePrimeEnd = pStats->bFivePrimeEnd;
bool bBarcode = pStats->bBarcode;
bool bCutTail = pStats->bCutTail;
RECORD * pBuffer, *pRecord;
TASK task;
int size, rc, nItemCnt, nCnt;
int64 startId;
pBuffer = pData->pBuffer;
size = pData->size;
rc = 0;
int64 cur_pos;
double cur_ratio;
int pos;
while(true){
while(!pTaskMan->getTask(task)){
if(pTaskMan->IsFinished()){
task.type = TASK_END;
break;
}
usleep(1);
}
if(task.type == TASK_END){
break;
}
startId = task.startId;
if(task.type == TASK_READ){
if(!pTaskMan->increaseCnt()){ // reach the buffer size
pTaskMan->addTask(task); // perform reading later
usleep(1);
continue;
}
// read records from input file to buffer
for(pRecord=&pBuffer[startId % size], nItemCnt=0; nItemCntreadRecord(pRecord);
if(rc < 0){
break;
}
}
if(!pStats->bQuiet){
cur_pos = pfq->tell();
if(cur_pos >= pfq->next_pos){
cur_ratio = int64(cur_pos * 10000 / file_length) / 10000.0;
pStats->progress(cur_ratio, 50);
pfq->next_pos = int64(((cur_ratio * 10000 + 1) * file_length + 9999)/10000);
}
}
if(rc < 0){ // error or end of file
pTaskMan->finish();
if(rc < -1) continue; // error
if(nItemCnt == 0) continue; // no record read
}
task.startId += task.nBlockSize;
pTaskMan->addTask(task); // save next task for parallelism
// process the records
for(pRecord=&pBuffer[startId % size], nCnt=0; nCnt < nItemCnt; nCnt++, pRecord++){
if( pStats->bFilterNs && cMatrix::isBlurry(pRecord->seq.s, pRecord->seq.n)){
pRecord->tag = TAG_BLURRY;
continue;
}
if( (minAverageQual > 0) && !cMatrix::checkQualities((uchar *)pRecord->qual.s, pRecord->qual.n, minAverageQual) ){
pRecord->tag = TAG_BADQUAL;
continue;
}
pRecord->tag = TAG_NORMAL;
pRecord->idx = cMatrix::findAdapter(pRecord->seq.s, pRecord->seq.n, (uchar *)pRecord->qual.s, pRecord->qual.n);
if(pRecord->idx.pos < 0){
pRecord->idx.pos = 0;
}
if( (minEndQual > 0) && (pRecord->idx.pos > 0) && (pRecord->qual.n > 0) ){ // not found
pRecord->idx.pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s, min(pRecord->idx.pos, pRecord->qual.n), minEndQual);
}
}
pRecord = &pBuffer[startId % size];
if(!pTaskMan->setItemCnt(startId, pRecord, nItemCnt)){
task.type = TASK_WRITE;
task.startId = startId;
task.nItemCnt = nItemCnt;
if(pTaskMan->bSingleBlock)
pTaskMan->insertTask(task);
else
pTaskMan->addTask(task);
}
continue;
}
// task.type == TASK_WRITE
pRecord = &pBuffer[startId % size];
nItemCnt = task.nItemCnt;
do{
// write to file
pRecord->nCnt = 0; // reset
for(nCnt=0; nCnt < nItemCnt; nCnt++, pRecord++){
if(pRecord->tag == TAG_BLURRY){
pStats->nBlurry++;
if(fpExcl != NULL) {
OutputTaggedRecord(fpExcl, pRecord);
}
continue;
}
if(pRecord->tag == TAG_BADQUAL){
pStats->nBad++;
if(fpExcl != NULL) {
OutputTaggedRecord(fpExcl, pRecord);
}
continue;
}
// TAG_NORMAL
pos = pRecord->idx.pos;
if(pos < minLen){
if(pos <= 0) {
pStats->nEmpty++;
pRecord->tag = TAG_EMPTY;
}
else {
pStats->nShort++;
pRecord->tag = TAG_SHORT;
}
if(fpExcl != NULL) {
OutputTaggedRecord(fpExcl, pRecord);
}
continue;
}
if(pos > maxLen){
if(!bCutTail){
pStats->nLong++;
pRecord->tag = TAG_LONG;
if(fpExcl != NULL) {
OutputTaggedRecord(fpExcl, pRecord);
}
continue;
}
pos = maxLen;
}
if(bBarcode){
int bc = pRecord->idx.bc - 1;
if(bc < 0){
fpOut = pStats->fpUntrim.fp;
}
else{
fpOut = pStats->fpOuts[bc].fp;
pStats->incrementBarcode(bc);
}
}
if(bFivePrimeEnd){
if( (fpMask != NULL) && (pos < pRecord->seq.n) ){
OutputMaskedRecord(fpMask, pRecord, pRecord->seq.n - pos, pos);
}
if(pStats->bFillWithNs){ // for equal-read-length requirement of some applications
OutputEntireRecordFilledWithNs(fpOut, pRecord, pRecord->seq.n - pos, pos);
}
else{
OutputPartialRecord(fpOut, pRecord, pRecord->seq.n - pos, pos);
}
}
else{
if( (fpMask != NULL) && (pos < pRecord->seq.n) ){
OutputMaskedRecord(fpMask, pRecord, 0, pos);
}
if(pStats->bFillWithNs){ // for equal-read-length requirement of some applications
OutputEntireRecordFilledWithNs(fpOut, pRecord, 0, pos);
}
else{
OutputPartialRecord(fpOut, pRecord, 0, pos);
}
}
if(bBarcode){
if(pRecord->idx.bc == 0)
pStats->nUntrimAvail++;
else
pStats->nTrimAvail++;
}
else{
if(pos < pRecord->seq.n)
pStats->nTrimAvail++;
else
pStats->nUntrimAvail++;
}
pStats->incrementCount(size_t(pos));
}
pTaskMan->decreaseCnt();
startId += task.nBlockSize;
pRecord = &pBuffer[startId % size];
nItemCnt = pTaskMan->getItemCnt(startId, pRecord);
}while(nItemCnt > 0);
}
return NULL;
}
void * mt_worker_ap(void * data)
{
cData * pData = (cData *)data;
cTaskManager *pTaskMan = pData->pTaskMan;
cStats * pStats = pData->pStats;
int64 file_length = pStats->total_file_length;
cFQ * pfq = pStats->pfq;
FILE *fpOut = pStats->fpOut;
FILE *fpMask = pStats->fpMask;
FILE *fpExcl = pStats->fpExcl;
FILE *fpBarcode = pStats->fpBarcode;
int minAverageQual = pStats->minAverageQual;
int minEndQual = pStats->minEndQual;
int minLen = pStats->getMinLen();
int maxLen = pStats->getMaxLen();
bool bBarcode = pStats->bBarcode;
int iCutF = pStats->iCutF;
int iCutR = pStats->iCutR;
bool bCutTail = pStats->bCutTail;
char barcodeSeq[25];
char barcodeQua[25];
if(fpBarcode != NULL){
memset(barcodeSeq, 0, 25 * sizeof(char));
memset(barcodeQua, 0, 25 * sizeof(char));
}
assert( (iCutF + iCutR) <= 24 );
RECORD *pBuffer, *pRecord;
TASK task;
int size, rc, nItemCnt, nCnt;
int flag;
int64 startId;
pBuffer = pData->pBuffer;
size = pData->size;
rc = 0;
int64 cur_pos;
double cur_ratio;
int pos;
while(true){
while(!pTaskMan->getTask(task)){
if(pTaskMan->IsFinished()){
task.type = TASK_END;
break;
}
usleep(1);
}
if(task.type == TASK_END){
break;
}
startId = task.startId;
if(task.type == TASK_READ){
if(!pTaskMan->increaseCnt()){ // reach the buffer size
pTaskMan->addTask(task); // perform reading later
usleep(1);
continue;
}
// read records from input file to buffer
for(pRecord=&pBuffer[startId % size], nItemCnt=0; nItemCntreadRecord(pRecord);
if(rc < 0){
break;
}
}
if(!pStats->bQuiet){
cur_pos = pfq->tell();
if(cur_pos >= pfq->next_pos){
cur_ratio = int64(cur_pos * 10000 / file_length) / 10000.0;
pStats->progress(cur_ratio, 50);
pfq->next_pos = int64(((cur_ratio * 10000 + 1) * file_length + 9999)/10000);
}
}
if(rc < 0){ // error or end of file
pTaskMan->finish();
if(rc < -1) continue; // error
if(nItemCnt == 0) continue; // no record read
}
task.startId += task.nBlockSize;
pTaskMan->addTask(task); // save next task for parallelism
// process the records
for(pRecord=&pBuffer[startId % size], nCnt=0; nCnt < nItemCnt; nCnt++, pRecord++){
if( pStats->bFilterNs && cMatrix::isBlurry(pRecord->seq.s, pRecord->seq.n)){
pRecord->tag = TAG_BLURRY;
continue;
}
if( (minAverageQual > 0) && !cMatrix::checkQualities((uchar *)pRecord->qual.s, pRecord->qual.n, minAverageQual) ){
pRecord->tag = TAG_BADQUAL;
continue;
}
pRecord->tag = TAG_NORMAL;
flag = cMatrix::findAdaptersInARead(pRecord->seq.s, pRecord->seq.n, (uchar *)pRecord->qual.s, pRecord->qual.n, pRecord->idx);
// TODO:
if(flag >= 0){
pRecord->bExchange = (flag == 1);
if(flag == 0){
pRecord->idx.pos = pRecord->seq.n - iCutF;
}
else{
pRecord->idx.pos = pRecord->seq.n - iCutR;
}
if(pRecord->idx.pos < 0){
pRecord->idx.pos = 0;
}
}
if( minEndQual > 0 ){ // TODO: quality trimming from 5' end
if( (pRecord->idx.pos > 0) && (pRecord->qual.n > 0) ){
pRecord->idx.pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s, min(pRecord->idx.pos, pRecord->qual.n), minEndQual);
}
}
}
pRecord = &pBuffer[startId % size];
if(!pTaskMan->setItemCnt(startId, pRecord, nItemCnt)){
task.type = TASK_WRITE;
task.startId = startId;
task.nItemCnt = nItemCnt;
if(pTaskMan->bSingleBlock)
pTaskMan->insertTask(task);
else
pTaskMan->addTask(task);
}
continue;
}
// task.type == TASK_WRITE
pRecord = &pBuffer[startId % size];
nItemCnt = task.nItemCnt;
do{
// write to file
pRecord->nCnt = 0; // reset
for(nCnt=0; nCnttag == TAG_BLURRY){
pStats->nBlurry++;
if(fpExcl != NULL) {
OutputTaggedRecord(fpExcl, pRecord);
}
continue;
}
if(pRecord->tag == TAG_BADQUAL){
pStats->nBad++;
if(fpExcl != NULL) {
OutputTaggedRecord(fpExcl, pRecord);
}
continue;
}
// TAG_NORMAL
pos = pRecord->idx.pos;
if(pos < minLen){
if(pos <= 0) {
pStats->nEmpty++;
pRecord->tag = TAG_EMPTY;
}
else{
pStats->nShort++;
pRecord->tag = TAG_SHORT;
}
if(fpExcl != NULL) {
OutputTaggedRecord(fpExcl, pRecord);
}
continue;
}
if(pos > maxLen){
if(!bCutTail){
pStats->nLong++;
pRecord->tag = TAG_LONG;
if(fpExcl != NULL) {
OutputTaggedRecord(fpExcl, pRecord);
}
continue;
}
pos = maxLen;
}
if(bBarcode){
if(pRecord->idx.bc < 0){ // to be checked
fpOut = pStats->fpUntrim.fp;
}
else{
fpOut = pStats->fpOuts[pRecord->idx.bc].fp;
pStats->incrementBarcode(pRecord->idx.bc);
}
OutputEntireRecord(fpOut, pRecord);
}
else{
int iCut = (pRecord->bExchange ? iCutR : iCutF);
if(fpMask != NULL){
if(pos < pRecord->seq.n - iCut)
OutputMaskedRecord(fpMask, pRecord, iCut, pos);
}
if(pStats->bFillWithNs){ // for equal-read-length requirement of some applications
OutputEntireRecordFilledWithNs(fpOut, pRecord, iCut, pos);
}
else{
OutputPartialRecord(fpOut, pRecord, iCut, pos);
}
}
if(pRecord->idx.bc < 0){ // assigned
pStats->nUntrimAvail++;
}
else{
pStats->nTrimAvail++;
}
pStats->incrementCount(size_t(pos));
}
pTaskMan->decreaseCnt();
startId += task.nBlockSize;
pRecord = &pBuffer[startId % size];
nItemCnt = pTaskMan->getItemCnt(startId, pRecord);
}while(nItemCnt > 0);
}
return NULL;
}
void * mt_worker2(void * data)
{
cData * pData = (cData *)data;
cTaskManager *pTaskMan = pData->pTaskMan;
cStats * pStats = pData->pStats;
int64 file_length = pStats->total_file_length;
cFQ * pfq = pStats->pfq;
cFQ * pfq2 = pStats->pfq2;
FILE *fpOut = pStats->fpOut;
FILE *fpOut2 = pStats->fpOut2;
FILE *fpMask = pStats->fpMask;
FILE *fpMask2 = pStats->fpMask2;
FILE *fpExcl = pStats->fpExcl;
FILE *fpExcl2 = pStats->fpExcl2;
int minAverageQual = pStats->minAverageQual;
int minEndQual = pStats->minEndQual;
int minLen = pStats->getMinLen();
int maxLen = pStats->getMaxLen();
bool bBarcode = pStats->bBarcode;
bool bCutTail = pStats->bCutTail;
RECORD *pBuffer, *pRecord, *pRecord2;
TASK task;
int size2, rc, rc2, nItemCnt, nCnt;
int64 startId;
pBuffer = pData->pBuffer;
size2 = pData->size * 2;
rc = rc2 = 0;
int64 cur_pos;
double cur_ratio;
int pos, pos2, mLen;
int rLen, qLen;
int rLen2, qLen2;
while(true){
while(!pTaskMan->getTask(task)){
if(pTaskMan->IsFinished()){
task.type = TASK_END;
break;
}
usleep(1);
}
if(task.type == TASK_END){
break;
}
startId = task.startId;
if(task.type == TASK_READ){
if(!pTaskMan->increaseCnt()){ // reach the buffer size
pTaskMan->addTask(task); // perform reading later
usleep(1);
continue;
}
// read records from input file to buffer
for(pRecord=&pBuffer[(startId << 1) % size2], nItemCnt=0; nItemCntreadRecord(pRecord);
rc2 = pfq2->readRecord(pRecord2);
if( (rc < 0) || (rc2 < 0) ){
break;
}
}
if(!pStats->bQuiet){
cur_pos = pfq->tell() + pfq2->tell();
if(cur_pos >= pfq->next_pos){
cur_ratio = int64(cur_pos * 10000 / file_length) / 10000.0;
pStats->progress(cur_ratio, 50);
pfq->next_pos = int64(((cur_ratio * 10000 + 1) * file_length + 9999)/10000);
}
}
if( (rc < 0) || (rc2 < 0) ){ // error or end of file
pTaskMan->finish();
if( (rc < -1) || (rc2 < -1) ) continue; // error
if(nItemCnt == 0) continue; // no record read
}
task.startId += task.nBlockSize;
pTaskMan->addTask(task); // save next task for parallelism
// process the records
for(pRecord=&pBuffer[(startId << 1) % size2], nCnt=0; nCnt < nItemCnt; nCnt++, pRecord+=2){
pRecord2 = pRecord + 1;
if(minAverageQual > 0){
if( !cMatrix::checkQualities((uchar *)pRecord->qual.s, pRecord->qual.n, minAverageQual) &&
!cMatrix::checkQualities((uchar *)pRecord2->qual.s, pRecord2->qual.n, minAverageQual) ){
pRecord->tag = pRecord2->tag = TAG_BADQUAL;
continue;
}
}
pRecord->tag = TAG_NORMAL;
rLen = pRecord->seq.n;
qLen = pRecord->qual.n;
rLen2 = pRecord2->seq.n;
qLen2 = pRecord2->qual.n;
if(cMatrix::findAdapterWithPE(pRecord->seq.s, pRecord2->seq.s, rLen, rLen2,
(uchar *)pRecord->qual.s, (uchar *)pRecord2->qual.s, qLen, qLen2,
pRecord->idx, pRecord2->idx)){ // trimmed
pos = pRecord->idx.pos;
pos2 = pRecord2->idx.pos;
if( (pos >= minLen) && (pos2 >= minLen) ){
cMatrix::combinePairSeqs(pRecord->seq.s, pRecord2->seq.s, pos, pos2,
(uchar *)pRecord->qual.s, (uchar *)pRecord2->qual.s, (int)qLen, (int)qLen2);
if(pStats->bFilterNs){
if(cMatrix::isBlurry(pRecord->seq.s, pos) && cMatrix::isBlurry(pRecord2->seq.s, pos2)){
pRecord->tag = pRecord2->tag = TAG_BLURRY;
}
}
}
}
else{
pos = rLen;
pos2 = rLen2;
if(pStats->bFilterNs){
if( cMatrix::isBlurry(pRecord->seq.s, rLen) && cMatrix::isBlurry(pRecord2->seq.s, rLen2) ){
pRecord->tag = pRecord2->tag = TAG_BLURRY;
}
}
}
if( (minEndQual > 0) && (pRecord->tag == TAG_NORMAL) ){ // trimmed by quality
if( (pRecord->qual.n > 0) && (pos > 0) )
pRecord->idx.pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s, min(pos, pRecord->qual.n), minEndQual);
if( (pRecord2->qual.n > 0) && (pos2 > 0) )
pRecord2->idx.pos = cMatrix::trimByQuality((uchar *)pRecord2->qual.s, min(pos2, pRecord2->qual.n), minEndQual);
}
}
pRecord = &pBuffer[(startId << 1) % size2];
if(!pTaskMan->setItemCnt(startId, pRecord, nItemCnt)){
task.type = TASK_WRITE;
task.startId = startId;
task.nItemCnt = nItemCnt;
if(pTaskMan->bSingleBlock)
pTaskMan->insertTask(task);
else
pTaskMan->addTask(task);
}
continue;
}
// task.type == TASK_WRITE
pRecord = &pBuffer[(startId << 1) % size2];
nItemCnt = task.nItemCnt;
do{
// write to file
pRecord->nCnt = 0; // reset
for(nCnt=0; nCnttag == TAG_BLURRY){
pStats->nBlurry++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pRecord->tag == TAG_BADQUAL){
pStats->nBad++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
// TAG_NORMAL
if(bBarcode){
pos = pRecord->seq.n;
pos2 = pRecord2->seq.n;
}
else{
pos = pRecord->idx.pos;
pos2 = pRecord2->idx.pos;
}
if( (pos < minLen) || (pos2 < minLen) ){
if( (pos <= 0) || (pos2 <= 0) ) {
pStats->nEmpty++;
pRecord->tag = pRecord2->tag = TAG_EMPTY;
}
else {
pStats->nShort++;
pRecord->tag = pRecord2->tag = TAG_SHORT;
}
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if( (pos > maxLen) || (pos2 > maxLen) ){
if(!bCutTail){
pStats->nLong++;
pRecord->tag = pRecord2->tag = TAG_LONG;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pos > maxLen) pos = maxLen;
if(pos2 > maxLen) pos2 = maxLen;
}
if(bBarcode){
if(pRecord->idx.bc < 0){
fpOut = pStats->fpUntrim.fp;
fpOut2 = pStats->fpUntrim2.fp;
}
else{
fpOut = pStats->fpOuts[pRecord->idx.bc].fp;
fpOut2 = pStats->fpOuts2[pRecord->idx.bc].fp;
pStats->incrementBarcode(pRecord->idx.bc);
}
}
if( (fpMask != NULL) && (fpMask2 != NULL) ){
if( (pos < pRecord->seq.n) || (pos2 < pRecord2->seq.n) ){
OutputMaskedRecord(fpMask, pRecord, 0, pos);
OutputMaskedRecord(fpMask2, pRecord2, 0, pos2);
}
}
if(pStats->bFillWithNs){ // for equal-read-length requirement of some applications
OutputEntireRecordFilledWithNs(fpOut, pRecord, 0, pos);
OutputEntireRecordFilledWithNs(fpOut2, pRecord2, 0, pos2);
}
else{
OutputPartialRecord(fpOut, pRecord, 0, pos);
OutputPartialRecord(fpOut2, pRecord2, 0, pos2);
}
if(bBarcode){
if(pRecord->idx.bc < 0){ // assigned
pStats->nUntrimAvail++;
}
else{
pStats->nTrimAvail++;
}
}
else{
if(pRecord->idx.pos < pRecord->seq.n || pRecord2->idx.pos < pRecord2->seq.n) // trimmed
pStats->nTrimAvail++;
else
pStats->nUntrimAvail++;
}
mLen = (pos + pos2) / 2;
pStats->incrementCount(size_t(mLen));
}
pTaskMan->decreaseCnt();
startId += task.nBlockSize;
pRecord = &pBuffer[(startId << 1) % size2];
nItemCnt = pTaskMan->getItemCnt(startId, pRecord);
}while(nItemCnt > 0);
}
return NULL;
}
void * mt_worker2_sep(void * data)
{
cData * pData = (cData *)data;
cTaskManager *pTaskMan = pData->pTaskMan;
cStats * pStats = pData->pStats;
int64 file_length = pStats->total_file_length;
cFQ * pfq = pStats->pfq;
cFQ * pfq2 = pStats->pfq2;
FILE *fpOut = pStats->fpOut;
FILE *fpOut2 = pStats->fpOut2;
FILE *fpMask = pStats->fpMask;
FILE *fpMask2 = pStats->fpMask2;
FILE *fpExcl = pStats->fpExcl;
FILE *fpExcl2 = pStats->fpExcl2;
int minAverageQual = pStats->minAverageQual;
int minEndQual = pStats->minEndQual;
int minLen = pStats->getMinLen();
int maxLen = pStats->getMaxLen();
bool bFivePrimeEnd = pStats->bFivePrimeEnd;
bool bBarcode = pStats->bBarcode;
bool bCutTail = pStats->bCutTail;
RECORD *pBuffer, *pRecord, *pRecord2;
TASK task;
int size2, rc, rc2, nItemCnt, nCnt;
int64 startId;
pBuffer = pData->pBuffer;
size2 = pData->size * 2;
rc = rc2 = 0;
int64 cur_pos;
double cur_ratio;
int pos, pos2, mLen;
while(true){
while(!pTaskMan->getTask(task)){
if(pTaskMan->IsFinished()){
task.type = TASK_END;
break;
}
usleep(1);
}
if(task.type == TASK_END){
break;
}
startId = task.startId;
if(task.type == TASK_READ){
if(!pTaskMan->increaseCnt()){ // reach the buffer size
pTaskMan->addTask(task); // perform reading later
usleep(1);
continue;
}
// read records from input file to buffer
for(pRecord=&pBuffer[(startId << 1) % size2], nItemCnt=0; nItemCntreadRecord(pRecord);
rc2 = pfq2->readRecord(pRecord2);
if( (rc < 0) || (rc2 < 0) ){
break;
}
}
if(!pStats->bQuiet){
cur_pos = pfq->tell() + pfq2->tell();
if(cur_pos >= pfq->next_pos){
cur_ratio = int64(cur_pos * 10000 / file_length) / 10000.0;
pStats->progress(cur_ratio, 50);
pfq->next_pos = int64(((cur_ratio * 10000 + 1) * file_length + 9999)/10000);
}
}
if( (rc < 0) || (rc2 < 0) ){ // error or end of file
pTaskMan->finish();
if( (rc < -1) || (rc2 < -1) ) continue; // error
if(nItemCnt == 0) continue; // no record read
}
task.startId += task.nBlockSize;
pTaskMan->addTask(task); // save next task for parallelism
// process the records
for(pRecord=&pBuffer[(startId << 1) % size2], nCnt=0; nCnt < nItemCnt; nCnt++, pRecord+=2){
pRecord2 = pRecord + 1;
if( pStats->bFilterNs &&
(cMatrix::isBlurry(pRecord->seq.s, pRecord->seq.n) &&
cMatrix::isBlurry(pRecord2->seq.s, pRecord2->seq.n)) ){
pRecord->tag = pRecord2->tag = TAG_BLURRY;
continue;
}
if(minAverageQual > 0){
if( !cMatrix::checkQualities((uchar *)pRecord->qual.s, pRecord->qual.n, minAverageQual) &&
!cMatrix::checkQualities((uchar *)pRecord2->qual.s, pRecord2->qual.n, minAverageQual) ){
pRecord->tag = pRecord2->tag = TAG_BADQUAL;
continue;
}
}
pRecord->tag = TAG_NORMAL;
pRecord->idx = cMatrix::findAdapter(pRecord->seq.s, pRecord->seq.n, (uchar *)pRecord->qual.s, pRecord->qual.n);
pRecord2->idx = cMatrix::findAdapter2(pRecord2->seq.s, pRecord2->seq.n, (uchar *)pRecord2->qual.s, pRecord2->qual.n);
if(pRecord->idx.pos < 0){
pRecord->idx.pos = 0;
}
if(pRecord2->idx.pos < 0){
pRecord2->idx.pos = 0;
}
if( minEndQual > 0 ){
if( (pRecord->idx.pos > 0) && (pRecord->qual.n > 0) ){
pRecord->idx.pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s, min(pRecord->idx.pos, pRecord->qual.n), minEndQual);
}
if( (pRecord2->idx.pos > 0) && (pRecord2->qual.n > 0) ){
pRecord2->idx.pos = cMatrix::trimByQuality((uchar *)pRecord2->qual.s, min(pRecord2->idx.pos, pRecord2->qual.n), minEndQual);
}
}
}
pRecord = &pBuffer[(startId << 1) % size2];
if(!pTaskMan->setItemCnt(startId, pRecord, nItemCnt)){
task.type = TASK_WRITE;
task.startId = startId;
task.nItemCnt = nItemCnt;
if(pTaskMan->bSingleBlock)
pTaskMan->insertTask(task);
else
pTaskMan->addTask(task);
}
continue;
}
// task.type == TASK_WRITE
pRecord = &pBuffer[(startId << 1) % size2];
nItemCnt = task.nItemCnt;
do{
// write to file
pRecord->nCnt = 0; // reset
for(nCnt=0; nCnttag == TAG_BLURRY){
pStats->nBlurry++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pRecord->tag == TAG_BADQUAL){
pStats->nBad++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
// TAG_NORMAL
if(bBarcode){
pos = pRecord->seq.n;
pos2 = pRecord2->seq.n;
}
else{
pos = pRecord->idx.pos;
pos2 = pRecord2->idx.pos;
}
if( (pos < minLen) || (pos2 < minLen) ){
if( (pos <= 0) || (pos2 <= 0) ) {
pStats->nEmpty++;
pRecord->tag = pRecord2->tag = TAG_EMPTY;
}
else {
pStats->nShort++;
pRecord->tag = pRecord2->tag = TAG_SHORT;
}
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if( (pos > maxLen) || (pos2 > maxLen) ){
if(!bCutTail){
pStats->nLong++;
pRecord->tag = pRecord2->tag = TAG_LONG;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pos > maxLen) pos = maxLen;
if(pos2 > maxLen) pos2 = maxLen;
}
if(bBarcode){
int bc = cMatrix::indices[pRecord->idx.bc][pRecord2->idx.bc];
if( bc < 0){
fpOut = pStats->fpUntrim.fp;
fpOut2 = pStats->fpUntrim2.fp;
}
else{
fpOut = pStats->fpOuts[bc].fp;
fpOut2 = pStats->fpOuts2[bc].fp;
pStats->incrementBarcode(bc);
}
}
if(bFivePrimeEnd){
if( (fpMask != NULL) && (fpMask2 != NULL) ){
if( (pos < pRecord->seq.n) || (pos2 < pRecord2->seq.n) ){
OutputMaskedRecord(fpMask, pRecord, pRecord->seq.n - pos, pos);
OutputMaskedRecord(fpMask2, pRecord2, pRecord2->seq.n - pos2, pos2);
}
}
if(pStats->bFillWithNs){ // for equal-read-length requirement of some applications
OutputEntireRecordFilledWithNs(fpOut, pRecord, pRecord->seq.n - pos, pos);
OutputEntireRecordFilledWithNs(fpOut2, pRecord2, pRecord2->seq.n - pos2, pos2);
}
else{
OutputPartialRecord(fpOut, pRecord, pRecord->seq.n - pos, pos);
OutputPartialRecord(fpOut2, pRecord2, pRecord2->seq.n - pos2, pos2);
}
}
else{
if( (fpMask != NULL) && (fpMask2 != NULL) ){
if( (pos < pRecord->seq.n) || (pos2 < pRecord2->seq.n) ){
OutputMaskedRecord(fpMask, pRecord, 0, pos);
OutputMaskedRecord(fpMask2, pRecord2, 0, pos2);
}
}
if(pStats->bFillWithNs){ // for equal-read-length requirement of some applications
OutputEntireRecordFilledWithNs(fpOut, pRecord, 0, pos);
OutputEntireRecordFilledWithNs(fpOut2, pRecord2, 0, pos2);
}
else{
OutputPartialRecord(fpOut, pRecord, 0, pos);
OutputPartialRecord(fpOut2, pRecord2, 0, pos2);
}
}
if(bBarcode){
if(cMatrix::indices[pRecord->idx.bc][pRecord2->idx.bc] < 0){ // assigned
pStats->nUntrimAvail++;
}
else{
pStats->nTrimAvail++;
}
}
else{
if(pRecord->idx.pos < pRecord->seq.n || pRecord2->idx.pos < pRecord2->seq.n) // trimmed
pStats->nTrimAvail++;
else
pStats->nUntrimAvail++;
}
mLen = (pos + pos2) / 2;
pStats->incrementCount(size_t(mLen));
}
pTaskMan->decreaseCnt();
startId += task.nBlockSize;
pRecord = &pBuffer[(startId << 1) % size2];
nItemCnt = pTaskMan->getItemCnt(startId, pRecord);
}while(nItemCnt > 0);
}
return NULL;
}
void * mt_worker2_ap(void * data)
{
cData * pData = (cData *)data;
cTaskManager *pTaskMan = pData->pTaskMan;
cStats * pStats = pData->pStats;
int64 file_length = pStats->total_file_length;
cFQ * pfq = pStats->pfq;
cFQ * pfq2 = pStats->pfq2;
FILE *fpOut = pStats->fpOut;
FILE *fpOut2 = pStats->fpOut2;
FILE *fpMask = pStats->fpMask;
FILE *fpMask2 = pStats->fpMask2;
FILE *fpExcl = pStats->fpExcl;
FILE *fpExcl2 = pStats->fpExcl2;
FILE *fpBarcode = pStats->fpBarcode;
int minAverageQual = pStats->minAverageQual;
int minEndQual = pStats->minEndQual;
int minLen = pStats->getMinLen();
int maxLen = pStats->getMaxLen();
bool bBarcode = pStats->bBarcode;
int iCutF = pStats->iCutF;
int iCutR = pStats->iCutR;
char barcodeSeq[25];
char barcodeQua[25];
bool bCutTail = pStats->bCutTail;
if(fpBarcode != NULL){
memset(barcodeSeq, 0, 25 * sizeof(char));
memset(barcodeQua, 0, 25 * sizeof(char));
}
assert( (iCutF + iCutR) <= 24 );
RECORD *pBuffer, *pRecord, *pRecord2;
TASK task;
int size2, rc, rc2, nItemCnt, nCnt;
int flag;
int64 startId;
pBuffer = pData->pBuffer;
size2 = pData->size * 2;
rc = rc2 = 0;
int64 cur_pos;
double cur_ratio;
int pos, pos2, mLen;
while(true){
while(!pTaskMan->getTask(task)){
if(pTaskMan->IsFinished()){
task.type = TASK_END;
break;
}
usleep(1);
}
if(task.type == TASK_END){
break;
}
startId = task.startId;
if(task.type == TASK_READ){
if(!pTaskMan->increaseCnt()){ // reach the buffer size
pTaskMan->addTask(task); // perform reading later
usleep(1);
continue;
}
// read records from input file to buffer
for(pRecord=&pBuffer[(startId << 1) % size2], nItemCnt=0; nItemCntreadRecord(pRecord);
rc2 = pfq2->readRecord(pRecord2);
if( (rc < 0) || (rc2 < 0) ){
break;
}
}
if(!pStats->bQuiet){
cur_pos = pfq->tell() + pfq2->tell();
if(cur_pos >= pfq->next_pos){
cur_ratio = int64(cur_pos * 10000 / file_length) / 10000.0;
pStats->progress(cur_ratio, 50);
pfq->next_pos = int64(((cur_ratio * 10000 + 1) * file_length + 9999)/10000);
}
}
if( (rc < 0) || (rc2 < 0) ){ // error or end of file
pTaskMan->finish();
if( (rc < -1) || (rc2 < -1) ) continue; // error
if(nItemCnt == 0) continue; // no record read
}
task.startId += task.nBlockSize;
pTaskMan->addTask(task); // save next task for parallelism
// process the records
for(pRecord=&pBuffer[(startId << 1) % size2], nCnt=0; nCnt < nItemCnt; nCnt++, pRecord+=2){
pRecord2 = pRecord + 1;
if( pStats->bFilterNs &&
(cMatrix::isBlurry(pRecord->seq.s, pRecord->seq.n) &&
cMatrix::isBlurry(pRecord2->seq.s, pRecord2->seq.n)) ){
pRecord->tag = pRecord2->tag = TAG_BLURRY;
continue;
}
if(minAverageQual > 0){
if( !cMatrix::checkQualities((uchar *)pRecord->qual.s, pRecord->qual.n, minAverageQual) &&
!cMatrix::checkQualities((uchar *)pRecord2->qual.s, pRecord2->qual.n, minAverageQual) ){
pRecord->tag = pRecord2->tag = TAG_BADQUAL;
continue;
}
}
pRecord->tag = TAG_NORMAL;
flag = cMatrix::findAdaptersBidirectionally(pRecord->seq.s, pRecord->seq.n, (uchar *)pRecord->qual.s, pRecord->qual.n,
pRecord2->seq.s, pRecord2->seq.n, (uchar *)pRecord2->qual.s, pRecord2->qual.n, pRecord->idx, pRecord2->idx);
if(flag >= 0){
pRecord->bExchange = (flag == 1);
if(flag == 0){
pRecord->idx.pos = pRecord->seq.n - iCutF;
pRecord2->idx.pos = pRecord2->seq.n - iCutR;
}
else{
pRecord->idx.pos = pRecord->seq.n - iCutR;
pRecord2->idx.pos = pRecord2->seq.n - iCutF;
}
if(pRecord->idx.pos < 0){
pRecord->idx.pos = 0;
}
if(pRecord2->idx.pos < 0){
pRecord2->idx.pos = 0;
}
}
if( minEndQual > 0 ){ // TODO: quality trimming from 5' end
if( (pRecord->idx.pos > 0) && (pRecord->qual.n > 0) ){
pRecord->idx.pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s, min(pRecord->idx.pos, pRecord->qual.n), minEndQual);
}
if( (pRecord2->idx.pos > 0) && (pRecord2->qual.n > 0) ){
pRecord2->idx.pos = cMatrix::trimByQuality((uchar *)pRecord2->qual.s, min(pRecord2->idx.pos, pRecord2->qual.n), minEndQual);
}
}
}
pRecord = &pBuffer[(startId << 1) % size2];
if(!pTaskMan->setItemCnt(startId, pRecord, nItemCnt)){
task.type = TASK_WRITE;
task.startId = startId;
task.nItemCnt = nItemCnt;
if(pTaskMan->bSingleBlock)
pTaskMan->insertTask(task);
else
pTaskMan->addTask(task);
}
continue;
}
// task.type == TASK_WRITE
pRecord = &pBuffer[(startId << 1) % size2];
nItemCnt = task.nItemCnt;
do{
// write to file
pRecord->nCnt = 0; // reset
for(nCnt=0; nCnttag == TAG_BLURRY){
pStats->nBlurry++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pRecord->tag == TAG_BADQUAL){
pStats->nBad++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
// TAG_NORMAL
pos = pRecord->idx.pos;
pos2 = pRecord2->idx.pos;
if( (pos < minLen) || (pos2 < minLen) ){
if( (pos <= 0) || (pos2 <= 0) ) {
pStats->nEmpty++;
pRecord->tag = pRecord2->tag = TAG_EMPTY;
}
else {
pStats->nShort++;
pRecord->tag = pRecord2->tag = TAG_SHORT;
}
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if( (pos > maxLen) || (pos2 > maxLen) ){
if(!bCutTail){
pStats->nLong++;
pRecord->tag = pRecord2->tag = TAG_LONG;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pos > maxLen) pos = maxLen;
if(pos2 > maxLen) pos2 = maxLen;
}
if(bBarcode){
if(pRecord->idx.bc < 0){
fpOut = pStats->fpUntrim.fp;
fpOut2 = pStats->fpUntrim2.fp;
}
else{
if(pRecord->bExchange){
fpOut2 = pStats->fpOuts[pRecord->idx.bc].fp;
fpOut = pStats->fpOuts2[pRecord->idx.bc].fp;
}
else{
fpOut = pStats->fpOuts[pRecord->idx.bc].fp;
fpOut2 = pStats->fpOuts2[pRecord->idx.bc].fp;
}
pStats->incrementBarcode(pRecord->idx.bc);
}
OutputEntireRecord(fpOut, pRecord);
OutputEntireRecord(fpOut2, pRecord2);
}
else{
int iCut = (pRecord->bExchange ? iCutR : iCutF);
int iCut2 = (pRecord->bExchange ? iCutF : iCutR);
if( (fpMask != NULL) && (fpMask2 != NULL) ){
if( (pos < pRecord->seq.n - iCut) || (pos2 < pRecord2->seq.n - iCut2) ){
OutputMaskedRecord(fpMask, pRecord, iCut, pos);
OutputMaskedRecord(fpMask2, pRecord2, iCut2, pos2);
}
}
if(pStats->bFillWithNs){
OutputEntireRecordFilledWithNs(fpOut, pRecord, iCut, pos);
OutputEntireRecordFilledWithNs(fpOut2, pRecord2, iCut2, pos2);
}
else{
OutputPartialRecord(fpOut, pRecord, iCut, pos);
OutputPartialRecord(fpOut2, pRecord2, iCut2, pos2);
}
}
if( (fpBarcode != NULL) && (pRecord->idx.bc >= 0) ){
if( (pRecord->com.n > 0) && (pRecord2->com.n > 0) ){ // fastq
if(cMatrix::PrepareBarcode(barcodeSeq, pRecord->idx.bc, pRecord->seq.s, iCutF, pRecord2->seq.s, iCutR, barcodeQua, pRecord->qual.s, pRecord2->qual.s)){
fprintf(fpBarcode, "@%s%s\n+\n%s\n", pRecord->id.s, barcodeSeq, barcodeQua);
}
else{
fprintf(fpBarcode, "@%s%.*s%.*s\n+\n%.*s%.*s\n", pRecord->id.s, iCutF, pRecord->seq.s, iCutR, pRecord2->seq.s,
iCutF, pRecord->qual.s, iCutR, pRecord2->qual.s);
}
}
else{ // fasta
if(cMatrix::PrepareBarcode(barcodeSeq, pRecord->idx.bc, pRecord->seq.s, iCutF, pRecord2->seq.s, iCutR)){
fprintf(fpBarcode, ">%s%s\n", pRecord->id.s, barcodeSeq);
}
else{
fprintf(fpBarcode, ">%s%.*s%.*s\n", pRecord->id.s, iCutF, pRecord->seq.s, iCutR, pRecord2->seq.s);
}
}
}
if(pRecord->idx.bc < 0){ // assigned
pStats->nUntrimAvail++;
}
else{
pStats->nTrimAvail++;
}
mLen = (pos + pos2) / 2;
pStats->incrementCount(size_t(mLen));
}
pTaskMan->decreaseCnt();
startId += task.nBlockSize;
pRecord = &pBuffer[(startId << 1) % size2];
nItemCnt = pTaskMan->getItemCnt(startId, pRecord);
}while(nItemCnt > 0);
}
return NULL;
}
void * mt_worker2_mp(void * data)
{
cData * pData = (cData *)data;
cTaskManager *pTaskMan = pData->pTaskMan;
cStats * pStats = pData->pStats;
int64 file_length = pStats->total_file_length;
cFQ * pfq = pStats->pfq;
cFQ * pfq2 = pStats->pfq2;
FILE *fpOut = pStats->fpOut;
FILE *fpOut2 = pStats->fpOut2;
FILE *fpMask = pStats->fpMask;
FILE *fpMask2 = pStats->fpMask2;
FILE *fpExcl = pStats->fpExcl;
FILE *fpExcl2 = pStats->fpExcl2;
int minAverageQual = pStats->minAverageQual;
int minEndQual = pStats->minEndQual;
int minLen = pStats->getMinLen();
int maxLen = pStats->getMaxLen();
int maxLen2 = (maxLen == INT_MAX) ? INT_MAX : (maxLen * 2);
bool bBarcode = pStats->bBarcode;
bool bCutTail = pStats->bCutTail;
bool bRedistribute = pStats->bRedistribute;
RECORD *pBuffer, *pRecord, *pRecord2;
TASK task;
int size2, rc, rc2, nItemCnt, nCnt;
int64 startId;
pBuffer = pData->pBuffer;
size2 = pData->size * 2;
rc = rc2 = 0;
int64 cur_pos;
double cur_ratio;
int pos, pos2;
int rLen, qLen;
int rLen2, qLen2;
while(true){
while(!pTaskMan->getTask(task)){
if(pTaskMan->IsFinished()){
task.type = TASK_END;
break;
}
usleep(1);
}
if(task.type == TASK_END){
break;
}
startId = task.startId;
if(task.type == TASK_READ){
if(!pTaskMan->increaseCnt()){ // reach the buffer size
pTaskMan->addTask(task); // perform reading later
usleep(1);
continue;
}
// read records from input file to buffer
for(pRecord=&pBuffer[(startId << 1) % size2], nItemCnt=0; nItemCntreadRecord(pRecord);
rc2 = pfq2->readRecord(pRecord2);
if( (rc < 0) || (rc2 < 0) ){
break;
}
}
if(!pStats->bQuiet){
cur_pos = pfq->tell() + pfq2->tell();
if(cur_pos >= pfq->next_pos){
cur_ratio = int64(cur_pos * 10000 / file_length) / 10000.0;
pStats->progress(cur_ratio, 50);
pfq->next_pos = int64(((cur_ratio * 10000 + 1) * file_length + 9999)/10000);
}
}
if( (rc < 0) || (rc2 < 0) ){ // error or end of file
pTaskMan->finish();
if( (rc < -1) || (rc2 < -1) ) continue; // error
if(nItemCnt == 0) continue; // no record read
}
task.startId += task.nBlockSize;
pTaskMan->addTask(task); // save next task for parallelism
// process the records
for(pRecord=&pBuffer[(startId << 1) % size2], nCnt=0; nCnt < nItemCnt; nCnt++, pRecord+=2){
pRecord2 = pRecord + 1;
if(minAverageQual > 0){
if( !cMatrix::checkQualities((uchar *)pRecord->qual.s, pRecord->qual.n, minAverageQual) &&
!cMatrix::checkQualities((uchar *)pRecord2->qual.s, pRecord2->qual.n, minAverageQual) ){
pRecord->tag = pRecord2->tag = TAG_BADQUAL;
continue;
}
}
pRecord->tag = TAG_NORMAL;
rLen = pRecord->seq.n;
qLen = pRecord->qual.n;
rLen2 = pRecord2->seq.n;
qLen2 = pRecord2->qual.n;
if(cMatrix::findAdapterWithPE(pRecord->seq.s, pRecord2->seq.s, rLen, rLen2,
(uchar *)pRecord->qual.s, (uchar *)pRecord2->qual.s, qLen, qLen2,
pRecord->idx, pRecord2->idx)){ // trimmed
pos = pRecord->idx.pos;
pos2 = pRecord2->idx.pos;
if( (pos >= minLen) && (pos2 >= minLen) ){
cMatrix::combinePairSeqs(pRecord->seq.s, pRecord2->seq.s, pos, pos2,
(uchar *)pRecord->qual.s, (uchar *)pRecord2->qual.s, (int)qLen, (int)qLen2);
if(pStats->bFilterNs){
if( cMatrix::isBlurry(pRecord->seq.s, pos) && cMatrix::isBlurry(pRecord2->seq.s, pos2) ){
pRecord->tag = pRecord2->tag = TAG_BLURRY;
}
}
}
}
else{
pos = rLen;
pos2 = rLen2;
if(pStats->bFilterNs){
if( cMatrix::isBlurry(pRecord->seq.s, rLen) && cMatrix::isBlurry(pRecord2->seq.s, rLen) ){
pRecord->tag = pRecord2->tag = TAG_BLURRY;
}
}
}
if( (pRecord->tag == TAG_NORMAL) && (pos >= minLen) && (pos2 >= minLen) ) {
pRecord->idx = cMatrix::findJuncAdapter(pRecord->seq.s, pos, (uchar *)pRecord->qual.s, qLen);
if(pRecord->idx.pos < 0){
pRecord->idx.pos = 0;
}
pRecord2->idx = cMatrix::findJuncAdapter(pRecord2->seq.s, pos2, (uchar *)pRecord2->qual.s, qLen2);
if(pRecord2->idx.pos < 0){
pRecord2->idx.pos = 0;
}
if( (pos < rLen) || (pos2 < rLen2) ) { // trimmed
if( (pRecord->idx.bc == 0) || (pRecord2->idx.bc == 0) || (pRecord->idx.bc != pRecord2->idx.bc) ){
pRecord->tag = pRecord2->tag = TAG_CONTAMINANT;
}
else{
if(pRecord->idx.pos + pRecord2->idx.pos + cMatrix::junctionLengths[pRecord->idx.bc] != max(pos, pos2)){
pRecord->tag = pRecord2->tag = TAG_CONTAMINANT;
}
}
if(minEndQual > 0){
if(pRecord->qual.n > 0)
pRecord->idx.pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s, min(pRecord->idx.pos, pRecord->qual.n), minEndQual);
if(pRecord2->qual.n > 0)
pRecord2->idx.pos = cMatrix::trimByQuality((uchar *)pRecord2->qual.s, min(pRecord2->idx.pos, pRecord2->qual.n), minEndQual);
}
}
else{
if( (pRecord->idx.bc == 0) && (pRecord2->idx.bc == 0) ){ // case D
if(pStats->bFilterUndetermined)
pRecord->tag = pRecord2->tag = TAG_UNDETERMINED;
else{
if(minEndQual > 0){
if(pRecord->qual.n > 0)
pRecord->idx.pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s, min(pRecord->qual.n, pRecord->idx.pos), minEndQual);
if(pRecord2->qual.n > 0)
pRecord2->idx.pos = cMatrix::trimByQuality((uchar *)pRecord2->qual.s, min(pRecord2->qual.n, pRecord2->idx.pos), minEndQual);
}
}
}
else{
if(pRecord->idx.bc == 0){ // case B
if( (pRecord2->idx.pos >= minLen) && (pRecord2->seq.n >= rLen) && (pRecord2->qual.n >= qLen) ){
if(bRedistribute){
pRecord->idx = cMatrix::mergePE(pRecord->seq.s, pRecord2->seq.s, rLen, (uchar *)pRecord->qual.s, (uchar *)pRecord2->qual.s, qLen, pRecord2->idx.pos, cMatrix::junctionLengths[pRecord2->idx.bc]);
}
if(minEndQual > 0){
int pos = cMatrix::trimByQuality((uchar *)pRecord2->qual.s + pRecord2->idx.pos, pRecord->idx.pos - pRecord->seq.n, minEndQual);
if(pos != pRecord->idx.pos - pRecord->seq.n){
if(pos == 0){
pRecord->idx.pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s, pRecord->qual.n, minEndQual);
}
else{
pRecord->idx.pos = pRecord->seq.n + pos;
}
}
}
}
}
else if(pRecord2->idx.bc == 0){ // case C
if( (pRecord->idx.pos >= minLen) && (pRecord->seq.n >= rLen) && (pRecord->qual.n >= qLen) ){
if(bRedistribute){
pRecord2->idx = cMatrix::mergePE(pRecord2->seq.s, pRecord->seq.s, rLen, (uchar *)pRecord2->qual.s, (uchar *)pRecord->qual.s, qLen, pRecord->idx.pos, cMatrix::junctionLengths[pRecord->idx.bc]);
}
if(minEndQual > 0){
int pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s + pRecord->idx.pos, pRecord2->idx.pos - pRecord2->seq.n, minEndQual);
if(pos != pRecord2->idx.pos - pRecord2->seq.n){
if(pos == 0){
pRecord2->idx.pos = cMatrix::trimByQuality((uchar *)pRecord2->qual.s, pRecord2->qual.n, minEndQual);
}
else{
pRecord2->idx.pos = pRecord2->seq.n + pos;
}
}
}
}
} // case C
else{
if(minEndQual > 0){
if(pRecord->qual.n > 0)
pRecord->idx.pos = cMatrix::trimByQuality((uchar *)pRecord->qual.s, min(pRecord->qual.n, pRecord->idx.pos), minEndQual);
if(pRecord2->qual.n > 0)
pRecord2->idx.pos = cMatrix::trimByQuality((uchar *)pRecord2->qual.s, min(pRecord2->qual.n, pRecord2->idx.pos), minEndQual);
}
}
} // not case D
} // pos >= rLen
}
}
pRecord = &pBuffer[(startId << 1) % size2];
if(!pTaskMan->setItemCnt(startId, pRecord, nItemCnt)){
task.type = TASK_WRITE;
task.startId = startId;
task.nItemCnt = nItemCnt;
if(pTaskMan->bSingleBlock)
pTaskMan->insertTask(task);
else
pTaskMan->addTask(task);
}
continue;
}
// task.type == TASK_WRITE
pRecord = &pBuffer[(startId << 1) % size2];
nItemCnt = task.nItemCnt;
do{
// write to file
pRecord->nCnt = 0; // reset
for(nCnt=0; nCnttag == TAG_BLURRY){
pStats->nBlurry++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pRecord->tag == TAG_BADQUAL){
pStats->nBad++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pRecord->tag == TAG_CONTAMINANT){
pStats->nContaminant++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pRecord->tag == TAG_UNDETERMINED){
pStats->nUndetermined++;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
// TAG_NORMAL
pos = pRecord->idx.pos;
pos2 = pRecord2->idx.pos;
if( (pos < minLen) || (pos2 < minLen) ){
if( (pos <= 0) || (pos2 <= 0) ) {
pStats->nEmpty++;
pRecord->tag = pRecord2->tag = TAG_EMPTY;
}
else {
pStats->nShort++;
pRecord->tag = pRecord2->tag = TAG_SHORT;
}
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pos + pos2 > maxLen2){
if(!bCutTail){
pStats->nLong++;
pRecord->tag = pRecord2->tag = TAG_LONG;
if( (fpExcl != NULL) && (fpExcl2 != NULL) ){
OutputTaggedRecord(fpExcl, pRecord);
OutputTaggedRecord(fpExcl2, pRecord2);
}
continue;
}
if(pos > maxLen) pos = maxLen;
if(pos2 > maxLen) pos2 = maxLen;
}
if(bBarcode){
if(pRecord->idx.bc < 0){
fpOut = pStats->fpUntrim.fp;
fpOut2 = pStats->fpUntrim2.fp;
}
else{
fpOut = pStats->fpOuts[pRecord->idx.bc].fp;
fpOut2 = pStats->fpOuts2[pRecord->idx.bc].fp;
pStats->incrementBarcode(pRecord->idx.bc);
}
}
rLen = pRecord->seq.n;
if( (fpMask != NULL) && (fpMask2 != NULL) ){
OutputMaskedRecord(fpMask, pRecord, 0, pos);
OutputMaskedRecord(fpMask2, pRecord2, 0, pos2);
}
if(pRecord->com.n > 0){ // fastq
if(pos <= rLen){
fprintf(fpOut, "@%s%.*s\n+\n%.*s\n", pRecord->id.s, pos, pRecord->seq.s, pos, pRecord->qual.s);
}
else{
fprintf(fpOut, "@%s%.*s%.*s\n", pRecord->id.s, rLen, pRecord->seq.s, pos - rLen, pRecord2->seq.s + pos2);
fprintf(fpOut, "+\n%.*s%.*s\n", rLen, pRecord->qual.s, pos - rLen, pRecord2->qual.s + pos2);
}
if(pos2 <= rLen){
fprintf(fpOut2, "@%s%.*s\n+\n%.*s\n", pRecord2->id.s, pos2, pRecord2->seq.s, pos2, pRecord2->qual.s);
}
else{
fprintf(fpOut2, "@%s%.*s%.*s\n", pRecord2->id.s, rLen, pRecord2->seq.s, pos2 - rLen, pRecord->seq.s + pos);
fprintf(fpOut2, "+\n%.*s%.*s\n", rLen, pRecord2->qual.s, pos2 - rLen, pRecord->qual.s + pos);
}
}
else{ // fasta
if(pos <= rLen){
fprintf(fpOut, ">%s%.*s\n", pRecord->id.s, pos, pRecord->seq.s);
}
else{
fprintf(fpOut, ">%s%.*s%.*s\n", pRecord->id.s, rLen, pRecord->seq.s, pos - rLen, pRecord2->seq.s + pos2);
}
if(pos2 <= rLen){
fprintf(fpOut2, ">%s%.*s\n", pRecord2->id.s, pos2, pRecord2->seq.s);
}
else{
fprintf(fpOut2, ">%s%.*s%.*s\n", pRecord2->id.s, rLen, pRecord2->seq.s, pos2 - rLen, pRecord->seq.s + pos);
}
}
if(bBarcode){
if(pRecord->idx.bc < 0){ // assigned
pStats->nUntrimAvail++;
}
else{
pStats->nTrimAvail++;
}
}
else{
if(pos + pos2 < rLen + rLen) // trimmed
pStats->nTrimAvail++;
else
pStats->nUntrimAvail++;
}
pStats->incrementCount(size_t((pos + pos2) / 2));
}
pTaskMan->decreaseCnt();
startId += task.nBlockSize;
pRecord = &pBuffer[(startId << 1) % size2];
nItemCnt = pTaskMan->getItemCnt(startId, pRecord);
}while(nItemCnt > 0);
}
return NULL;
}
int processFile(cParameter * pParameter, cStats * pStats)
{
CFILE cf;
int i;
int64 file_length;
if(pParameter->bStdin){
cf.fp = stdin;
file_length = -1;
}
else{
char * inFile = pParameter->input[0];
file_length = gzsize(inFile);
cf = gzopen(inFile, "r");
if(cf.fp == NULL){
fprintf(stderr, "Can not open %s for reading\n", inFile);
return 1;
}
}
cWork wk;
if(!wk.Init(pParameter, pStats, file_length, cf.fp)){
fprintf(stderr, "Can not allocate memory for workset\n");
gzclose(&cf);
return 1;
}
mtaux_t *mt = wk.getMultiThreadingPointer();
int rc;
void *status;
if( (pParameter->trimMode & TRIM_AP) != 0 ){
for(i=1; in_threads; i++){ // worker 0 is effectively launched by the master thread
rc = pthread_create(&mt->tid[i], &wk.attr, mt_worker_ap, &mt->w[i]);
if(rc != 0){
fprintf(stderr, "Can not create thread %d\n", i);
break;
}
}
mt_worker_ap(&mt->w[0]);
}
else{
for(i=1; in_threads; i++){ // worker 0 is effectively launched by the master thread
rc = pthread_create(&mt->tid[i], &wk.attr, mt_worker, &mt->w[i]);
if(rc != 0){
fprintf(stderr, "Can not create thread %d\n", i);
break;
}
}
mt_worker(&mt->w[0]);
}
for(i=1; in_threads; ++i){ // waits for termination of other threads
rc = pthread_join(mt->tid[i], &status);
}
if(!pParameter->bStdin){
gzclose(&cf);
}
return 0;
}
int processPairedFiles(cParameter * pParameter, cStats * pStats)
{
char * inFile = pParameter->input[0];
char * inFile2 = pParameter->input[1];
CFILE cf, cf2;
int i;
int64 file_length = gzsize(inFile) + gzsize(inFile2);
cf = gzopen(inFile, "r");
cf2 = gzopen(inFile2, "r");
if( (cf.fp == NULL) || (cf2.fp == NULL) ){
if(cf.fp == NULL)
fprintf(stderr, "Can not open %s for reading\n", inFile);
else
gzclose(&cf);
if(cf2.fp == NULL)
fprintf(stderr, "Can not open %s for reading\n", inFile2);
else
gzclose(&cf2);
return 1;
}
cWork wk;
if(!wk.Init(pParameter, pStats, file_length, cf.fp, cf2.fp)){
fprintf(stderr, "Can not allocate memory for workset\n");
gzclose(&cf2);
gzclose(&cf);
return 1;
}
mtaux_t *mt = wk.getMultiThreadingPointer();
int rc;
void *status;
if( (pParameter->trimMode & TRIM_PE) != 0 ){
if( (pParameter->trimMode & TRIM_ANY) == TRIM_DEFAULT ){
for(i=1; in_threads; i++){ // worker 0 is effectively launched by the master thread
rc = pthread_create(&mt->tid[i], &wk.attr, mt_worker2, &mt->w[i]);
if(rc != 0){
fprintf(stderr, "Can not create thread %d\n", i);
break;
}
}
mt_worker2(&mt->w[0]);
}
else{
for(i=1; in_threads; i++){ // worker 0 is effectively launched by the master thread
rc = pthread_create(&mt->tid[i], &wk.attr, mt_worker2_sep, &mt->w[i]);
if(rc != 0){
fprintf(stderr, "Can not create thread %d\n", i);
break;
}
}
mt_worker2_sep(&mt->w[0]);
}
}
else if( (pParameter->trimMode & TRIM_AP) != 0 ){
for(i=1; in_threads; i++){ // worker 0 is effectively launched by the master thread
rc = pthread_create(&mt->tid[i], &wk.attr, mt_worker2_ap, &mt->w[i]);
if(rc != 0){
fprintf(stderr, "Can not create thread %d\n", i);
break;
}
}
mt_worker2_ap(&mt->w[0]);
}
else{ // TRIM_MP
for(i=1; in_threads; i++){ // worker 0 is effectively launched by the master thread
rc = pthread_create(&mt->tid[i], &wk.attr, mt_worker2_mp, &mt->w[i]);
if(rc != 0){
fprintf(stderr, "Can not create thread %d\n", i);
break;
}
}
mt_worker2_mp(&mt->w[0]);
}
for(i=1; in_threads; ++i){ // waits for termination of other threads
rc = pthread_join(mt->tid[i], &status);
}
gzclose(&cf2);
gzclose(&cf);
return 0;
}
int main(int argc, char * argv[])
{
cParameter para;
cStats stats;
char errMsg[256];
// process the input parameters
int iRet = para.GetOpt(argc, argv, errMsg);
if(iRet < 0){
char * program = strrchr(argv[0], '/');
program = (program == NULL) ? argv[0] : (program + 1);
if(iRet == -1){
if(para.bEnquireVersion){
para.PrintVersion(stdout);
return 0;
}
para.PrintUsage(program, stdout);
}
else{
fprintf(stderr, "%s (%s): %s\n\n", program, para.version, errMsg);
para.PrintSimpleUsage(program, stderr);
}
return 1;
}
if(para.IsAutoFastqFormat()){
para.fastqFormat = gzformat(para.input, para.nFileCnt);
if(para.fastqFormat == CONTRADICT_FASTQ){
fprintf(stderr, "Error: the FASTQ quality formats of input files are different\n");
return 1;
}
para.baseQual = (para.fastqFormat == SOLEXA_FASTQ) ? 64 : 33;
}
if(!stats.initHist(¶)){
fprintf(stderr, "Error: can not allocate memory for audit\n");
return 1;
}
if(!stats.openOutputFiles(¶)){
return 1;
}
FILE * hLog;
if(para.bStdout){
hLog = stderr;
}
else{
hLog = fopen(para.logfile, "w");
if(hLog == NULL){
fprintf(stderr, "Error: can not open %s for writing\n", para.logfile);
return 1;
}
}
para.printVersion(hLog);
para.printCommandLine(hLog);
para.printRelatedFiles(hLog);
para.printOpt(hLog, true);
if(!stats.bStdout){
para.printLogo(stdout);
para.printVersion(stdout);
para.printOpt(stdout);
}
stats.printTime("started", hLog);
if(!stats.bStdout) stats.printTime("started", stdout);
stats.start();
////////////// process the input file(s)
if(para.nFileCnt <= 1){
iRet = processFile(¶, &stats);
}
else{
iRet = processPairedFiles(¶, &stats);
}
if(iRet != 0){
if(!stats.bStdout) fclose(hLog);
return iRet;
}
stats.end();
stats.printTime("done", hLog, 0x02);
if(!stats.bStdout) stats.printTime("done", stdout, 0x02);
stats.printDiffTime(hLog);
if(!stats.bStdout) stats.printDiffTime(stdout);
stats.printSummary(hLog);
if(!stats.bStdout) stats.printSummary(stdout);
stats.printHist(hLog);
if(!stats.bStdout){
fclose(hLog);
fprintf(stdout, "log has been saved to \"%s\".\n", para.logfile);
}
if(!stats.writeMapFile(¶)){
fprintf(stderr, "Can not write Mapping file\n");
return 1;
}
return 0;
}
skewer-0.2.2/src/matrix.cpp 0000664 0000000 0000000 00000111270 12700552063 0015614 0 ustar 00root root 0000000 0000000 /**********************************************************************
* Skewer - a fast and accurate adapter trimming tool
* using the bit-masked k-difference matching algorithm
* Copyright (c) 2013-2016 by Hongshan Jiang
* hongshan.jiang@gmail.com
*
* If you use this program, please cite the paper:
* Jiang, H., Lei, R., Ding, S.W. and Zhu, S. (2014) Skewer: a fast and
* accurate adapter trimmer for next-generation sequencing paired-end reads.
* BMC Bioinformatics, 15, 182.
* http://www.biomedcentral.com/1471-2105/15/182
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include
#include
#include
#include
#include "matrix.h"
#include "fastq.h"
CODE codeMap[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // 0
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // 1
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // 2
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // 3
// 0x41~0x5A, A~Z
CD_NONE, CD_A, CD_B, CD_C, CD_D, CD_NONE, CD_NONE, CD_G, CD_H, CD_NONE, CD_NONE, CD_K, CD_NONE, CD_M, CD_N, CD_NONE, // 4
CD_NONE, CD_NONE, CD_R, CD_S, CD_T, CD_T, CD_V, CD_W, CD_NONE, CD_Y, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // 5
// 0x61~0x7A, a~z
CD_NONE, CD_A, CD_B, CD_C, CD_D, CD_NONE, CD_NONE, CD_G, CD_H, CD_NONE, CD_NONE, CD_K, CD_NONE, CD_M, CD_N, CD_NONE, // 6
CD_NONE, CD_NONE, CD_R, CD_S, CD_T, CD_T, CD_V, CD_W, CD_NONE, CD_Y, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // 7
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // 8
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // 9
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // A
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // B
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // C
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // D
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, // E
CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE, CD_NONE // F
};
bool blurry[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // 0
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // 1
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // 2
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // 3
true,false, true,false, true, true, true,false, true, true, true, true, true, true, true, true, // 4
true, true, true, true,false,false, true, true, true, true, true, true, true, true, true, true, // 5
true,false, true,false, true, true, true,false, true, true, true, true, true, true, true, true, // 6
true, true, true, true,false,false, true, true, true, true, true, true, true, true, true, true, // 7
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // 8
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // 9
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // A
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // B
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // C
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // D
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, // E
true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true // F
};
CODE complement[CD_CNT] = {
CD_NONE, CD_T, CD_G, CD_C, CD_A, CD_Y, CD_R, CD_W, CD_S, CD_M, CD_K, CD_V, CD_H, CD_D, CD_B, CD_N
};
char character[CD_CNT] = {
'N', 'A', 'C', 'G', 'T', 'R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'N'
};
double scoring[CD_CNT][CD_CNT] = {
// - A C G T | R Y S W K M | B D H V | N
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.05}, //-
{ 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0.05}, //A
{ 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0.05}, //C
{ 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0.05}, //G
{ 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0.05}, //T
{ 1, 0, 1, 0, 1, 0, 1,0.75,0.75,0.75,0.75, 0.5, 0, 0.5, 0, 0.05}, //R
{ 1, 1, 0, 1, 0, 1, 0,0.75,0.75,0.75,0.75, 0, 0.5, 0, 0.5, 0.05}, //Y
{ 1, 1, 0, 0, 1,0.75,0.75, 0, 1,0.75,0.75, 0, 0.5, 0.5, 0, 0.05}, //S
{ 1, 0, 1, 1, 0,0.75,0.75, 1, 0,0.75,0.75, 0.5, 0, 0, 0.5, 0.05}, //W
{ 1, 1, 1, 0, 0,0.75,0.75,0.75,0.75, 0, 1, 0, 0, 0.5, 0.5, 0.05}, //K
{ 1, 0, 0, 1, 1,0.75,0.75,0.75,0.75, 1, 0, 0.5, 0.5, 0, 0, 0.05}, //M
{ 1, 1, 0, 0, 0, 0.5, 0, 0, 0.5, 0, 0.5, 0, 0.4, 0.4, 0.4, 0.05}, //B
{ 1, 0, 1, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0.4, 0, 0.4, 0.4, 0.05}, //D
{ 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 0.5, 0, 0.4, 0.4, 0, 0.4, 0.05}, //H
{ 1, 0, 0, 0, 1, 0, 0.5, 0, 0.5, 0.5, 0, 0.4, 0.4, 0.4, 0, 0.05}, //V
{ 0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05, 0} //N
};
uint64 chrVadp[CD_CNT][CD_CNT] = {
// adp - A C G T | R Y S W K M | B D H V | N //chr
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}, //-
{ 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0}, //A
{ 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0}, //C
{ 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0}, //G
{ 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0}, //T
{ 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0}, //R
{ 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0}, //Y
{ 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0}, //S
{ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}, //W
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0}, //K
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0}, //M
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0}, //B
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0}, //D
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}, //H
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}, //V
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0} //N
};
const double MIN_PENALTY = 0.477121255;
const double MEAN_PENALTY = 2.477121255;
const double MAX_PENALTY = 4.477121255;
const double EPSILON = (MIN_PENALTY / 10);
///////////////////////////////////////
bool cElementSet::insert (const ELEMENT& val)
{
pair ret;
ELEMENT_SET::iterator it = this->find(val);
if(it == this->end()){
ret = ELEMENT_SET::insert(val);
return ret.second;
}
if(val.score < it->score){
return true;
}
this->erase(it++);
ELEMENT_SET::insert(it, val);
return true;
}
///////////////////////////////////////
cAdapter::cAdapter()
{
len = 0;
}
cAdapter::~cAdapter()
{
}
void cAdapter::Init(char * seq, size_t sLen, TRIM_MODE trimMode)
{
int i;
// construct sequence
this->len = (int(sLen) > MAX_ADAPTER_LEN) ? MAX_ADAPTER_LEN : sLen;
gzstrncpy(sequence, seq, len);
this->trimMode = trimMode;
// construct mismatch bits
int code, code2;
uint64 bits;
for(code=0; code=0; i--){
code2 = codeMap[uchar(sequence[i])];
bits = (bits << 1) | chrVadp[code][code2];
}
matchBits[code] = ~bits;
}
}
void cAdapter::Init2(char * seq, size_t sLen)
{
int i;
// construct sequence
if(int(sLen) > MAX_ADAPTER_LEN){
// seq += (sLen - MAX_ADAPTER_LEN);
// the head instead of the tail should be used
this->len = MAX_ADAPTER_LEN;
}
else{
this->len = sLen;
}
for(i=0; itrimMode = TRIM_TAIL;
// construct mismatch bits
int code, code2;
uint64 bits;
for(code=CD_BASIC_CNT-1; code>=0; code--){
bits = 0;
for(i=int(len)-1; i>=0; i--){
code2 = codeMap[uchar(sequence[i])];
bits = (bits << 1) | chrVadp[code][code2];
}
matchBits[code] = ~bits;
}
for(bits=~bits,code=CD_BASIC_CNT; code & queue, uint64 &d0bits, uint64 &lbits, uint64 &unbits, uint64 &dnbits, double &penal, double &dMaxPenalty, int &iMaxIndel)
{
int i;
double score;
uint64 bits = ~lbits | d0bits;
for(bits>>=1,i=1; i>=1){
if((bits & 0x01) == 0){
if(cMatrix::bSensitive){
score = queue[i].score + (penal - cMatrix::dDelta);
if( (queue[i-1].score < score) && (queue[i-1].nIndel < iMaxIndel) ){
if( (queue[i+1].score < score) && (queue[i+1].nIndel < iMaxIndel) ){
if(queue[i-1].score < queue[i+1].score){
queue[i] = queue[i-1];
dnbits |= (1L << (i-1));
}
else{
queue[i] = queue[i+1];
unbits |= (1L << (i+1));
}
}
else{
queue[i] = queue[i-1];
dnbits |= (1L << (i-1));
}
queue[i].nIndel++;
}
else{
if( (queue[i+1].score < score) && (queue[i+1].nIndel < iMaxIndel) ){
queue[i] = queue[i+1];
unbits |= (1L << (i+1));
queue[i].nIndel++;
}
else{
queue[i].score = score;
}
}
queue[i].score += cMatrix::dDelta;
}
else{ // !cMatrix::bSensitive
queue[i].score += penal;
}
if(queue[i].score >= dMaxPenalty){
lbits &= ~(1L << i);
}
}
}
if(queue.size() > 1){
if((bits & 0x01) == 0){
if(cMatrix::bSensitive){
if( (queue[i-1].nIndel < iMaxIndel) && (queue[i-1].score + cMatrix::dDelta < queue[i].score + penal) ){
queue[i] = queue[i-1];
dnbits |= (1L << (i-1));
queue[i].score += cMatrix::dDelta;
queue[i].nIndel++;
}
else{
queue[i].score += penal;
}
}
else{ // !cMatrix::bSensitive
queue[i].score += penal;
}
if(queue[i].score >= dMaxPenalty){
lbits &= ~(1L << i);
}
}
for(; i>0; i--){
if(queue.back().score < dMaxPenalty) break;
queue.pop_back();
}
}
}
bool cAdapter::align(char * read, size_t rLen, uchar * qual, size_t qLen, cElementSet &result, int bc, bool bBestAlign)
{
bool bDetermined = false;
ELEMENT elem;
double dMaxPenalty = cMatrix::dPenaltyPerErr * len + 0.001;
int iMaxIndel = ceil(cMatrix::dEpsilonIndel * len);
int minK = bBestAlign ? ((cMatrix::iMinOverlap >= (int)(len - iMaxIndel + 1)) ? (int)(len - iMaxIndel + 1) : cMatrix::iMinOverlap) : 1;
double dMu = (bc >= 0) ? cMatrix::dMu : MIN_PENALTY;
deque queue;
ELEMENT element;
double score;
uint64 legalBits = 0;
int i, j, jj;
element.idx.bc = bc + 1;
if(trimMode & TRIM_HEAD){
for(i=1; i<=int(len)-minK; i++){
element.idx.pos = -i;
element.score = cMatrix::dPenaltyPerErr * i;
element.nIndel = 0;
queue.push_back(element);
legalBits = (legalBits << 1) | 1;
}
}
else{
for(i=1,score=cMatrix::dDelta; i iMaxIndel) break;
element.idx.pos = -i;
element.score = score;
element.nIndel = i;
queue.push_back(element);
legalBits = (legalBits << 1) | 1;
}
}
element.nIndel = 0;
uint64 mbits, xbits, unbits, dnbits, d0bits;
unbits = dnbits = 0L;
double penal;
for(j=0; j 0) ? cMatrix::penalty[qual[jj]] : dMu);
element.idx.pos = j;
element.score = ((mbits & 0x01) == 0) ? penal : 0;
queue.push_front(element);
xbits = mbits | unbits;
dnbits <<= 1;
unbits <<= 1;
d0bits = ((dnbits + (xbits & dnbits)) ^ dnbits) | xbits;
legalBits = (legalBits << 1) | 1;
UPDATE_COLUMN(queue, d0bits, legalBits, unbits, dnbits, penal, dMaxPenalty, iMaxIndel);
dnbits &= d0bits;
unbits &= d0bits;
if(queue.size() == len){
if(bBestAlign){
if(trimMode == TRIM_HEAD){
i = (queue.back().idx.pos < 0) ? (len + queue.back().idx.pos) : len;
if( !bDetermined || (i * cMatrix::dMu - queue.back().score) > elem.score * (i+1) ){
elem = queue.back();
dMaxPenalty = elem.score;
elem.score = (i * cMatrix::dMu - elem.score) / (i+1); // normalization
elem.idx.pos = rLen - 1 - j;
}
}
else{
elem = queue.back();
dMaxPenalty = elem.score + ((trimMode == TRIM_TAIL) ? EPSILON : 0);
elem.score = (len * cMatrix::dMu - elem.score) / (len + 1); // normalization
}
bDetermined = true;
if(dMaxPenalty == 0) break;
}
else{
elem = queue.back();
elem.score = len * cMatrix::dMu - elem.score; // normalization
result.insert(elem);
}
queue.pop_back();
}
}
if(dMaxPenalty > 0){ // not the case of "perfect match for single-end reads trimming"
if(bBestAlign){
if(trimMode & TRIM_TAIL){
dMaxPenalty = (cMatrix::dPenaltyPerErr * queue.size() + 0.001);
for(i=queue.size(); i>=minK; i--, dMaxPenalty-=cMatrix::dPenaltyPerErr){
if(dMaxPenalty <= 0) break;
if(queue.back().score < dMaxPenalty){
if(!bDetermined || ((i * cMatrix::dMu - queue.back().score) > elem.score * (i+1)) ){
elem = queue.back();
dMaxPenalty = elem.score;
elem.score = (i * cMatrix::dMu - elem.score) / (i+1); // normalization
bDetermined = true;
}
}
queue.pop_back();
}
}
else{
dMaxPenalty -= (len - queue.size()) * cMatrix::dDelta;
iMaxIndel -= (len - queue.size());
for(i=queue.size(); i>=minK; i--, dMaxPenalty-=cMatrix::dDelta, iMaxIndel--){
if( (dMaxPenalty <= 0) || (iMaxIndel < 0) ) break;
if( (queue.back().score < dMaxPenalty) && (queue.back().nIndel <= iMaxIndel) ){
if(!bDetermined || ((i * cMatrix::dMu - queue.back().score) > elem.score * (i+1)) ){
elem = queue.back();
dMaxPenalty = elem.score;
elem.score = (i * cMatrix::dMu - elem.score) / (i+1); // normalization
elem.idx.pos = -(len - i);
bDetermined = true;
}
}
queue.pop_back();
}
}
}
else{
dMaxPenalty = cMatrix::dPenaltyPerErr * queue.size() + 0.001;
for(i=queue.size(); i>=minK; i--, dMaxPenalty-=cMatrix::dPenaltyPerErr){
if(queue.back().score < dMaxPenalty){
elem = queue.back();
elem.score = i * cMatrix::dMu - elem.score; // normalization
result.insert(elem);
}
queue.pop_back();
}
}
}
if(bDetermined){
result.clear();
result.insert(elem);
}
return bDetermined;
}
void cAdapter::initBarcode(int iCut)
{
int i, k;
if(iCut > (int)len) iCut = (int)len;
for(k=0,i=0; i cMatrix::firstAdapters;
deque cMatrix::secondAdapters;
deque cMatrix::junctionAdapters;
vector cMatrix::junctionLengths;
vector< vector > cMatrix::indices;
int cMatrix::iIdxCnt = 0;
vector cMatrix::fw_masked;
vector cMatrix::rv_masked;
vector cMatrix::fw_barcodes;
vector cMatrix::rv_barcodes;
vector cMatrix::fw_primers;
vector cMatrix::rv_primers;
vector cMatrix::rowBc;
vector cMatrix::colBc;
bool cMatrix::bShareAdapter = false;
double cMatrix::dEpsilon = 0.15;
double cMatrix::dEpsilonIndel = 0.03;
double cMatrix::dPenaltyPerErr = cMatrix::dEpsilon * MEAN_PENALTY;
double cMatrix::dDelta = MAX_PENALTY;
double cMatrix::dMu = MEAN_PENALTY;
double cMatrix::penalty[256];
bool cMatrix::bSensitive = false;
int cMatrix::iMinOverlap = 3;
///////////////////////////////////////
cMatrix::cMatrix()
{
}
cMatrix::~cMatrix()
{
}
bool cMatrix::CalcRevCompScore(char * seq, char * seq2, int len, uchar * qual, uchar * qual2, size_t qLen, double &score)
{
double dMaxPenalty = dPenaltyPerErr * len;
double penal;
CODE code, code2;
if(len <= 0){
score = (qLen > 0) ? (dMu * qLen / 2) : 0.0;
// prefer to detect empty reads even if an error ratio of 0.5 is specified
return true;
}
score = 0.0;
for(int i=0; i 0.0){
if(qLen > 0){
if(cMatrix::penalty[qual[i]] <= cMatrix::penalty[qual2[len-1-i]]){
penal *= cMatrix::penalty[qual[i]];
}
else{
penal *= cMatrix::penalty[qual2[len-1-i]];
}
}
else{
penal *= dMu;
}
score += penal;
if(score > dMaxPenalty){
return false;
}
}
}
score = len * dMu - score; // normalization
return true;
}
string cMatrix::GetRevComp(char * seq, int len)
{
char sequence[MAX_ADAPTER_LEN+1];
if(len > MAX_ADAPTER_LEN){
seq += (len - MAX_ADAPTER_LEN);
len = MAX_ADAPTER_LEN;
}
for(int i=0; i 0);
// pre-calcualte the penalties corresponding to quality values
int chr;
for(chr=0; chr<=baseQual; chr++){
cMatrix::penalty[chr] = MIN_PENALTY;
}
int i;
for(i=1; i<40; i++,chr++){
cMatrix::penalty[chr] = MIN_PENALTY + i / 10.0;
}
for(; chr<256; chr++){
cMatrix::penalty[chr] = MAX_PENALTY;
}
cMatrix::bShareAdapter = bShareAdapter;
cMatrix::firstAdapters.clear();
cMatrix::secondAdapters.clear();
cMatrix::junctionAdapters.clear();
}
void cMatrix::AddAdapter(deque & adapters, char * vector, size_t len, TRIM_MODE trimMode)
{
cAdapter adapter;
adapter.Init(vector, len, trimMode);
adapters.push_back(adapter);
}
void cMatrix::CalculateJunctionLengths()
{
deque::iterator it_adapter;
junctionLengths.push_back(0);
for(it_adapter=junctionAdapters.begin(); it_adapter!=junctionAdapters.end(); it_adapter++){
junctionLengths.push_back((*it_adapter).len);
}
}
void cMatrix::CalculateIndices(vector< vector > &bMatrix, int nRow, int nCol)
{
int i, j;
rowBc.clear();
colBc.clear();
indices.resize(nRow, vector(nCol, -1));
iIdxCnt = 0;
for(i=0; i & fw_adapters, int iCutF, deque & rv_adapters, int iCutR)
{
deque::iterator it_adapter;
cAdapter * pAdapter;
fw_masked.clear(); fw_masked.push_back(NULL);
fw_barcodes.clear(); fw_barcodes.push_back(string(""));
fw_primers.clear(); fw_primers.push_back(string(""));
for(it_adapter=fw_adapters.begin(); it_adapter!=fw_adapters.end(); it_adapter++){
pAdapter = &(*it_adapter);
pAdapter->initBarcode(iCutF);
fw_masked.push_back(pAdapter->getMasked());
fw_barcodes.push_back(string(pAdapter->getBarcode()));
fw_primers.push_back(string(pAdapter->getPrimer()));
}
rv_masked.clear(); rv_masked.push_back(NULL);
rv_barcodes.clear(); rv_barcodes.push_back(string(""));
rv_primers.clear(); rv_primers.push_back(string(""));
for(it_adapter=rv_adapters.begin(); it_adapter!=rv_adapters.end(); it_adapter++){
pAdapter = &(*it_adapter);
pAdapter->initBarcode(iCutR);
rv_masked.push_back(pAdapter->getMasked());
rv_barcodes.push_back(string(pAdapter->getBarcode()));
rv_primers.push_back(string(pAdapter->getPrimer()));
}
}
bool cMatrix::isBlurry(char * seq, size_t len)
{
size_t u;
int iMaxBlurry = ceil(cMatrix::dEpsilon * len);
int iBlurry = 0;
for(u=0; u iMaxBlurry){
return true;
}
}
}
return false;
}
bool cMatrix::checkQualities(uchar * quals, size_t len, int minQual)
{
size_t u;
if(len == 0) return true;
int total = 0;
for(u=0; u= minQual;
}
int cMatrix::trimByQuality(uchar * quals, size_t len, int minQual)
{
int i;
for(i=(int)len-1; i>=0; i--){
if(quals[i] >= minQual)
break;
}
return (i+1);
}
INDEX cMatrix::findAdapter(char * read, size_t rLen, uchar * qual, size_t qLen)
{
deque::iterator it_adapter;
cAdapter * pAdapter;
cElementSet result;
double maxScore = -1;
INDEX index;
index.pos = int(rLen);
index.bc = 0;
int i;
for(i=0,it_adapter=firstAdapters.begin(); it_adapter!=firstAdapters.end(); it_adapter++,i++){
pAdapter = &(*it_adapter);
if(pAdapter->align(read, rLen, qual, qLen, result, i)){
if(result.begin()->score > maxScore){
index = result.begin()->idx;
maxScore = result.begin()->score;
}
}
}
return index;
}
INDEX cMatrix::findAdapter2(char * read, size_t rLen, uchar * qual, size_t qLen)
{
deque::iterator it_adapter;
cAdapter * pAdapter;
cElementSet result;
double maxScore = -1;
INDEX index;
index.pos = int(rLen);
index.bc = 0;
int i;
deque *pAdapters = (bShareAdapter ? &firstAdapters : &secondAdapters);
for(i=0,it_adapter=pAdapters->begin(); it_adapter!=pAdapters->end(); it_adapter++,i++){
pAdapter = &(*it_adapter);
if(pAdapter->align(read, rLen, qual, qLen, result, i)){
if(result.begin()->score > maxScore){
index = result.begin()->idx;
maxScore = result.begin()->score;
}
}
}
return index;
}
INDEX cMatrix::findJuncAdapter(char * read, size_t rLen, uchar * qual, size_t qLen)
{
deque::iterator it_adapter;
cAdapter * pAdapter;
cElementSet result;
double maxScore = -1;
INDEX index;
index.pos = int(rLen);
index.bc = 0;
int i;
for(i=0,it_adapter=junctionAdapters.begin(); it_adapter!=junctionAdapters.end(); it_adapter++,i++){
pAdapter = &(*it_adapter);
if(pAdapter->align(read, rLen, qual, qLen, result, i)){
if(result.begin()->score > maxScore){
index = result.begin()->idx;
maxScore = result.begin()->score;
}
}
}
return index;
}
bool cMatrix::findAdapterWithPE(char * read, char * read2, size_t rLen, size_t rLen2, uchar * qual, uchar * qual2, size_t qLen, size_t qLen2, INDEX &index, INDEX &index2)
{
deque::iterator it_adapter;
cAdapter * pAdapter;
cElementSet result, result2;
index.pos = int(rLen);
index.bc = -1;
index2.pos = int(rLen2);
index2.bc = -1;
int i;
for(i=0,it_adapter=firstAdapters.begin(); it_adapter!=firstAdapters.end(); it_adapter++,i++){
pAdapter = &(*it_adapter);
pAdapter->align(read, rLen, qual, qLen, result, i, false);
}
deque *pAdapters = (bShareAdapter ? &firstAdapters : &secondAdapters);
for(i=0,it_adapter=pAdapters->begin(); it_adapter!=pAdapters->end(); it_adapter++,i++){
pAdapter = &(*it_adapter);
pAdapter->align(read2, rLen2, qual2, qLen2, result2, i, false);
}
if(result.empty() && result2.empty()){
return false;
}
size_t minQLen = (qLen <= qLen2) ? qLen : qLen2;
double maxScore = -1;
double score;
cElementSet::iterator it_element, it_element2, it_ele, it_ele2;
bool bRevComplement;
int pos, pos2, cpos, iStart, iStart2;
int apos, apos2;
int bc, bc2;
bc = bc2 = 0;
apos = int(rLen);
apos2 = int(rLen2);
it_element = result.begin();
it_element2 = result2.begin();
while( true ){
pos = (it_element == result.end()) ? INT_MAX : it_element->idx.pos;
pos2 = (it_element2 == result2.end()) ? INT_MAX : it_element2->idx.pos;
if( (pos == INT_MAX) && (pos2 == INT_MAX) )
break;
if(pos <= pos2){ // partial ordering: pos < rLen && pos2 < rLen2 iff. pos2 != INT_MAX
cpos = pos;
iStart = (pos <= int(rLen2)) ? 0 : (pos - int(rLen2));
iStart2 = 0;
}
else{ // pos > pos2
cpos = pos2;
iStart = 0;
iStart2 = (pos2 <= int(rLen)) ? 0 : (pos2 - int(rLen));
}
bRevComplement = CalcRevCompScore(read + iStart, read2 + iStart2, cpos, qual + iStart, qual2 + iStart2, minQLen, score);
if(pos < pos2){
if(bRevComplement){
do{
if( (indices[it_element->idx.bc][0] >= 0) && (score + it_element->score > maxScore) ){
maxScore = score + it_element->score;
bc = it_element->idx.bc;
bc2 = 0;
apos = cpos;
apos2 = (cpos <= int(rLen2)) ? cpos : int(rLen2);
}
it_element++;
}while( (it_element != result.end()) && (it_element->idx.pos == cpos) );
}
else{
do{
it_element++;
}while( (it_element != result.end()) && (it_element->idx.pos == cpos) );
}
}
else if(pos > pos2){
if(bRevComplement){
do{
if( (indices[0][it_element2->idx.bc] >= 0) && (score + it_element2->score > maxScore) ){
maxScore = score + it_element2->score;
bc = 0;
bc2 = it_element2->idx.bc;
apos = (cpos <= int(rLen)) ? cpos : int(rLen);
apos2 = cpos;
}
it_element2++;
}while( (it_element2 != result2.end()) && (it_element2->idx.pos == cpos) );
}
else{
do{
it_element2++;
}while( (it_element2 != result2.end()) && (it_element2->idx.pos == cpos) );
}
}
else{ // ==
if(bRevComplement){
for(it_ele = it_element; (it_ele != result.end()) && (it_ele->idx.pos == cpos); it_ele++){
for(it_ele2 = it_element2; (it_ele2 != result2.end()) && (it_ele2->idx.pos == cpos); it_ele2++){
if(indices[it_ele->idx.bc][it_ele2->idx.bc] < 0)
continue;
if(score + it_ele->score + it_ele2->score <= maxScore)
continue;
maxScore = score + it_ele->score + it_ele2->score;
bc = it_ele->idx.bc;
bc2 = it_ele2->idx.bc;
apos = cpos;
apos2 = cpos;
}
}
}
do{
it_element++;
}while( (it_element != result.end()) && (it_element->idx.pos == cpos) );
do{
it_element2++;
}while( (it_element2 != result2.end()) && (it_element2->idx.pos == cpos) );
}
}
index.bc = index2.bc = indices[bc][bc2];
if(index.bc < 0){
return false;
}
if( (apos <= 0) || (apos2 <= 0) ){
index.pos = index2.pos = 0;
}
else{
index.pos = apos;
index2.pos = apos2;
}
return true;
}
// return value
// 0: forward-reverse
// 1: reverse-forward
// -1: no match
int cMatrix::findAdaptersInARead(char * read, size_t rLen, uchar * qual, size_t qLen, INDEX &index)
{
deque::iterator it_adapter;
cAdapter * pAdapter;
cElementSet result;
index.pos = int(rLen);
index.bc = -1;
int i;
size_t nLen;
double maxScore = -1;
int flag = 0;
deque::iterator it_element, it_element2;
for(i=0,it_adapter=firstAdapters.begin(); it_adapter!=firstAdapters.end(); it_adapter++,i++){
pAdapter = &(*it_adapter);
nLen = (pAdapter->len < rLen ? pAdapter->len : rLen);
if(pAdapter->align(read, nLen, qual, qLen, result, i)){
if(result.begin()->score > maxScore){
index = result.begin()->idx;
index.bc = indices[index.bc][0];
maxScore = result.begin()->score;
}
}
}
if(!bShareAdapter){
for(i=0,it_adapter=secondAdapters.begin(); it_adapter!=secondAdapters.end(); it_adapter++,i++){
pAdapter = &(*it_adapter);
nLen = (pAdapter->len < rLen ? pAdapter->len : rLen);
if(pAdapter->align(read, nLen, qual, qLen, result, i)){
if(result.begin()->score > maxScore){
index = result.begin()->idx;
index.bc = indices[0][index.bc];
maxScore = result.begin()->score;
flag = 1;
}
}
}
}
return (index.bc < 0) ? -1 : flag;
}
// return value
// 0: forward-reverse
// 1: reverse-forward
// -1: no match
int cMatrix::findAdaptersBidirectionally(char * read, size_t rLen, uchar * qual, size_t qLen,
char * read2, size_t rLen2, uchar * qual2, size_t qLen2, INDEX &index, INDEX &index2)
{
int bc = -1;
deque::iterator it_adapter;
cAdapter * pAdapter;
cElementSet result;
deque result1, result2, result3, result4;
index.pos = index2.pos = int(rLen);
index.bc = index2.bc = 0;
int i;
size_t nLen;
deque::iterator it_element, it_element2;
for(i=0,it_adapter=firstAdapters.begin(); it_adapter!=firstAdapters.end(); it_adapter++,i++){
pAdapter = &(*it_adapter);
nLen = (pAdapter->len < rLen ? pAdapter->len : rLen);
if(pAdapter->align(read, nLen, qual, qLen, result, i)){
result1.push_back(*result.begin());
}
nLen = (pAdapter->len < rLen2 ? pAdapter->len : rLen2);
if(pAdapter->align(read2, nLen, qual2, qLen2, result, i)){
result3.push_back(*result.begin());
}
}
ELEMENT eElement;
memset((void *)&eElement, 0, sizeof(ELEMENT));
double maxScore = -1;
if(bShareAdapter){
if(result1.empty() ^ result3.empty()){
if(result1.empty()){
result1.push_back(eElement);
}
else{ // result3.empty()
result3.push_back(eElement);
}
}
for(it_element=result1.begin(); it_element!=result1.end(); it_element++){
for(it_element2=result3.begin(); it_element2!=result3.end(); it_element2++){
if(indices[it_element->idx.bc][it_element2->idx.bc] < 0)
continue;
if(it_element->score + it_element2->score > maxScore){
maxScore = it_element->score + it_element2->score;
index = it_element->idx;
index2 = it_element2->idx;
bc = indices[it_element->idx.bc][it_element2->idx.bc];
}
}
}
index.bc = bc;
return (bc < 0) ? -1 : 0;
}
for(i=0,it_adapter=secondAdapters.begin(); it_adapter!=secondAdapters.end(); it_adapter++,i++){
pAdapter = &(*it_adapter);
nLen = (pAdapter->len < rLen2 ? pAdapter->len : rLen2);
if(pAdapter->align(read2, nLen, qual2, qLen2, result, i)){
result2.push_back(*result.begin());
}
nLen = (pAdapter->len < rLen ? pAdapter->len : rLen);
if(pAdapter->align(read, nLen, qual, qLen, result, i)){
result4.push_back(*result.begin());
}
}
if(result1.empty() ^ result2.empty()){
if(result1.empty()){
result1.push_back(eElement);
}
else{ // result2.empty()
result2.push_back(eElement);
}
}
for(it_element=result1.begin(); it_element!=result1.end(); it_element++){
for(it_element2=result2.begin(); it_element2!=result2.end(); it_element2++){
if(indices[it_element->idx.bc][it_element2->idx.bc] < 0)
continue;
if(it_element->score + it_element2->score > maxScore){
maxScore = it_element->score + it_element2->score;
index = it_element->idx;
index2 = it_element2->idx;
bc = indices[it_element->idx.bc][it_element2->idx.bc];
}
}
}
bool bReverse = false;
if(result3.empty() ^ result4.empty()){
if(result3.empty()){
result3.push_back(eElement);
}
else{ // result4.empty()
result4.push_back(eElement);
}
}
for(it_element=result3.begin(); it_element!=result3.end(); it_element++){
for(it_element2=result4.begin(); it_element2!=result4.end(); it_element2++){
if(indices[it_element->idx.bc][it_element2->idx.bc] < 0)
continue;
if(it_element->score + it_element2->score > maxScore){
maxScore = it_element->score + it_element2->score;
index2 = it_element->idx;
index = it_element2->idx;
bc = indices[it_element->idx.bc][it_element2->idx.bc];
bReverse = true;
}
}
}
index.bc = bc;
return (bc < 0) ? -1 : bReverse;
}
bool cMatrix::PrepareBarcode(char * barcodeSeq, int bcIdx, char * seq, int len, char * seq2, int len2, char * barcodeQual, char * qual, char * qual2)
{
assert(bcIdx >= 0);
bool *mask = fw_masked[rowBc[bcIdx]];
bool *mask2 = rv_masked[colBc[bcIdx]];
if( (mask == NULL) || (mask2 == NULL) ){
return false;
}
int n = 0;
int i;
for(i=0; i= 0);
bool *mask = fw_masked[rowBc[bcIdx]];
bool *mask2 = rv_masked[colBc[bcIdx]];
if( (mask == NULL) || (mask2 == NULL) ){
return false;
}
int n = 0;
int i;
for(i=0; iidx.pos;
if(pos >= 0){
clen = rLen - pos;
bRevComplement = CalcRevCompScore(read + pos, read2 + pos, clen, qual + pos, qual2 + pos, qLen, score);
}
}
if(bRevComplement){ // overlap detected
if(qLen > 0)
combinePairSeqs(read+pos, read2+pos, clen, clen, qual+pos, qual2+pos, qLen, qLen);
endPos = startPos + jLen;
if(endPos + clen >= rLen){ // junction adapter locates in overlapping region
index.pos = rLen - (endPos + clen - rLen);
}
else{
eLen = rLen - (endPos + clen);
index.pos += eLen;
for(i=eLen/2; i>=0; i--){ //reverse
chr = read2[endPos + i];
read2[endPos + i] = read2[endPos + eLen - 1 - i];
read2[endPos + eLen - 1 - i] = chr;
}
for(i=0; i<(int)eLen; i++){
read2[startPos + i] = character[complement[codeMap[uchar(read2[endPos + i])]]];
}
if(qLen > 0){
for(i=eLen/2; i>=0; i--){ //reverse
uchr = qual2[endPos + i];
qual2[endPos + i] = qual2[endPos + eLen - 1 -i];
qual2[endPos + eLen - 1 - i] = uchr;
}
for(i=0; i<(int)eLen; i++){
qual2[startPos + i] = qual2[endPos + i];
}
}
}
}
else{
index.pos -= cMatrix::iMinOverlap;
}
return index;
}
bool cMatrix::combinePairSeqs(char * read, char * read2, int len, int len2, uchar * qual, uchar * qual2, int qLen, int qLen2)
{
CODE code, code2;
if(len != len2){
int offset;
if(len > len2){
offset = len - len2;
read += offset;
len -= offset;
qual += offset;
qLen -= offset;
}
else{
offset = len2 - len;
read2 += offset;
qual2 += offset;
qLen2 -= offset;
}
}
int minQLen = (qLen < qLen2) ? qLen : qLen2;
if(minQLen < len){
return false;
}
for(int i=0; i qual[i]){
qual[i] = qual2[len-1-i];
if(code != code2){
read[i] = character[code2];
}
}
}
return true;
}
skewer-0.2.2/src/matrix.h 0000664 0000000 0000000 00000014301 12700552063 0015256 0 ustar 00root root 0000000 0000000 /**********************************************************************
* Skewer - a fast and accurate adapter trimming tool
* using the bit-masked k-difference matching algorithm
* Copyright (c) 2013-2016 by Hongshan Jiang
* hongshan.jiang@gmail.com
*
* If you use this program, please cite the paper:
* Jiang, H., Lei, R., Ding, S.W. and Zhu, S. (2014) Skewer: a fast and
* accurate adapter trimmer for next-generation sequencing paired-end reads.
* BMC Bioinformatics, 15, 182.
* http://www.biomedcentral.com/1471-2105/15/182
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef _MATRIX_H
#define _MATRIX_H
#include
#include
#include
#include
#include
#include "common.h"
using namespace std;
typedef struct{
double score;
int nIndel;
INDEX idx;
}ELEMENT;
class ElementComparator
{
public:
bool operator()(const ELEMENT &elem1, const ELEMENT &elem2){
return elem1.idx.pos < elem2.idx.pos;
}
};
typedef set ELEMENT_SET;
class cElementSet : public ELEMENT_SET
{
public:
bool insert(const ELEMENT& val);
};
typedef enum{
CD_NONE = 0,
CD_A = 1, // Adenosine
CD_C = 2, // Cytidine
CD_G = 3, // Guanosine
CD_T = 4, // Thymidine (or Uridine)
CD_R = 5, // puRine, A or G
CD_Y = 6, // pYrimidine, T or C
CD_S = 7, // Strong, G or C
CD_W = 8, // Weak, A or T
CD_K = 9, // Keto, G or T
CD_M = 10, // aMino, A or C
CD_B = 11, // not A
CD_D = 12, // not C
CD_H = 13, // not G
CD_V = 14, // not T
CD_N = 15, // any base
CD_CNT = CD_N+1,
CD_BASIC_CNT = 5
}CODE;
class cAdapter
{
char sequence[MAX_ADAPTER_LEN+1]; // for debug only
char barcode[MAX_ADAPTER_LEN+1];
char primer[MAX_ADAPTER_LEN+1];
bool masked[MAX_ADAPTER_LEN+1];
inline void UPDATE_COLUMN(deque & queue, uint64 &d0bits, uint64 &lbits, uint64 &unbits, uint64 &dnbits, double &penal, double &dMaxPenalty, int &iMaxIndel);
public:
size_t len;
TRIM_MODE trimMode;
bool bBestAlign;
uint64 matchBits[CD_CNT];
public:
cAdapter();
~cAdapter();
void Init(char * seq, size_t sLen, TRIM_MODE trimMode);
void Init2(char * seq, size_t sLen);
bool align(char * read, size_t rLen, uchar * qual, size_t qLen, cElementSet &result, int bc, bool bBestAlign=true);
public:
void initBarcode(int iCut);
char * getBarcode() { return barcode; }
char * getPrimer() { return primer; }
bool * getMasked() { return masked; }
};
///////////////////////////////////////
class cMatrix
{
friend class cAdapter;
static bool bShareAdapter;
static double dEpsilon, dEpsilonIndel;
static double dPenaltyPerErr;
static double dDelta, dMu;
static double penalty[256];
static bool bSensitive;
public:
static vector fw_masked;
static vector rv_masked;
static vector fw_barcodes;
static vector rv_barcodes;
static vector fw_primers;
static vector rv_primers;
static vector rowBc;
static vector colBc;
public:
static deque firstAdapters;
static deque secondAdapters;
static deque junctionAdapters;
static vector junctionLengths;
static vector< vector > indices;
static int iIdxCnt;
static int iMinOverlap;
public:
cMatrix();
~cMatrix();
private:
static bool CalcRevCompScore(char * seq, char * seq2, int len, uchar * qual, uchar * qual2, size_t qLen, double &score);
static string GetRevComp(char * seq, int len);
public:
static void InitParameters(enum TRIM_MODE trimMode, double dEpsilon, double dEpsilonIndel, int baseQual, bool bShareAdapter);
static void AddAdapter(deque & adapters, char * vector, size_t len, TRIM_MODE trimMode);
static void CalculateJunctionLengths();
static void CalculateIndices(vector< vector > &bMatrix, int nRow, int nCol);
static void InitBarcodes(deque & fw_primers, int iCutF, deque & rv_primers, int iCutR);
static bool isBlurry(char * seq, size_t len);
static bool checkQualities(uchar * quals, size_t len, int minQual);
static int trimByQuality(uchar * quals, size_t len, int minQual);
static INDEX findAdapter(char * read, size_t rLen, uchar * qual, size_t qLen);
static INDEX findAdapter2(char * read, size_t rLen, uchar * qual, size_t qLen);
static INDEX findJuncAdapter(char * read, size_t rLen, uchar * qual, size_t qLen);
static bool findAdapterWithPE(char * read, char * read2, size_t rLen, size_t rLen2, uchar * qual, uchar * qual2, size_t qLen, size_t qLen2, INDEX &index, INDEX & index2);
static int findAdaptersBidirectionally(char * read, size_t rLen, uchar * qual, size_t qLen,
char * read2, size_t rLen2, uchar * qual2, size_t qLen2, INDEX &index, INDEX &index2);
static int findAdaptersInARead(char * read, size_t rLen, uchar * qual, size_t qLen, INDEX &index);
static bool PrepareBarcode(char * barcodeSeq, int bcIdx, char * seq, int len, char * seq2, int len2, char * barcodeQual, char * qual, char * qual2);
static bool PrepareBarcode(char * barcodeSeq, int bcIdx, char * seq, int len, char * seq2, int len2);
static INDEX mergePE(char * read, char * read2, size_t rLen, uchar * qual, uchar * qual2, size_t qLen, size_t startPos, size_t jLen);
static bool combinePairSeqs(char * read, char * read2, int len, int len2, uchar * qual, uchar * qual2, int qLen, int qLen2);
};
#endif // _MATRIX_H
skewer-0.2.2/src/parameter.cpp 0000664 0000000 0000000 00000107761 12700552063 0016302 0 ustar 00root root 0000000 0000000 /**********************************************************************
* Skewer - a fast and accurate adapter trimming tool
* using the bit-masked k-difference matching algorithm
* Copyright (c) 2013-2016 by Hongshan Jiang
* hongshan.jiang@gmail.com
*
* If you use this program, please cite the paper:
* Jiang, H., Lei, R., Ding, S.W. and Zhu, S. (2014) Skewer: a fast and
* accurate adapter trimmer for next-generation sequencing paired-end reads.
* BMC Bioinformatics, 15, 182.
* http://www.biomedcentral.com/1471-2105/15/182
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include
#include
#include
#include
#include
#include
#include "parameter.h"
#include "fastq.h"
using namespace std;
const char * VERSION = "0.2.2";
const char * DATE = "April 4, 2016";
const char * AUTHOR = "Hongshan Jiang";
const char * ILLUMINA_ADAPTER_PREFIX = "AGATCGGAAGAGC";
const char * ILLUMINA_PAIR1_ADAPTER_PREFIX = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC";
const char * ILLUMINA_PAIR1_ADAPTER = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG";
const char * ILLUMINA_PAIR2_ADAPTER_PREFIX = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA";
const char * ILLUMINA_PAIR2_ADAPTER = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT";
const char * ILLUMINA_SRNA_ADAPTER = "TCGTATGCCGTCTTCTGCTTGT";
const char * ILLUMINA_JUNCTION_ADAPTER = "CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG";
// colorCode:
// -1: normal
// 0: black
// 1: red
// 2: green
// 3: yellow
// 4: blue
// 5: magenta
// 6: cyan
// 256: bold
int color_fprintf(int colorCode, FILE *stream, const char *format, ...)
{
va_list arglist;
bool bBold = (colorCode & 256) != 0;
colorCode &= ~256;
if(colorCode > 7){
colorCode = 7;
}
va_start(arglist, format);
if(colorCode >= 0)
fprintf(stream, "\033[%d;3%dm", bBold, colorCode);
int iRet = vfprintf(stream, format, arglist);
if(colorCode >= 0)
fprintf(stream, "\033[0m");
va_end(arglist);
return iRet;
}
int color_sprintf(int colorCode, char *str, const char *format, ...)
{
va_list arglist;
bool bBold = (colorCode & 256) != 0;
colorCode &= ~256;
if(colorCode > 7){
colorCode = 7;
}
va_start(arglist, format);
if(colorCode >= 0)
sprintf(str, "\33[%d;3%dm", bBold, colorCode);
int iRet = vsprintf(str, format, arglist);
if(colorCode >= 0)
sprintf(str, "\33[0m");
va_end(arglist);
return iRet;
}
void color_fprintf_sequences(int colorCode, FILE *stream, vector &sequences, char leading)
{
vector::iterator it_seq;
int i;
for(i=1,it_seq=sequences.begin(); it_seq!=sequences.end(); it_seq++,i++){
fprintf(stream, "%c%02d:\t", leading, i);
color_fprintf(colorCode, stream, "%s\n", (*it_seq).c_str());
}
}
void color_fprintf_sequences(int colorCode, FILE *stream, vector &sequences, vector &names)
{
assert(sequences.size() + 1 == names.size());
vector::iterator it_seq;
int i;
for(i=1,it_seq=sequences.begin(); it_seq!=sequences.end(); it_seq++,i++){
fprintf(stream, "%s:\t", names[i].c_str());
color_fprintf(colorCode, stream, "%s\n", (*it_seq).c_str());
}
}
///////////////////////////////////////
cParameter::cParameter()
{
version = VERSION;
argc = 0;
argv = NULL;
input[0] = arr[0]; input[1] = arr[1];
input[0][0] = input[1][0] = '\0';
logfile[0] = '\0';
pDecorate = "";
trimMode = TRIM_DEFAULT;
bShareAdapter = false;
bBarcode = false;
bClip = false;
bQiime = false;
bFilterNs = false;
bFilterUndetermined = false;
bRedistribute = false;
bXFile = bYFile = bJFile = false;
nFileCnt = 0;
fastqFormat = UNKNOWN_FASTQ;
outputFormat = COMPRESS_NONE;
bStdin = false;
bStdout = false;
bQuiet = false;
bEnquireVersion = false;
bAutoFormat = true;
baseQual = 33;
epsilon = 0.1;
delta = 0.03;
minLen = 18;
maxLen = 0;
minAverageQual = 0;
minEndQual = 0;
minK = 5;
nThreads = 1;
iCutF = iCutR = 0;
bCutTail = false;
bWriteMasked = false;
bWriteExcluded = false;
bFillWithNs = false;
}
char * cParameter::occOfLastDot (char * str)
{
char * ret = NULL;
do{
if(*str == '.'){
ret = str;
}
else if(*str == '/'){
ret = NULL;
}
} while (*str++);
return (ret != NULL) ? ret : str;
}
bool cParameter::IsDirectorySpecified (char * str)
{
char *x = strrchr(str, '/');
return (x != NULL) && (x[1] == '\0');
}
int cParameter::ReadMatrix(const char * fileName)
{
char * line = NULL;
size_t alloc;
FILE * fp = fopen(fileName, "r");
if(fp == NULL)
return -1;
int iRet = 0;
int len, iRow, iCol;
char * pch;
bool bval;
bool bAdd1stCol = false;
iRow = 0;
while( (len = getline(&line, &alloc, fp)) > 0 ){
line[--len] = '\0';
if( (len > 0) && (line[len-1] == '\r') ){
line[--len] = '\0';
}
iCol = 0;
pch = strtok(line, "\t");
if(line[0] == '#'){
if(iRow > 0){
iRet = -2;
break;
}
while(pch != NULL){
if(iCol > 0){
if( (iCol == 1) && (strcmp(pch, "%") != 0) ){
colNames.push_back("%");
bAdd1stCol = true;
}
colNames.push_back(pch);
}
pch = strtok(NULL, "\t");
iCol++;
}
iRow++;
continue;
}
if(iRow == 1){
if( (pch == NULL) || (strcmp(pch, "%") != 0) ){
rowNames.push_back("%");
bMatrix.push_back(vector(colNames.size(),false));
}
}
vector bvec;
if(bAdd1stCol)
bvec.push_back(false);
while(pch != NULL){
if(iCol == 0){
rowNames.push_back(pch);
}
else{
bval = (atoi(pch) > 0);
bvec.push_back(bval);
}
pch = strtok(NULL, "\t");
iCol++;
}
bMatrix.push_back(bvec);
iRow++;
}
if(line != NULL)
free(line);
fclose(fp);
if( (rowNames.size() > 0) && (colNames.size() > 0) ){
bMatrix[0][0] = false;
}
return iRet;
}
int cParameter::ReadFasta(const char * fileName, vector & sequences)
{
char * line = NULL;
size_t alloc;
FILE * fp = fopen(fileName, "r");
if(fp == NULL)
return -1;
int iRet = 0;
int len;
int no = 0;
string seq;
while( (len = getline(&line, &alloc, fp)) > 0 ){
if(line[0] == '>'){
if(no > 0){
if(seq.length() == 0){
iRet = -2;
break;
}
if((int)seq.length() > MAX_ADAPTER_LEN){
if( (trimMode & TRIM_ANY) == TRIM_HEAD )
seq.assign(seq.substr(seq.length() - MAX_ADAPTER_LEN, string::npos));
else
seq.assign(seq.substr(0, MAX_ADAPTER_LEN));
}
sequences.push_back(seq);
seq.clear();
}
if(int(sequences.size()) == MAX_ADAPTER_CNT){
fprintf(stderr, "\rWarning: only uses the first %d adapter sequences in \"%s\"\n", MAX_ADAPTER_CNT, fileName);
break;
}
no++;
continue;
}
if(no == 0){
iRet = -2;
break;
}
line[--len] = '\0';
if( (len > 0) && (line[len-1] == '\r') ){
line[--len] = '\0';
}
seq.append(line);
}
if(seq.length() > 0){
if((int)seq.length() > MAX_ADAPTER_LEN){
if( (trimMode & TRIM_ANY) == TRIM_HEAD )
seq.assign(seq.substr(seq.length() - MAX_ADAPTER_LEN, string::npos));
else
seq.assign(seq.substr(0, MAX_ADAPTER_LEN));
}
sequences.push_back(seq);
}
if(line != NULL)
free(line);
fclose(fp);
return iRet;
}
///////////////////////////////////////////
// public subroutines
bool cParameter::IsAutoFastqFormat()
{
return bAutoFormat;
}
void cParameter::PrintVersion(FILE * fp)
{
fprintf(fp, "\tskewer version: %s\n", VERSION);
fprintf(fp, "\tAuthor: %s\n", AUTHOR);
fprintf(fp, "\tLast update: %s\n", DATE);
}
void cParameter::PrintUsage(char * program, FILE * fp)
{
fprintf(fp, "Skewer (A fast and accurate adapter trimmer for paired-end reads)\n");
fprintf(fp, "Version %s (updated in %s), Author: %s\n\n", VERSION, DATE, AUTHOR);
fprintf(fp, "USAGE: %s [options] [paired-reads.fastq]\n", program);
fprintf(fp, " or %s [options] - (for input from STDIN)\n\n", program);
fprintf(fp, "OPTIONS (ranges in brackets, defaults in parentheses):\n");
fprintf(fp, " Adapter:\n");
fprintf(fp, " -x Adapter sequence/file (%s)\n", ILLUMINA_PAIR1_ADAPTER_PREFIX);
fprintf(fp, " -y Adapter sequence/file for pair-end reads (%s),\n", ILLUMINA_PAIR2_ADAPTER_PREFIX);
fprintf(fp, " implied by -x if -x is the only one specified explicitly.\n");
fprintf(fp, " -M, --matrix File indicates valid adapter pairing (all-ones matrix).\n");
fprintf(fp, " -j Junction adapter sequence/file for Nextera Mate Pair reads (%s)\n", ILLUMINA_JUNCTION_ADAPTER);
fprintf(fp, " -m, --mode trimming mode; 1) single-end -- head: 5' end; tail: 3' end; any: anywhere (tail)\n");
fprintf(fp, " 2) paired-end -- pe: paired-end; mp: mate-pair; ap: amplicon (pe)\n");
fprintf(fp, " -b, --barcode Demultiplex reads according to adapters/primers (no)\n");
fprintf(fp, " Tolerance:\n");
fprintf(fp, " -r Maximum allowed error rate (normalized #errors / length of aligned region) [0, 0.5], (0.1)\n");
fprintf(fp, " -d Maximum allowed indel error rate [0, r], (0.03)\n");
fprintf(fp, " reciprocal is used for -r, -e and -d when num > or = 2\n");
fprintf(fp, " -k Minimum overlap length for adapter detection [1, inf);\n");
fprintf(fp, " (max(1, int(4-10*r)) for single-end; (/2) for mate-pair)\n");
fprintf(fp, " Clipping:\n");
fprintf(fp, " -c, --cut , Hard clip off the 5' leading bases as the barcodes in amplicon mode; (no)\n");
fprintf(fp, " -e, --cut3 Hard clip off the 3' tailing bases if the read length is greater than\n");
fprintf(fp, " the maximum read length specified by -L; (no)\n");
fprintf(fp, " Filtering:\n");
fprintf(fp, " -q, --end-quality Trim 3' end until specified or higher quality reached; (0)\n");
fprintf(fp, " -Q, --mean-quality The lowest mean quality value allowed before trimming; (0)\n");
fprintf(fp, " -l, --min The minimum read length allowed after trimming; (18)\n");
fprintf(fp, " -L, --max The maximum read length allowed after trimming; (no limit)\n");
fprintf(fp, " -n Whether to filter out highly degenerative (many Ns) reads; (no)\n");
fprintf(fp, " -u Whether to filter out undetermined mate-pair reads; (no)\n");
fprintf(fp, " -N, --fillNs Whether to replace trimmed bases with Ns (has no effect with 'b' or '-m mp'); (no)\n");
fprintf(fp, " Input/Output:\n");
fprintf(fp, " -f, --format Format of FASTQ quality value: sanger|solexa|auto; (auto)\n");
fprintf(fp, " -o, --output Base name of output file; ('.trimmed')\n");
fprintf(fp, " -z, --compress Compress output in GZIP format (no)\n");
fprintf(fp, " -1, --stdout Redirect output to STDOUT, suppressing -b, -o, and -z options (no)\n");
fprintf(fp, " --qiime Prepare the \"barcodes.fastq\" and \"mapping_file.txt\" for processing with QIIME; (default: no)\n");
fprintf(fp, " --quiet No progress update (not quiet)\n");
fprintf(fp, " -A, --masked-output Write output file(s) for trimmed reads (trimmed bases converted to lower case) (no)\n");
fprintf(fp, " -X, --excluded-output Write output file(s) for excluded reads (no)\n");
fprintf(fp, " Miscellaneous:\n");
fprintf(fp, " -i, --intelligent For mate-pair mode, whether to redistribute reads based on junction information; (no)\n");
fprintf(fp, " -t, --threads Number of concurrent threads [1, 32]; (1)\n");
fprintf(fp, "\nEXAMPLES:\n");
fprintf(fp, " %s -Q 9 -t 2 -x adapters.fa sample.fastq -o trimmed\n", program);
fprintf(fp, " %s -x %s -q 3 sample-pair1.fq.gz sample-pair2.fq.gz\n", program, ILLUMINA_ADAPTER_PREFIX);
fprintf(fp, " %s -x %s -l 16 -L 30 -d 0 srna.fastq\n", program, ILLUMINA_SRNA_ADAPTER);
fprintf(fp, " %s -m mp -i lmp-pair1.fastq lmp-pair2.fastq\n", program);
fprintf(fp, " %s -m ap --cut 0,6 --qiime -x forward-primers.fa -y reverse-primers.fa mix-pair1.fastq mix-pair2.fastq\n", program);
}
void cParameter::PrintSimpleUsage(char * program, FILE * fp)
{
fprintf(fp, "Usage: %s [options] [file2]\n", program);
fprintf(fp, "Try `%s --help' for more information.\n", program);
}
void cParameter::printCommandLine(FILE * fp)
{
int i;
if(argc <= 0) return;
fprintf(fp, "COMMAND LINE:\t%s", argv[0]);
for(i=1; i 0)
fprintf(fp, "; ");
fprintf(fp, "%s", output[i].c_str());
}
fprintf(fp, "\n");
if(!untrimmed.empty()){
fprintf(fp, "un%s:\t%s\n", pDecorate, untrimmed.c_str());
}
}
}
else{
fprintf(fp, "Input file:\t%s\n", input[0]);
fprintf(fp, "Paired file:\t%s\n", input[1]);
fprintf(fp, "%s:\t", pDecorate);
for(i=0; i 0)
fprintf(fp, "; ");
fprintf(fp, "%s, %s", output[i].c_str(), output2[i].c_str());
}
fprintf(fp, "\n");
if(!untrimmed.empty()){
fprintf(fp, "%s:\t%s, %s\n", pDecorate, untrimmed.c_str(), untrimmed2.c_str());
}
if(bQiime){
fprintf(fp, "barcode file:\t%s\n", barcodes.c_str());
fprintf(fp, "mapping file:\t%s\n", mapfile.c_str());
}
}
}
void cParameter::printVersion(FILE *fp)
{
fprintf(fp, "skewer v%s [%s]\n", VERSION, DATE);
}
void cParameter::printLogo(FILE *fp, bool bLeadingRtn)
{
if(bLeadingRtn) fprintf(fp, "\n");
fprintf(fp, ".--. .-.\n");
fprintf(fp, ": .--': :.-.\n");
fprintf(fp, "`. `. : `'.' .--. .-..-..-. .--. .--.\n");
fprintf(fp, "_`, :: . `.' '_.': `; `; :' '_.': ..'\n");
fprintf(fp, "`.__.':_;:_;`.__.'`.__.__.'`.__.':_;\n");
}
void cParameter::printOpt(FILE * fp, bool bLeadingRtn)
{
int color = (fp == stdout) ? 3 : -1;
if(bLeadingRtn) fprintf(fp, "\n");
fprintf(fp, "Parameters used:\n");
// adapter
const char * endInfo = ( ((trimMode & TRIM_ANY) == TRIM_HEAD) ? "5' end" : "3' end" );
if(bXFile){
fprintf(fp, "-- %s adapter sequences in file (-x):", endInfo);
fprintf(fp, "\t%s\n", x_str.c_str());
color_fprintf_sequences(color, fp, adapters, rowNames);
}
else{
fprintf(fp, "-- %s adapter sequence (-x):", endInfo);
color_fprintf(color, fp, "\t%s\n", x_str.c_str());
}
if(nFileCnt == 2){
if(!bShareAdapter){
if(bYFile){
fprintf(fp, "-- paired %s adapter sequences in file (-y):", endInfo);
fprintf(fp, "\t%s\n", y_str.c_str());
color_fprintf_sequences(color, fp, adapters2, colNames);
}
else{
fprintf(fp, "-- paired %s adapter sequence (-y):", endInfo);
color_fprintf(color, fp, "\t%s\n", y_str.c_str());
}
}
if(trimMode == TRIM_MP){
if(bJFile){
fprintf(fp, "-- junction adapter sequences in file (-j):");
fprintf(fp, "\t%s\n", j_str.c_str());
color_fprintf_sequences(color, fp, juncAdapters, 'J');
}
else{
fprintf(fp, "-- junction adapter sequence (-j):");
color_fprintf(color, fp, "\t%s\n", j_str.c_str());
}
}
}
// penalty
fprintf(fp, "-- maximum error ratio allowed (-r):\t%.3f\n", epsilon);
fprintf(fp, "-- maximum indel error ratio allowed (-d):\t%.3f\n", delta);
// filtering
if(minAverageQual > 0){
fprintf(fp, "-- mean quality threshold (-Q):\t\t%d\n", minAverageQual);
}
if(minEndQual > 0){
fprintf(fp, "-- end quality threshold (-q):\t\t%d\n", minEndQual);
}
fprintf(fp, "-- minimum read length allowed after trimming (-l):\t%d\n", minLen);
if(maxLen > 0){
fprintf(fp, "-- maximum read length for output (-L):\t%d\n", maxLen);
}
// input
fprintf(fp, "-- file format (-f):\t\t%s %s\n", FASTQ_FORMAT_NAME[fastqFormat], (bAutoFormat ? "(auto detected)" : ""));
// misc
if(nFileCnt < 2){
fprintf(fp, "-- minimum overlap length for adapter detection (-k):\t");
if(minK == INT_MAX){
fprintf(fp, "inf\n");
}
else{
fprintf(fp, "%d\n", minK);
}
}
else{
if(trimMode == TRIM_MP){
fprintf(fp, "-- minimum overlap length for junction adapter detection (-k):\t");
if(minK == INT_MAX){
fprintf(fp, "inf\n");
}
else{
fprintf(fp, "%d\n", minK);
}
if(bRedistribute){
fprintf(fp, "-- redistribute reads based on junction information (-i):\tyes\n");
}
}
else if( (trimMode != TRIM_PE) && !(trimMode & TRIM_AP) ){
fprintf(fp, "-- minimum overlap length for adapter detection (-k):\t");
if(minK == INT_MAX){
fprintf(fp, "inf\n");
}
else{
fprintf(fp, "%d\n", minK);
}
}
}
if(nThreads > 1){
fprintf(fp, "-- number of concurrent threads (-t):\t%d\n", nThreads);
}
}
int cParameter::GetOpt(int argc, char *argv[], char * errMsg)
{
const char *options = "x:y:j:m:r:d:q:l:L:M:nuf:bc:e#o:z1Q:k:t:i*vhAXN";
OPTION_ITEM longOptions[] = {
{"barcode", 'b'},
{"mode", 'm'},
{"end-quality", 'q'},
{"mean-quality", 'Q'},
{"min", 'l'},
{"max", 'L'},
{"matrix", 'M'},
{"output", 'o'},
{"threads", 't'},
{"format", 'f'},
{"stdout", '1'},
{"compress", 'z'},
{"cut", 'c'}, // hard clip for clipping 6bp or 8bp tags from amplicon reads
// example: --cut 0,6 for cutting leading 6 bp from read matches reverse primer
{"cut3", 'e'},
{"qiime", '#'},
{"intelligent", 'i'},
{"quiet", '*'},
{"version", 'v'},
{"help", 'h'},
{"masked-output", 'A'},
{"excluded-output", 'X'},
{"fillNs", 'N'}
};
char basename[MAX_PATH+1+100];
char trimmed[MAX_PATH+1+100];
this->argc = argc;
this->argv = argv;
bool bSetX, bSetY, bSetM, bSetJ, bSetO, bSetL, bSetR, bSetD, bSetK;
bSetX = bSetY = bSetM = bSetJ = bSetO = bSetL = bSetR = bSetD = bSetK = false;
int iRet = 0;
int i, j;
char chr;
const char * str;
for(i=1; i= int(sizeof(longOptions)/sizeof(OPTION_ITEM))){
sprintf(errMsg, "No such an option --%s", str);
iRet = -2;
break;
}
}
else if(chr == '\0'){
bStdin = true;
continue;
}
str = strchr(options, chr);
if(str == NULL){
sprintf(errMsg, "No such an option -%c", chr);
iRet = -2;
break;
}
if(str[1] == ':'){ // has an argument
if(++i >= argc){
sprintf(errMsg, "-%c needs an argument", chr);
iRet = -2;
break;
}
}
switch(chr){
case 'x':
x_str.assign(argv[i]);
str = strchr(x_str.c_str(), '.');
bXFile = (str != NULL);
bSetX = true;
break;
case 'y':
y_str.assign(argv[i]);
str = strchr(y_str.c_str(), '.');
bYFile = (str != NULL);
bSetY = true;
break;
case 'j':
j_str.assign(argv[i]);
str = strchr(j_str.c_str(), '.');
bJFile = (str != NULL);
bSetJ = true;
break;
case 'm':
if(strcasecmp(argv[i], "head") == 0){
trimMode = TRIM_HEAD;
}
else if(strcasecmp(argv[i], "any") == 0){
trimMode = TRIM_ANY;
}
else if(strcasecmp(argv[i], "mp") == 0){
trimMode = TRIM_MP;
}
else if(strcasecmp(argv[i], "ap") == 0){
trimMode = TRIM_AP;
}
else if(strcasecmp(argv[i], "tail") == 0){
trimMode = TRIM_TAIL;
}
else if(strcasecmp(argv[i], "pe") == 0){
trimMode = TRIM_PE;
}
else{
iRet = -3;
}
break;
case 'M':
m_str.assign(argv[i]);
bSetM = true;
break;
case 'r':
if( (argv[i][0] < '0' || argv[i][0] > '9') && argv[i][0] != '.' ){
iRet = -3;
break;
}
epsilon = atof(argv[i]);
if(epsilon < 0) epsilon = 0;
else if(epsilon > 2) epsilon = 1 / epsilon;
else if(epsilon > 0.5) epsilon = 0.5;
bSetR = true;
break;
case 'd':
if( (argv[i][0] < '0' || argv[i][0] > '9') && argv[i][0] != '.' ){
iRet = -3;
break;
}
delta = atof(argv[i]);
if(delta < 0) delta = 0;
else if(delta > 2) delta = 1 / delta;
bSetD = true;
break;
case 'q':
if(argv[i][0] < '0' || argv[i][0] > '9'){
iRet = -3;
break;
}
minEndQual = atoi(argv[i]);
break;
case 'Q':
if(argv[i][0] < '0' || argv[i][0] > '9'){
iRet = -3;
break;
}
minAverageQual = atoi(argv[i]);
break;
case 'l':
if(argv[i][0] < '0' || argv[i][0] > '9'){
iRet = -3;
break;
}
minLen = atoi(argv[i]);
if(minLen < 0) minLen = 0;
break;
case 'L':
if(argv[i][0] < '0' || argv[i][0] > '9'){
iRet = -3;
break;
}
maxLen = atoi(argv[i]);
bSetL = true;
break;
case 'n':
bFilterNs = true;
break;
case 'u':
bFilterUndetermined = true;
break;
case 'f':
if(strcasecmp(argv[i], "auto") == 0){
break;
}
bAutoFormat = false;
if(strcasecmp(argv[i], "sanger") == 0){
fastqFormat = SANGER_FASTQ;
baseQual = 33;
}
else if(strcasecmp(argv[i], "solexa") == 0){
fastqFormat = SOLEXA_FASTQ;
baseQual = 64;
}
else{
sprintf(errMsg, "unknown format '%s', please select either 'sanger' or 'solexa'", argv[i]);
iRet = -2;
}
break;
case 'o':
gzstrncpy(basename, argv[i], MAX_PATH);
bSetO = true;
break;
case 'z':
outputFormat = COMPRESS_GZ;
break;
case 'c':
{
char * line = strdup(argv[i]);
char * fnum = strtok(line, ",");
char * rnum = strtok(NULL, ",");
if( (fnum == NULL) || (rnum == NULL) ){
sprintf(errMsg, "invalid parameter '--cut %s'", argv[i]);
iRet = -2;
break;
}
iCutF = atoi(fnum);
iCutR = atoi(rnum);
if(iCutF < 0) iCutF = 0;
if(iCutR < 0) iCutR = 0;
if( ((iCutF + iCutR) == 0) || ((iCutF + iCutR) > 24) ){
sprintf(errMsg, "invalid parameter: \"--cut %s\", the combined length should be in the range of (0, 24]", argv[i]);
iRet = -2;
break;
}
}
bClip = true;
break;
case 'e':
bCutTail = true;
break;
case '1':
bStdout = true;
break;
case '*':
bQuiet = true;
break;
case 'k':
minK = (strcasecmp(argv[i], "inf") == 0) ? INT_MAX : atoi(argv[i]);
if(minK < 1) minK = 1;
bSetK = true;
break;
case 't':
if(argv[i][0] < '0' || argv[i][0] > '9'){
iRet = -3;
break;
}
nThreads = atoi(argv[i]);
if(nThreads < 1) nThreads = 1;
else if(nThreads > 32) nThreads = 32;
break;
case 'b':
bBarcode = true;
break;
case '#':
bQiime = true;
break;
case 'i':
bRedistribute = true;
break;
case 'A':
bWriteMasked = true;
break;
case 'X':
bWriteExcluded = true;
break;
case 'N':
bFillWithNs = true;
break;
default:
iRet = -1;
bEnquireVersion |= (chr == 'v');
break;
}
if(iRet < 0) break;
}
if(iRet < 0){
if(iRet == -3){
sprintf(errMsg, "Invalid argument of -%c: %s", chr, argv[i]);
}
return iRet;
}
// input and output
if(bQiime){
if(!bClip){
sprintf(errMsg, "--qiime option should only be used with --cut option\n");
return -2;
}
if(bBarcode){
bBarcode = false;
}
}
if(nFileCnt == 0){
if(!bStdin){
sprintf(errMsg, "No input file specified");
return -2;
}
}
else{
if(bStdin){
sprintf(errMsg, "STDIN can not be specified with other input files");
return -2;
}
}
if(bStdout){
if(nFileCnt == 2){
sprintf(errMsg, "STDOUT can not be specified for output paired files");
return -2;
}
if(bBarcode){
sprintf(errMsg, "STDOUT can not be used for demultiplexing (-b)" );
return -2;
}
if(bSetO){
sprintf(errMsg, "STDOUT can not be used with -o");
return -2;
}
if(outputFormat != COMPRESS_NONE){
sprintf(errMsg, "STDOUT can not be used for compressing (-z)");
return -2;
}
}
// trimming mode
if( trimMode == TRIM_AP ){ // for amplicon, paired-end or single end
trimMode = TRIM_MODE(trimMode | TRIM_HEAD);
if( bSetK && (minK != INT_MAX) ){
sprintf(errMsg, "-k should be set to \"inf\" in amplicon mode\n");
return -2;
}
if(!bSetD)
delta = 0.1;
}
else{
if(bClip){
sprintf(errMsg, "-c option should only be used in amplicon mode\n");
return -2;
}
if(nFileCnt < 2){
trimMode = TRIM_MODE(trimMode & TRIM_ANY);
if( trimMode == TRIM_DEFAULT ){
trimMode = TRIM_TAIL;
}
}
else{ // nFileCnt == 2
if(trimMode != TRIM_MP){
trimMode = TRIM_MODE(trimMode | TRIM_PE);
}
}
}
// adapters
if(bSetX){ // specified by command
if(bXFile){
int iReadRet = ReadFasta(x_str.c_str(), adapters);
if(iReadRet < 0){
if(iReadRet == -1)
sprintf(errMsg, "Can not read adapter sequences from FASTA file \"%s\"", x_str.c_str());
else
sprintf(errMsg, "\"%s\" is not a valid FASTA file", x_str.c_str());
return -2;
}
}
else{
if(int(x_str.length()) > MAX_ADAPTER_LEN){
string tmpString;
if( (trimMode & TRIM_ANY) == TRIM_HEAD )
tmpString.assign(x_str.c_str() + x_str.length() - MAX_ADAPTER_LEN);
else
tmpString.assign(x_str.c_str(), 0, MAX_ADAPTER_LEN);
x_str.assign(tmpString);
}
adapters.push_back(x_str);
}
}
else{ // default
x_str.assign(ILLUMINA_PAIR1_ADAPTER_PREFIX);
adapters.push_back(x_str);
}
if(bSetY){ // specified by command
if(bYFile){
int iReadRet = ReadFasta(y_str.c_str(), adapters2);
if(iReadRet < 0){
if(iReadRet == -1)
sprintf(errMsg, "Can not read adapter sequences from FASTA file \"%s\"", y_str.c_str());
else
sprintf(errMsg, "\"%s\" is not a valid FASTA file", y_str.c_str());
return -2;
}
}
else{
if(int(y_str.length()) > MAX_ADAPTER_LEN){
string tmpString;
if( (trimMode & TRIM_ANY) == TRIM_HEAD )
tmpString.assign(y_str.c_str() + y_str.length() - MAX_ADAPTER_LEN);
else
tmpString.assign(y_str.c_str(), 0, MAX_ADAPTER_LEN);
y_str.assign(tmpString);
}
adapters2.push_back(y_str);
}
}
else{ // default
if(bSetX){
bShareAdapter = true;
}
else{
y_str.assign(ILLUMINA_PAIR2_ADAPTER_PREFIX);
adapters2.push_back(y_str);
}
}
if(nFileCnt == 2){
if(trimMode == TRIM_MP){
if(bSetJ){ // specified by command
if(bXFile){
if(ReadFasta(j_str.c_str(), juncAdapters) < 0){
sprintf(errMsg, "Can not read adapter sequences from FASTA file \"%s\"", j_str.c_str());
return -2;
}
}
else{
if(int(j_str.length()) > MAX_ADAPTER_LEN){
string tmpString;
if( (trimMode & TRIM_ANY) == TRIM_HEAD )
tmpString.assign(j_str.c_str() + j_str.length() - MAX_ADAPTER_LEN);
else
tmpString.assign(j_str.c_str(), 0, MAX_ADAPTER_LEN);
j_str.assign(tmpString);
}
juncAdapters.push_back(j_str);
}
}
else{ // default
j_str.assign(ILLUMINA_JUNCTION_ADAPTER);
juncAdapters.push_back(j_str);
}
}
}
if(nFileCnt < 2){
if(bSetM){
sprintf(errMsg, "-M should not be specified for single-end reads\n");
return -2;
}
char buffer[MAX_PATH];
rowNames.push_back("%");
if(int(adapters.size()) > 26){
for(i=0; i *pAdapter = (bShareAdapter ? &adapters : &adapters2);
colNames.push_back("%");
for(j=0; jsize()); j++){
sprintf(buffer, "%02d", (j+1));
colNames.push_back(buffer);
}
vector bvec, bvec2;
bvec.push_back(false);
bvec2.push_back(true);
for(j=0; j 26){
for(i=0; i *pAdapter = (bShareAdapter ? &adapters : &adapters2);
colNames.push_back("%");
for(j=0; jsize()); j++){
sprintf(buffer, "%02d", (j+1));
colNames.push_back(buffer);
}
vector bvec;
bvec.push_back(false);
for(j=0; j(colNames.size(), false) : bvec);
for(j=0; j(colNames.size(), true));
}
}
}
if(!bSetK){
minK = (trimMode & TRIM_AP) ? INT_MAX :
(trimMode == TRIM_MP) ? (j_str.length() / 2) : max(int(4 - 10 * epsilon), 1);
}
// penalty
if(bSetR){
if(delta > epsilon)
delta = epsilon;
}
else{
if(delta > epsilon)
epsilon = delta;
}
// filtering
if(bSetL){
if(maxLen < minLen){
maxLen = minLen;
}
}
// prepare output files
if(bStdout){
return iRet;
}
pDecorate = (bBarcode ? "assigned" : "trimmed");
char * end;
if(bSetO){
end = strrchr(basename, '/');
strcpy(trimmed, basename);
if( (end != NULL) && (end[1] == '\0') ){
string command;
command.assign("mkdir -p " + string(basename));
if(system(command.c_str()) != 0){
sprintf(errMsg, "Can not create directory \"%s\"", basename);
return -2;
}
if(bQiime){
barcodes.assign(string(trimmed) + string("barcodes.fastq"));
mapfile.assign(string(trimmed) + string("mapping_file.txt"));
}
sprintf(trimmed + (end - basename) + 1, "%s", pDecorate);
sprintf(end+1, "un%s", pDecorate);
}
else{
if(strcmp(basename, ".") == 0){
sprintf(trimmed, "%s", pDecorate);
sprintf(basename, "un%s", pDecorate);
if(bQiime){
barcodes.assign("barcodes.fastq");
mapfile.assign("mapping_file.txt");
}
}
else{
if(bQiime){
barcodes.assign(string(trimmed) + string("-barcodes.fastq"));
mapfile.assign(string(trimmed) + string("-mapping_file.txt"));
}
strcat(trimmed, "-"); strcat(trimmed, pDecorate);
strcat(basename, "-un"); strcat(basename, pDecorate);
}
}
}
else{
if(bStdin){
sprintf(trimmed, "%s", pDecorate);
sprintf(basename, "un%s", pDecorate);
if(bQiime){
barcodes.assign("barcodes.fastq");
mapfile.assign("mapping_file.txt");
}
}
else{
gzstrncpy(trimmed, input[0], MAX_PATH);
end = occOfLastDot(trimmed);
end[0] = '\0';
strcpy(basename, trimmed);
if(bQiime){
barcodes.assign(string(trimmed) + string("-barcodes.fastq"));
mapfile.assign(string(trimmed) + string("-mapping_file.txt"));
}
sprintf(end, "-%s", pDecorate);
sprintf(basename + (end - trimmed), "-un%s", pDecorate);
}
}
sprintf(logfile, "%s.log", trimmed);
string fileName, fileName2;
if(bBarcode){
char buffer[MAX_PATH];
if(nFileCnt >= 2){
for(i=0; i= 2){
fileName.assign(string(trimmed) + string("-pair1.fastq"));
fileName2.assign(string(trimmed) + string("-pair2.fastq"));
if(outputFormat == COMPRESS_GZ){
fileName += string(".gz");
fileName2 += string(".gz");
}
output.push_back(fileName);
output2.push_back(fileName2);
if(trimMode & TRIM_AP){
untrimmed.assign(string(basename) + string("-pair1.fastq"));
untrimmed2.assign(string(basename) + string("-pair2.fastq"));
if(outputFormat == COMPRESS_GZ){
untrimmed += string(".gz");
untrimmed2 += string(".gz");
}
}
}
else{
fileName.assign(string(trimmed) + string(".fastq"));
if(outputFormat == COMPRESS_GZ){
fileName += string(".gz");
}
output.push_back(fileName);
if(trimMode & TRIM_AP){
untrimmed.assign(string(basename) + string(".fastq"));
if(outputFormat == COMPRESS_GZ){
untrimmed += string(".gz");
}
}
}
if (bWriteMasked) {
string maskedFileName, maskedFileName2;
maskedFileName.assign(string(trimmed) + string("-masked-pair1.fastq"));
maskedFileName2.assign(string(trimmed) + string("-masked-pair2.fastq"));
if(outputFormat == COMPRESS_GZ){
maskedFileName += string(".gz");
maskedFileName2 += string(".gz");
}
masked.push_back(maskedFileName);
masked2.push_back(maskedFileName2);
}
}
if(bWriteExcluded) {
if(nFileCnt >= 2){
string excludedFileName, excludedFileName2;
excludedFileName.assign(string(basename) + string("-excluded-pair1.fastq"));
excludedFileName2.assign(string(basename) + string("-excluded-pair2.fastq"));
if(outputFormat == COMPRESS_GZ){
excludedFileName += string(".gz");
excludedFileName2 += string(".gz");
}
excluded.push_back(excludedFileName);
excluded2.push_back(excludedFileName2);
}
else {
string excludedFileName;
excludedFileName.assign(string(basename) + string("-excluded.fastq"));
if(outputFormat == COMPRESS_GZ){
excludedFileName += string(".gz");
}
excluded.push_back(excludedFileName);
}
}
return iRet;
}
skewer-0.2.2/src/parameter.h 0000664 0000000 0000000 00000010435 12700552063 0015736 0 ustar 00root root 0000000 0000000 /**********************************************************************
* Skewer - a fast and accurate adapter trimming tool
* using the bit-masked k-difference matching algorithm
* Copyright (c) 2013-2014 by Hongshan Jiang
* hongshan.jiang@gmail.com
*
* If you use this program, please cite the paper:
* Jiang, H., Lei, R., Ding, S.W. and Zhu, S. (2014) Skewer: a fast and
* accurate adapter trimmer for next-generation sequencing paired-end reads.
* BMC Bioinformatics, 15, 182.
* http://www.biomedcentral.com/1471-2105/15/182
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef _PARAMETER_H
#define _PARAMETER_H
#include
#include
#include "common.h"
#include "fastq.h"
using namespace std;
typedef struct{
const char * name;
char chr;
}OPTION_ITEM;
typedef enum{
COMPRESS_NONE = 0,
COMPRESS_GZ = 1,
COMPRESS_BZ2 = 2
}COMPRESS_FORMAT;
///////////////////////////////////////
class cParameter
{
private:
char arr[2][MAX_PATH+1];
int argc;
char ** argv;
bool bXFile;
bool bYFile;
bool bJFile;
bool bAutoFormat;
public:
const char * version;
string x_str;
string y_str;
string m_str;
string j_str;
vector adapters;
vector adapters2;
vector< vector > bMatrix;
vector rowNames;
vector colNames;
vector juncAdapters;
char * input[2];
// These are the output fastq files for the
// trimmed reads that pass filters
vector output;
vector output2;
// These are the output fastq files for only
// the reads that passed filters AND were trimmed.
// The full reads appear in the file with the
// trimmed bases in lower case rather than removed.
vector masked;
vector masked2;
bool bWriteMasked;
// These files contain the reads that were excluded
// from the output because they failed filters.
vector excluded;
vector excluded2;
bool bWriteExcluded;
vector barcodeNames;
string barcodes;
string mapfile;
string untrimmed;
string untrimmed2;
char logfile[MAX_PATH+1+100];
const char * pDecorate;
TRIM_MODE trimMode;
bool bShareAdapter;
bool bBarcode;
bool bClip;
bool bQiime;
bool bFilterNs;
bool bFilterUndetermined;
bool bRedistribute;
bool bStdin;
bool bStdout;
bool bQuiet;
bool bEnquireVersion;
int nFileCnt;
enum FASTQ_FORMAT fastqFormat;
int baseQual;
COMPRESS_FORMAT outputFormat;
double epsilon, delta;
int minLen, maxLen, minAverageQual, minEndQual, nThreads;
int minK;
int iCutF, iCutR;
bool bCutTail;
bool bFillWithNs;
private:
char * occOfLastDot(char * str);
bool IsDirectorySpecified (char * str);
int ReadMatrix(const char * fileName);
int ReadFasta(const char * fileName, vector & sequences);
public:
cParameter();
bool IsAutoFastqFormat();
void PrintVersion(FILE * fp);
void PrintUsage(char * program, FILE *fp);
void PrintSimpleUsage(char * program, FILE *fp);
void printCommandLine(FILE *fp);
void printRelatedFiles(FILE *fp);
void printVersion(FILE *fp);
void printLogo(FILE *fp, bool bLeadingRtn=false);
void printOpt(FILE *fp, bool bLeadingRtn=false);
int GetOpt(int argc, char *argv[], char * errMsg);
};
extern "C" int color_fprintf(int colorCode, FILE *stream, const char *format, ...);
extern "C" int color_sprintf(int colorCode, char *str, const char *format, ...);
#endif // _PARAMETER_H