soap2.20/0000755000105300011350000000000011370172123011135 5ustar yuchangrdsoap2.20/BWTAln.c0000644000105300011350000006667511231711561012415 0ustar yuchangrd#include "BWTAln.h" unsigned int REVBWTForwardSearch(const unsigned char *convertedkey, const unsigned int keylength, const BWT *rev_bwt, unsigned int *resultsaindexleft, unsigned int *resultsaindexright, unsigned int *rev_resultsaindexleft, unsigned int *rev_resultsaindexright) { unsigned int sacount=0; unsigned int rev_startsaindex, rev_endsaindex; unsigned int startsaindex, endsaindex; unsigned int pos = 1; int i; unsigned int c = convertedkey[0]; unsigned int occcount_start[4]; unsigned int occcount_end[4]; unsigned int occcount[4]; rev_startsaindex = rev_bwt->cumulativeFreq[convertedkey[0]]+1; rev_endsaindex = rev_bwt->cumulativeFreq[convertedkey[0]+1]; startsaindex = rev_bwt->cumulativeFreq[convertedkey[0]]+1; endsaindex = rev_bwt->cumulativeFreq[convertedkey[0]+1]; while (pos < keylength && startsaindex <= endsaindex) { c = convertedkey[pos]; BWTAllOccValue(rev_bwt,rev_startsaindex,occcount_start); BWTAllOccValue(rev_bwt,rev_endsaindex + 1,occcount_end); rev_startsaindex = rev_bwt->cumulativeFreq[c] + occcount_start[c] + 1; rev_endsaindex = rev_bwt->cumulativeFreq[c] + occcount_end[c]; occcount[3]=0; for (i=2;i>=0;i--) { occcount[i]=occcount[i+1]+occcount_end[i+1]-occcount_start[i+1]; } endsaindex = endsaindex - occcount[c]; startsaindex = endsaindex - (rev_endsaindex-rev_startsaindex); pos++; } *resultsaindexleft = startsaindex; *resultsaindexright = endsaindex; *rev_resultsaindexleft = rev_startsaindex; *rev_resultsaindexright = rev_endsaindex; sacount+=endsaindex-startsaindex+1; // number of occurrence = endsaindex - startsaindex + 1 return sacount; } unsigned int REVBWTContForwardSearch(const unsigned char *convertedkey, const unsigned int start, const unsigned int len, const BWT *rev_bwt, unsigned int *sal, unsigned int *sar, unsigned int *rev_sal, unsigned int *rev_sar) { unsigned int sacount=0; unsigned int pos = start; unsigned char c; unsigned int occcount_start[4]; unsigned int occcount_end[4]; unsigned int occcount[4]; int k; while (pos < start+len && *sal <= *sar) { c = convertedkey[pos]; BWTAllOccValue(rev_bwt,*rev_sal,occcount_start); BWTAllOccValue(rev_bwt,*rev_sar + 1,occcount_end); *rev_sal = rev_bwt->cumulativeFreq[c] + occcount_start[c] + 1; *rev_sar = rev_bwt->cumulativeFreq[c] + occcount_end[c]; occcount[3]=0; for (k=2;k>=0;k--) { occcount[k]=occcount[k+1]+occcount_end[k+1]-occcount_start[k+1]; } *sar = *sar - occcount[c]; *sal = *sar - (*rev_sar-*rev_sal); pos++; } sacount+=*sar-*sal+1; return sacount; } unsigned int BWTContBackwardSearch(const unsigned char *convertedkey, const unsigned int start, const unsigned int len, const BWT *bwt, unsigned int *sal, unsigned int *sar) { unsigned int sacount=0; unsigned int pos = len; unsigned char c; if (*sal > *sar) { return 0; } while (pos > 0 && *sal <= *sar) { c = convertedkey[pos-1]; *sal = bwt->cumulativeFreq[c] + BWTOccValue(bwt, *sal, c) + 1; *sar = bwt->cumulativeFreq[c] + BWTOccValue(bwt, *sar + 1, c); pos--; } sacount+=*sar-*sal+1; return sacount; } unsigned int BWTBackward1Error(const unsigned char *querypattern, const BWTOPT *bo, BWT *bwt, unsigned int start, unsigned int len, unsigned int pl, unsigned int pr, unsigned int info, HITTABLE *hits) { unsigned int mk_l=1,mk_r=0; unsigned int occcount_pstart[4]; unsigned int occcount_pend[4]; unsigned int sacount=0; int hitcount = 0; unsigned char c; unsigned char ec; int i; //printf("bwtbackward1error %u %u\n",pl,pr); // v--start v----start+len // querypattern = xxxxxxxxxxxxxxxxx[xxxxxxxxxxxxxx]xxxxxx // <------- search direction // querypattern[start+len-1], querypattern[start+len-2]...querypattern[start] // for i=0 to len-1, // append querypatter[start+len-1-i]! for (i=0;(icumulativeFreq[ec] + occcount_pstart[ec] + 1; mk_r = bwt->cumulativeFreq[ec] + occcount_pend[ec]; if (BWTContBackwardSearch(querypattern,start,len-i-1,bwt,&mk_l,&mk_r)) { hitcount += OCCProcess(mk_l,mk_r, bo, info, hits); sacount+=mk_r-mk_l+1; } } c = querypattern[start+len-1-i]; pl = bwt->cumulativeFreq[c] + occcount_pstart[c] + 1; pr = bwt->cumulativeFreq[c] + occcount_pend[c]; } return hitcount; } unsigned int REVBWTForward1Error(const unsigned char *querypattern, const BWTOPT *bo, BWT * bwt,BWT * rev_bwt, unsigned int start,unsigned int len,unsigned int pl,unsigned int pr,unsigned int rev_pl,unsigned int rev_pr, unsigned int info, HITTABLE *hits) { unsigned int mk_l=1,mk_r=0,rev_mk_l,rev_mk_r; unsigned int occcount_pstart[4]; unsigned int occcount_pend[4]; unsigned int occcountp[4]; unsigned int sacount=0; int hitcount = 0; unsigned char c; unsigned char ec; unsigned int i; int k; const int coord = (info>>24&1)?(bo->seqLen-bo->alnLen):0; for (i=0;(i=0;k--) { occcountp[k]=occcountp[k+1]+occcount_pend[k+1]-occcount_pstart[k+1]; } //forward manner for (ec=0;ec<4;ec++) { if (querypattern[start+i]==ec) continue; info &= 0xffff000; info |= ((ec&3)<<8|((start+i+coord)&0xff))&0xfff; mk_l=pl; mk_r=pr; rev_mk_l=rev_pl; rev_mk_r=rev_pr; unsigned int pos = i+1; rev_mk_l = rev_bwt->cumulativeFreq[ec] + occcount_pstart[ec] + 1; rev_mk_r = rev_bwt->cumulativeFreq[ec] + occcount_pend[ec]; mk_r = mk_r - occcountp[ec]; mk_l = mk_r - (rev_mk_r-rev_mk_l); if (REVBWTContForwardSearch(querypattern,start+pos,len-i-1,rev_bwt,&mk_l,&mk_r,&rev_mk_l,&rev_mk_r)) { // printf("%d\t", start+i); // printf("%d\n", ec); hitcount+= OCCProcess(mk_l,mk_r, bo, info, hits); //return mk_l, mk_r sacount+=mk_r-mk_l+1; } } c = querypattern[start+i]; rev_pl = rev_bwt->cumulativeFreq[c] + occcount_pstart[c] + 1; rev_pr = rev_bwt->cumulativeFreq[c] + occcount_pend[c]; pr = pr - occcountp[c]; pl = pr - (rev_pr-rev_pl); } return hitcount; } int BWTExactMatching(const unsigned char *convertedKey, const BWTOPT *bo, int chain, BWT *bwt, LOOKUPTABLE *lookup, HITTABLE *hits){ if(convertedKey == NULL) return 0; const unsigned int keyLength = bo->alnLen; LOOKUPTABLE lookupTable; lookupTable.tableSize = lookup->tableSize; lookupTable.table = lookup->table; unsigned int l, r; unsigned int i; int hitcount = 0; // fprintf(stdout, "BWTExactMatching\n"); unsigned int info = (chain&1) << 24 ; unsigned long long packedPattern = 0; // printf("tablesize: %d, keyLength: %d\n", lookupTable.tableSize, keyLength); for (i = 0; i 0 && l <= r; --i) { unsigned char c = convertedKey[i-1]; l = bwt->cumulativeFreq[c] + BWTOccValue(bwt, l, c) + 1; r = bwt->cumulativeFreq[c] + BWTOccValue(bwt, r + 1, c); } if (l<=r && hits->n < bo->cutoff) { // fprintf(stderr, "occ find\n"); hitcount += OCCProcess(l, r, bo, info, hits); return hitcount; } return 0; } int BWT1ErrorMatching(const unsigned char * convertedKey, const BWTOPT *bo, const int chain, BWT *bwt, BWT *rev_bwt, LOOKUPTABLE *lookup, LOOKUPTABLE *rev_lookup, HITTABLE *hits) { if(convertedKey == NULL) return 0; LOOKUPTABLE lookupTable, rev_lookupTable; lookupTable.tableSize = lookup->tableSize; rev_lookupTable.tableSize = rev_lookup->tableSize; lookupTable.table = lookup->table; rev_lookupTable.table = rev_lookup->table; // unsigned int cutoff = bo->cutoff; unsigned int keyLength = bo->alnLen; unsigned int forwardDepth = bo->h; unsigned int info = (1<<25)|((chain&1)<<24); unsigned int l, r; unsigned int rev_l, rev_r; unsigned int i; int hitcount = 0; unsigned backwardDepth = keyLength - forwardDepth; // fprintf(stdout, "BWT1misMatching\n"); //1. Backward Case //============================================== // look-up the last characters (backward) unsigned long long packedPattern = 0; for (i = 0; i cumulativeFreq[c] + BWTOccValue(bwt, l, c) + 1; r = bwt->cumulativeFreq[c] + BWTOccValue(bwt, r + 1, c); } // error in the forward depth section of the query pattern hitcount += BWTBackward1Error(convertedKey, bo, bwt, 0, forwardDepth, l,r, info, hits); // fprintf(stdout, "saCount1: %u\n", saCount); // if(hits->n >= bo->cutoff) return hitcount; //2.Forward Case //============================================== unsigned int occCount_start[4]; unsigned int occCount_end[4]; unsigned int occCount[4]; unsigned long long l_packedPattern = 0; unsigned long long r_packedPattern = 0; unsigned long long rev_packedPattern = 0; // look-up the first characters for (i = 0; i cumulativeFreq[c] + occCount_start[c] + 1; rev_r = rev_bwt->cumulativeFreq[c] + occCount_end[c]; occCount[3]=0; int k; for (k=2;k>=0;k--) { occCount[k]=occCount[k+1]+occCount_end[k+1]-occCount_start[k+1]; } r = r - occCount[c]; l = r - (rev_r-rev_l); } // error in the forward depth section of the query pattern hitcount+=REVBWTForward1Error(convertedKey, bo, bwt, rev_bwt, forwardDepth, backwardDepth, l, r, rev_l, rev_r, info, hits); // fprintf(stdout, "saCount: %u\n", saCount); return hitcount; } int BWT2ErrorMatching(const unsigned char *convertedKey, const BWTOPT *bo, const int chain, BWT * bwt, BWT * rev_bwt, LOOKUPTABLE *lookup, LOOKUPTABLE *rev_lookup, HITTABLE *hits) { if(convertedKey == NULL) return 0; LOOKUPTABLE lookupTable, rev_lookupTable; lookupTable.tableSize = lookup->tableSize; rev_lookupTable.tableSize = rev_lookup->tableSize; lookupTable.table = lookup->table; rev_lookupTable.table = rev_lookup->table; unsigned int keyLength = bo->alnLen; // fprintf(stderr, "keyLength %u\n", keyLength); unsigned int sizeX = bo->x; unsigned int sizeY = bo->y; unsigned int cutoff = bo->cutoff; unsigned int info = (2<<25)|(chain<<24); unsigned int l, r; unsigned int rev_l, rev_r; unsigned int i; unsigned char ec; unsigned int sizeZ = keyLength - sizeX - sizeY; unsigned int occCount_pstart[4]; unsigned int occCount_pend[4]; unsigned int occCountp[4]; unsigned int occCount_start[4]; unsigned int occCount_end[4]; unsigned int occCount[4]; unsigned long long packedPattern = 0; unsigned long long l_packedPattern = 0; unsigned long long r_packedPattern = 0; unsigned long long rev_packedPattern = 0; unsigned long long rev_l_packedPattern = 0; unsigned long long rev_r_packedPattern = 0; unsigned long long mask; unsigned long long ALLONE = (1<<(lookupTable.tableSize*2))-1; unsigned char c; int hitcount = 0; const int coord = (info>>24&1)?(bo->seqLen-bo->alnLen):0; // fprintf(stdout, "BWT2misMatching\n"); //Separate into 4 cases according to the documentation. //============================================== //Case A Backward Search //1. cellZ //2. 2-mismatch cellX+Y //============================================== // look-up the last characters (backward) for (i = 0; i cumulativeFreq[c] + BWTOccValue(bwt, l, c) + 1; r = bwt->cumulativeFreq[c] + BWTOccValue(bwt, r + 1, c); } // 2 errors in cellX+Y for (i=sizeX+sizeY-1;(i>0 && l<=r);i--) { BWTAllOccValue(bwt,l,occCount_pstart); BWTAllOccValue(bwt,r + 1,occCount_pend); //Backward Manner for (ec=0;ec<4;ec++) { if (convertedKey[i]==ec) continue; unsigned int mk_l=l; unsigned int mk_r=r; info &= 0x7000000; info |= ((((ec&0x3)<<8)|((i+coord)&0xff))&0x3ff)<<12; mk_l = bwt->cumulativeFreq[ec] + occCount_pstart[ec] + 1; mk_r = bwt->cumulativeFreq[ec] + occCount_pend[ec]; //return mk_l, mk_r if (mk_l <= mk_r) { //r_count+=mk_r-mk_l+1; hitcount+=BWTBackward1Error(convertedKey, bo, bwt, 0, i, mk_l,mk_r, info, hits); if(hits->n >= cutoff) return hitcount; } } c = convertedKey[i]; l = bwt->cumulativeFreq[c] + occCount_pstart[c] + 1; r = bwt->cumulativeFreq[c] + occCount_pend[c]; } // if(hits->n >= cutoff) return hitcount; // printf("case A %d\n", saCount); //Case B Forward Search //1. cellX+Y //2. 2-mismatch cellZ //============================================== packedPattern = 0; l_packedPattern = 0; r_packedPattern = 0; rev_packedPattern = 0; rev_l_packedPattern = 0; rev_r_packedPattern = 0; // look-up the first characters for (i = 0; i cumulativeFreq[c] + occCount_start[c] + 1; rev_r = rev_bwt->cumulativeFreq[c] + occCount_end[c]; occCount[3]=0; int k; for (k=2;k>=0;k--) { occCount[k]=occCount[k+1]+occCount_end[k+1]-occCount_start[k+1]; } r = r - occCount[c]; l = r - (rev_r-rev_l); } //2 error in cellZ // fprintf(stdout, "find errr\n"); // fprintf(stdout, "SizeX+Y: %d\n" // "keylen: %d\n" // "l: %d, r: %d\n", sizeX+sizeY, keyLength, l, r); for (i=sizeX+sizeY;(i=0;k--) { occCountp[k]=occCountp[k+1]+occCount_pend[k+1]-occCount_pstart[k+1]; } //Forward Manner for (ec=0;ec<4;ec++) { if (convertedKey[i]==ec) continue; // fprintf(stdout, "%d\n", i); info &= 0x7000000; info |= ((((ec&0x3)<<8)|((i+coord)&0xff))&0x3ff)<<12; unsigned int mk_l=l; unsigned int mk_r=r; unsigned int rev_mk_l=rev_l; unsigned int rev_mk_r=rev_r; rev_mk_l = rev_bwt->cumulativeFreq[ec] + occCount_pstart[ec] + 1; rev_mk_r = rev_bwt->cumulativeFreq[ec] + occCount_pend[ec]; mk_r = mk_r - occCountp[ec]; mk_l = mk_r - (rev_mk_r-rev_mk_l); //return mk_l, mk_r if (mk_l <= mk_r) { //r_count+=mk_r-mk_l+1; //2-nd Error Matching in sub cellZ Range // fprintf(stdout, "%d--%d--%d\n,",convertedKey[i], i, ec); hitcount+=REVBWTForward1Error(convertedKey, bo, bwt, rev_bwt, i+1, keyLength-i-1, mk_l,mk_r,rev_mk_l,rev_mk_r, info, hits); if(hits->n >= cutoff) return hitcount; //saCount+=forward1Error(p,bwt,rev_bwt, i+1,cellX+cellY+cellZ-i-1,mk_l,mk_r,rev_mk_l,rev_mk_r); // fprintf(stdout, "\nsaCoutn:%d\n",saCount); } } c = convertedKey[i]; rev_l = rev_bwt->cumulativeFreq[c] + occCount_pstart[c] + 1; rev_r = rev_bwt->cumulativeFreq[c] + occCount_pend[c]; r = r - occCountp[c]; l = r - (rev_r-rev_l); } // if(hits->n >= cutoff)return hitcount; //*/ // printf("case B %d\n", saCount); // //Case C //1. cellX (forward) //2. 1-mismatch cellY (forward) //3. 1-mismatch cellZ (forward) //============================================== packedPattern = 0; l_packedPattern = 0; r_packedPattern = 0; rev_packedPattern = 0; rev_l_packedPattern = 0; rev_r_packedPattern = 0; for (i = 0; i cumulativeFreq[c] + occCount_start[c] + 1; rev_r = rev_bwt->cumulativeFreq[c] + occCount_end[c]; int k; occCount[3]=0; for (k=2;k>=0;k--) { occCount[k]=occCount[k+1]+occCount_end[k+1]-occCount_start[k+1]; } r = r - occCount[c]; l = r - (rev_r-rev_l); pos++; } if (l <= r) { //2-nd Error Matching in cellZ Range hitcount+=REVBWTForward1Error(convertedKey, bo, bwt, rev_bwt, sizeX+sizeY, sizeZ, l,r, rev_l,rev_r, info, hits); if(hits->n >= cutoff) return hitcount; } } } //*/ if(hits->n >= cutoff)return hitcount; l = packedPattern ? lookupTable.table[packedPattern-1]+1 : 1; r = lookupTable.table[packedPattern]; rev_l = rev_packedPattern ? rev_lookupTable.table[rev_packedPattern-1]+1 : 1; rev_r = rev_lookupTable.table[rev_packedPattern]; //For error happen outside lookup range.. for (i=lookupTable.tableSize;(i=0;k--) { occCountp[k]=occCountp[k+1]+occCount_pend[k+1]-occCount_pstart[k+1]; } //Forward Manner for (ec=0;ec<4;ec++) { if ((convertedKey[i]&0x3) ==ec) continue; info &= 0x7000000; info |= ((((ec&0x3)<<8)|((i+coord)&0xff))&0x3ff)<<12; unsigned int mk_l=l; unsigned int mk_r=r; unsigned int rev_mk_l=rev_l; unsigned int rev_mk_r=rev_r; unsigned int pos = i+1; rev_mk_l = rev_bwt->cumulativeFreq[ec] + occCount_pstart[ec] + 1; rev_mk_r = rev_bwt->cumulativeFreq[ec] + occCount_pend[ec]; mk_r = mk_r - occCountp[ec]; mk_l = mk_r - (rev_mk_r-rev_mk_l); while (pos < sizeX+sizeY && rev_mk_l <= rev_mk_r) { c = convertedKey[pos] & 0x3; BWTAllOccValue(rev_bwt,rev_mk_l,occCount_start); BWTAllOccValue(rev_bwt,rev_mk_r + 1,occCount_end); rev_mk_l = rev_bwt->cumulativeFreq[c] + occCount_start[c] + 1; rev_mk_r = rev_bwt->cumulativeFreq[c] + occCount_end[c]; int k; occCount[3]=0; for (k=2;k>=0;k--) { occCount[k]=occCount[k+1]+occCount_end[k+1]-occCount_start[k+1]; } mk_r = mk_r - occCount[c]; mk_l = mk_r - (rev_mk_r-rev_mk_l); pos++; } //return mk_l, mk_r if (mk_l <= mk_r) { //2-nd Error Matching in cellZ Range hitcount+=REVBWTForward1Error(convertedKey, bo, bwt, rev_bwt, sizeX+sizeY, sizeZ, mk_l,mk_r, rev_mk_l,rev_mk_r, info, hits); // if(hits->n >= cutoff) return hitcount; } } c = convertedKey[i]; rev_l = rev_bwt->cumulativeFreq[c] + occCount_pstart[c] + 1; rev_r = rev_bwt->cumulativeFreq[c] + occCount_pend[c]; r = r - occCountp[c]; l = r - (rev_r-rev_l); } if(hits->n >= cutoff)return hitcount; //*/ // printf("case C %d\n", saCount); /* //Case D //1. cellY (forward) //2. 1-mismatch cellZ (forward) //3. 1-mismatch cellX (backward) //============================================== packedPattern = 0; l_packedPattern = 0; r_packedPattern = 0; rev_packedPattern = 0; rev_l_packedPattern = 0; rev_r_packedPattern = 0; for (i = 0; i cumulativeFreq[c] + occCount_start[c] + 1; rev_r = rev_bwt->cumulativeFreq[c] + occCount_end[c]; int k; occCount[3]=0; for (k=2;k>=0;k--) { occCount[k]=occCount[k+1]+occCount_end[k+1]-occCount_start[k+1]; } r = r - occCount[c]; l = r - (rev_r-rev_l); pos++; } if (l <= r) { //2-nd Error Matching in cellX Range hitcount+=BWTBackward1Error(convertedKey,bo, bwt, 0, sizeX, l,r, info, hits); } if (hits->n >= cutoff) return hitcount; } } /// l = packedPattern ? lookupTable.table[packedPattern-1]+1 : 1; r = lookupTable.table[packedPattern]; rev_l = rev_packedPattern ? rev_lookupTable.table[rev_packedPattern-1]+1 : 1; rev_r = rev_lookupTable.table[rev_packedPattern]; //For error happen outside lookup range.. for (i=sizeX+lookupTable.tableSize;(i=0;k--) { occCountp[k]=occCountp[k+1]+occCount_pend[k+1]-occCount_pstart[k+1]; } //Forward Manner for (ec=0;ec<4;ec++) { if (convertedKey[i]==ec) continue; info &= 0x7000000; info |= ((((ec&0x3)<<8)|(i&0xff))&0x3ff)<<12; unsigned int mk_l=l; unsigned int mk_r=r; unsigned int rev_mk_l=rev_l; unsigned int rev_mk_r=rev_r; unsigned int pos = i+1; rev_mk_l = rev_bwt->cumulativeFreq[ec] + occCount_pstart[ec] + 1; rev_mk_r = rev_bwt->cumulativeFreq[ec] + occCount_pend[ec]; mk_r = mk_r - occCountp[ec]; mk_l = mk_r - (rev_mk_r-rev_mk_l); while (pos < keyLength && rev_mk_l <= rev_mk_r) { c = convertedKey[pos]; BWTAllOccValue(rev_bwt,rev_mk_l,occCount_start); BWTAllOccValue(rev_bwt,rev_mk_r + 1,occCount_end); rev_mk_l = rev_bwt->cumulativeFreq[c] + occCount_start[c] + 1; rev_mk_r = rev_bwt->cumulativeFreq[c] + occCount_end[c]; int k; occCount[3]=0; for (k=2;k>=0;k--) { occCount[k]=occCount[k+1]+occCount_end[k+1]-occCount_start[k+1]; } mk_r = mk_r - occCount[c]; mk_l = mk_r - (rev_mk_r-rev_mk_l); pos++; } //return mk_l, mk_r if (mk_l <= mk_r) { //2-nd Error Matching in cellX Range hitcount+=BWTBackward1Error(convertedKey, bo, bwt, 0, sizeX, mk_l,mk_r, info, hits); } if (hits->n >= cutoff)return hitcount; } c = convertedKey[i]; rev_l = rev_bwt->cumulativeFreq[c] + occCount_pstart[c] + 1; rev_r = rev_bwt->cumulativeFreq[c] + occCount_pend[c]; r = r - occCountp[c]; l = r - (rev_r-rev_l); } /// // printf("case D %d\n", saCount); //*/ return hitcount; } static inline int POSCMP(const void *a, const void *b){ return *(unsigned int *)a - *(unsigned int *)b; } soap2.20/BWT.c0000644000105300011350000014512511164534250011751 0ustar yuchangrd/* BWT.c BWT-Index This module contains an implementation of BWT-index for alphabet size = 4. The functions provided include: Load functions for loading BWT to memory; Core functions for accessing core Inverse Psi values; Search functions for searching patterns from text; Text retrieval functions for retrieving text from BWT. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.L You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include #include "BWT.h" #include "MiscUtilities.h" #include "DNACount.h" #include "TextConverter.h" #include "MemManager.h" #include "r250.h" #include "HSP.h" // static functions static INLINE unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, const unsigned int character); static INLINE void BWTAllOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, unsigned int* __restrict occValueExplicit); static INLINE unsigned int BWTSaIndexToChar(const BWT *bwt, const unsigned int saIndex); static INLINE unsigned int BWTGetWordPackedText(const unsigned int *packedText, const unsigned int index, const unsigned int shift, const unsigned int numOfBit); static INLINE void BWTPrefetchOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit); static INLINE void BWTPrefetchBWT(const BWT *bwt, const unsigned int index); int SaIndexGroupDPHitOrder1(const void *saIndexGroup, const int index1, const int index2); int SaIndexGroupDPHitOrder2(const void *saIndexGroup, const int index1, const int index2); static INLINE unsigned int BWTSaIndexToChar(const BWT *bwt, const unsigned int saIndex) { return (saIndex > bwt->cumulativeFreq[1]) + (saIndex > bwt->cumulativeFreq[2]) + (saIndex > bwt->cumulativeFreq[3]); } BWT *BWTCreate(MMPool *mmPool, const unsigned int textLength, unsigned int *decodeTable) { BWT *bwt; bwt = MMPoolDispatch(mmPool, sizeof(BWT)); bwt->textLength = 0; bwt->inverseSa = 0; bwt->cumulativeFreq = MMPoolDispatch(mmPool, (ALPHABET_SIZE + 1) * sizeof(unsigned int)); initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); bwt->bwtSizeInWord = 0; bwt->saValueOnBoundary = NULL; // Generate decode tables if (decodeTable == NULL) { bwt->decodeTable = MMPoolDispatch(mmPool, DNA_OCC_CNT_TABLE_SIZE_IN_WORD * sizeof(unsigned int)); GenerateDNAOccCountTable(bwt->decodeTable); } else { bwt->decodeTable = decodeTable; } bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); bwt->occValueMajor = MMPoolDispatch(mmPool, bwt->occMajorSizeInWord * sizeof(unsigned int)); bwt->occSizeInWord = 0; bwt->occValue = NULL; bwt->saInterval = ALL_ONE_MASK; bwt->saValueSize = 0; bwt->saValue = NULL; bwt->inverseSaInterval = ALL_ONE_MASK; bwt->inverseSaSize = 0; bwt->inverseSa = NULL; return bwt; } BWT *BWTLoad(MMPool *mmPool, const char *bwtCodeFileName, const char *occValueFileName, const char *saValueFileName, const char *inverseSaFileName, const char *saIndexRangeFileName, unsigned int *decodeTable) { unsigned int i; FILE *bwtCodeFile, *occValueFile, *saValueFile = NULL, *inverseSaFile = NULL, *saIndexRangeFile = NULL; BWT *bwt; unsigned int tmp; unsigned int bwtCodeLengthInFile; unsigned int numOfSaIndexRange; bwtCodeFile = (FILE*)fopen64(bwtCodeFileName, "rb"); if (bwtCodeFile == NULL) { fprintf(stderr, "BWTLoad() : cannot open bwtCodeFile!\n"); exit(1); } occValueFile = (FILE*)fopen64(occValueFileName, "rb"); if (occValueFile == NULL) { fprintf(stderr, "BWTLoad() : cannot open occValueFile!\n"); exit(1); } if (saValueFileName != NULL && saValueFileName[0] != '\0' && saValueFileName[0] != '-') { saValueFile = (FILE*)fopen64(saValueFileName, "rb"); if (saValueFile == NULL) { fprintf(stderr, "BWTLoad() : cannot open saValueFile!\n"); exit(1); } } if (inverseSaFileName != NULL && inverseSaFileName[0] != '\0' && inverseSaFileName[0] != '-') { inverseSaFile = (FILE*)fopen64(inverseSaFileName, "rb"); if (inverseSaFile == NULL) { fprintf(stderr, "BWTLoad() : cannot open inverseSaFile!\n"); exit(1); } } if (saIndexRangeFileName != NULL && saIndexRangeFileName[0] != '\0' && saIndexRangeFileName[0] != '-') { saIndexRangeFile = (FILE*)fopen64(saIndexRangeFileName, "rb"); if (saIndexRangeFile == NULL) { fprintf(stderr, "BWTLoad() : cannot open saIndexRangeFile!\n"); exit(1); } } bwt = MMPoolDispatch(mmPool, sizeof(BWT)); fread(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtCodeFile); bwt->cumulativeFreq = MMPoolDispatch(mmPool, (ALPHABET_SIZE + 1) * sizeof(unsigned int)); bwt->cumulativeFreq[0] = 0; fread(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtCodeFile); bwt->textLength = bwt->cumulativeFreq[ALPHABET_SIZE]; fread(&tmp, sizeof(unsigned int), 1, occValueFile); if (tmp != bwt->inverseSa0) { fprintf(stderr, "BWTLoad(): OccValue inverseSa0 not match!\n"); exit(1); } for (i=1; i<=ALPHABET_SIZE; i++) { fread(&tmp, sizeof(unsigned int), 1, occValueFile); if (tmp != bwt->cumulativeFreq[i]) { fprintf(stderr, "BWTLoad(): OccValue cumulativeFreq not match!\n"); exit(1); } } bwt->bwtSizeInWord = BWTResidentSizeInWord(bwt->textLength) + WORD_BETWEEN_OCC / 2; // + 8 words so that the 128 bits before and after an explicit occ are in the same aligned 64 byte bwtCodeLengthInFile = BWTFileSizeInWord(bwt->textLength); bwt->bwtCode = MMUnitAllocate(bwt->bwtSizeInWord * sizeof(unsigned int)); fread(bwt->bwtCode, sizeof(unsigned int), bwtCodeLengthInFile, bwtCodeFile); fclose(bwtCodeFile); BWTClearTrailingBwtCode(bwt); bwt->occSizeInWord = BWTOccValueMinorSizeInWord(bwt->textLength) ; bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(bwt->textLength); bwt->occValue = MMUnitAllocate(bwt->occSizeInWord * sizeof(unsigned int)); fread(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile); bwt->occValueMajor = MMUnitAllocate(bwt->occMajorSizeInWord * sizeof(unsigned int)); fread(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile); fclose(occValueFile); if (decodeTable == NULL) { bwt->decodeTable = MMUnitAllocate(DNA_OCC_CNT_TABLE_SIZE_IN_WORD * sizeof(unsigned int)); GenerateDNAOccCountTable(bwt->decodeTable); bwt->decodeTableGenerated = TRUE; } else { bwt->decodeTable = decodeTable; bwt->decodeTableGenerated = FALSE; } bwt->saValueOnBoundary = NULL; if (saValueFile == NULL) { bwt->saInterval = ALL_ONE_MASK; bwt->saValueSize = 0; bwt->saValue = NULL; } else { fread(&tmp, sizeof(unsigned int), 1, saValueFile); if (tmp != bwt->inverseSa0) { fprintf(stderr, "BWTLoad(): SaValue inverseSa0 not match!\n"); exit(1); } for (i=1; i<=ALPHABET_SIZE; i++) { fread(&tmp, sizeof(unsigned int), 1, saValueFile); if (tmp != bwt->cumulativeFreq[i]) { fprintf(stderr, "BWTLoad(): SaValue cumulativeFreq not match!\n"); exit(1); } } fread(&bwt->saInterval, sizeof(unsigned int), 1, saValueFile); bwt->saValueSize = (bwt->textLength + bwt->saInterval) / bwt->saInterval * sizeof(unsigned int); bwt->saValue = MMUnitAllocate(bwt->saValueSize); fread(bwt->saValue, 1, bwt->saValueSize, saValueFile); bwt->saValue[0] = (unsigned int)-1; // Special handling for bwt fclose(saValueFile); BWTGenerateSaValueOnBoundary(mmPool, bwt); } if (inverseSaFile == NULL) { bwt->inverseSaInterval = ALL_ONE_MASK; bwt->inverseSaSize = 0; bwt->inverseSa = NULL; } else { fread(&tmp, sizeof(unsigned int), 1, inverseSaFile); if (tmp != bwt->inverseSa0) { fprintf(stderr, "BWTLoad(): InverseSaValue inverseSa0 not match!\n"); exit(1); } for (i=1; i<=ALPHABET_SIZE; i++) { fread(&tmp, sizeof(unsigned int), 1, inverseSaFile); if (tmp != bwt->cumulativeFreq[i]) { fprintf(stderr, "BWTLoad(): InverseSaValue cumulativeFreq not match!\n"); exit(1); } } fread(&bwt->inverseSaInterval, sizeof(unsigned int), 1, inverseSaFile); bwt->inverseSaSize = (bwt->textLength + bwt->inverseSaInterval) / bwt->inverseSaInterval * sizeof(unsigned int); bwt->inverseSa = MMUnitAllocate(bwt->inverseSaSize); fread(bwt->inverseSa, 1, bwt->inverseSaSize, inverseSaFile); fclose(inverseSaFile); } // Load Sa index range if (saIndexRangeFile == NULL) { bwt->saIndexRange = NULL; bwt->saIndexRangeNumOfChar = 0; bwt->saIndexRangeSize = 0; } else { fread(&tmp, sizeof(unsigned int), 1, saIndexRangeFile); if (tmp != bwt->inverseSa0) { fprintf(stderr, "BWTLoad(): SaIndex inverseSa0 not match!\n"); exit(1); } for (i=1; i<=ALPHABET_SIZE; i++) { fread(&tmp, sizeof(unsigned int), 1, saIndexRangeFile); if (tmp != bwt->cumulativeFreq[i]) { fprintf(stderr, "BWTLoad(): SaIndex cumulativeFreq not match!\n"); exit(1); } } fread(&bwt->saIndexRangeNumOfChar, sizeof(unsigned int), 1, saIndexRangeFile); numOfSaIndexRange = 1 << (bwt->saIndexRangeNumOfChar * 2); // 4^saIndexRangeNumOfChar bwt->saIndexRange = MMUnitAllocate(numOfSaIndexRange * sizeof(SaIndexRange)); fread(bwt->saIndexRange, sizeof(SaIndexRange), numOfSaIndexRange, saIndexRangeFile); bwt->saIndexRangeSize = numOfSaIndexRange * sizeof(SaIndexRange); fclose(saIndexRangeFile); } return bwt; } void BWTFree(MMPool *mmPool, BWT *bwt) { MMPoolReturn(mmPool, bwt->cumulativeFreq, ALPHABET_SIZE * sizeof(unsigned int)); MMUnitFree(bwt->bwtCode, bwt->bwtSizeInWord * sizeof(unsigned int)); if (bwt->occValue != NULL) { MMUnitFree(bwt->occValue, bwt->occSizeInWord * sizeof(unsigned int)); } if (bwt->occValueMajor != NULL) { MMUnitFree(bwt->occValueMajor, bwt->occMajorSizeInWord * sizeof(unsigned int)); } if (bwt->saValue != NULL) { MMUnitFree(bwt->saValue, bwt->saValueSize); } if (bwt->inverseSa != NULL) { MMUnitFree(bwt->inverseSa, bwt->inverseSaSize); } if (bwt->decodeTableGenerated == TRUE) { MMUnitFree(bwt->decodeTable, DNA_OCC_CNT_TABLE_SIZE_IN_WORD * sizeof(unsigned int)); } if (bwt->saIndexRange != NULL) { MMUnitFree(bwt->saIndexRange, bwt->saIndexRangeSize); } if (bwt->saValueOnBoundary != NULL) { MMPoolReturn(mmPool, bwt->saValueOnBoundary, sizeof(unsigned int) * 2 * ALPHABET_SIZE); } MMPoolReturn(mmPool, bwt, sizeof(BWT)); } /* void BWTPrintMemoryUsage(const BWT *bwt, FILE *output, const unsigned int packedDNASize) { unsigned int totalMemorySize; fprintf(output, "BWT code size : %u\n", bwt->bwtSizeInWord * sizeof(unsigned int)); fprintf(output, "Occ value size : %u\n", (bwt->occSizeInWord + bwt->occMajorSizeInWord) * sizeof(unsigned int)); if (bwt->saValueSize > 0) { fprintf(output, "SA value size : %u\n", bwt->saValueSize); } if (bwt->inverseSaSize > 0) { fprintf(output, "Inverse SA size : %u\n", bwt->inverseSaSize); } if (bwt->saIndexRange > 0) { fprintf(output, "SA index rangee : %u\n", bwt->saIndexRangeSize); } if (packedDNASize > 0) { fprintf(output, "Packed DNA size : %u\n", packedDNASize); } totalMemorySize = (bwt->bwtSizeInWord + bwt->occSizeInWord + bwt->occMajorSizeInWord) * sizeof(unsigned int) + bwt->saValueSize + bwt->inverseSaSize + bwt->saIndexRangeSize + packedDNASize; fprintf(output, "Total memory : %u\n", totalMemorySize); fprintf(output, "Bit per char : %.2f\n", (float)totalMemorySize / ((float)bwt->textLength / BITS_IN_BYTE)); } //*/ void BWTGenerateSaValueOnBoundary(MMPool *mmPool, BWT *bwt) { unsigned int i; if (bwt->saValueOnBoundary == NULL) { bwt->saValueOnBoundary = MMPoolDispatch(mmPool, sizeof(unsigned int) * 2 * ALPHABET_SIZE); } for (i=0; isaValueOnBoundary[i * 2 + 1] = BWTSaValue(bwt, bwt->cumulativeFreq[i + 1]); if (bwt->cumulativeFreq[i] < bwt->textLength) { bwt->saValueOnBoundary[i * 2] = BWTSaValue(bwt, bwt->cumulativeFreq[i] + 1); } else { bwt->saValueOnBoundary[i * 2] = bwt->saValueOnBoundary[i * 2 + 1]; } } } // Ordering of index1 and index2 is not important; this module will handle the ordering // index1 and index2 can be on the same aligned 128 bit region or can be on adjacant aligned 128 bit region // If index1 and index2 are in the same aligned 128 bit region, one of them must be on the boundary // These requirements are to reduce the no. of branches in the program flow unsigned int BWTDecode(const BWT *bwt, const unsigned int index1, const unsigned int index2, const unsigned int character) { unsigned int numChar1, numChar2, minIndex, maxIndex, minIndex128, maxIndex128; unsigned int r; const static unsigned int ALIGN_16 partitionOne1[4] = { 47, 31, 15, 0 }; const static unsigned int ALIGN_16 partitionOne2[4] = { 0, 15, 31, 47 }; const static unsigned int ALIGN_16 partitionZero1[4] = { 63, 47, 31, 15 }; const static unsigned int ALIGN_16 partitionZero2[4] = { 15, 31, 47, 63 }; // SSE registers __m128i r1e, r2e; __m128i mcl; __m128i m0, m1; __m128i r1a, r1b, r1c; __m128i r2a, r2b, r2c; // Sort index1 and index2 r = (index1 - index2) & -(index1 < index2); minIndex = index2 + r; maxIndex = index1 - r; // Locate 128 bit boundary minIndex128 = lastAlignedBoundary(minIndex, CHAR_PER_128); maxIndex128 = lastAlignedBoundary(maxIndex - (maxIndex - minIndex > CHAR_PER_128), CHAR_PER_128); // Determine no.of characters to count numChar1 = maxIndex128 - minIndex; numChar2 = maxIndex - maxIndex128; // Load encoding into register here in the hope of hiding some memory latency r1e = _mm_load_si128((__m128i *)(bwt->bwtCode + minIndex128 / CHAR_PER_WORD)); // Load encoding into register r2e = _mm_load_si128((__m128i *)(bwt->bwtCode + maxIndex128 / CHAR_PER_WORD)); // Load encoding into register // Set character extraction masks m0 = _mm_set1_epi32(0xFFFFFFFF + (character & 1)); // Character selection mask for even bits m1 = _mm_set1_epi32(0xFFFFFFFF + (character >> 1)); // Character selection mask for odd bits mcl = _mm_set1_epi32(0x55555555); // Set bit-clearing mask to 0x55555555....(alternate 1-bit) // Set counting mask for 2 x 128 bits r1a = _mm_set1_epi32(numChar1); // Load number of characters into register r2a = _mm_set1_epi32(numChar2); // Load number of characters into register r1b = _mm_load_si128((__m128i*)partitionOne1); // Load partition into register r2b = _mm_load_si128((__m128i*)partitionOne2); // Load partition into register r1c = _mm_load_si128((__m128i*)partitionZero1); // Load partition into register r2c = _mm_load_si128((__m128i*)partitionZero2); // Load partition into register r1b = _mm_cmpgt_epi32(r1a, r1b); // Compare to generate 4x32 bit mask; the word with counting boundary is all ones r2b = _mm_cmpgt_epi32(r2a, r2b); // Compare to generate 4x32 bit mask; the word with counting boundary is all ones r1c = _mm_cmpgt_epi32(r1a, r1c); // Compare to generate 4x32 bit mask; the word with counting boundary is all zeros r2c = _mm_cmpgt_epi32(r2a, r2c); // Compare to generate 4x32 bit mask; the word with counting boundary is all zeros r1b = _mm_srli_epi32(r1b, (16 - numChar1 % 16) * 2); // Shift bits so that all word comform to the requirement of counting the word with counting boundary r2b = _mm_slli_epi32(r2b, (16 - numChar2 % 16) * 2); // Shift bits so that all word comform to the requirement of counting the word with counting boundary r1c = _mm_or_si128(r1b, r1c); // Combine two masks r2c = _mm_or_si128(r2b, r2c); // Combine two masks r1c = _mm_and_si128(r1c, mcl); // Combine with bit-clearing mask (now = 0x55555555....) r2c = _mm_and_si128(r2c, mcl); // Combine with bit-clearing mask (now = 0x55555555....) // Start counting; encoding has been loaded into register earlier r1b = _mm_srli_epi32(r1e, 1); // Shift encoding to right by 1 bit r2b = _mm_srli_epi32(r2e, 1); // Shift encoding to right by 1 bit r1a = _mm_xor_si128(r1e, m0); // Check even-bits with mask r2a = _mm_xor_si128(r2e, m0); // Check even-bits with mask r1b = _mm_xor_si128(r1b, m1); // Check odd-bits with mask r2b = _mm_xor_si128(r2b, m1); // Check odd-bits with mask r1a = _mm_and_si128(r1a, r1b); // Combine even and odd bits r2a = _mm_and_si128(r2a, r2b); // Combine even and odd bits r1a = _mm_and_si128(r1a, r1c); // Combine with counting mask, which has been combined with bit-clearing mask of 0x55555555.... r2a = _mm_and_si128(r2a, r2c); // Combine with counting mask, which has been combined with bit-clearing mask of 0x55555555.... // Combine 2 x 128 bits and continue counting r1a = _mm_add_epi32(r1a, r2a); // Combine 2 x 128 bits by adding them together mcl = _mm_set1_epi32(0x33333333); // Set bit-clearing mask to 0x33333333....(alternate 2-bits) r1b = _mm_srli_epi32(r1a, 2); // Shift intermediate result to right by 2 bit r1a = _mm_and_si128(r1a, mcl); // Clear alternate 2-bits of intermediate result by combining with bit-clearing mask (now = 0x33333333....) r1b = _mm_and_si128(r1b, mcl); // Clear alternate 2-bits of shifted intermediate result by combining with bit-clearing mask (now = 0x33333333....) r1a = _mm_add_epi32(r1a, r1b); // Combine shifted and non-shifted intermediate results by adding them together mcl = _mm_set1_epi32(0x0F0F0F0F); // Set bit-clearing mask to 0x0F0F0F0F....(alternate 4-bits) m0 = _mm_setzero_si128(); // Set an all-zero mask r1b = _mm_srli_epi32(r1a, 4); // Shift intermediate result to right by 2 bit r1a = _mm_add_epi32(r1a, r1b); // Combine shifted and non-shifted intermediate results by adding them together r1a = _mm_and_si128(r1a, mcl); // Clear alternate 4-bits of intermediate result by combining with bit-clearing mask (now = 0xOFOFOFOF....) r1a = _mm_sad_epu8(r1a, m0); // Treating the 128 bit as 16 x 8 bit; summing up the 1st 8 x 8 bit into 1st 64-bit and 2nd 8 x 8 bit into 2nd 64-bit return _mm_extract_epi16(r1a, 0) + _mm_extract_epi16(r1a, 4); // Extract and return result from register } // Ordering of index1 and index2 is not important; this module will handle the ordering // index1 and index2 can be on the same aligned 128 bit region or can be on adjacant aligned 128 bit region // If index1 and index2 are in the same aligned 128 bit region, one of them must be on the boundary // These requirements are to reduce the no. of branches in the program flow void BWTDecodeAll(const BWT *bwt, const unsigned int index1, const unsigned int index2, unsigned int* __restrict occValue) { unsigned int numChar1, numChar2, minIndex, maxIndex, minIndex128, maxIndex128; unsigned int r; const static unsigned int ALIGN_16 partitionOne1[4] = { 47, 31, 15, 0 }; const static unsigned int ALIGN_16 partitionOne2[4] = { 0, 15, 31, 47 }; const static unsigned int ALIGN_16 partitionZero1[4] = { 63, 47, 31, 15 }; const static unsigned int ALIGN_16 partitionZero2[4] = { 15, 31, 47, 63 }; // SSE registers __m128i r1e, r2e; __m128i mcl; __m128i rc, rg, rt; __m128i ra1, ra2; __m128i rc1, rc2; __m128i rg1, rg2; __m128i rt1, rt2; // Sort index1 and index2 r = (index1 - index2) & -(index1 < index2); minIndex = index2 + r; maxIndex = index1 - r; // Locate 128 bit boundary minIndex128 = lastAlignedBoundary(minIndex, CHAR_PER_128); maxIndex128 = lastAlignedBoundary(maxIndex - (maxIndex - minIndex > CHAR_PER_128), CHAR_PER_128); // Determine no.of characters to count numChar1 = maxIndex128 - minIndex; numChar2 = maxIndex - maxIndex128; // Load encoding into register here in the hope of hiding some memory latency r1e = _mm_load_si128((__m128i *)(bwt->bwtCode + minIndex128 / CHAR_PER_WORD)); // Load encoding into register r2e = _mm_load_si128((__m128i *)(bwt->bwtCode + maxIndex128 / CHAR_PER_WORD)); // Load encoding into register // Set character extraction masks mcl = _mm_set1_epi32(0x55555555); // Set bit-clearing mask to 0x55555555....(alternate 1-bit) // Set counting mask for 2 x 128 bits ra1 = _mm_set1_epi32(numChar1); // Load number of characters into register ra2 = _mm_set1_epi32(numChar2); // Load number of characters into register rc1 = _mm_load_si128((__m128i*)partitionOne1); // Load partition into register rc2 = _mm_load_si128((__m128i*)partitionOne2); // Load partition into register rg1 = _mm_load_si128((__m128i*)partitionZero1); // Load partition into register rg2 = _mm_load_si128((__m128i*)partitionZero2); // Load partition into register rc1 = _mm_cmpgt_epi32(ra1, rc1); // Compare to generate 4x32 bit mask; the word with counting boundary is all ones rc2 = _mm_cmpgt_epi32(ra2, rc2); // Compare to generate 4x32 bit mask; the word with counting boundary is all ones rg1 = _mm_cmpgt_epi32(ra1, rg1); // Compare to generate 4x32 bit mask; the word with counting boundary is all zeros rg2 = _mm_cmpgt_epi32(ra2, rg2); // Compare to generate 4x32 bit mask; the word with counting boundary is all zeros rc1 = _mm_srli_epi32(rc1, (16 - numChar1 % 16) * 2); // Shift bits so that all word comform to the requirement of counting the word with counting boundary rc2 = _mm_slli_epi32(rc2, (16 - numChar2 % 16) * 2); // Shift bits so that all word comform to the requirement of counting the word with counting boundary ra1 = _mm_or_si128(rc1, rg1); // Combine two masks ra2 = _mm_or_si128(rc2, rg2); // Combine two masks // Start counting; encoding has been loaded into register earlier r1e = _mm_and_si128(r1e, ra1); // Combine encoding with counting mask r2e = _mm_and_si128(r2e, ra2); // Combine encoding with counting mask // ra1, ra2, rc1, rc2, rg1, rg2, rt1, rt2 all retired // Shift and combine with character selection mask ra1 = _mm_srli_epi32(r1e, 1); // Shift encoding to right by 1 bit ra2 = _mm_srli_epi32(r2e, 1); // Shift encoding to right by 1 bit rt1 = _mm_and_si128(r1e, mcl); // Check even-bits = '1' rt2 = _mm_and_si128(r2e, mcl); // Check even-bits = '1' rg1 = _mm_and_si128(ra1, mcl); // Check odd-bits = '1' rg2 = _mm_and_si128(ra2, mcl); // Check odd-bits = '1' rc1 = _mm_andnot_si128(r1e, mcl); // Check even-bits = '0' rc2 = _mm_andnot_si128(r2e, mcl); // Check even-bits = '0' ra1 = _mm_andnot_si128(ra1, mcl); // Check odd-bits = '0' ra2 = _mm_andnot_si128(ra2, mcl); // Check odd-bits = '0' // r1e, r2e retired // Count for 'c' 'g' 't' r1e = _mm_and_si128(ra1, rt1); // Combine even and odd bits r2e = _mm_and_si128(ra2, rt2); // Combine even and odd bits ra1 = _mm_and_si128(rg1, rc1); // Combine even and odd bits ra2 = _mm_and_si128(rg2, rc2); // Combine even and odd bits rc1 = _mm_and_si128(rg1, rt1); // Combine even and odd bits rc2 = _mm_and_si128(rg2, rt2); // Combine even and odd bits rc = _mm_add_epi32(r1e, r2e); // Combine 2 x 128 bits by adding them together rg = _mm_add_epi32(ra1, ra2); // Combine 2 x 128 bits by adding them together rt = _mm_add_epi32(rc1, rc2); // Combine 2 x 128 bits by adding them together // All except rc, rg, rt retired // Continue counting rc, rg, rt mcl = _mm_set1_epi32(0x33333333); // Set bit-clearing mask to 0x33333333....(alternate 2-bits) rc1 = _mm_srli_epi32(rc, 2); // Shift intermediate result to right by 2 bit rg1 = _mm_srli_epi32(rg, 2); // Shift intermediate result to right by 2 bit rt1 = _mm_srli_epi32(rt, 2); // Shift intermediate result to right by 2 bit rc2 = _mm_and_si128(rc, mcl); // Clear alternate 2-bits of intermediate result by combining with bit-clearing mask (now = 0x33333333....) rg2 = _mm_and_si128(rg, mcl); // Clear alternate 2-bits of intermediate result by combining with bit-clearing mask (now = 0x33333333....) rt2 = _mm_and_si128(rt, mcl); // Clear alternate 2-bits of intermediate result by combining with bit-clearing mask (now = 0x33333333....) rc1 = _mm_and_si128(rc1, mcl); // Clear alternate 2-bits of shifted intermediate result by combining with bit-clearing mask (now = 0x33333333....) rg1 = _mm_and_si128(rg1, mcl); // Clear alternate 2-bits of shifted intermediate result by combining with bit-clearing mask (now = 0x33333333....) rt1 = _mm_and_si128(rt1, mcl); // Clear alternate 2-bits of shifted intermediate result by combining with bit-clearing mask (now = 0x33333333....) rc = _mm_add_epi32(rc1, rc2); // Combine shifted and non-shifted intermediate results by adding them together rg = _mm_add_epi32(rg1, rg2); // Combine shifted and non-shifted intermediate results by adding them together rt = _mm_add_epi32(rt1, rt2); // Combine shifted and non-shifted intermediate results by adding them together mcl = _mm_set1_epi32(0x0F0F0F0F); // Set bit-clearing mask to 0x0F0F0F0F....(alternate 4-bits) r1e = _mm_setzero_si128(); // Set an all-zero mask rc1 = _mm_srli_epi32(rc, 4); // Shift intermediate result to right by 2 bit rg1 = _mm_srli_epi32(rg, 4); // Shift intermediate result to right by 2 bit rt1 = _mm_srli_epi32(rt, 4); // Shift intermediate result to right by 2 bit rc2 = _mm_add_epi32(rc, rc1); // Combine shifted and non-shifted intermediate results by adding them together rg2 = _mm_add_epi32(rg, rg1); // Combine shifted and non-shifted intermediate results by adding them together rt2 = _mm_add_epi32(rt, rt1); // Combine shifted and non-shifted intermediate results by adding them together rc = _mm_and_si128(rc2, mcl); // Clear alternate 4-bits of intermediate result by combining with bit-clearing mask (now = 0xOFOFOFOF....) rg = _mm_and_si128(rg2, mcl); // Clear alternate 4-bits of intermediate result by combining with bit-clearing mask (now = 0xOFOFOFOF....) rt = _mm_and_si128(rt2, mcl); // Clear alternate 4-bits of intermediate result by combining with bit-clearing mask (now = 0xOFOFOFOF....) rc = _mm_sad_epu8(rc, r1e); // Treating the 128 bit as 16 x 8 bit; summing up the 1st 8 x 8 bit into 1st 64-bit and 2nd 8 x 8 bit into 2nd 64-bit rg = _mm_sad_epu8(rg, r1e); // Treating the 128 bit as 16 x 8 bit; summing up the 1st 8 x 8 bit into 1st 64-bit and 2nd 8 x 8 bit into 2nd 64-bit rt = _mm_sad_epu8(rt, r1e); // Treating the 128 bit as 16 x 8 bit; summing up the 1st 8 x 8 bit into 1st 64-bit and 2nd 8 x 8 bit into 2nd 64-bit occValue[1] = _mm_extract_epi16(rc, 0) + _mm_extract_epi16(rc, 4); // Extract result from register and store into variable occValue[2] = _mm_extract_epi16(rg, 0) + _mm_extract_epi16(rg, 4); // Extract result from register and store into variable occValue[3] = _mm_extract_epi16(rt, 0) + _mm_extract_epi16(rt, 4); // Extract result from register and store into variable occValue[0] = maxIndex - minIndex - occValue[1] - occValue[2] - occValue[3]; } unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) { unsigned int occValue, decodeValue; unsigned int occExplicitIndex, occIndex; unsigned int r; // $ is supposed to be positioned at inverseSa0 but it is not encoded // therefore index is subtracted by 1 for adjustment index -= (index > bwt->inverseSa0); #ifdef DEBUG if (index > bwt->textLength) { fprintf(stderr, "BWTOccValue() : index > textLength!\n"); exit(1); } #endif occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex = occExplicitIndex * OCC_INTERVAL; //_mm_prefetch((char*)(memory + address[j+1]), _MM_HINT_NTA); occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); #ifdef DEBUG if (occValue > occIndex) { fprintf(stderr, "BWTOccValue() : occValueExplicit > occIndex!\n"); exit(1); } #endif if (occIndex != index) { decodeValue = BWTDecode(bwt, occIndex, index, character); r = -(occIndex > index); return occValue + (decodeValue & ~r) - (decodeValue & r); } else { return occValue; } } void BWTOccValueTwoIndex(const BWT *bwt, unsigned int index1, unsigned int index2, const unsigned int character, unsigned int* __restrict occValue) { unsigned int decodeValue, tempExplicit1, tempExplicit2, tempOccValue1, tempOccValue2; unsigned int occExplicitIndex1, occIndex1; unsigned int occExplicitIndex2, occIndex2; unsigned int r; // $ is supposed to be positioned at inverseSa0 but it is not encoded // therefore index is subtracted by 1 for adjustment index1 -= (index1 > bwt->inverseSa0); index2 -= (index2 > bwt->inverseSa0); #ifdef DEBUG if (index1 > bwt->textLength) { fprintf(stderr, "BWTOccValueTwoIndex() : index1 > textLength!\n"); exit(1); } if (index2 > bwt->textLength) { fprintf(stderr, "BWTOccValueTwoIndex() : index2 > textLength!\n"); exit(1); } #endif // Pre-fetch memory to be accessed BWTPrefetchBWT(bwt, index1); BWTPrefetchBWT(bwt, index2); occExplicitIndex1 = (index1 + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex1 = occExplicitIndex1 * OCC_INTERVAL; occExplicitIndex2 = (index2 + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex2 = occExplicitIndex2 * OCC_INTERVAL; // Pre-fetch memory to be accessed BWTPrefetchOccValueExplicit(bwt, occExplicitIndex1); BWTPrefetchOccValueExplicit(bwt, occExplicitIndex2); if (occIndex1 != index1) { decodeValue = BWTDecode(bwt, occIndex1, index1, character); r = -(occIndex1 > index1); tempOccValue1 = (decodeValue & ~r) - (decodeValue & r); } else { tempOccValue1 = 0; } if (occIndex2 != index2) { decodeValue = BWTDecode(bwt, occIndex2, index2, character); r = -(occIndex2 > index2); tempOccValue2 = (decodeValue & ~r) - (decodeValue & r); } else { tempOccValue2 = 0; } tempExplicit1 = BWTOccValueExplicit(bwt, occExplicitIndex1, character); tempExplicit2 = BWTOccValueExplicit(bwt, occExplicitIndex2, character); #ifdef DEBUG if (tempExplicit1 > occIndex1) { fprintf(stderr, "BWTOccValueTwoIndex() : occValueExplicit1 > occIndex1!\n"); exit(1); } if (tempExplicit2 > occIndex2) { fprintf(stderr, "BWTOccValueTwoIndex() : occValueExplicit2 > occIndex2!\n"); exit(1); } #endif occValue[0] = tempOccValue1 + tempExplicit1; occValue[1] = tempOccValue2 + tempExplicit2; } void BWTAllOccValue(const BWT *bwt, unsigned int index, unsigned int* __restrict occValue) { unsigned int occExplicitIndex, occIndex; unsigned int ALIGN_16 tempOccValue[ALPHABET_SIZE]; unsigned int r; // SSE registers __m128i rtov, rov, rc, t1, t2; // $ is supposed to be positioned at inverseSa0 but it is not encoded // therefore index is subtracted by 1 for adjustment index -= (index > bwt->inverseSa0); #ifdef DEBUG if (index > bwt->textLength) { fprintf(stderr, "BWTOccValue() : index > textLength!\n"); exit(1); } #endif occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex = occExplicitIndex * OCC_INTERVAL; BWTAllOccValueExplicit(bwt, occExplicitIndex, occValue); if (occIndex != index) { BWTDecodeAll(bwt, occIndex, index, tempOccValue); // The following code add tempOccvalue to occValue if index > occIndex and subtract tempOccValue from occValue if occIndex > index r = -(occIndex > index); rc = _mm_set1_epi32(r); // Set rc = r r r r rtov = _mm_load_si128((__m128i*)tempOccValue); rov = _mm_load_si128((__m128i*)occValue); t1 = _mm_andnot_si128(rc, rtov); t2 = _mm_and_si128(rc, rtov); rov = _mm_add_epi32(rov, t1); rov = _mm_sub_epi32(rov, t2); _mm_store_si128((__m128i*)occValue, rov); } else { return; } } void BWTAllOccValueTwoIndex(const BWT *bwt, unsigned int index1, unsigned int index2, unsigned int* __restrict occValue1, unsigned int* __restrict occValue2) { unsigned int occExplicitIndex1, occIndex1; unsigned int occExplicitIndex2, occIndex2; unsigned int ALIGN_16 tempOccValue1[ALPHABET_SIZE]; unsigned int ALIGN_16 tempOccValue2[ALPHABET_SIZE]; unsigned int r; // SSE registers __m128i rtov, rc, t1, t2, o1, o2; // $ is supposed to be positioned at inverseSa0 but it is not encoded // therefore index is subtracted by 1 for adjustment index1 -= (index1 > bwt->inverseSa0); index2 -= (index2 > bwt->inverseSa0); #ifdef DEBUG if (index1 > index2) { fprintf(stderr, "BWTAllOccValueTwoIndex() : index1 > index2!\n"); exit(1); } if (index2 > bwt->textLength) { fprintf(stderr, "BWTAllOccValueTwoIndex() : index2 > textLength!\n"); exit(1); } #endif // Pre-fetch memory to be accessed BWTPrefetchBWT(bwt, index1); BWTPrefetchBWT(bwt, index2); occExplicitIndex1 = (index1 + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex1 = occExplicitIndex1 * OCC_INTERVAL; occExplicitIndex2 = (index2 + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex2 = occExplicitIndex2 * OCC_INTERVAL; // Pre-fetch memory to be accessed BWTPrefetchOccValueExplicit(bwt, occExplicitIndex1); BWTPrefetchOccValueExplicit(bwt, occExplicitIndex2); if (occIndex1 != index1) { BWTDecodeAll(bwt, occIndex1, index1, tempOccValue1); // The following code add tempOccvalue to occValue if index > occIndex and subtract tempOccValue from occValue if occIndex > index r = -(occIndex1 > index1); rtov = _mm_load_si128((__m128i*)tempOccValue1); rc = _mm_set1_epi32(r); // Set rc = r r r r t1 = _mm_andnot_si128(rc, rtov); t2 = _mm_and_si128(rc, rtov); o1 = _mm_sub_epi32(t1, t2); } else { o1 = _mm_setzero_si128(); } /* if (occIndex1 != index1) { if (occIndex1 < index1) { ForwardDNAAllOccCount(bwt->bwtCode + occIndex1 / CHAR_PER_WORD, index1 - occIndex1, tempOccValue, bwt->decodeTable); occValue1[0] += tempOccValue[0]; occValue1[1] += tempOccValue[1]; occValue1[2] += tempOccValue[2]; occValue1[3] += tempOccValue[3]; } else { BackwardDNAAllOccCount(bwt->bwtCode + occIndex1 / CHAR_PER_WORD, occIndex1 - index1, tempOccValue, bwt->decodeTable); occValue1[0] -= tempOccValue[0]; occValue1[1] -= tempOccValue[1]; occValue1[2] -= tempOccValue[2]; occValue1[3] -= tempOccValue[3]; } } */ if (occIndex2 != index2) { BWTDecodeAll(bwt, occIndex2, index2, tempOccValue2); // The following code add tempOccvalue to occValue if index > occIndex and subtract tempOccValue from occValue if occIndex > index r = -(occIndex1 > index2); rc = _mm_set1_epi32(r); // Set rc = r r r r rtov = _mm_load_si128((__m128i*)tempOccValue2); t1 = _mm_andnot_si128(rc, rtov); t2 = _mm_and_si128(rc, rtov); o2 = _mm_sub_epi32(t1, t2); } else { o2 = _mm_setzero_si128(); } BWTAllOccValueExplicit(bwt, occExplicitIndex1, occValue1); BWTAllOccValueExplicit(bwt, occExplicitIndex2, occValue2); t1 = _mm_load_si128((__m128i*)occValue1); t2 = _mm_load_si128((__m128i*)occValue2); t1 = _mm_add_epi32(t1, o1); t2 = _mm_add_epi32(t2, o2); _mm_store_si128((__m128i*)occValue1, t1); _mm_store_si128((__m128i*)occValue2, t2); /* if (occIndex2 != index2) { if (occIndex2 < index2) { ForwardDNAAllOccCount(bwt->bwtCode + occIndex2 / CHAR_PER_WORD, index2 - occIndex2, tempOccValue, bwt->decodeTable); occValue2[0] += tempOccValue[0]; occValue2[1] += tempOccValue[1]; occValue2[2] += tempOccValue[2]; occValue2[3] += tempOccValue[3]; } else { BackwardDNAAllOccCount(bwt->bwtCode + occIndex2 / CHAR_PER_WORD, occIndex2 - index2, tempOccValue, bwt->decodeTable); occValue2[0] -= tempOccValue[0]; occValue2[1] -= tempOccValue[1]; occValue2[2] -= tempOccValue[2]; occValue2[3] -= tempOccValue[3]; } } */ } unsigned int BWTOccValueOnSpot(const BWT *bwt, unsigned int index, unsigned int* __restrict character) { unsigned int occExplicitIndex, occIndex; unsigned int occValue, decodeValue; unsigned int r; // The bwt character before index will be returned and the count will be up to that bwt character #ifdef DEBUG if (index == bwt->inverseSa0 + 1) { fprintf(stderr, "BWTOccValueOnSpot(): index = inverseSa0 + 1!\n"); exit(1); } if (index > bwt->textLength + 1) { fprintf(stderr, "BWTOccValueOnSpot() : index > textLength!\n"); exit(1); } if (index == 0) { fprintf(stderr, "BWTOccValueOnSpot() : index = 0!\n"); exit(1); } #endif // $ is supposed to be positioned at inverseSa0 but it is not encoded // therefore index is incremented for adjustment index -= (index > bwt->inverseSa0); // Bidirectional encoding occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; occIndex = occExplicitIndex * OCC_INTERVAL; *character = bwt->bwtCode[(index - 1) / CHAR_PER_WORD] << (((index - 1) % CHAR_PER_WORD) * BIT_PER_CHAR) >> (BITS_IN_WORD - BIT_PER_CHAR); occValue = BWTOccValueExplicit(bwt, occExplicitIndex, *character); if (occIndex != index) { decodeValue = BWTDecode(bwt, occIndex, index, *character); r = -(occIndex > index); return occValue + (decodeValue & ~r) - (decodeValue & r); } else { return occValue; } } unsigned int BWTSearchOccValue(const BWT *bwt, const unsigned int character, const unsigned int searchOccValue) { unsigned int occValue; unsigned int i,j; unsigned int c; unsigned int bwtPos; unsigned int occExplicitIndexLeft, occExplicitIndexRight, occExplicitIndexMiddle; #ifdef DEBUG if (searchOccValue == 0 || searchOccValue > bwt->textLength) { fprintf(stderr, "BWTSearchOccValue() : searchOccValue out of bound!\n"); exit(1); } #endif // Search Occurrence value occExplicitIndexLeft = 0; occExplicitIndexRight = (bwt->textLength + OCC_INTERVAL - 1) / OCC_INTERVAL; while (occExplicitIndexLeft + 1 < occExplicitIndexRight) { occExplicitIndexMiddle = average(occExplicitIndexLeft, occExplicitIndexRight); if (searchOccValue > BWTOccValueExplicit(bwt, occExplicitIndexMiddle, character)) { occExplicitIndexLeft = occExplicitIndexMiddle; } else { occExplicitIndexRight = occExplicitIndexMiddle; } } // Not tuned for DNA occValue = BWTOccValueExplicit(bwt, occExplicitIndexLeft, character); bwtPos = occExplicitIndexLeft * OCC_INTERVAL / CHAR_PER_WORD; for (i=0; i < OCC_INTERVAL / CHAR_PER_WORD; i++) { c = bwt->bwtCode[bwtPos + i]; for (j=0; j < CHAR_PER_WORD && occValue < searchOccValue; j++) { if (c >> (BITS_IN_WORD - BIT_PER_CHAR) == character) { occValue++; if (occValue >= searchOccValue) { return occExplicitIndexLeft * OCC_INTERVAL + i * CHAR_PER_WORD + j; } } c <<= BIT_PER_CHAR; } } fprintf(stderr, "BWTSearchOccValue() : unexpected error!\n"); exit(1); } static INLINE unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, const unsigned int character) { unsigned int occIndexMajor; unsigned int compareMask, shift, mask; occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; compareMask = (-(occIndexExplicit % OCC_VALUE_PER_WORD == 0)); shift = 16 & compareMask; mask = 0x0000FFFF | compareMask; return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + ((bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> shift) & mask); } static INLINE void BWTAllOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, unsigned int* __restrict occValueExplicit) { unsigned int occIndexMajor; unsigned int compareMask, shift, mask; __m128i v1, v2, m; occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; compareMask = (-(occIndexExplicit % OCC_VALUE_PER_WORD == 0)); shift = 16 & compareMask; mask = 0x0000FFFF | compareMask; v2 = _mm_load_si128((__m128i *)(bwt->occValue + occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE)); v1 = _mm_load_si128((__m128i *)(bwt->occValueMajor + occIndexMajor * ALPHABET_SIZE)); m = _mm_set1_epi32(mask); v2 = _mm_srli_epi32(v2, shift); v2 = _mm_and_si128(v2, m); v1 = _mm_add_epi32(v1, v2); _mm_store_si128((__m128i*)occValueExplicit, v1); } static INLINE void BWTPrefetchOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit) { unsigned int occIndexMajor; occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; _mm_prefetch((char*)(bwt->occValueMajor + occIndexMajor * ALPHABET_SIZE), _MM_HINT_T0); _mm_prefetch((char*)(bwt->occValue + occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE), _MM_HINT_NTA); } static INLINE void BWTPrefetchBWT(const BWT *bwt, const unsigned int index) { _mm_prefetch((char*)(bwt->bwtCode + index / CHAR_PER_WORD), _MM_HINT_NTA); } unsigned int BWTResidentSizeInWord(const unsigned int numChar) { unsigned int numCharRoundUpToOccInterval; // The $ in BWT at the position of inverseSa0 is not encoded numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; } unsigned int BWTFileSizeInWord(const unsigned int numChar) { // The $ in BWT at the position of inverseSa0 is not encoded return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; } unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar) { unsigned int numOfOccValue; numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; } unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar) { unsigned int numOfOccValue; unsigned int numOfOccIntervalPerMajor; numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; } void BWTClearTrailingBwtCode(BWT *bwt) { unsigned int bwtResidentSizeInWord; unsigned int wordIndex, offset; unsigned int i; bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); wordIndex = bwt->textLength / CHAR_PER_WORD; offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; if (offset > 0) { bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); } else { if (wordIndex < bwtResidentSizeInWord) { bwt->bwtCode[wordIndex] = 0; } } for (i=wordIndex+1; ibwtCode[i] = 0; } } unsigned int BWTPsiMinusValue(const BWT *bwt, const unsigned int index) { unsigned int c; unsigned int occValue; #ifdef DEBUG if (index > bwt->textLength) { fprintf(stderr, "BWTPsiMinusValue() : index out of range!\n"); exit(1); } #endif if (index != bwt->inverseSa0) { occValue = BWTOccValueOnSpot(bwt, index + 1, &c); occValue += bwt->cumulativeFreq[c]; return occValue; } else { return 0; } } unsigned int BWTPsiPlusValue(const BWT *bwt, const unsigned int index) { unsigned int c; unsigned int psiPlusValue; #ifdef DEBUG if (index > bwt->textLength) { fprintf(stderr, "BWTPsiPlusValue() : index out of range!\n"); exit(1); } #endif if (index == 0) { return bwt->inverseSa0; } // Find the BWT of PSI+ c = (index > bwt->cumulativeFreq[1]) + (index > bwt->cumulativeFreq[2]) + (index > bwt->cumulativeFreq[3]); psiPlusValue = BWTSearchOccValue(bwt, c, index - bwt->cumulativeFreq[c]); if (psiPlusValue >= bwt->inverseSa0) { psiPlusValue++; } return psiPlusValue; } unsigned int BWTSaValue(const BWT *bwt, unsigned int saIndex) { unsigned int saValueSkipped = 0; #ifdef DEBUG if (saIndex > bwt->textLength) { fprintf(stderr, "BWTSaValue() : Index out of range!\n"); exit(1); } if (bwt->saValue == NULL) { fprintf(stderr, "BWTSaValue() : Explicit SA value is not loaded!\n"); exit(1); } #endif while (saIndex % bwt->saInterval != 0) { saValueSkipped++; saIndex = BWTPsiMinusValue(bwt, saIndex); } #ifdef DEBUG if (bwt->saValue[saIndex/bwt->saInterval] + saValueSkipped > bwt->textLength) { fprintf(stderr, "BWTSaValue() : saValue out of range!\n"); exit(1); } #endif // SA[0] stores -1 although it should be textLength // PsiMinusValue returns 0 on inverseSa0 return bwt->saValue[saIndex/bwt->saInterval] + saValueSkipped; } unsigned int BWTInverseSa(const BWT *bwt, unsigned int saValue) { unsigned int i; unsigned int saIndex; unsigned int inverseSaExplicitIndex; unsigned int saValueToSkip; #ifdef DEBUG if (saValue > bwt->textLength) { fprintf(stderr, "BWTInverseSa() : Index out of range!\n"); exit(1); } if (bwt->inverseSa == NULL) { fprintf(stderr, "BWTInverseSa() : Explicit inverse SA is not loaded!\n"); exit(1); } #endif inverseSaExplicitIndex = (saValue + bwt->inverseSaInterval - 1) / bwt->inverseSaInterval; if (inverseSaExplicitIndex * bwt->inverseSaInterval > bwt->textLength) { saIndex = 0; saValueToSkip = bwt->textLength - saValue; } else { saIndex = bwt->inverseSa[inverseSaExplicitIndex]; saValueToSkip = inverseSaExplicitIndex * bwt->inverseSaInterval - saValue; } for (i=0; i 0) { // packedText should be allocated with at least 1 Word buffer initialized to zero text = (packedText[index] << shift) | (packedText[index + 1] >> (BITS_IN_WORD - shift)); } else { text = packedText[index]; } if (numOfBit < BITS_IN_WORD) { // Fill unused bit with zero text &= mask[numOfBit]; } return text; } int BWTForwardSearch(const unsigned int *packedKey, const unsigned int keyLength, const BWT *bwt, const unsigned int *packedText) { unsigned int startSaIndex, endSaIndex, saIndexMiddle; unsigned int saExplicitIndexLeft, saExplicitIndexRight, saExplicitIndexMiddle; unsigned int saValue; unsigned int firstChar; unsigned int index, shift; unsigned int packedKeyLength, keyLengthInBit; unsigned int llcp, rlcp, mlcp, maxlcp; unsigned int p = 0; // to avoid compiler warning only if (keyLength % CHAR_PER_WORD == 0) { packedKeyLength = keyLength / CHAR_PER_WORD; keyLengthInBit = packedKeyLength * BITS_IN_WORD; } else { packedKeyLength = keyLength / CHAR_PER_WORD + 1; keyLengthInBit = (keyLength / CHAR_PER_WORD) * BITS_IN_WORD + (keyLength % CHAR_PER_WORD) * BIT_PER_CHAR; } // Get the SA index initial range by retrieving cumulative frequency firstChar = packedKey[0] >> (BITS_IN_WORD - BIT_PER_CHAR); startSaIndex = bwt->cumulativeFreq[firstChar] + 1; endSaIndex = bwt->cumulativeFreq[firstChar + 1]; if (startSaIndex > endSaIndex) { // The first character of search pattern does not exists in text return 0; } // Find lcp for left boundary saValue = bwt->saValueOnBoundary[firstChar * 2]; // Pre-calculated // restriction for positions near the end of text maxlcp = min(packedKeyLength, (bwt->textLength - saValue + CHAR_PER_WORD - 1) / CHAR_PER_WORD); shift = BIT_PER_CHAR * (saValue % CHAR_PER_WORD); index = saValue / CHAR_PER_WORD; llcp = 0; while (llcp < maxlcp && packedKey[llcp] == BWTGetWordPackedText(packedText, index + llcp, shift, keyLengthInBit - llcp * BITS_IN_WORD)) { llcp++; } if ((saValue + keyLength > bwt->textLength) && llcp == maxlcp) { llcp--; } if (llcp == packedKeyLength) { return 1; } // Find lcp for right boundary saValue = bwt->saValueOnBoundary[firstChar * 2 + 1]; // Pre-calculated // restriction for positions near the end of text maxlcp = min(packedKeyLength, (bwt->textLength - saValue + CHAR_PER_WORD - 1) / CHAR_PER_WORD); shift = BIT_PER_CHAR * (saValue % CHAR_PER_WORD); index = saValue / CHAR_PER_WORD; rlcp = 0; while (rlcp < maxlcp && packedKey[rlcp] == BWTGetWordPackedText(packedText, index + rlcp, shift, keyLengthInBit - rlcp * BITS_IN_WORD)) { rlcp++; } if ((saValue + keyLength > bwt->textLength) && rlcp == maxlcp) { rlcp--; } if (rlcp == packedKeyLength) { return 1; } // Locate in SA index explicitly stored saExplicitIndexLeft = startSaIndex / bwt->saInterval; saExplicitIndexRight = (endSaIndex - 1) / bwt->saInterval + 1; // loop until two adjacent SA explicit index is found while (saExplicitIndexLeft + 1 < saExplicitIndexRight) { saExplicitIndexMiddle = average(saExplicitIndexLeft, saExplicitIndexRight); saValue = bwt->saValue[saExplicitIndexMiddle]; shift = BIT_PER_CHAR * (saValue % CHAR_PER_WORD); index = saValue / CHAR_PER_WORD; // Try to increase mlcp mlcp = min(llcp, rlcp); // mlcp = the characters (in unit of 16 for DNA) matched so far // restriction for positions near the end of text maxlcp = min(packedKeyLength, (bwt->textLength - saValue + CHAR_PER_WORD - 1) / CHAR_PER_WORD); while (mlcp < maxlcp) { p = BWTGetWordPackedText(packedText, index + mlcp, shift, keyLengthInBit - mlcp * BITS_IN_WORD); if (packedKey[mlcp] != p) { break; } mlcp++; } if ((saValue + keyLength <= bwt->textLength) || mlcp != maxlcp) { if (mlcp == packedKeyLength) { return 1; } if (packedKey[mlcp] > p) { llcp = mlcp; saExplicitIndexLeft = saExplicitIndexMiddle; } else { rlcp = mlcp; saExplicitIndexRight = saExplicitIndexMiddle; } } else { if (packedKey[mlcp-1] >= p) { llcp = mlcp - 1; saExplicitIndexLeft = saExplicitIndexMiddle; } else { rlcp = mlcp - 1; saExplicitIndexRight = saExplicitIndexMiddle; } } } // Two adjacent SA explicit index is found, convert back to SA index if (saExplicitIndexLeft == startSaIndex / bwt->saInterval) { startSaIndex = bwt->cumulativeFreq[firstChar] + 1; } else { startSaIndex = saExplicitIndexLeft * bwt->saInterval; } if (saExplicitIndexRight == (endSaIndex - 1) / bwt->saInterval + 1) { endSaIndex = bwt->cumulativeFreq[firstChar + 1]; } else { endSaIndex = saExplicitIndexRight * bwt->saInterval; } // binary search by decoding bwt while (startSaIndex < endSaIndex) { saIndexMiddle = average(startSaIndex, endSaIndex); saValue = BWTSaValue(bwt, saIndexMiddle); shift = BIT_PER_CHAR * (saValue % CHAR_PER_WORD); index = saValue / CHAR_PER_WORD; // Try to increase mlcp mlcp = min(llcp, rlcp); // mlcp = the characters (in unit of 16 for DNA) matched so far // restriction for positions near the end of text maxlcp = min(packedKeyLength, (bwt->textLength - saValue + CHAR_PER_WORD - 1) / CHAR_PER_WORD); while (mlcp < maxlcp) { p = BWTGetWordPackedText(packedText, index + mlcp, shift, keyLengthInBit - mlcp * BITS_IN_WORD); if (packedKey[mlcp] != p) { break; } mlcp++; } if ((saValue + keyLength <= bwt->textLength) || mlcp != maxlcp) { if (mlcp == packedKeyLength) { return 1; } if (packedKey[mlcp] > p) { llcp = mlcp; startSaIndex = saIndexMiddle + 1; } else { rlcp = mlcp; endSaIndex = saIndexMiddle; } } else { if (packedKey[mlcp-1] >= p) { llcp = mlcp - 1; startSaIndex = saIndexMiddle + 1; } else { rlcp = mlcp - 1; endSaIndex = saIndexMiddle; } } } // no match found return 0; } soap2.20/DNACount.c0000644000105300011350000007755711164534250012745 0ustar yuchangrd/* DNACount.c DNA Count This module contains DNA occurrence counting functions. The DNA must be in word-packed format. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include "DNACount.h" #include "MiscUtilities.h" void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable) { unsigned int i, j, c, t; for (i=0; i>= 2; } } } unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; #ifdef DEBUG if (index >= 256) { fprintf(stderr, "ForwardDNAOccCount() : index >= 256!\n"); exit(1); } #endif wordToCount = index / 16; charToCount = index - wordToCount * 16; for (i=0; i> 16]; sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; } if (charToCount > 0) { c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } return (sum >> (character * 8)) & 0x000000FF; } unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; #ifdef DEBUG if (index >= 256) { fprintf(stderr, "ForwardDNAOccCount() : index >= 256!\n"); exit(1); } #endif wordToCount = index / 16; charToCount = index - wordToCount * 16; dna -= wordToCount + 1; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; } return (sum >> (character * 8)) & 0x000000FF; } void ForwardDNAAllOccCount(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; #ifdef DEBUG if (index >= 256) { fprintf(stderr, "ForwardDNAOccCount() : index >= 256!\n"); exit(1); } #endif wordToCount = index / 16; charToCount = index - wordToCount * 16; for (i=0; i> 16]; sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; } if (charToCount > 0) { c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } occCount[0] = sum & 0x000000FF; sum >>= 8; occCount[1] = sum & 0x000000FF; sum >>= 8; occCount[2] = sum & 0x000000FF; sum >>= 8; occCount[3] = sum; } void BackwardDNAAllOccCount(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; #ifdef DEBUG if (index >= 256) { fprintf(stderr, "ForwardDNAOccCount() : index >= 256!\n"); exit(1); } #endif wordToCount = index / 16; charToCount = index - wordToCount * 16; dna -= wordToCount + 1; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; } occCount[0] = sum & 0x000000FF; sum >>= 8; occCount[1] = sum & 0x000000FF; sum >>= 8; occCount[2] = sum & 0x000000FF; sum >>= 8; occCount[3] = sum; } unsigned int Forward1OccCount(const unsigned int* bitVector, const unsigned int index, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[32] = { 0x00000000, 0x80000000, 0xC0000000, 0xE0000000, 0xF0000000, 0xF8000000, 0xFC000000, 0xFE000000, 0xFF000000, 0xFF800000, 0xFFC00000, 0xFFE00000, 0xFFF00000, 0xFFF80000, 0xFFFC0000, 0xFFFE0000, 0xFFFF0000, 0xFFFF8000, 0xFFFFC000, 0xFFFFE000, 0xFFFFF000, 0xFFFFF800, 0xFFFFFC00, 0xFFFFFE00, 0xFFFFFF00, 0xFFFFFF80, 0xFFFFFFC0, 0xFFFFFFE0, 0xFFFFFFF0, 0xFFFFFFF8, 0xFFFFFFFC, 0xFFFFFFFE}; unsigned int wordToCount, bitToCount; unsigned int i, c; unsigned int sum = 0; unsigned int numberOf1; #ifdef DEBUG if (index >= 256) { fprintf(stderr, "Forward1OccCount() : index >= 256!\n"); exit(1); } #endif wordToCount = index / 32; bitToCount = index - wordToCount * 32; for (i=0; i> 16]; sum += dnaDecodeTable[bitVector[i] & 0x0000FFFF]; } if (bitToCount > 0) { c = bitVector[i] & truncateRightMask[bitToCount]; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0x0000FFFF]; } sum >>= 8; numberOf1 = sum & 0x000000FF; sum >>= 8; numberOf1 = sum & 0x000000FF; sum >>= 8; numberOf1 = sum * 2; return numberOf1; } unsigned int Backward1OccCount(const unsigned int* bitVector, const unsigned int index, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[32] = { 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x0000000F, 0x0000001F, 0x0000003F, 0x0000007F, 0x000000FF, 0x000001FF, 0x000003FF, 0x000007FF, 0x00000FFF, 0x00001FFF, 0x00003FFF, 0x00007FFF, 0x0000FFFF, 0x0001FFFF, 0x0003FFFF, 0x0007FFFF, 0x000FFFFF, 0x001FFFFF, 0x003FFFFF, 0x007FFFFF, 0x00FFFFFF, 0x01FFFFFF, 0x03FFFFFF, 0x07FFFFFF, 0x0FFFFFFF, 0x1FFFFFFF, 0x3FFFFFFF, 0x7FFFFFFF}; unsigned int wordToCount, bitToCount; unsigned int i, c; unsigned int sum = 0; unsigned int numberOf1; #ifdef DEBUG if (index >= 256) { fprintf(stderr, "ForwardDNAOccCount() : index >= 256!\n"); exit(1); } #endif wordToCount = index / 32; bitToCount = index - wordToCount * 32; bitVector -= wordToCount + 1; if (bitToCount > 0) { c = *bitVector & truncateLeftMask[bitToCount]; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; } for (i=0; i> 16]; sum += dnaDecodeTable[*bitVector & 0x0000FFFF]; } sum >>= 8; numberOf1 = sum & 0x000000FF; sum >>= 8; numberOf1 = sum & 0x000000FF; sum >>= 8; numberOf1 = sum * 2; return numberOf1; } unsigned int ForwardDNAOccCountNoLimit(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; unsigned int iteration, wordToCount, charToCount; unsigned int i, j, c; unsigned int sum; unsigned int occCount = 0; iteration = index / 256; wordToCount = (index - iteration * 256) / 16; charToCount = index - iteration * 256 - wordToCount * 16; for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; dna++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { occCount += (sum >> (character * 8)) & 0x000000FF; } else { // only some or all of the 3 bits are on // in reality, only one of the four cases are possible if (sum == 0x00000100) { if (character == 0) { occCount += 256; } } else if (sum == 0x00010000) { if (character == 1) { occCount += 256; } } else if (sum == 0x01000000) { if (character == 2) { occCount += 256; } } else if (sum == 0x00000000) { if (character == 3) { occCount += 256; } } else { fprintf(stderr, "ForwardDNAOccCountNoLimit(): DNA occ sum exception!\n"); exit(1); } } } sum = 0; for (j=0; j> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; dna++; } if (charToCount > 0) { c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } occCount += (sum >> (character * 8)) & 0x000000FF; return occCount; } unsigned int BackwardDNAOccCountNoLimit(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; unsigned int iteration, wordToCount, charToCount; unsigned int i, j, c; unsigned int sum = 0; unsigned int occCount; dna -= index / 16 + 1; iteration = index / 256; wordToCount = (index - iteration * 256) / 16; charToCount = index - iteration * 256 - wordToCount * 16; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } for (j=0; j> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; } occCount = (sum >> (character * 8)) & 0x000000FF; for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { occCount += (sum >> (character * 8)) & 0x000000FF; } else { // only some or all of the 3 bits are on // in reality, only one of the four cases are possible if (sum == 0x00000100) { if (character == 0) { occCount += 256; } } else if (sum == 0x00010000) { if (character == 1) { occCount += 256; } } else if (sum == 0x01000000) { if (character == 2) { occCount += 256; } } else if (sum == 0x00000000) { if (character == 3) { occCount += 256; } } else { fprintf(stderr, "BackwardDNAOccCountNoLimit(): DNA occ sum exception!\n"); exit(1); } } } return occCount; } void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; unsigned int iteration, wordToCount, charToCount; unsigned int i, j, c; unsigned int sum; occCount[0] = 0; occCount[1] = 0; occCount[2] = 0; occCount[3] = 0; iteration = index / 256; wordToCount = (index - iteration * 256) / 16; charToCount = index - iteration * 256 - wordToCount * 16; for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; dna++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } else { // only some or all of the 3 bits are on // in reality, only one of the four cases are possible if (sum == 0x00000100) { occCount[0] += 256; } else if (sum == 0x00010000) { occCount[1] += 256; } else if (sum == 0x01000000) { occCount[2] += 256; } else if (sum == 0x00000000) { occCount[3] += 256; } else { fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); exit(1); } } } sum = 0; for (j=0; j> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; dna++; } if (charToCount > 0) { c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } void BackwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; unsigned int iteration, wordToCount, charToCount; unsigned int i, j, c; unsigned int sum; dna -= index / 16 + 1; iteration = index / 256; wordToCount = (index - iteration * 256) / 16; charToCount = index - iteration * 256 - wordToCount * 16; sum = 0; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } for (j=0; j> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; } occCount[0] = sum & 0x000000FF; sum >>= 8; occCount[1] = sum & 0x000000FF; sum >>= 8; occCount[2] = sum & 0x000000FF; sum >>= 8; occCount[3] = sum; for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } else { // only some or all of the 3 bits are on // in reality, only one of the four cases are possible if (sum == 0x00000100) { occCount[0] += 256; } else if (sum == 0x00010000) { occCount[1] += 256; } else if (sum == 0x01000000) { occCount[2] += 256; } else if (sum == 0x00000000) { occCount[3] += 256; } else { fprintf(stderr, "BackwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); exit(1); } } } } void GenerateDNA_NOccCountTable(unsigned int *dnaDecodeTable) { unsigned int i, j, c, t; for (i=0; i>= 3; } } } unsigned int ForwardDNA_NOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[10] = { 0x00000000, 0xE0000000, 0xFC000000, 0xFF800000, 0xFFF00000, 0xFFFE0000, 0xFFFFC000, 0xFFFFF800, 0xFFFFFF00, 0xFFFFFFE0}; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; unsigned int occCount; #ifdef DEBUG if (index > 250) { fprintf(stderr, "ForwardDNA_NOccCount() : index > 250!\n"); exit(1); } #endif wordToCount = index / 10; charToCount = index - wordToCount * 10; for (i=0; i> 17]; sum += dnaDecodeTable[(dna[i] >> 2) & 0x00007FFF]; } if (charToCount > 0) { c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 10 - charToCount; sum += dnaDecodeTable[c >> 17]; sum += dnaDecodeTable[(c >> 2) & 0x00007FFF]; sum += charToCount - 10; // decrease count of 'a' by 10 - charToCount } if (character != 4) { occCount = (sum >> (character * 8)) & 0x000000FF; } else { occCount = index; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum; } return occCount; } unsigned int BackwardDNA_NOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[10] = { 0x00000000, 0x0000001C, 0x000000FC, 0x000007FC, 0x00003FFC, 0x0001FFFC, 0x000FFFFC, 0x007FFFFC, 0x03FFFFFC, 0x1FFFFFFC}; unsigned int wordToCount, charToCount; unsigned int j, c; unsigned int sum = 0; unsigned int occCount; #ifdef DEBUG if (index > 250) { fprintf(stderr, "BackwardDNA_NOccCount() : index >= 250!\n"); exit(1); } #endif wordToCount = index / 10; charToCount = index - wordToCount * 10; dna -= wordToCount + 1; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 10 - charToCount; sum += dnaDecodeTable[c >> 17]; sum += dnaDecodeTable[(c >> 2) & 0x00007FFF]; sum += charToCount - 10; // decrease count of 'a' by 10 - charToCount } for (j=0; j> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; } if (character != 4) { occCount = (sum >> (character * 8)) & 0x000000FF; } else { occCount = index; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum; } #ifdef DEBUG if (occCount > index + 1) { fprintf(stderr, "BackwardDNA_NOccCount() : occCount > index + 1!\n"); exit(1); } #endif return occCount; } void ForwardDNA_NAllOccCount(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[10] = { 0x00000000, 0xE0000000, 0xFC000000, 0xFF800000, 0xFFF00000, 0xFFFE0000, 0xFFFFC000, 0xFFFFF800, 0xFFFFFF00, 0xFFFFFFE0}; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; #ifdef DEBUG if (index > 250) { fprintf(stderr, "ForwardDNA_NAllOccCount() : index >= 250!\n"); exit(1); } #endif wordToCount = index / 10; charToCount = index - wordToCount * 10; for (i=0; i> 17]; sum += dnaDecodeTable[(dna[i] >> 2) & 0x00007FFF]; } if (charToCount > 0) { c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 10 - charToCount; sum += dnaDecodeTable[c >> 17]; sum += dnaDecodeTable[(c >> 2) & 0x00007FFF]; sum += charToCount - 10; // decrease count of 'a' by 10 - charToCount } occCount[0] = sum & 0x000000FF; sum >>= 8; occCount[1] = sum & 0x000000FF; sum >>= 8; occCount[2] = sum & 0x000000FF; sum >>= 8; occCount[3] = sum; } void BackwardDNA_NAllOccCount(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[10] = { 0x00000000, 0x0000001C, 0x000000FC, 0x000007FC, 0x00003FFC, 0x0001FFFC, 0x000FFFFC, 0x007FFFFC, 0x03FFFFFC, 0x1FFFFFFC}; unsigned int wordToCount, charToCount; unsigned int j, c; unsigned int sum = 0; #ifdef DEBUG if (index > 250) { fprintf(stderr, "BackwardDNA_NAllOccCount() : index >= 250!\n"); exit(1); } #endif wordToCount = index / 10; charToCount = index - wordToCount * 10; dna -= wordToCount + 1; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 10 - charToCount; sum += dnaDecodeTable[c >> 17]; sum += dnaDecodeTable[(c >> 2) & 0x00007FFF]; sum += charToCount - 10; // decrease count of 'a' by 16 - charToCount } for (j=0; j> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; } occCount[0] = sum & 0x000000FF; sum >>= 8; occCount[1] = sum & 0x000000FF; sum >>= 8; occCount[2] = sum & 0x000000FF; sum >>= 8; occCount[3] = sum; } unsigned int ForwardDNA_NOccCountNoLimit(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[10] = { 0x00000000, 0xE0000000, 0xFC000000, 0xFF800000, 0xFFF00000, 0xFFFE0000, 0xFFFFC000, 0xFFFFF800, 0xFFFFFF00, 0xFFFFFFE0}; unsigned int iteration, wordToCount, charToCount; unsigned int i, j, c; unsigned int sum; unsigned int occCount = 0; iteration = index / 250; wordToCount = (index - iteration * 250) / 10; charToCount = index - iteration * 250 - wordToCount * 10; for (i=0; i> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; dna++; } if (character != 4) { occCount += (sum >> (character * 8)) & 0x000000FF; } else { occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum; } } sum = 0; for (j=0; j> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; dna++; } if (charToCount > 0) { c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 10 - charToCount; sum += dnaDecodeTable[c >> 17]; sum += dnaDecodeTable[(c >> 2) & 0x00007FFF]; sum += charToCount - 10; // decrease count of 'a' by 10 - charToCount } if (character != 4) { occCount += (sum >> (character * 8)) & 0x000000FF; } else { occCount += index; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum; } return occCount; } unsigned int BackwardDNA_NOccCountNoLimit(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[10] = { 0x00000000, 0x0000001C, 0x000000FC, 0x000007FC, 0x00003FFC, 0x0001FFFC, 0x000FFFFC, 0x007FFFFC, 0x03FFFFFC, 0x1FFFFFFC}; unsigned int iteration, wordToCount, charToCount; unsigned int i, j, c; unsigned int sum = 0; unsigned int occCount = 0; dna -= index / 10 + 1; iteration = index / 250; wordToCount = (index - iteration * 250) / 10; charToCount = index - iteration * 250 - wordToCount * 10; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 10 - charToCount; sum += dnaDecodeTable[c >> 17]; sum += dnaDecodeTable[(c >> 2) & 0x00007FFF]; sum += charToCount - 10; // decrease count of 'a' by 10 - charToCount } for (j=0; j> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; } if (character != 4) { occCount = (sum >> (character * 8)) & 0x000000FF; } else { occCount = index; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum; } for (i=0; i> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; } if (character != 4) { occCount += (sum >> (character * 8)) & 0x000000FF; } else { occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum & 0x000000FF; sum >>= 8; occCount -= sum; } } return occCount; } void ForwardDNA_NAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[10] = { 0x00000000, 0xE0000000, 0xFC000000, 0xFF800000, 0xFFF00000, 0xFFFE0000, 0xFFFFC000, 0xFFFFF800, 0xFFFFFF00, 0xFFFFFFE0}; unsigned int iteration, wordToCount, charToCount; unsigned int i, j, c; unsigned int sum; occCount[0] = 0; occCount[1] = 0; occCount[2] = 0; occCount[3] = 0; iteration = index / 250; wordToCount = (index - iteration * 250) / 10; charToCount = index - iteration * 250 - wordToCount * 10; for (i=0; i> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; dna++; } occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } sum = 0; for (j=0; j> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; dna++; } if (charToCount > 0) { c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 10 - charToCount; sum += dnaDecodeTable[c >> 17]; sum += dnaDecodeTable[(c >> 2) & 0x00007FFF]; sum += charToCount - 10; // decrease count of 'a' by 10 - charToCount } occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } void BackwardDNA_NAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[10] = { 0x00000000, 0x0000001C, 0x000000FC, 0x000007FC, 0x00003FFC, 0x0001FFFC, 0x000FFFFC, 0x007FFFFC, 0x03FFFFFC, 0x1FFFFFFC}; unsigned int iteration, wordToCount, charToCount; unsigned int i, j, c; unsigned int sum = 0; dna -= index / 10 + 1; iteration = index / 250; wordToCount = (index - iteration * 250) / 10; charToCount = index - iteration * 250 - wordToCount * 10; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 10 - charToCount; sum += dnaDecodeTable[c >> 17]; sum += dnaDecodeTable[(c >> 2) & 0x00007FFF]; sum += charToCount - 10; // decrease count of 'a' by 16 - charToCount } for (j=0; j> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; } occCount[0] = sum & 0x000000FF; sum >>= 8; occCount[1] = sum & 0x000000FF; sum >>= 8; occCount[2] = sum & 0x000000FF; sum >>= 8; occCount[3] = sum; for (i=0; i> 17]; sum += dnaDecodeTable[(*dna >> 2) & 0x00007FFF]; } occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } } unsigned int ForwardOccCount(const unsigned int* packed, const unsigned int index, const unsigned int character, const unsigned int alphabetSize) { unsigned int wordToCount, charToCount; unsigned int bitPerChar, charPerWord; unsigned int i, j, c; unsigned int occCount = 0; bitPerChar = ceilLog2(alphabetSize); charPerWord = BITS_IN_WORD / bitPerChar; wordToCount = index / charPerWord; charToCount = index - wordToCount * charPerWord; for (i=0; i> (BITS_IN_WORD - bitPerChar) == character) { occCount++; } c <<= bitPerChar; } } if (charToCount > 0) { c = packed[i]; for (j=0; j> (BITS_IN_WORD - bitPerChar) == character) { occCount++; } c <<= bitPerChar; } } return occCount; } unsigned int BackwardOccCount(const unsigned int* packed, const unsigned int index, const unsigned int character, const unsigned int alphabetSize) { unsigned int wordToCount, charToCount; unsigned int bitPerChar, charPerWord; unsigned int i, j, c; unsigned int occCount = 0; bitPerChar = ceilLog2(alphabetSize); charPerWord = BITS_IN_WORD / bitPerChar; wordToCount = index / charPerWord; charToCount = index - wordToCount * charPerWord; packed -= wordToCount + 1; if (charToCount > 0) { c = *packed << (bitPerChar * (charPerWord - charToCount)); for (j=0; j> (BITS_IN_WORD - bitPerChar) == character) { occCount++; } c <<= bitPerChar; } } for (i=1; i<=wordToCount; i++) { packed++; c = *packed; for (j=0; j> (BITS_IN_WORD - bitPerChar) == character) { occCount++; } c <<= bitPerChar; } } return occCount; } void ForwardAllOccCount(const unsigned int* packed, const unsigned int index, const unsigned int alphabetSize, unsigned int* occCount) { unsigned int wordToCount, charToCount; unsigned int bitPerChar, charPerWord; unsigned int i, j, c; bitPerChar = ceilLog2(alphabetSize); charPerWord = BITS_IN_WORD / bitPerChar; wordToCount = index / charPerWord; charToCount = index - wordToCount * charPerWord; for (i=0; i> (BITS_IN_WORD - bitPerChar)]++; c <<= bitPerChar; } } if (charToCount > 0) { c = packed[i]; for (j=0; j> (BITS_IN_WORD - bitPerChar)]++; c <<= bitPerChar; } } } void BackwardAllOccCount(const unsigned int* packed, const unsigned int index, const unsigned int alphabetSize, unsigned int* occCount) { unsigned int wordToCount, charToCount; unsigned int bitPerChar, charPerWord; unsigned int i, j, c; bitPerChar = ceilLog2(alphabetSize); charPerWord = BITS_IN_WORD / bitPerChar; wordToCount = index / charPerWord; charToCount = index - wordToCount * charPerWord; packed -= wordToCount + 1; if (charToCount > 0) { c = *packed << (bitPerChar * (charPerWord - charToCount)); for (j=0; j> (BITS_IN_WORD - bitPerChar)]++; c <<= bitPerChar; } } for (i=1; i<=wordToCount; i++) { packed++; c = *packed; for (j=0; j> (BITS_IN_WORD - bitPerChar)]++; c <<= bitPerChar; } } } soap2.20/extratools.c0000644000105300011350000001656311167575332013535 0ustar yuchangrd#include "extratools.h" //This file includes the implementations of all the extra tools adding to all steps //e.g. Look Up Table // Hash Table // All things like those void LoadLookupTable(LOOKUPTABLE * lookupTable, const char * fileName, const int tableSize) { (*lookupTable).tableSize = tableSize; unsigned long long NR_TOP = 1 << (tableSize * 2); (*lookupTable).table = malloc(sizeof(unsigned) * NR_TOP); int fin = open(fileName, O_RDONLY); unsigned step = 1048576; unsigned int i; for (i = 0; i < NR_TOP; i += step) { read(fin, (*lookupTable).table + i, step * sizeof(*(*lookupTable).table)); } close(fin); } unsigned int LookupSafe(LOOKUPTABLE lookupTable, BWT * bwt, unsigned long long lKey, unsigned long long rKey, unsigned int *l, unsigned int *r) { *l = lKey ? lookupTable.table[lKey-1]+1 : 1; *r = lookupTable.table[rKey]; if (*l == bwt->inverseSa0) { (*l)++; } return *r-*l+1; } unsigned int retrieveSA=0,retrieveHASH=0; double textPositionTime, textPositionTimeTotal = 0; unsigned int writeQIndex; double getTextPositionTime() {return textPositionTimeTotal;} unsigned int getSARetrieved() {return retrieveSA;} unsigned int getHASHRetrieved() {return retrieveHASH;} void FreeLookupTable(LOOKUPTABLE * lookupTable) { free((*lookupTable).table); } void LoadHashTable(HASHTABLE * hashTable, const char * fileName) { unsigned int ttlOccurrence=0; unsigned int ttlItem=0; FILE *inFile; if(!(inFile = fopen(fileName, "r"))) return; fread((unsigned int *)&((*hashTable).tableSize),sizeof(unsigned int),1,inFile); fread((unsigned int *)&((*hashTable).a),sizeof(unsigned int),1,inFile); fread((unsigned int *)&((*hashTable).b),sizeof(unsigned int),1,inFile); fread((unsigned int *)&((*hashTable).prime),sizeof(unsigned int),1,inFile); fread((unsigned int *)&ttlItem,sizeof(unsigned int),1,inFile); fread((unsigned int *)&ttlOccurrence,sizeof(unsigned int),1,inFile); //printf("Initializing the hash table..(n=%u)\n",(*hashTable).tableSize); (*hashTable).table = (HASHCELL*) malloc(sizeof(HASHCELL)*((*hashTable).tableSize)); (*hashTable).itemList = (HASHITEM*) malloc(sizeof(HASHITEM)*ttlItem); (*hashTable).occList = (OCC*) malloc(sizeof(OCC)*ttlOccurrence); //printf("Initialized the hash table..\n"); unsigned int i; for (i=0;i<((*hashTable).tableSize);i++) { char mk; fread((char *) &mk,1,1,inFile); if (mk==0) { //Empty cell (*hashTable).table[i].index=0; (*hashTable).table[i].count=0; } else { fread((unsigned int *)&((*hashTable).table[i].index),sizeof(unsigned int),1,inFile); fread((unsigned int *)&((*hashTable).table[i].count),sizeof(unsigned int),1,inFile); } } for (i=0;i=0 && hashedIndex<(*hashTable).tableSize) { unsigned int i=0; while (i=count) return NULL; return &((*hashTable).itemList[index+i]); } return NULL; } void RegisterDecoder(BWT * bwt,HASHTABLE * hashTable) { occBwt=bwt; occHashtable=hashTable; occCollected=0; retrieveSA=0; retrieveHASH=0; textPositionTimeTotal = 0; writeQIndex = 0; //occCollector = malloc(sizeof (unsigned int) * 1024*1024); } unsigned int allOne = 0; unsigned int OCCSection=0; inline int CalMismatch(const char *seq, const unsigned int *ref, const unsigned int occPosCord, const unsigned int seqLen, const unsigned int dnaLength){ unsigned int i, l; int match = 0; // fprintf(stderr, "%u\t%u\t%u\n", occPosCord, seqLen, dnaLength); // fprintf(stderr, "%u\n", ref[0]); for(i =0, l=occPosCord; i < seqLen && l < dnaLength; ++i, ++l){ // fprintf(stderr, "%d,%u ", i, l); if(!(((*(seq+i))&0x3) ^ ((((*(ref+(l>>4)))>>(((~l)&0xf)<<1)))&0x3))) ++match; } // fprintf(stderr, "match %d\n", match); return (seqLen-match); } #include int OCCProcess(const unsigned int l,const unsigned int r, const BWTOPT *bo, const unsigned int info, HITTABLE *hits) { #if TRUE // fprintf(stderr, "OCC Process, n occ %d\n", r-l+1); #endif const unsigned int cutoff = bo->cutoff; if(hits->n >= bo->cutoff) return 0; unsigned int n = hits->n; HITITEM *hit = hits->itemList+n; ChrBlock *blockList = bo->blockList; const unsigned int nblock = bo->nblock; const unsigned int seqLen = bo->seqLen; const unsigned int alnLen = bo->alnLen; const unsigned int extLen = seqLen-alnLen; const unsigned int max_mm = bo->max_mm+(info>>25 & 0x7); const unsigned int *pacRef = bo->pacRef; const unsigned int dnaLength = bo->dnaLen; const unsigned int strain = (info>>24)&1; char *seq = strain?bo->rc:bo->fw; // for(i=0; in; hits->n = n; return inc; } else { //Hash HASHITEM *item = HashFind(occHashtable,l,r); if (item != NULL) { unsigned int k; for (k=0;kr-item->l+1 && n < cutoff;k++) { occ_pos = occHashtable->occList[item->occIndex+k]; HitInc(n); } int inc = n - hits->n; hits->n = n; return inc; } else { unsigned int k; for (k=l; k<=r && n < cutoff; k++) { occ_pos = BWTSaValue(occBwt,k); HitInc(n); } int inc = n - hits->n; hits->n = n; return inc; } } } void registerTPFile(FILE * filePtr,unsigned int searchMode) { textPositionFile=filePtr; fwrite(&searchMode,sizeof(unsigned int),1,textPositionFile); allOne=(1U<<31)-1; allOne<<=1; allOne+=1; } void registerQIndex(unsigned int index) { writeQIndex=index; OCCSection=0; } void registerQSection() { if (writeQIndex==0) { fwrite(&allOne,sizeof(unsigned int),1,textPositionFile); } else { OCCSection++; } } soap2.20/HSP.c0000644000105300011350000002514011164534250011741 0ustar yuchangrd/* HSP.c BWTBlastn functions This module contains miscellaneous BWTBlastn functions. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include "MemManager.h" #include #include #include #include #include "TextConverter.h" #include "MiscUtilities.h" #include "r250.h" #include "HSP.h" extern double stat_expectationValue; void HSPFillCharMap(unsigned char charMap[255]) { int i; for (i=0; i<255; i++) { charMap[i] = nonMatchDnaCharIndex; } for (i=0; i<16; i++) { charMap[(int)dnaChar[i]] = (unsigned char)i; charMap[(int)dnaChar[i] - 'A' + 'a'] = (unsigned char)i; } } void HSPFillComplementMap(unsigned char complementMap[255]) { int i; for (i=0; i<255; i++) { complementMap[i] = nonMatchDnaCharIndex; } for (i=0; i<16; i++) { complementMap[(int)dnaComplement[i]] = (unsigned int)i; complementMap[(int)dnaComplement[i] - 'A' + 'a'] = (unsigned int)i; } } HSP *HSPLoad (MMPool *mmPool, const char *PackedDNAFileName, const char *AnnotationFileName) { HSP *hsp; int i; FILE *annotationFile = NULL; hsp = MMPoolDispatch(mmPool, sizeof(HSP)); // Load packed DNA if (PackedDNAFileName != NULL && PackedDNAFileName[0] != '\0' && PackedDNAFileName[0] != '-') { hsp->packedDNA = DNALoadPacked(PackedDNAFileName, &hsp->dnaLength, TRUE); } else { hsp->packedDNA = NULL; hsp->dnaLength = 0; } // Load Annotation // fprintf(stderr, "%s\n", AnnotationFileName); if (AnnotationFileName != NULL && (annotationFile = fopen(AnnotationFileName, "r"))){ unsigned int numOfChar = 0; int FASTARandomSeed =0; int chrNum =0; fscanf(annotationFile, "%u\t%d\t%d\n", &numOfChar, &chrNum, &FASTARandomSeed); hsp->chrNum = chrNum; hsp->chrName = MMUnitAllocate((chrNum+1)*sizeof(char *)); for (i=0; ichrNum; ++i){ unsigned int nameLen = 0; fscanf(annotationFile, "%u\t", &nameLen); hsp->chrName[i] = MMUnitAllocate((nameLen+1) * sizeof(char)); fscanf(annotationFile, "%s\n", hsp->chrName[i]); } fscanf(annotationFile, "%d\n", &(hsp->numOfBlock)); hsp->blockList = MMUnitAllocate(((hsp->numOfBlock)+1) * sizeof(ChrBlock)); for(i=0;inumOfBlock;++i){ ChrBlock *p = hsp->blockList+i; int chrID, blockStart, blockEnd,ori; chrID = blockStart = blockEnd = ori = 0; fscanf(annotationFile, "%d\t%u\t%u\t%u\n", &(p->chrID), &(p->blockStart), &(p->blockEnd), &(p->ori)); } } hsp->dnaLength = hsp->blockList[hsp->numOfBlock-1].blockEnd; #ifdef DEBUG int j = hsp->numOfBlock; fprintf(stderr, "%d\t%d\t%u\t%u\n", hsp->blockList[j-1].chrID, hsp->blockList[j-1].ori, hsp->blockList[j-1].blockStart, hsp->blockList[j-1].blockEnd); #endif fclose(annotationFile); return hsp; } void HSPFree(MMPool *mmPool, HSP *hsp) { if (hsp->packedDNA != NULL) { DNAFreePacked(hsp->packedDNA, hsp->dnaLength); } int i; for(i=0; ichrNum; ++i){ MMUnitFree(hsp->chrName[i], (strlen(hsp->chrName[i])+1)*sizeof(char)); } MMUnitFree(hsp->chrName,(hsp->chrNum+1)* sizeof(char *)); MMUnitFree(hsp->blockList, (hsp->numOfBlock+1) * sizeof(ChrBlock)); MMPoolReturn(mmPool, hsp, sizeof(hsp)); } unsigned int HSPParseFASTAToPacked(const char* FASTAFileName, const char* annotationFileName, const char* packedDNAFileName, const char* ambiguityFileName, const unsigned int FASTARandomSeed, const int maskLowerCase) { FILE *FASTAFile, *annotationFile, *packedDNAFile, *ambiguityFile; NewAnnotation *chrAnnotation; int chrAnnAllocated = 256; int blockAllocated = 256; char c; int chrNum, blockNum; unsigned int i, l; int nCount; unsigned int chrLen, usefulCharNum, numCharInBuffer, totalNumChar; unsigned char charMap[255]; char *chrSeq, *p; unsigned int chrAllocated = 65536; unsigned char buffer[PACKED_BUFFER_SIZE]; unsigned char packedBuffer[PACKED_BUFFER_SIZE / 4]; chrLen = usefulCharNum = numCharInBuffer = totalNumChar = chrNum = blockNum = i = l = nCount = 0; FASTAFile = (FILE*)fopen64(FASTAFileName, "r"); if (FASTAFile == NULL) { fprintf(stderr, "ParseFASTToPacked() : Cannot open FASTAFileName!\n"); exit(1); } annotationFile = (FILE*)fopen64(annotationFileName, "w"); if (annotationFile == NULL) { fprintf(stderr, "ParseFASTToPacked() : Cannot open annotationFileName!\n"); exit(1); } packedDNAFile = (FILE*)fopen64(packedDNAFileName, "wb"); if (packedDNAFile == NULL) { fprintf(stderr, "ParseFASTToPacked() : Cannot open packedDNAFileName!\n"); exit(1); } ambiguityFile = (FILE*)fopen64(ambiguityFileName, "w"); if (ambiguityFile == NULL) { fprintf(stderr, "ParseFASTToPacked() : Cannot open ambiguityFileName!\n"); exit(1); } HSPFillCharMap(charMap); c = (char)getc(FASTAFile); if (c != '>') { fprintf(stderr, "ParseFASTToPacked() : FASTA file does not begin with '>'!\n"); exit(1); } chrAnnotation = (NewAnnotation *)malloc(sizeof(NewAnnotation)*chrAnnAllocated); chrSeq = (char*)malloc(sizeof(char)*chrAllocated); chrNum = blockNum = usefulCharNum = numCharInBuffer = 0; while(!feof(FASTAFile)){ if (feof(FASTAFile)) break; if (chrNum == chrAnnAllocated){ chrAnnAllocated <<= 1; chrAnnotation = (NewAnnotation *)realloc(chrAnnotation, sizeof(NewAnnotation)*chrAnnAllocated); // printf("%d\n", chrNum); } l=0; c = (char)getc(FASTAFile); while(!feof(FASTAFile) && c!='\t' && c!=' ' && c!='\n' && l='a' && c<='z'){ c+='A'-'a'; } if (chrLen >= chrAllocated){ chrAllocated <<= 1; chrSeq = (char*)realloc(chrSeq, sizeof(char)*chrAllocated); } *(chrSeq+chrLen) = c; chrLen += 1; } c=(char)getc(FASTAFile); } if (chrLen < 75) continue; //* i=0; p=chrSeq; while (ambiguityCount[charMap[(int)*p]] == 1 && i++ != chrLen) p++; if (i == chrLen) { blockNum = 1; chrAnnotation[chrNum].blockInChr = (ChrBlock *)malloc(sizeof(ChrBlock)*blockNum); chrAnnotation[chrNum].chrStart = usefulCharNum; chrAnnotation[chrNum].blockNum = blockNum; chrAnnotation[chrNum].blockInChr[0].blockStart = usefulCharNum; chrAnnotation[chrNum].blockInChr[0].ori = 0; usefulCharNum += chrLen; chrAnnotation[chrNum].chrEnd = usefulCharNum-1; chrAnnotation[chrNum].blockInChr[0].blockEnd = usefulCharNum-1; i=0; while(i= PACKED_BUFFER_SIZE) { ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, PACKED_BUFFER_SIZE); fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / 4, packedDNAFile); numCharInBuffer = 0; } buffer[numCharInBuffer++] = chrSeq[i++]; } } else { i=0; p = chrSeq; while (ambiguityCount[charMap[(int)*p]]!=1 && ++i!=chrLen) p++; if (i<10) { i = 0; p = chrSeq;} blockNum = 1; chrAnnotation[chrNum].blockInChr = (ChrBlock *)malloc(sizeof(ChrBlock)*blockAllocated); chrAnnotation[chrNum].chrStart = usefulCharNum; chrAnnotation[chrNum].blockInChr[blockNum-1].ori = i; chrAnnotation[chrNum].blockInChr[blockNum-1].blockStart = usefulCharNum; int len=0; while (i= PACKED_BUFFER_SIZE) { ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, PACKED_BUFFER_SIZE); fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / 4, packedDNAFile); numCharInBuffer = 0; } buffer[numCharInBuffer++] = *p++; i++; usefulCharNum++; len++; }else{ nCount = 0; while((ambiguityCount[charMap[(int)*p]]!=1) && i= PACKED_BUFFER_SIZE) { ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, PACKED_BUFFER_SIZE); fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / 4, packedDNAFile); numCharInBuffer = 0; } buffer[numCharInBuffer++] = 'G'; usefulCharNum++; len++; } while(--nCount>0); } else { if (i 0) { ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, numCharInBuffer); fwrite(packedBuffer, 1, (numCharInBuffer + 3) / 4, packedDNAFile); numCharInBuffer = 0; } if (totalNumChar % 4 == 0) { c = 0; fwrite(&c, 1, 1, packedDNAFile); } c = (char)(totalNumChar % 4); fwrite(&c, 1, 1, packedDNAFile); fclose(packedDNAFile); fprintf(annotationFile, "%u\t%d\t%d\n", totalNumChar, chrNum, FASTARandomSeed); int j=0; int total = 0; for (i=0;i #include #include "kstring.h" int ksprintf(kstring_t *s, const char *fmt, ...) { va_list ap; int l; va_start(ap, fmt); l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); va_end(ap); if (l + 1 > s->m - s->l) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); va_start(ap, fmt); l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); } va_end(ap); s->l += l; return l; } #ifdef KSTRING_MAIN #include int main() { kstring_t *s; s = (kstring_t*)calloc(1, sizeof(kstring_t)); ksprintf(s, "abcdefg: %d", 100); printf("%s\n", s->s); free(s); return 0; } #endif soap2.20/MemManager.c0000644000105300011350000007454711164534250013337 0ustar yuchangrd/* MemManager.c Memory Manager This module provides memory management functions. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #ifndef _WIN32 #include #endif #include "MiscUtilities.h" #include "MemManager.h" MMMaster mmMaster; void *MMMalloc(const unsigned int memSize) { void *address; address = MEMALIGN(memSize, MAX_ALIGN); if (address == NULL) { fprintf(stderr, "MMMalloc() : cannot allocate memory!\n"); exit(1); } return address; } void MMFree(void *address) { FREEALIGN(address); } void MMMasterInitialize(const unsigned int maxNumberOfPools, const unsigned int maxNumberOfBulks, const int traceUnitByteAllocation, FILE* unitByteTraceFile) { unsigned int i; mmMaster.maxTotalByteAllocated = 0; mmMaster.maxTotalByteDispatched = 0; mmMaster.currentUnitByteAllocated = 0; mmMaster.maxUnitByteAllocated = 0; mmMaster.maxNumberOfBulks = maxNumberOfBulks; mmMaster.maxNumberOfPools = maxNumberOfPools; if (maxNumberOfBulks > 0) { mmMaster.mmBulk = MEMALIGN(sizeof(MMBulk*) * maxNumberOfBulks, MAX_ALIGN); for (i=0; i 0) { mmMaster.mmPool = MEMALIGN(sizeof(MMPool*) * maxNumberOfPools, MAX_ALIGN); for (i=0; i mmMaster.maxTotalByteAllocated) { return currentTotalByteAllocated; } else { return mmMaster.maxTotalByteAllocated; } } unsigned int MMMasterMaxTotalByteDispatched() { unsigned int currentTotalByteDispatched ; currentTotalByteDispatched = MMMasterCurrentTotalByteDispatched(); if (currentTotalByteDispatched > mmMaster.maxTotalByteDispatched) { return currentTotalByteDispatched; } else { return mmMaster.maxTotalByteDispatched; } } void MMMasterSetMaxTotalByteAllocated() { unsigned int currentTotalByteAllocated; currentTotalByteAllocated = MMMasterCurrentTotalByteAllocated(); if (currentTotalByteAllocated > mmMaster.maxTotalByteAllocated) { mmMaster.maxTotalByteAllocated = currentTotalByteAllocated; } } void MMMasterSetMaxTotalByteDispatched() { unsigned int currentTotalByteDispatched; currentTotalByteDispatched = MMMasterCurrentTotalByteDispatched(); if (currentTotalByteDispatched > mmMaster.maxTotalByteDispatched) { mmMaster.maxTotalByteDispatched = currentTotalByteDispatched; } } void MMMasterPrintReport(FILE *output, const unsigned int withUnitDetails, const unsigned int withPoolDetails, const unsigned int withBulkDetails) { unsigned int i; fprintf(output, "Maximum amount of memory allocated: %u\n", MMMasterMaxTotalByteAllocated()); fprintf(output, "Maximum amount of memory dispatched: %u\n", MMMasterMaxTotalByteDispatched()); if (withUnitDetails) { fprintf(output, "\n"); MMUnitPrintReport(output); } if (withPoolDetails) { for (i=0; i mmMaster.maxUnitByteAllocated) { mmMaster.maxUnitByteAllocated = mmMaster.currentUnitByteAllocated; } mmMaster.currentUnitByteAllocated -= memSize; if (mmMaster.traceUnitByteAllocation) { fprintf(mmMaster.unitByteTraceFile, "MMUnitFree : %u\n", memSize); } } unsigned int MMUnitCurrentByteAllocated() { return mmMaster.currentUnitByteAllocated; } unsigned int MMUnitMaxByteAllocated() { if (mmMaster.currentUnitByteAllocated > mmMaster.maxUnitByteAllocated) { return mmMaster.currentUnitByteAllocated; } else { return mmMaster.maxUnitByteAllocated; } } void MMUnitPrintReport(FILE *output) { fprintf(output, "Maximum amount of unit memory allocated: %u\n", MMUnitMaxByteAllocated()); fprintf(output, "Amount of memory unit memory currently allocated: %u\n", MMUnitCurrentByteAllocated()); } MMPool *MMPoolCreate(const unsigned int poolSize) { MMPool *mmPool; unsigned int i; #ifdef DEBUG if (poolSize < sizeof(MMPool)) { fprintf(stderr, "MMPoolCreate() : poolSize < MMPool!\n"); exit(1); } #endif if (poolSize / MAX_ALIGN * MAX_ALIGN != poolSize) { fprintf(stderr, "MMPoolCreate() : poolSize must be multiple of MAX_ALIGN (%d)!\n", MAX_ALIGN); // Otherwise temp memory is not properly aligned exit(1); } mmPool = MEMALIGN(poolSize, MAX_ALIGN); if (mmPool == NULL) { fprintf(stderr, "MMPoolCreate() : cannot allocate memory!\n"); exit(1); } mmPool->poolSize = poolSize; mmPool->poolByteDispatched = sizeof(MMPool); mmPool->poolByteSpillover = 0; mmPool->firstSpillOverAddress = NULL; mmPool->currentTempByteDispatched = 0; mmPool->currentTempByteSpillover = 0; mmPool->maxTotalByteDispatched = 0; for (i=0; i maxNumberOfPools!\n"); exit(1); } unsigned int MMPoolIsActive(const MMPool *mmPool) { return ((mmPool->firstSpillOverAddress) != (void*)mmPool); } void MMPoolSetInactive(MMPool *mmPool) { if (mmPool->firstSpillOverAddress != NULL) { fprintf(stderr, "MMPoolSetInactive() : spillover memory not freed yet!\n"); exit(1); } mmPool->firstSpillOverAddress = (void*)mmPool; } unsigned int MMPoolCurrentTotalByteAllocated(const MMPool *mmPool) { return mmPool->poolSize + mmPool->poolByteSpillover + mmPool->currentTempByteSpillover; } unsigned int MMPoolCurrentTotalByteDispatched(const MMPool *mmPool) { return mmPool->poolByteDispatched + mmPool->currentTempByteDispatched; } unsigned int MMPoolMaxTotalByteDispatched(const MMPool *mmPool) { unsigned int currentTotalByteDispatched; currentTotalByteDispatched = MMPoolCurrentTotalByteDispatched(mmPool); if (currentTotalByteDispatched > mmPool->maxTotalByteDispatched) { return currentTotalByteDispatched; } else { return mmPool->maxTotalByteDispatched; } } unsigned int MMPoolByteAvailable(const MMPool *mmPool) { if (mmPool->poolSize > mmPool->poolByteDispatched + MAX_ALIGN) { return (mmPool->poolSize - mmPool->poolByteDispatched + MAX_ALIGN - 1) / MAX_ALIGN * MAX_ALIGN; } else { return 0; } } MMPool *MMPoolFree(MMPool *mmPool) { MMPool *dummyMMPool; unsigned int i; void *temp1, *temp2; #ifdef DEBUG if (mmPool == NULL) { fprintf(stderr, "MMPoolFree(): mmPool = NULL!\n"); exit(1); } #endif #ifdef RECORD_GRAND_TOTAL MMMasterSetMaxTotalByteAllocated(); MMMasterSetMaxTotalByteDispatched(); #endif dummyMMPool = MEMALIGN(sizeof(MMPool), MAX_ALIGN); if (dummyMMPool == NULL) { fprintf(stderr, "MMPoolFree() : cannot allocate memory!\n"); exit(1); } // Free spillover memory temp1 = mmPool->firstSpillOverAddress; while (temp1 != NULL) { temp2 = *((void**)temp1); FREEALIGN(temp1); temp1 = temp2; } mmPool->firstSpillOverAddress = NULL; dummyMMPool->poolByteDispatched = mmPool->poolByteDispatched; dummyMMPool->poolByteSpillover = mmPool->poolByteSpillover; dummyMMPool->currentTempByteDispatched = mmPool->currentTempByteDispatched; dummyMMPool->currentTempByteSpillover = mmPool->currentTempByteSpillover; dummyMMPool->firstSpillOverAddress = mmPool->firstSpillOverAddress; dummyMMPool->maxTotalByteDispatched = mmPool->maxTotalByteDispatched; dummyMMPool->poolSize = mmPool->poolSize; MMPoolSetInactive(dummyMMPool); // Update master directory for (i=0; ifirstSpillOverAddress; while (temp1 != NULL) { temp2 = *((void**)temp1); FREEALIGN(temp1); temp1 = temp2; } mmPool->poolByteDispatched = sizeof(MMPool); mmPool->poolByteSpillover = 0; mmPool->currentTempByteDispatched = 0; mmPool->currentTempByteSpillover = 0; mmPool->firstSpillOverAddress = NULL; mmPool->maxTotalByteDispatched = 0; } void MMPoolDestory(MMPool *mmPool) { unsigned int i; MMPool *temp; #ifdef DEBUG if (mmPool == NULL) { fprintf(stderr, "MMPoolDestory(): mmPool = NULL!\n"); exit(1); } #endif if (MMPoolIsActive(mmPool)) { temp = MMPoolFree(mmPool); } else { temp = mmPool; } // Update master directory for (i=0; ipoolByteDispatched - mmPool->poolByteSpillover + mmPool->currentTempByteDispatched - mmPool->currentTempByteSpillover; nextPoolMemoryOffset = mmPool->poolByteDispatched - mmPool->poolByteSpillover; // Calculate the number of byte to skip in order to align the memory dispatched align = 1 << (BITS_IN_WORD - leadingZero(memSize - 1)); if (align > MAX_ALIGN) { align = MAX_ALIGN; } if (align < MIN_ALIGN) { align = MIN_ALIGN; } skipForAlign = nextAlignedBoundary(nextPoolMemoryOffset, align) - nextPoolMemoryOffset; if (totalPoolMemoryUsed + memSize + skipForAlign <= mmPool->poolSize) { temp = (void**)(((char*)mmPool) + nextPoolMemoryOffset + skipForAlign); mmPool->poolByteDispatched += memSize + skipForAlign; return temp; } else { // Spillover // Allocate for linked list pointer as well temp = MEMALIGN(memSize + MAX_ALIGN, MAX_ALIGN); // spillover memory is always aligned to MAX_ALIGN if (temp == NULL) { fprintf(stderr, "MMPoolDispatch(): cannot allocate memory!\n"); exit(1); } // Add spillover memory to linked list *temp = mmPool->firstSpillOverAddress; mmPool->firstSpillOverAddress = temp; mmPool->poolByteSpillover += memSize + MAX_ALIGN; mmPool->poolByteDispatched += memSize + MAX_ALIGN; return (char*)temp + MAX_ALIGN; } } unsigned int MMPoolDispatchOffset(MMPool *mmPool, const unsigned int memSize) { unsigned int totalPoolMemoryUsed, nextPoolMemoryOffset; unsigned int align, skipForAlign; if (mmPool == NULL) { fprintf(stderr, "MMPoolDispatchOffset(): mmPool == NULL!\n"); exit(1); } if (memSize == 0) { fprintf(stderr, "MMPoolDispatchOffset(): memSize = 0!\n"); exit(1); } totalPoolMemoryUsed = mmPool->poolByteDispatched - mmPool->poolByteSpillover + mmPool->currentTempByteDispatched - mmPool->currentTempByteSpillover; nextPoolMemoryOffset = mmPool->poolByteDispatched - mmPool->poolByteSpillover; // Calculate the number of byte to skip in order to align the memory dispatched align = 1 << (BITS_IN_WORD - leadingZero(memSize - 1)); if (align > MAX_ALIGN) { align = MAX_ALIGN; } if (align < MIN_ALIGN) { align = MIN_ALIGN; } skipForAlign = nextAlignedBoundary(nextPoolMemoryOffset, align) - nextPoolMemoryOffset; if (totalPoolMemoryUsed + memSize + skipForAlign > mmPool->poolSize) { fprintf(stderr, "MMPoolDispatchOffset(): Not enough memory in memory pool!\n"); exit(1); } mmPool->poolByteDispatched += memSize + skipForAlign; return nextPoolMemoryOffset + skipForAlign; } void MMPoolReturn(MMPool *mmPool, void *address, const unsigned int memSize) { if (mmPool == NULL) { MMUnitFree(address, memSize); } } void MMPoolPrintReport(MMPool *mmPool, FILE *output) { fprintf(output, "Pool Size : %u\n", mmPool->poolSize); fprintf(output, " Dispatched : %u\n", mmPool->poolByteDispatched); fprintf(output, " - Spillover : %u\n", mmPool->poolByteSpillover); fprintf(output, "Maximum amount of memory dispatched including temp memory : %u\n", MMPoolMaxTotalByteDispatched(mmPool)); } void *MMTempDispatch(MMPool *mmPool, const unsigned int memSize) { void **temp; unsigned int totalPoolMemoryUsed, nextTempMemoryOffset; unsigned int alignedMemSize; void **pointerToLastSpilloverAddress; if (mmPool == NULL) { return MMUnitAllocate(memSize); } if (memSize == 0) { fprintf(stderr, "MMTempDispatch(): memSize = 0!\n"); exit(1); } alignedMemSize = nextAlignedBoundary(memSize, MAX_ALIGN); // temp memory is always aligned to MAX_ALIGN totalPoolMemoryUsed = mmPool->poolByteDispatched - mmPool->poolByteSpillover + mmPool->currentTempByteDispatched - mmPool->currentTempByteSpillover; nextTempMemoryOffset = mmPool->currentTempByteDispatched - mmPool->currentTempByteSpillover; if (totalPoolMemoryUsed + alignedMemSize <= mmPool->poolSize) { temp = (void**)(((char*)mmPool) + mmPool->poolSize - nextTempMemoryOffset - alignedMemSize); mmPool->currentTempByteDispatched += alignedMemSize; return temp; } else { // Spillover // Locate the last spillover memory pointerToLastSpilloverAddress = &(mmPool->firstSpillOverAddress); temp = (void**)(*pointerToLastSpilloverAddress); while (temp != NULL) { pointerToLastSpilloverAddress = temp; temp = (void**)*pointerToLastSpilloverAddress; } // Allocate for linked list pointer as well temp = MEMALIGN(memSize + MAX_ALIGN, MAX_ALIGN); if (temp == NULL) { fprintf(stderr, "MMTempDispatch(): cannot allocate memory!\n"); exit(1); } *pointerToLastSpilloverAddress = temp; *temp = NULL; mmPool->currentTempByteDispatched += memSize + MAX_ALIGN; mmPool->currentTempByteSpillover += memSize + MAX_ALIGN; return (char*)temp + MAX_ALIGN; } } void MMTempReturn(MMPool *mmPool, void *address, const unsigned int memSize) { void **temp; unsigned int alignedMemSize; void **pointerToLastButOneSpillover; void *spilloverPointerAddress; if (mmPool == NULL) { MMUnitFree(address, memSize); } else { alignedMemSize = nextAlignedBoundary(memSize, MAX_ALIGN); if (address >= (void*)mmPool && address <= (void*)((char*)mmPool + mmPool->poolSize)) { // No need to record the global level max memory dispatched/allocated // because memory pool is allocated as a whole and fluctuation across pools should not be counted if (mmPool->poolByteDispatched + mmPool->currentTempByteDispatched > mmPool->maxTotalByteDispatched) { mmPool->maxTotalByteDispatched = mmPool->poolByteDispatched + mmPool->currentTempByteDispatched; } mmPool->currentTempByteDispatched -= alignedMemSize; } else { #ifdef RECORD_GRAND_TOTAL MMMasterSetMaxTotalByteAllocated(); MMMasterSetMaxTotalByteDispatched(); #endif // Spillover spilloverPointerAddress = (void*)((char*)address - MAX_ALIGN); // MAX_ALIGN no. of bytes preceding temp address // Locate the last spillover memory pointerToLastButOneSpillover = &(mmPool->firstSpillOverAddress); temp = (void**)(*pointerToLastButOneSpillover); while (*temp != NULL) { pointerToLastButOneSpillover = temp; temp = (void**)*pointerToLastButOneSpillover; } if (*pointerToLastButOneSpillover != spilloverPointerAddress) { fprintf(stderr, "MMTempReturn(): address != lastSpilloverAddress! Last allocated temp memory must be freed first\n"); exit(1); } FREEALIGN(spilloverPointerAddress); *pointerToLastButOneSpillover = NULL; if (mmPool->poolByteDispatched + mmPool->currentTempByteDispatched > mmPool->maxTotalByteDispatched) { mmPool->maxTotalByteDispatched = mmPool->poolByteDispatched + mmPool->currentTempByteDispatched; } mmPool->currentTempByteDispatched -= memSize + MAX_ALIGN; mmPool->currentTempByteSpillover -= memSize + MAX_ALIGN; } } } void MMTempPrintReport(MMPool *mmPool, FILE *output) { MMPoolPrintReport(mmPool, output); } MMBulk *MMBulkCreate(MMPool *mmPool, const unsigned int itemSize, const unsigned int itemPerAllocationInPowerOf2, const unsigned int boundaryCushionSize, const unsigned int directorySize) { unsigned int i; MMBulk *mmBulk; #ifdef DEBUG if (itemSize == 0) { fprintf(stderr, "MMBulkCreate() : itemSize = 0!\n"); exit(1); } if (itemPerAllocationInPowerOf2 >= BITS_IN_WORD) { fprintf(stderr, "MMBulkCreate() : itemPerAllocationInPowerOf2 >= BITS_IN_WORD!\n"); exit(1); } #endif if (mmPool == NULL) { mmBulk = MMUnitAllocate(sizeof(MMBulk)); } else { mmBulk = MMPoolDispatch(mmPool, sizeof(MMBulk)); } mmBulk->itemSize = itemSize; mmBulk->itemPerAllocationInPowerOf2 = itemPerAllocationInPowerOf2; mmBulk->boundaryCushionSize = boundaryCushionSize; mmBulk->indexMask = truncateLeft(ALL_ONE_MASK, BITS_IN_WORD - itemPerAllocationInPowerOf2); mmBulk->currentDirectoryEntry = 0; mmBulk->nextUnusedItem = 0; mmBulk->directorySize = directorySize; if (mmPool == NULL) { mmBulk->directory = MMUnitAllocate(sizeof(unsigned char*) * directorySize); } else { mmBulk->directory = MMPoolDispatch(mmPool, sizeof(unsigned char*) * directorySize); } //Allocate memory for the first directory entry mmBulk->directory[0] = MEMALIGN(boundaryCushionSize * 2 + (itemSize << itemPerAllocationInPowerOf2), MAX_ALIGN); if (mmBulk->directory[0] == NULL) { fprintf(stderr, "MMBulkCreate() : cannot allocate memory!\n"); exit(1); } //Advance the address by boundaryCushionSize mmBulk->directory[0] += boundaryCushionSize; for (i=0; i maxNumberOfBulk!\n"); exit(1); } unsigned int MMBulkIsActive(const MMBulk *mmBulk) { return (mmBulk->directory != (void*)mmBulk); } void MMBulkSetInactive(MMBulk *mmBulk) { if (mmBulk->directory != NULL) { } mmBulk->directory = (void*)mmBulk; } unsigned int MMBulkByteAllocated(const MMBulk *mmBulk) { return (mmBulk->currentDirectoryEntry + 1) * (mmBulk->boundaryCushionSize * 2 + (mmBulk->itemSize << mmBulk->itemPerAllocationInPowerOf2)); } unsigned int MMBulkByteDispatched(const MMBulk *mmBulk) { return (mmBulk->currentDirectoryEntry) * (mmBulk->boundaryCushionSize * 2 + (mmBulk->itemSize << mmBulk->itemPerAllocationInPowerOf2)) + mmBulk->boundaryCushionSize * 2 + mmBulk->itemSize * mmBulk->nextUnusedItem; } unsigned int MMBulkUnitDispatched(const MMBulk *mmBulk) { return mmBulk->currentDirectoryEntry * (1 << mmBulk->itemPerAllocationInPowerOf2) + mmBulk->nextUnusedItem; } void MMBulkFree(MMBulk *mmBulk) { unsigned int i; #ifdef RECORD_GRAND_TOTAL MMMasterSetMaxTotalByteAllocated(); MMMasterSetMaxTotalByteDispatched(); #endif for (i=0; i<=mmBulk->currentDirectoryEntry; i++) { FREEALIGN(mmBulk->directory[i] - mmBulk->boundaryCushionSize); } if (MMBulkFindPoolUsed(mmBulk) == NULL) { MMUnitFree(mmBulk->directory, sizeof(unsigned char*) * mmBulk->directorySize); } mmBulk->directory = NULL; MMBulkSetInactive(mmBulk); } void MMBulkDestory(MMBulk *mmBulk) { unsigned int i; MMBulk *temp; #ifdef DEBUG if (mmBulk == NULL) { fprintf(stderr, "MMBulkDestory(): mmBulk = NULL!\n"); exit(1); } #endif if (MMBulkIsActive(mmBulk)) { MMBulkFree(mmBulk); } temp = mmBulk; // Update master directory for (i=0; inextUnusedItem >> mmBulk->itemPerAllocationInPowerOf2) { mmBulk->currentDirectoryEntry++; if (mmBulk->currentDirectoryEntry >= mmBulk->directorySize) { fprintf(stderr, "MMBulkDispatch() : memory directory size overflow!\n"); exit(1); } //Allocate memory for the next directory entry mmBulk->directory[mmBulk->currentDirectoryEntry] = MEMALIGN(mmBulk->boundaryCushionSize * 2 + (mmBulk->itemSize << mmBulk->itemPerAllocationInPowerOf2), MAX_ALIGN); if (mmBulk->directory[mmBulk->currentDirectoryEntry] == NULL) { fprintf(stderr, "MMBulkDispatch() : cannot allocate memory!\n"); exit(1); } //Advance the address by boundaryCushionSize mmBulk->directory[mmBulk->currentDirectoryEntry] += mmBulk->boundaryCushionSize; mmBulk->nextUnusedItem = 0; } return ((mmBulk->currentDirectoryEntry << mmBulk->itemPerAllocationInPowerOf2) | mmBulk->nextUnusedItem++); } void *MMBulkAddress(const MMBulk *mmBulk, const unsigned int index) { #ifdef DEBUG if (index >= (((mmBulk->currentDirectoryEntry+1) << mmBulk->itemPerAllocationInPowerOf2) | mmBulk->nextUnusedItem)) { fprintf(stderr, "MMBulkAddress() : index out of range!\n"); exit(1); } #endif return &(mmBulk->directory[index >> mmBulk->itemPerAllocationInPowerOf2][(index & mmBulk->indexMask) * mmBulk->itemSize]); } MMPool *MMBulkFindPoolUsed(const MMBulk *mmBulk) { unsigned int i; void *temp; for (i=0; i= (void*)mmMaster.mmPool[i] && (void*)mmBulk <= (void*)((char*)mmMaster.mmPool[i] + mmMaster.mmPool[i]->poolSize)) { return mmMaster.mmPool[i]; } temp = mmMaster.mmPool[i]->firstSpillOverAddress; while (temp != NULL) { if ((void*)((char*)temp + sizeof(void*)) == (void*)mmBulk) { return mmMaster.mmPool[i]; } temp = *((void**)temp); } } } return NULL; } void MMBulkPrintReport(MMBulk *mmBulk, FILE *output){ fprintf(output, "Memory allocated : %u\n", MMBulkByteAllocated(mmBulk)); fprintf(output, "Memory dispatched : %u\n", MMBulkByteDispatched(mmBulk)); } void MMBulkSave(MMBulk *mmBulk, FILE *output) { unsigned int i; fwrite(&mmBulk->itemSize, sizeof(unsigned int), 1, output); fwrite(&mmBulk->itemPerAllocationInPowerOf2, sizeof(unsigned int), 1, output); fwrite(&mmBulk->boundaryCushionSize, sizeof(unsigned int), 1, output); fwrite(&mmBulk->currentDirectoryEntry, sizeof(unsigned int), 1, output); fwrite(&mmBulk->nextUnusedItem, sizeof(unsigned int), 1, output); fwrite(&mmBulk->directorySize, sizeof(unsigned int), 1, output); for (i=0; icurrentDirectoryEntry; i++) { fwrite(mmBulk->directory[i], mmBulk->itemSize << mmBulk->itemPerAllocationInPowerOf2, 1, output); } if (mmBulk->nextUnusedItem > 0) { fwrite(mmBulk->directory[i], mmBulk->itemSize * mmBulk->nextUnusedItem, 1, output); } } MMBulk *MMBulkLoad(MMPool *mmPool, FILE *input) { unsigned int i; MMBulk *mmBulk; mmBulk = MMPoolDispatch(mmPool, sizeof(MMBulk)); fread(&mmBulk->itemSize, sizeof(unsigned int), 1, input); fread(&mmBulk->itemPerAllocationInPowerOf2, sizeof(unsigned int), 1, input); fread(&mmBulk->boundaryCushionSize, sizeof(unsigned int), 1, input); fread(&mmBulk->currentDirectoryEntry, sizeof(unsigned int), 1, input); fread(&mmBulk->nextUnusedItem, sizeof(unsigned int), 1, input); fread(&mmBulk->directorySize, sizeof(unsigned int), 1, input); mmBulk->indexMask = truncateLeft(ALL_ONE_MASK, BITS_IN_WORD - mmBulk->itemPerAllocationInPowerOf2); mmBulk->directory = MMPoolDispatch(mmPool, sizeof(unsigned char*) * mmBulk->directorySize); for (i=0; icurrentDirectoryEntry; i++) { mmBulk->directory[i] = MEMALIGN(mmBulk->boundaryCushionSize * 2 + (mmBulk->itemSize << mmBulk->itemPerAllocationInPowerOf2), MAX_ALIGN); if (mmBulk->directory[i] == NULL) { fprintf(stderr, "MMBulkLoad() : cannot allocate memory!\n"); exit(1); } //Advance the address by boundaryCushionSize mmBulk->directory[i] += mmBulk->boundaryCushionSize; fread(mmBulk->directory[i], mmBulk->itemSize << mmBulk->itemPerAllocationInPowerOf2, 1, input); } mmBulk->directory[i] = MEMALIGN(mmBulk->boundaryCushionSize * 2 + (mmBulk->itemSize << mmBulk->itemPerAllocationInPowerOf2), MAX_ALIGN); if (mmBulk->directory[i] == NULL) { fprintf(stderr, "MMBulkLoad() : cannot allocate memory!\n"); exit(1); } //Advance the address by boundaryCushionSize mmBulk->directory[i] += mmBulk->boundaryCushionSize; if (mmBulk->nextUnusedItem > 0) { fread(mmBulk->directory[i], mmBulk->itemSize * mmBulk->nextUnusedItem, 1, input); } for (i=0; i maxNumberOfBulk!\n"); exit(1); } soap2.20/MiscUtilities.c0000644000105300011350000022110111164534250014071 0ustar yuchangrd/* MiscUtilities.c Miscellaneous Utilities This module contains miscellaneous utility functions. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include "MiscUtilities.h" // static functions static int DustWo(const int len, const unsigned char *s, int *beg, int *end, const int wo); static void DustWo1(const int len, const unsigned char *s, const int ivv, const int wo, int *mv, int *iv, int *jv); void Dust(const unsigned int patternLength, unsigned char *pattern, const unsigned int cutoff, const unsigned int window, const unsigned int word) { int i, j, l; int from ,to; int a, b, v; int len; int level; int win, win2; int wo; // Set default parameters if (cutoff == 0) { level = 20; } else { level = (int)cutoff; } if (window == 0) { win = 64; } else { win = (int)window; } if (word == 0) { wo = 3; } else { wo = (int)word; } win2 = win / 2; len = (int)patternLength; from = 0; to = -1; for (i=0; i < len; i += win2) { from -= win2; to -= win2; l = (len > i+win) ? win : len-i; v = DustWo(l, pattern+i, &a, &b, wo); for (j = from; j <= to; j++) { if (i+j>=0 && i+j= 'A' && pattern[i+j] <= 'Z') { pattern[i+j] += 'a' - 'A'; } } } if (v > level) { for (j = a; j <= b && j < win2; j++) { if (i+j>=0 && i+j= 'A' && pattern[i+j] <= 'Z') { pattern[i+j] += 'a' - 'A'; } } } from = j; to = b; } else { from = 0; to = -1; } } } static int DustWo(const int len, const unsigned char *s, int *beg, int *end, const int wo) { int i, l1; int mv, iv, jv; l1 = len - wo + 1; if (l1 < 0) { *beg = 0; *end = len - 1; return 0; } mv = 0; iv = 0; jv = 0; for (i=0; i < l1; i++) { DustWo1(len-i, s+i, i, wo, &mv, &iv, &jv); } *beg = iv; *end = iv + jv; return mv; } static void DustWo1(const int len, const unsigned char *s, const int ivv, const int wo, int *mv, int *iv, int *jv) { int i, ii, j, v, t, n, n1, sum; static int counts[32*32*32]; static int iis[32*32*32]; int js, nis; n = 32 * 32 * 32; n1 = n - 1; nis = 0; i = 0; ii = 0; sum = 0; v = 0; for (j=0; j < len; j++, s++) { ii <<= 5; if (*s >= 'A' && *s <= 'Z') { ii |= *s - 'A'; } else { // Ignoring lower case i = 0; continue; } ii &= n1; i++; if (i >= wo) { for (js=0; js < nis && iis[js] != ii; js++) ; if (js == nis) { iis[nis] = ii; counts[ii] = 0; nis++; } if ((t = counts[ii]) > 0) { sum += t; v = 10 * sum / j; if (*mv < v) { *mv = v; *iv = ivv; *jv = j; } } counts[ii]++; } } } void LimitCodeGenerateCodeTable(const unsigned int limit, unsigned int** codeValue, unsigned int** codeLength) { unsigned int i, j; unsigned int code, c; unsigned int domainSize; unsigned int gammaCodeLength; unsigned int bitToExpand; unsigned int expandBitPosition; #ifdef DEBUG if (limit <= 1) { fprintf(stderr, "LimitCodeGenerateCodeTable(): Limit <= 1!\n"); exit(1); } #endif domainSize = 2; codeLength[domainSize][1] = 1; codeLength[domainSize][2] = 1; codeValue[domainSize][1] = 0; codeValue[domainSize][2] = 1; // First determine number of bit domainSize++; while (domainSize <= limit) { // copy from domainSize - 1 for (i=1; i0; i--) { if (codeLength[domainSize][i] == bitToExpand) { expandBitPosition = i; break; } } // Increase the number of bit at expandBitPosition and assign the same number of bit to the next code codeLength[domainSize][expandBitPosition]++; codeLength[domainSize][domainSize] = codeLength[domainSize][expandBitPosition]; // Assign code value codeValue[domainSize][1] = 0; // 1 always take '0' as code code = 0; for (i=2; i<=domainSize; i++) { for (j=1; j> (codeLength[domainSize][i] - codeLength[domainSize][j]); if (c == codeValue[domainSize][j]) { code++; // code conflict } else { break; // no conflict, proceed to check next number } } } // all preceding numbers checked codeValue[domainSize][i] = code; code++; } domainSize++; } } int QSortUnsignedIntOrder(const void *data, const int index1, const int index2) { if (*((unsigned int*)data + index1) != *((unsigned int*)data + index2)) { if (*((unsigned int*)data + index1) > *((unsigned int*)data + index2)) { return 1; } else { return -1; } } else { return 0; } } static void QSortSwap(void* __restrict data, const int dataWidth, const int index1, const int index2) { int k; char temp; for (k=0; klowIndex && QSortComp(data, j - 1, j) > 0; j--) { QSortSwap(data, dataWidth, j - 1, j); } } break; } else { // Choose pivot as median of the lowest, middle, and highest data; sort the three data midIndex = average(lowIndex, highIndex); if (QSortComp(data, lowIndex, midIndex) > 0) { QSortSwap(data, dataWidth, lowIndex, midIndex); } if (QSortComp(data, lowIndex, highIndex) > 0) { QSortSwap(data, dataWidth, lowIndex, highIndex); } if (QSortComp(data, midIndex, highIndex) > 0) { QSortSwap(data, dataWidth, midIndex, highIndex); } // Move partition key to the 2nd entry QSortSwap(data, dataWidth, midIndex, lowIndex + 1); midIndex = lowIndex + 1; // Partition data numberOfEqualKey = 0; lowPartitionIndex = lowIndex + 2; highPartitionIndex = highIndex - 1; for (;;) { // keys that are equal to the partition key is sorted into the low partition while (lowPartitionIndex <= highPartitionIndex) { c = QSortComp(data, lowPartitionIndex, midIndex); numberOfEqualKey += (c == 0); if (c > 0) { break; } lowPartitionIndex++; } while (lowPartitionIndex < highPartitionIndex) { c = QSortComp(data, midIndex, highPartitionIndex); numberOfEqualKey += (c == 0); if (c >= 0) { break; } highPartitionIndex--; } if (lowPartitionIndex < highPartitionIndex) { QSortSwap(data, dataWidth, lowPartitionIndex, highPartitionIndex); //if (highPartitionIndex == midIndex) { // // partition key has been moved // midIndex = lowPartitionIndex; //} lowPartitionIndex++; highPartitionIndex--; } else { break; } } // Adjust the partition index highPartitionIndex = lowPartitionIndex; lowPartitionIndex--; // move the partition key to end of low partition QSortSwap(data, dataWidth, midIndex, lowPartitionIndex); if (highIndex - lowIndex + SMALL_ARRAY_SIZE > EQUAL_KEY_THRESHOLD * numberOfEqualKey) { } else { // Many keys equals to the partition key; separate the equal key data from the lower partition midIndex = lowIndex; for (;;) { while (midIndex < lowPartitionIndex && QSortComp(data, midIndex, lowPartitionIndex) < 0) { midIndex++; } while (midIndex < lowPartitionIndex && QSortComp(data, lowPartitionIndex, lowPartitionIndex - 1) == 0) { lowPartitionIndex--; } if (midIndex >= lowPartitionIndex) { break; } QSortSwap(data, dataWidth, midIndex, lowPartitionIndex - 1); midIndex++; lowPartitionIndex--; } } if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { // put the larger partition to stack lowStack[stackDepth] = lowIndex; highStack[stackDepth] = lowPartitionIndex - 1; stackDepth++; // sort the smaller partition first lowIndex = highPartitionIndex; } else { // put the larger partition to stack lowStack[stackDepth] = highPartitionIndex; highStack[stackDepth] = highIndex; stackDepth++; if (lowPartitionIndex > lowIndex) { // sort the smaller partition first highIndex = lowPartitionIndex - 1; } else { break; } } } } // Pop a range from stack if (stackDepth > 0) { stackDepth--; lowIndex = lowStack[stackDepth]; highIndex = highStack[stackDepth]; continue; } else { break; } } } unsigned int checkDuplicate(int *input, const unsigned int numItem, const int minValue, const int maxValue, char* text) { unsigned int *present; unsigned int i; char defaultText[17] = "checkDuplicate()"; if (text == NULL) { text = defaultText; } present = malloc((maxValue - minValue + 1) * sizeof(unsigned int)); initializeVAL(present, maxValue - minValue + 1, 0); for (i=0; i= minValue && input[i] <= maxValue) { if (present[input[i] - minValue] > 0) { fprintf(stderr, "%s : Item %u and %u contains duplicate value of %d\n", text, present[input[i] - minValue], i, input[i]); free(present); return FALSE; } present[input[i] - minValue] = i; } } free(present); return TRUE; } unsigned int leadingZero(const unsigned int input) { unsigned int l; const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; if (input & 0xFFFF0000) { if (input & 0xFF000000) { l = leadingZero8bit[input >> 24]; } else { l = 8 + leadingZero8bit[input >> 16]; } } else { if (input & 0x0000FF00) { l = 16 + leadingZero8bit[input >> 8]; } else { l = 24 + leadingZero8bit[input]; } } return l; } unsigned int ceilLog2(const unsigned int input) { if (input <= 1) { return 0; } return BITS_IN_WORD - leadingZero(input - 1); } unsigned int floorLog2(const unsigned int input) { if (input <= 1) { return 0; } return BITS_IN_WORD - leadingZero(input) - 1; } unsigned int power(const unsigned int base, const unsigned int power) { unsigned int i; unsigned int result = 1; for (i=0; i> i)) >> (BITS_IN_WORD - i - 1)) { output[j] = '1'; } else { output[j] = '0'; } j++; if (bitGroup > 0 && bitGroup < BITS_IN_WORD) { if ((i+1) % bitGroup == 0) { output[j] = ' '; j++; } } } output[j] = '\0'; } unsigned int getRandomSeed() { time_t timer; time(&timer); if (sizeof(time_t) > sizeof(unsigned int)) { return (unsigned int)(timer % 0xFFFFFFFF); } else { return (unsigned int)(timer); } } void ConvertBytePackedDNAToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int textLength) { /* unsigned int i, j, k; unsigned int c; unsigned int bitPerBytePackedChar; unsigned int bitPerWordPackedChar; unsigned int charPerWord; unsigned int charPerByte; unsigned int bytePerIteration; unsigned int byteProcessed = 0; unsigned int wordProcessed = 0; unsigned int mask, shift; unsigned int buffer[BITS_IN_WORD]; unsigned char tempChar[4]; bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; charPerWord = BITS_IN_WORD / bitPerWordPackedChar; bytePerIteration = charPerWord / charPerByte; mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; while ((wordProcessed + 1) * CHAR_PER_WORD < textLength) { memcpy(tempChar, input[wordProcessed], 4); output[wordProcessed] = tempChar[0] << 24 | tempChar[1] << 16 | tempChar[2] << 8 | tempChar[3]; wordProcessed++; } k = 0; for (i=0; i < (textLength - wordProcessed * CHAR_PER_WORD - 1) / CHAR_PER_BYTE + 1; i++) { c = (unsigned int)input[byteProcessed] << shift; for (j=0; j> bitPerWordPackedChar * i; } output[wordProcessed] = c; */ } unsigned int reverseBit(unsigned int x) { x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2)); x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4)); x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8)); return((x >> 16) | (x << 16)); } void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue) { unsigned int i; for (i=0; i 0, // are guaranteed to be cleared as 0 // The bits in the resulting ending word are undefined if the resulting bit offset = 0 // The remaining words (to make up the last 4 word multiple) are undefined void bitCopyNoDestOffset(unsigned int *destinationAddress, const unsigned int *sourceAddress, int sourceBitOffset, int copyLengthInBit) { unsigned int i; unsigned int rightShift; unsigned int copyLeftBuffer[4], copyRightBuffer[4]; unsigned int copyWordLength, copyWordLengthRoundTo4; #ifdef DEBUG if (copyLengthInBit == 0) { fprintf(stderr, "bitCopyNoDestOffset() : copyLengthInBit = 0!\n"); exit(1); } #endif copyWordLength = (copyLengthInBit + BITS_IN_WORD_MINUS_1) / BITS_IN_WORD; if (sourceBitOffset == 0) { memcpy(destinationAddress, sourceAddress, copyWordLength * 4); } else { rightShift = BITS_IN_WORD - sourceBitOffset; copyWordLengthRoundTo4 = (copyWordLength + 3) & FOUR_MULTIPLE_MASK; for (i=0; i> rightShift; copyRightBuffer[1] = sourceAddress[i + 2] >> rightShift; copyRightBuffer[2] = sourceAddress[i + 3] >> rightShift; copyRightBuffer[3] = sourceAddress[i + 4] >> rightShift; destinationAddress[i + 0] = copyLeftBuffer[0] | copyRightBuffer[0]; destinationAddress[i + 1] = copyLeftBuffer[1] | copyRightBuffer[1]; destinationAddress[i + 2] = copyLeftBuffer[2] | copyRightBuffer[2]; destinationAddress[i + 3] = copyLeftBuffer[3] | copyRightBuffer[3]; } } if (copyLengthInBit % BITS_IN_WORD > 0) { destinationAddress[copyWordLength - 1] = truncateRight(destinationAddress[copyWordLength - 1], BITS_IN_WORD - (copyLengthInBit % BITS_IN_WORD)); } } void bitCopyDestWordOffsetOnly(unsigned int *destinationAddress, unsigned int destinationWordOffset, const unsigned int *sourceAddress, unsigned int sourceBitOffset, unsigned int copyLengthInBit) { unsigned int i; unsigned int rightShift; unsigned int copyLeftBuffer[4], copyRightBuffer[4]; unsigned int copyWordLength, copyWordLengthRoundTo4, wordToNext4WordBoundary; unsigned int *destAddr; const unsigned int *srcAddr; #ifdef DEBUG if (copyLengthInBit == 0) { fprintf(stderr, "bitCopyDestWordOffsetOnly() : copyLengthInBit = 0!\n"); exit(1); } #endif copyWordLength = (copyLengthInBit + BITS_IN_WORD_MINUS_1) / BITS_IN_WORD; destAddr = destinationAddress + destinationWordOffset; srcAddr = sourceAddress; wordToNext4WordBoundary = (FOUR_MULTIPLE_MASK - destinationWordOffset) % 4; if (sourceBitOffset == 0) { memcpy(destAddr, srcAddr, copyWordLength * 4); } else { rightShift = BITS_IN_WORD - sourceBitOffset; for (i=0; i> rightShift); } destAddr += wordToNext4WordBoundary; srcAddr += wordToNext4WordBoundary; copyWordLengthRoundTo4 = (copyWordLength - wordToNext4WordBoundary + 3) & FOUR_MULTIPLE_MASK; for (i=0; i> rightShift; copyRightBuffer[1] = srcAddr[i + 2] >> rightShift; copyRightBuffer[2] = srcAddr[i + 3] >> rightShift; copyRightBuffer[3] = srcAddr[i + 4] >> rightShift; destAddr[i + 0] = copyLeftBuffer[0] | copyRightBuffer[0]; destAddr[i + 1] = copyLeftBuffer[1] | copyRightBuffer[1]; destAddr[i + 2] = copyLeftBuffer[2] | copyRightBuffer[2]; destAddr[i + 3] = copyLeftBuffer[3] | copyRightBuffer[3]; } } if (copyLengthInBit % BITS_IN_WORD > 0) { destinationAddress[copyWordLength - 1] = truncateRight(destinationAddress[copyWordLength - 1], BITS_IN_WORD - (copyLengthInBit % BITS_IN_WORD)); } } // The remaining bits in destinationAddress, if destinationBitOffset > 0, must be cleared as 0 unsigned int bitCopy(unsigned int *destinationAddress, int destinationWordOffset, int destinationBitOffset, const unsigned int *sourceAddress, int sourceBitOffset, int copyLengthInBit) { unsigned int i; unsigned int rightShift; unsigned int copyLeftBuffer[4], copyRightBuffer[4]; unsigned int copyWordLength, copyWordLengthRoundTo4, wordToNext4WordBoundary; unsigned int *destAddr; const unsigned int *srcAddr; #ifdef DEBUG if (copyLengthInBit == 0) { fprintf(stderr, "bitCopy() : copyLengthInBit = 0!\n"); exit(1); } #endif destAddr = destinationAddress + destinationWordOffset; srcAddr = sourceAddress; if (destinationBitOffset > 0) { destAddr[0] = destAddr[0] | (srcAddr[0] << sourceBitOffset >> destinationBitOffset); if (destinationBitOffset < sourceBitOffset) { destAddr[0] = destAddr[0] | (srcAddr[1] >> destinationBitOffset >> (BITS_IN_WORD - sourceBitOffset)); } if (copyLengthInBit > BITS_IN_WORD - destinationBitOffset) { destAddr++; srcAddr += (sourceBitOffset + BITS_IN_WORD - destinationBitOffset) / BITS_IN_WORD; sourceBitOffset = (sourceBitOffset + BITS_IN_WORD - destinationBitOffset) % BITS_IN_WORD; copyLengthInBit -= BITS_IN_WORD - destinationBitOffset; destinationWordOffset++; } else { if ((destinationBitOffset + copyLengthInBit) % BITS_IN_WORD > 0) { destAddr[0] = truncateRight(destAddr[0], BITS_IN_WORD - destinationBitOffset - copyLengthInBit); } return 0; } } copyWordLength = (copyLengthInBit + BITS_IN_WORD_MINUS_1) / BITS_IN_WORD; if (sourceBitOffset == 0) { memcpy(destAddr, srcAddr, copyWordLength * 4); } else { wordToNext4WordBoundary = (FOUR_MULTIPLE_MASK - destinationWordOffset) % 4; rightShift = BITS_IN_WORD - sourceBitOffset; for (i=0; i> rightShift); } if (wordToNext4WordBoundary >= copyWordLength) { if (copyLengthInBit % BITS_IN_WORD > 0) { destAddr[copyWordLength - 1] = truncateRight(destAddr[copyWordLength - 1], BITS_IN_WORD - (copyLengthInBit % BITS_IN_WORD)); } return 0; } destAddr += wordToNext4WordBoundary; srcAddr += wordToNext4WordBoundary; copyWordLength -= wordToNext4WordBoundary; copyWordLengthRoundTo4 = (copyWordLength + 3) & FOUR_MULTIPLE_MASK; for (i=0; i> rightShift; copyRightBuffer[1] = srcAddr[i + 2] >> rightShift; copyRightBuffer[2] = srcAddr[i + 3] >> rightShift; copyRightBuffer[3] = srcAddr[i + 4] >> rightShift; destAddr[i + 0] = copyLeftBuffer[0] | copyRightBuffer[0]; destAddr[i + 1] = copyLeftBuffer[1] | copyRightBuffer[1]; destAddr[i + 2] = copyLeftBuffer[2] | copyRightBuffer[2]; destAddr[i + 3] = copyLeftBuffer[3] | copyRightBuffer[3]; } } if (copyLengthInBit % BITS_IN_WORD > 0) { destAddr[copyWordLength - 1] = truncateRight(destAddr[copyWordLength - 1], BITS_IN_WORD - (copyLengthInBit % BITS_IN_WORD)); } return 0; } // return a prime number >= number unsigned int nextPrime(const unsigned int number) { // the smallest prime larger than 2^16 is 65537, which is the 6543th prime number static const unsigned int prime[6543] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129, 2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203, 2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357, 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423, 2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531, 2539, 2543, 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693, 2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741, 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, 2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903, 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079, 3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331, 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413, 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511, 3517, 3527, 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607, 3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727, 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, 3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907, 3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057, 4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129, 4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283, 4289, 4297, 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409, 4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493, 4507, 4513, 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621, 4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751, 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813, 4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937, 4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011, 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087, 5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167, 5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233, 5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351, 5381, 5387, 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443, 5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521, 5527, 5531, 5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639, 5641, 5647, 5651, 5653, 5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711, 5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791, 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, 5851, 5857, 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939, 5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073, 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133, 6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211, 6217, 6221, 6229, 6247, 6257, 6263, 6269, 6271, 6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359, 6361, 6367, 6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473, 6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571, 6577, 6581, 6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701, 6703, 6709, 6719, 6733, 6737, 6761, 6763, 6779, 6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833, 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, 6911, 6917, 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997, 7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121, 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207, 7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253, 7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349, 7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487, 7489, 7499, 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561, 7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643, 7649, 7669, 7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757, 7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853, 7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919, 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009, 8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111, 8117, 8123, 8147, 8161, 8167, 8171, 8179, 8191, 8209, 8219, 8221, 8231, 8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291, 8293, 8297, 8311, 8317, 8329, 8353, 8363, 8369, 8377, 8387, 8389, 8419, 8423, 8429, 8431, 8443, 8447, 8461, 8467, 8501, 8513, 8521, 8527, 8537, 8539, 8543, 8563, 8573, 8581, 8597, 8599, 8609, 8623, 8627, 8629, 8641, 8647, 8663, 8669, 8677, 8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731, 8737, 8741, 8747, 8753, 8761, 8779, 8783, 8803, 8807, 8819, 8821, 8831, 8837, 8839, 8849, 8861, 8863, 8867, 8887, 8893, 8923, 8929, 8933, 8941, 8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011, 9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091, 9103, 9109, 9127, 9133, 9137, 9151, 9157, 9161, 9173, 9181, 9187, 9199, 9203, 9209, 9221, 9227, 9239, 9241, 9257, 9277, 9281, 9283, 9293, 9311, 9319, 9323, 9337, 9341, 9343, 9349, 9371, 9377, 9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433, 9437, 9439, 9461, 9463, 9467, 9473, 9479, 9491, 9497, 9511, 9521, 9533, 9539, 9547, 9551, 9587, 9601, 9613, 9619, 9623, 9629, 9631, 9643, 9649, 9661, 9677, 9679, 9689, 9697, 9719, 9721, 9733, 9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791, 9803, 9811, 9817, 9829, 9833, 9839, 9851, 9857, 9859, 9871, 9883, 9887, 9901, 9907, 9923, 9929, 9931, 9941, 9949, 9967, 9973, 10007, 10009, 10037, 10039, 10061, 10067, 10069, 10079, 10091, 10093, 10099, 10103, 10111, 10133, 10139, 10141, 10151, 10159, 10163, 10169, 10177, 10181, 10193, 10211, 10223, 10243, 10247, 10253, 10259, 10267, 10271, 10273, 10289, 10301, 10303, 10313, 10321, 10331, 10333, 10337, 10343, 10357, 10369, 10391, 10399, 10427, 10429, 10433, 10453, 10457, 10459, 10463, 10477, 10487, 10499, 10501, 10513, 10529, 10531, 10559, 10567, 10589, 10597, 10601, 10607, 10613, 10627, 10631, 10639, 10651, 10657, 10663, 10667, 10687, 10691, 10709, 10711, 10723, 10729, 10733, 10739, 10753, 10771, 10781, 10789, 10799, 10831, 10837, 10847, 10853, 10859, 10861, 10867, 10883, 10889, 10891, 10903, 10909, 10937, 10939, 10949, 10957, 10973, 10979, 10987, 10993, 11003, 11027, 11047, 11057, 11059, 11069, 11071, 11083, 11087, 11093, 11113, 11117, 11119, 11131, 11149, 11159, 11161, 11171, 11173, 11177, 11197, 11213, 11239, 11243, 11251, 11257, 11261, 11273, 11279, 11287, 11299, 11311, 11317, 11321, 11329, 11351, 11353, 11369, 11383, 11393, 11399, 11411, 11423, 11437, 11443, 11447, 11467, 11471, 11483, 11489, 11491, 11497, 11503, 11519, 11527, 11549, 11551, 11579, 11587, 11593, 11597, 11617, 11621, 11633, 11657, 11677, 11681, 11689, 11699, 11701, 11717, 11719, 11731, 11743, 11777, 11779, 11783, 11789, 11801, 11807, 11813, 11821, 11827, 11831, 11833, 11839, 11863, 11867, 11887, 11897, 11903, 11909, 11923, 11927, 11933, 11939, 11941, 11953, 11959, 11969, 11971, 11981, 11987, 12007, 12011, 12037, 12041, 12043, 12049, 12071, 12073, 12097, 12101, 12107, 12109, 12113, 12119, 12143, 12149, 12157, 12161, 12163, 12197, 12203, 12211, 12227, 12239, 12241, 12251, 12253, 12263, 12269, 12277, 12281, 12289, 12301, 12323, 12329, 12343, 12347, 12373, 12377, 12379, 12391, 12401, 12409, 12413, 12421, 12433, 12437, 12451, 12457, 12473, 12479, 12487, 12491, 12497, 12503, 12511, 12517, 12527, 12539, 12541, 12547, 12553, 12569, 12577, 12583, 12589, 12601, 12611, 12613, 12619, 12637, 12641, 12647, 12653, 12659, 12671, 12689, 12697, 12703, 12713, 12721, 12739, 12743, 12757, 12763, 12781, 12791, 12799, 12809, 12821, 12823, 12829, 12841, 12853, 12889, 12893, 12899, 12907, 12911, 12917, 12919, 12923, 12941, 12953, 12959, 12967, 12973, 12979, 12983, 13001, 13003, 13007, 13009, 13033, 13037, 13043, 13049, 13063, 13093, 13099, 13103, 13109, 13121, 13127, 13147, 13151, 13159, 13163, 13171, 13177, 13183, 13187, 13217, 13219, 13229, 13241, 13249, 13259, 13267, 13291, 13297, 13309, 13313, 13327, 13331, 13337, 13339, 13367, 13381, 13397, 13399, 13411, 13417, 13421, 13441, 13451, 13457, 13463, 13469, 13477, 13487, 13499, 13513, 13523, 13537, 13553, 13567, 13577, 13591, 13597, 13613, 13619, 13627, 13633, 13649, 13669, 13679, 13681, 13687, 13691, 13693, 13697, 13709, 13711, 13721, 13723, 13729, 13751, 13757, 13759, 13763, 13781, 13789, 13799, 13807, 13829, 13831, 13841, 13859, 13873, 13877, 13879, 13883, 13901, 13903, 13907, 13913, 13921, 13931, 13933, 13963, 13967, 13997, 13999, 14009, 14011, 14029, 14033, 14051, 14057, 14071, 14081, 14083, 14087, 14107, 14143, 14149, 14153, 14159, 14173, 14177, 14197, 14207, 14221, 14243, 14249, 14251, 14281, 14293, 14303, 14321, 14323, 14327, 14341, 14347, 14369, 14387, 14389, 14401, 14407, 14411, 14419, 14423, 14431, 14437, 14447, 14449, 14461, 14479, 14489, 14503, 14519, 14533, 14537, 14543, 14549, 14551, 14557, 14561, 14563, 14591, 14593, 14621, 14627, 14629, 14633, 14639, 14653, 14657, 14669, 14683, 14699, 14713, 14717, 14723, 14731, 14737, 14741, 14747, 14753, 14759, 14767, 14771, 14779, 14783, 14797, 14813, 14821, 14827, 14831, 14843, 14851, 14867, 14869, 14879, 14887, 14891, 14897, 14923, 14929, 14939, 14947, 14951, 14957, 14969, 14983, 15013, 15017, 15031, 15053, 15061, 15073, 15077, 15083, 15091, 15101, 15107, 15121, 15131, 15137, 15139, 15149, 15161, 15173, 15187, 15193, 15199, 15217, 15227, 15233, 15241, 15259, 15263, 15269, 15271, 15277, 15287, 15289, 15299, 15307, 15313, 15319, 15329, 15331, 15349, 15359, 15361, 15373, 15377, 15383, 15391, 15401, 15413, 15427, 15439, 15443, 15451, 15461, 15467, 15473, 15493, 15497, 15511, 15527, 15541, 15551, 15559, 15569, 15581, 15583, 15601, 15607, 15619, 15629, 15641, 15643, 15647, 15649, 15661, 15667, 15671, 15679, 15683, 15727, 15731, 15733, 15737, 15739, 15749, 15761, 15767, 15773, 15787, 15791, 15797, 15803, 15809, 15817, 15823, 15859, 15877, 15881, 15887, 15889, 15901, 15907, 15913, 15919, 15923, 15937, 15959, 15971, 15973, 15991, 16001, 16007, 16033, 16057, 16061, 16063, 16067, 16069, 16073, 16087, 16091, 16097, 16103, 16111, 16127, 16139, 16141, 16183, 16187, 16189, 16193, 16217, 16223, 16229, 16231, 16249, 16253, 16267, 16273, 16301, 16319, 16333, 16339, 16349, 16361, 16363, 16369, 16381, 16411, 16417, 16421, 16427, 16433, 16447, 16451, 16453, 16477, 16481, 16487, 16493, 16519, 16529, 16547, 16553, 16561, 16567, 16573, 16603, 16607, 16619, 16631, 16633, 16649, 16651, 16657, 16661, 16673, 16691, 16693, 16699, 16703, 16729, 16741, 16747, 16759, 16763, 16787, 16811, 16823, 16829, 16831, 16843, 16871, 16879, 16883, 16889, 16901, 16903, 16921, 16927, 16931, 16937, 16943, 16963, 16979, 16981, 16987, 16993, 17011, 17021, 17027, 17029, 17033, 17041, 17047, 17053, 17077, 17093, 17099, 17107, 17117, 17123, 17137, 17159, 17167, 17183, 17189, 17191, 17203, 17207, 17209, 17231, 17239, 17257, 17291, 17293, 17299, 17317, 17321, 17327, 17333, 17341, 17351, 17359, 17377, 17383, 17387, 17389, 17393, 17401, 17417, 17419, 17431, 17443, 17449, 17467, 17471, 17477, 17483, 17489, 17491, 17497, 17509, 17519, 17539, 17551, 17569, 17573, 17579, 17581, 17597, 17599, 17609, 17623, 17627, 17657, 17659, 17669, 17681, 17683, 17707, 17713, 17729, 17737, 17747, 17749, 17761, 17783, 17789, 17791, 17807, 17827, 17837, 17839, 17851, 17863, 17881, 17891, 17903, 17909, 17911, 17921, 17923, 17929, 17939, 17957, 17959, 17971, 17977, 17981, 17987, 17989, 18013, 18041, 18043, 18047, 18049, 18059, 18061, 18077, 18089, 18097, 18119, 18121, 18127, 18131, 18133, 18143, 18149, 18169, 18181, 18191, 18199, 18211, 18217, 18223, 18229, 18233, 18251, 18253, 18257, 18269, 18287, 18289, 18301, 18307, 18311, 18313, 18329, 18341, 18353, 18367, 18371, 18379, 18397, 18401, 18413, 18427, 18433, 18439, 18443, 18451, 18457, 18461, 18481, 18493, 18503, 18517, 18521, 18523, 18539, 18541, 18553, 18583, 18587, 18593, 18617, 18637, 18661, 18671, 18679, 18691, 18701, 18713, 18719, 18731, 18743, 18749, 18757, 18773, 18787, 18793, 18797, 18803, 18839, 18859, 18869, 18899, 18911, 18913, 18917, 18919, 18947, 18959, 18973, 18979, 19001, 19009, 19013, 19031, 19037, 19051, 19069, 19073, 19079, 19081, 19087, 19121, 19139, 19141, 19157, 19163, 19181, 19183, 19207, 19211, 19213, 19219, 19231, 19237, 19249, 19259, 19267, 19273, 19289, 19301, 19309, 19319, 19333, 19373, 19379, 19381, 19387, 19391, 19403, 19417, 19421, 19423, 19427, 19429, 19433, 19441, 19447, 19457, 19463, 19469, 19471, 19477, 19483, 19489, 19501, 19507, 19531, 19541, 19543, 19553, 19559, 19571, 19577, 19583, 19597, 19603, 19609, 19661, 19681, 19687, 19697, 19699, 19709, 19717, 19727, 19739, 19751, 19753, 19759, 19763, 19777, 19793, 19801, 19813, 19819, 19841, 19843, 19853, 19861, 19867, 19889, 19891, 19913, 19919, 19927, 19937, 19949, 19961, 19963, 19973, 19979, 19991, 19993, 19997, 20011, 20021, 20023, 20029, 20047, 20051, 20063, 20071, 20089, 20101, 20107, 20113, 20117, 20123, 20129, 20143, 20147, 20149, 20161, 20173, 20177, 20183, 20201, 20219, 20231, 20233, 20249, 20261, 20269, 20287, 20297, 20323, 20327, 20333, 20341, 20347, 20353, 20357, 20359, 20369, 20389, 20393, 20399, 20407, 20411, 20431, 20441, 20443, 20477, 20479, 20483, 20507, 20509, 20521, 20533, 20543, 20549, 20551, 20563, 20593, 20599, 20611, 20627, 20639, 20641, 20663, 20681, 20693, 20707, 20717, 20719, 20731, 20743, 20747, 20749, 20753, 20759, 20771, 20773, 20789, 20807, 20809, 20849, 20857, 20873, 20879, 20887, 20897, 20899, 20903, 20921, 20929, 20939, 20947, 20959, 20963, 20981, 20983, 21001, 21011, 21013, 21017, 21019, 21023, 21031, 21059, 21061, 21067, 21089, 21101, 21107, 21121, 21139, 21143, 21149, 21157, 21163, 21169, 21179, 21187, 21191, 21193, 21211, 21221, 21227, 21247, 21269, 21277, 21283, 21313, 21317, 21319, 21323, 21341, 21347, 21377, 21379, 21383, 21391, 21397, 21401, 21407, 21419, 21433, 21467, 21481, 21487, 21491, 21493, 21499, 21503, 21517, 21521, 21523, 21529, 21557, 21559, 21563, 21569, 21577, 21587, 21589, 21599, 21601, 21611, 21613, 21617, 21647, 21649, 21661, 21673, 21683, 21701, 21713, 21727, 21737, 21739, 21751, 21757, 21767, 21773, 21787, 21799, 21803, 21817, 21821, 21839, 21841, 21851, 21859, 21863, 21871, 21881, 21893, 21911, 21929, 21937, 21943, 21961, 21977, 21991, 21997, 22003, 22013, 22027, 22031, 22037, 22039, 22051, 22063, 22067, 22073, 22079, 22091, 22093, 22109, 22111, 22123, 22129, 22133, 22147, 22153, 22157, 22159, 22171, 22189, 22193, 22229, 22247, 22259, 22271, 22273, 22277, 22279, 22283, 22291, 22303, 22307, 22343, 22349, 22367, 22369, 22381, 22391, 22397, 22409, 22433, 22441, 22447, 22453, 22469, 22481, 22483, 22501, 22511, 22531, 22541, 22543, 22549, 22567, 22571, 22573, 22613, 22619, 22621, 22637, 22639, 22643, 22651, 22669, 22679, 22691, 22697, 22699, 22709, 22717, 22721, 22727, 22739, 22741, 22751, 22769, 22777, 22783, 22787, 22807, 22811, 22817, 22853, 22859, 22861, 22871, 22877, 22901, 22907, 22921, 22937, 22943, 22961, 22963, 22973, 22993, 23003, 23011, 23017, 23021, 23027, 23029, 23039, 23041, 23053, 23057, 23059, 23063, 23071, 23081, 23087, 23099, 23117, 23131, 23143, 23159, 23167, 23173, 23189, 23197, 23201, 23203, 23209, 23227, 23251, 23269, 23279, 23291, 23293, 23297, 23311, 23321, 23327, 23333, 23339, 23357, 23369, 23371, 23399, 23417, 23431, 23447, 23459, 23473, 23497, 23509, 23531, 23537, 23539, 23549, 23557, 23561, 23563, 23567, 23581, 23593, 23599, 23603, 23609, 23623, 23627, 23629, 23633, 23663, 23669, 23671, 23677, 23687, 23689, 23719, 23741, 23743, 23747, 23753, 23761, 23767, 23773, 23789, 23801, 23813, 23819, 23827, 23831, 23833, 23857, 23869, 23873, 23879, 23887, 23893, 23899, 23909, 23911, 23917, 23929, 23957, 23971, 23977, 23981, 23993, 24001, 24007, 24019, 24023, 24029, 24043, 24049, 24061, 24071, 24077, 24083, 24091, 24097, 24103, 24107, 24109, 24113, 24121, 24133, 24137, 24151, 24169, 24179, 24181, 24197, 24203, 24223, 24229, 24239, 24247, 24251, 24281, 24317, 24329, 24337, 24359, 24371, 24373, 24379, 24391, 24407, 24413, 24419, 24421, 24439, 24443, 24469, 24473, 24481, 24499, 24509, 24517, 24527, 24533, 24547, 24551, 24571, 24593, 24611, 24623, 24631, 24659, 24671, 24677, 24683, 24691, 24697, 24709, 24733, 24749, 24763, 24767, 24781, 24793, 24799, 24809, 24821, 24841, 24847, 24851, 24859, 24877, 24889, 24907, 24917, 24919, 24923, 24943, 24953, 24967, 24971, 24977, 24979, 24989, 25013, 25031, 25033, 25037, 25057, 25073, 25087, 25097, 25111, 25117, 25121, 25127, 25147, 25153, 25163, 25169, 25171, 25183, 25189, 25219, 25229, 25237, 25243, 25247, 25253, 25261, 25301, 25303, 25307, 25309, 25321, 25339, 25343, 25349, 25357, 25367, 25373, 25391, 25409, 25411, 25423, 25439, 25447, 25453, 25457, 25463, 25469, 25471, 25523, 25537, 25541, 25561, 25577, 25579, 25583, 25589, 25601, 25603, 25609, 25621, 25633, 25639, 25643, 25657, 25667, 25673, 25679, 25693, 25703, 25717, 25733, 25741, 25747, 25759, 25763, 25771, 25793, 25799, 25801, 25819, 25841, 25847, 25849, 25867, 25873, 25889, 25903, 25913, 25919, 25931, 25933, 25939, 25943, 25951, 25969, 25981, 25997, 25999, 26003, 26017, 26021, 26029, 26041, 26053, 26083, 26099, 26107, 26111, 26113, 26119, 26141, 26153, 26161, 26171, 26177, 26183, 26189, 26203, 26209, 26227, 26237, 26249, 26251, 26261, 26263, 26267, 26293, 26297, 26309, 26317, 26321, 26339, 26347, 26357, 26371, 26387, 26393, 26399, 26407, 26417, 26423, 26431, 26437, 26449, 26459, 26479, 26489, 26497, 26501, 26513, 26539, 26557, 26561, 26573, 26591, 26597, 26627, 26633, 26641, 26647, 26669, 26681, 26683, 26687, 26693, 26699, 26701, 26711, 26713, 26717, 26723, 26729, 26731, 26737, 26759, 26777, 26783, 26801, 26813, 26821, 26833, 26839, 26849, 26861, 26863, 26879, 26881, 26891, 26893, 26903, 26921, 26927, 26947, 26951, 26953, 26959, 26981, 26987, 26993, 27011, 27017, 27031, 27043, 27059, 27061, 27067, 27073, 27077, 27091, 27103, 27107, 27109, 27127, 27143, 27179, 27191, 27197, 27211, 27239, 27241, 27253, 27259, 27271, 27277, 27281, 27283, 27299, 27329, 27337, 27361, 27367, 27397, 27407, 27409, 27427, 27431, 27437, 27449, 27457, 27479, 27481, 27487, 27509, 27527, 27529, 27539, 27541, 27551, 27581, 27583, 27611, 27617, 27631, 27647, 27653, 27673, 27689, 27691, 27697, 27701, 27733, 27737, 27739, 27743, 27749, 27751, 27763, 27767, 27773, 27779, 27791, 27793, 27799, 27803, 27809, 27817, 27823, 27827, 27847, 27851, 27883, 27893, 27901, 27917, 27919, 27941, 27943, 27947, 27953, 27961, 27967, 27983, 27997, 28001, 28019, 28027, 28031, 28051, 28057, 28069, 28081, 28087, 28097, 28099, 28109, 28111, 28123, 28151, 28163, 28181, 28183, 28201, 28211, 28219, 28229, 28277, 28279, 28283, 28289, 28297, 28307, 28309, 28319, 28349, 28351, 28387, 28393, 28403, 28409, 28411, 28429, 28433, 28439, 28447, 28463, 28477, 28493, 28499, 28513, 28517, 28537, 28541, 28547, 28549, 28559, 28571, 28573, 28579, 28591, 28597, 28603, 28607, 28619, 28621, 28627, 28631, 28643, 28649, 28657, 28661, 28663, 28669, 28687, 28697, 28703, 28711, 28723, 28729, 28751, 28753, 28759, 28771, 28789, 28793, 28807, 28813, 28817, 28837, 28843, 28859, 28867, 28871, 28879, 28901, 28909, 28921, 28927, 28933, 28949, 28961, 28979, 29009, 29017, 29021, 29023, 29027, 29033, 29059, 29063, 29077, 29101, 29123, 29129, 29131, 29137, 29147, 29153, 29167, 29173, 29179, 29191, 29201, 29207, 29209, 29221, 29231, 29243, 29251, 29269, 29287, 29297, 29303, 29311, 29327, 29333, 29339, 29347, 29363, 29383, 29387, 29389, 29399, 29401, 29411, 29423, 29429, 29437, 29443, 29453, 29473, 29483, 29501, 29527, 29531, 29537, 29567, 29569, 29573, 29581, 29587, 29599, 29611, 29629, 29633, 29641, 29663, 29669, 29671, 29683, 29717, 29723, 29741, 29753, 29759, 29761, 29789, 29803, 29819, 29833, 29837, 29851, 29863, 29867, 29873, 29879, 29881, 29917, 29921, 29927, 29947, 29959, 29983, 29989, 30011, 30013, 30029, 30047, 30059, 30071, 30089, 30091, 30097, 30103, 30109, 30113, 30119, 30133, 30137, 30139, 30161, 30169, 30181, 30187, 30197, 30203, 30211, 30223, 30241, 30253, 30259, 30269, 30271, 30293, 30307, 30313, 30319, 30323, 30341, 30347, 30367, 30389, 30391, 30403, 30427, 30431, 30449, 30467, 30469, 30491, 30493, 30497, 30509, 30517, 30529, 30539, 30553, 30557, 30559, 30577, 30593, 30631, 30637, 30643, 30649, 30661, 30671, 30677, 30689, 30697, 30703, 30707, 30713, 30727, 30757, 30763, 30773, 30781, 30803, 30809, 30817, 30829, 30839, 30841, 30851, 30853, 30859, 30869, 30871, 30881, 30893, 30911, 30931, 30937, 30941, 30949, 30971, 30977, 30983, 31013, 31019, 31033, 31039, 31051, 31063, 31069, 31079, 31081, 31091, 31121, 31123, 31139, 31147, 31151, 31153, 31159, 31177, 31181, 31183, 31189, 31193, 31219, 31223, 31231, 31237, 31247, 31249, 31253, 31259, 31267, 31271, 31277, 31307, 31319, 31321, 31327, 31333, 31337, 31357, 31379, 31387, 31391, 31393, 31397, 31469, 31477, 31481, 31489, 31511, 31513, 31517, 31531, 31541, 31543, 31547, 31567, 31573, 31583, 31601, 31607, 31627, 31643, 31649, 31657, 31663, 31667, 31687, 31699, 31721, 31723, 31727, 31729, 31741, 31751, 31769, 31771, 31793, 31799, 31817, 31847, 31849, 31859, 31873, 31883, 31891, 31907, 31957, 31963, 31973, 31981, 31991, 32003, 32009, 32027, 32029, 32051, 32057, 32059, 32063, 32069, 32077, 32083, 32089, 32099, 32117, 32119, 32141, 32143, 32159, 32173, 32183, 32189, 32191, 32203, 32213, 32233, 32237, 32251, 32257, 32261, 32297, 32299, 32303, 32309, 32321, 32323, 32327, 32341, 32353, 32359, 32363, 32369, 32371, 32377, 32381, 32401, 32411, 32413, 32423, 32429, 32441, 32443, 32467, 32479, 32491, 32497, 32503, 32507, 32531, 32533, 32537, 32561, 32563, 32569, 32573, 32579, 32587, 32603, 32609, 32611, 32621, 32633, 32647, 32653, 32687, 32693, 32707, 32713, 32717, 32719, 32749, 32771, 32779, 32783, 32789, 32797, 32801, 32803, 32831, 32833, 32839, 32843, 32869, 32887, 32909, 32911, 32917, 32933, 32939, 32941, 32957, 32969, 32971, 32983, 32987, 32993, 32999, 33013, 33023, 33029, 33037, 33049, 33053, 33071, 33073, 33083, 33091, 33107, 33113, 33119, 33149, 33151, 33161, 33179, 33181, 33191, 33199, 33203, 33211, 33223, 33247, 33287, 33289, 33301, 33311, 33317, 33329, 33331, 33343, 33347, 33349, 33353, 33359, 33377, 33391, 33403, 33409, 33413, 33427, 33457, 33461, 33469, 33479, 33487, 33493, 33503, 33521, 33529, 33533, 33547, 33563, 33569, 33577, 33581, 33587, 33589, 33599, 33601, 33613, 33617, 33619, 33623, 33629, 33637, 33641, 33647, 33679, 33703, 33713, 33721, 33739, 33749, 33751, 33757, 33767, 33769, 33773, 33791, 33797, 33809, 33811, 33827, 33829, 33851, 33857, 33863, 33871, 33889, 33893, 33911, 33923, 33931, 33937, 33941, 33961, 33967, 33997, 34019, 34031, 34033, 34039, 34057, 34061, 34123, 34127, 34129, 34141, 34147, 34157, 34159, 34171, 34183, 34211, 34213, 34217, 34231, 34253, 34259, 34261, 34267, 34273, 34283, 34297, 34301, 34303, 34313, 34319, 34327, 34337, 34351, 34361, 34367, 34369, 34381, 34403, 34421, 34429, 34439, 34457, 34469, 34471, 34483, 34487, 34499, 34501, 34511, 34513, 34519, 34537, 34543, 34549, 34583, 34589, 34591, 34603, 34607, 34613, 34631, 34649, 34651, 34667, 34673, 34679, 34687, 34693, 34703, 34721, 34729, 34739, 34747, 34757, 34759, 34763, 34781, 34807, 34819, 34841, 34843, 34847, 34849, 34871, 34877, 34883, 34897, 34913, 34919, 34939, 34949, 34961, 34963, 34981, 35023, 35027, 35051, 35053, 35059, 35069, 35081, 35083, 35089, 35099, 35107, 35111, 35117, 35129, 35141, 35149, 35153, 35159, 35171, 35201, 35221, 35227, 35251, 35257, 35267, 35279, 35281, 35291, 35311, 35317, 35323, 35327, 35339, 35353, 35363, 35381, 35393, 35401, 35407, 35419, 35423, 35437, 35447, 35449, 35461, 35491, 35507, 35509, 35521, 35527, 35531, 35533, 35537, 35543, 35569, 35573, 35591, 35593, 35597, 35603, 35617, 35671, 35677, 35729, 35731, 35747, 35753, 35759, 35771, 35797, 35801, 35803, 35809, 35831, 35837, 35839, 35851, 35863, 35869, 35879, 35897, 35899, 35911, 35923, 35933, 35951, 35963, 35969, 35977, 35983, 35993, 35999, 36007, 36011, 36013, 36017, 36037, 36061, 36067, 36073, 36083, 36097, 36107, 36109, 36131, 36137, 36151, 36161, 36187, 36191, 36209, 36217, 36229, 36241, 36251, 36263, 36269, 36277, 36293, 36299, 36307, 36313, 36319, 36341, 36343, 36353, 36373, 36383, 36389, 36433, 36451, 36457, 36467, 36469, 36473, 36479, 36493, 36497, 36523, 36527, 36529, 36541, 36551, 36559, 36563, 36571, 36583, 36587, 36599, 36607, 36629, 36637, 36643, 36653, 36671, 36677, 36683, 36691, 36697, 36709, 36713, 36721, 36739, 36749, 36761, 36767, 36779, 36781, 36787, 36791, 36793, 36809, 36821, 36833, 36847, 36857, 36871, 36877, 36887, 36899, 36901, 36913, 36919, 36923, 36929, 36931, 36943, 36947, 36973, 36979, 36997, 37003, 37013, 37019, 37021, 37039, 37049, 37057, 37061, 37087, 37097, 37117, 37123, 37139, 37159, 37171, 37181, 37189, 37199, 37201, 37217, 37223, 37243, 37253, 37273, 37277, 37307, 37309, 37313, 37321, 37337, 37339, 37357, 37361, 37363, 37369, 37379, 37397, 37409, 37423, 37441, 37447, 37463, 37483, 37489, 37493, 37501, 37507, 37511, 37517, 37529, 37537, 37547, 37549, 37561, 37567, 37571, 37573, 37579, 37589, 37591, 37607, 37619, 37633, 37643, 37649, 37657, 37663, 37691, 37693, 37699, 37717, 37747, 37781, 37783, 37799, 37811, 37813, 37831, 37847, 37853, 37861, 37871, 37879, 37889, 37897, 37907, 37951, 37957, 37963, 37967, 37987, 37991, 37993, 37997, 38011, 38039, 38047, 38053, 38069, 38083, 38113, 38119, 38149, 38153, 38167, 38177, 38183, 38189, 38197, 38201, 38219, 38231, 38237, 38239, 38261, 38273, 38281, 38287, 38299, 38303, 38317, 38321, 38327, 38329, 38333, 38351, 38371, 38377, 38393, 38431, 38447, 38449, 38453, 38459, 38461, 38501, 38543, 38557, 38561, 38567, 38569, 38593, 38603, 38609, 38611, 38629, 38639, 38651, 38653, 38669, 38671, 38677, 38693, 38699, 38707, 38711, 38713, 38723, 38729, 38737, 38747, 38749, 38767, 38783, 38791, 38803, 38821, 38833, 38839, 38851, 38861, 38867, 38873, 38891, 38903, 38917, 38921, 38923, 38933, 38953, 38959, 38971, 38977, 38993, 39019, 39023, 39041, 39043, 39047, 39079, 39089, 39097, 39103, 39107, 39113, 39119, 39133, 39139, 39157, 39161, 39163, 39181, 39191, 39199, 39209, 39217, 39227, 39229, 39233, 39239, 39241, 39251, 39293, 39301, 39313, 39317, 39323, 39341, 39343, 39359, 39367, 39371, 39373, 39383, 39397, 39409, 39419, 39439, 39443, 39451, 39461, 39499, 39503, 39509, 39511, 39521, 39541, 39551, 39563, 39569, 39581, 39607, 39619, 39623, 39631, 39659, 39667, 39671, 39679, 39703, 39709, 39719, 39727, 39733, 39749, 39761, 39769, 39779, 39791, 39799, 39821, 39827, 39829, 39839, 39841, 39847, 39857, 39863, 39869, 39877, 39883, 39887, 39901, 39929, 39937, 39953, 39971, 39979, 39983, 39989, 40009, 40013, 40031, 40037, 40039, 40063, 40087, 40093, 40099, 40111, 40123, 40127, 40129, 40151, 40153, 40163, 40169, 40177, 40189, 40193, 40213, 40231, 40237, 40241, 40253, 40277, 40283, 40289, 40343, 40351, 40357, 40361, 40387, 40423, 40427, 40429, 40433, 40459, 40471, 40483, 40487, 40493, 40499, 40507, 40519, 40529, 40531, 40543, 40559, 40577, 40583, 40591, 40597, 40609, 40627, 40637, 40639, 40693, 40697, 40699, 40709, 40739, 40751, 40759, 40763, 40771, 40787, 40801, 40813, 40819, 40823, 40829, 40841, 40847, 40849, 40853, 40867, 40879, 40883, 40897, 40903, 40927, 40933, 40939, 40949, 40961, 40973, 40993, 41011, 41017, 41023, 41039, 41047, 41051, 41057, 41077, 41081, 41113, 41117, 41131, 41141, 41143, 41149, 41161, 41177, 41179, 41183, 41189, 41201, 41203, 41213, 41221, 41227, 41231, 41233, 41243, 41257, 41263, 41269, 41281, 41299, 41333, 41341, 41351, 41357, 41381, 41387, 41389, 41399, 41411, 41413, 41443, 41453, 41467, 41479, 41491, 41507, 41513, 41519, 41521, 41539, 41543, 41549, 41579, 41593, 41597, 41603, 41609, 41611, 41617, 41621, 41627, 41641, 41647, 41651, 41659, 41669, 41681, 41687, 41719, 41729, 41737, 41759, 41761, 41771, 41777, 41801, 41809, 41813, 41843, 41849, 41851, 41863, 41879, 41887, 41893, 41897, 41903, 41911, 41927, 41941, 41947, 41953, 41957, 41959, 41969, 41981, 41983, 41999, 42013, 42017, 42019, 42023, 42043, 42061, 42071, 42073, 42083, 42089, 42101, 42131, 42139, 42157, 42169, 42179, 42181, 42187, 42193, 42197, 42209, 42221, 42223, 42227, 42239, 42257, 42281, 42283, 42293, 42299, 42307, 42323, 42331, 42337, 42349, 42359, 42373, 42379, 42391, 42397, 42403, 42407, 42409, 42433, 42437, 42443, 42451, 42457, 42461, 42463, 42467, 42473, 42487, 42491, 42499, 42509, 42533, 42557, 42569, 42571, 42577, 42589, 42611, 42641, 42643, 42649, 42667, 42677, 42683, 42689, 42697, 42701, 42703, 42709, 42719, 42727, 42737, 42743, 42751, 42767, 42773, 42787, 42793, 42797, 42821, 42829, 42839, 42841, 42853, 42859, 42863, 42899, 42901, 42923, 42929, 42937, 42943, 42953, 42961, 42967, 42979, 42989, 43003, 43013, 43019, 43037, 43049, 43051, 43063, 43067, 43093, 43103, 43117, 43133, 43151, 43159, 43177, 43189, 43201, 43207, 43223, 43237, 43261, 43271, 43283, 43291, 43313, 43319, 43321, 43331, 43391, 43397, 43399, 43403, 43411, 43427, 43441, 43451, 43457, 43481, 43487, 43499, 43517, 43541, 43543, 43573, 43577, 43579, 43591, 43597, 43607, 43609, 43613, 43627, 43633, 43649, 43651, 43661, 43669, 43691, 43711, 43717, 43721, 43753, 43759, 43777, 43781, 43783, 43787, 43789, 43793, 43801, 43853, 43867, 43889, 43891, 43913, 43933, 43943, 43951, 43961, 43963, 43969, 43973, 43987, 43991, 43997, 44017, 44021, 44027, 44029, 44041, 44053, 44059, 44071, 44087, 44089, 44101, 44111, 44119, 44123, 44129, 44131, 44159, 44171, 44179, 44189, 44201, 44203, 44207, 44221, 44249, 44257, 44263, 44267, 44269, 44273, 44279, 44281, 44293, 44351, 44357, 44371, 44381, 44383, 44389, 44417, 44449, 44453, 44483, 44491, 44497, 44501, 44507, 44519, 44531, 44533, 44537, 44543, 44549, 44563, 44579, 44587, 44617, 44621, 44623, 44633, 44641, 44647, 44651, 44657, 44683, 44687, 44699, 44701, 44711, 44729, 44741, 44753, 44771, 44773, 44777, 44789, 44797, 44809, 44819, 44839, 44843, 44851, 44867, 44879, 44887, 44893, 44909, 44917, 44927, 44939, 44953, 44959, 44963, 44971, 44983, 44987, 45007, 45013, 45053, 45061, 45077, 45083, 45119, 45121, 45127, 45131, 45137, 45139, 45161, 45179, 45181, 45191, 45197, 45233, 45247, 45259, 45263, 45281, 45289, 45293, 45307, 45317, 45319, 45329, 45337, 45341, 45343, 45361, 45377, 45389, 45403, 45413, 45427, 45433, 45439, 45481, 45491, 45497, 45503, 45523, 45533, 45541, 45553, 45557, 45569, 45587, 45589, 45599, 45613, 45631, 45641, 45659, 45667, 45673, 45677, 45691, 45697, 45707, 45737, 45751, 45757, 45763, 45767, 45779, 45817, 45821, 45823, 45827, 45833, 45841, 45853, 45863, 45869, 45887, 45893, 45943, 45949, 45953, 45959, 45971, 45979, 45989, 46021, 46027, 46049, 46051, 46061, 46073, 46091, 46093, 46099, 46103, 46133, 46141, 46147, 46153, 46171, 46181, 46183, 46187, 46199, 46219, 46229, 46237, 46261, 46271, 46273, 46279, 46301, 46307, 46309, 46327, 46337, 46349, 46351, 46381, 46399, 46411, 46439, 46441, 46447, 46451, 46457, 46471, 46477, 46489, 46499, 46507, 46511, 46523, 46549, 46559, 46567, 46573, 46589, 46591, 46601, 46619, 46633, 46639, 46643, 46649, 46663, 46679, 46681, 46687, 46691, 46703, 46723, 46727, 46747, 46751, 46757, 46769, 46771, 46807, 46811, 46817, 46819, 46829, 46831, 46853, 46861, 46867, 46877, 46889, 46901, 46919, 46933, 46957, 46993, 46997, 47017, 47041, 47051, 47057, 47059, 47087, 47093, 47111, 47119, 47123, 47129, 47137, 47143, 47147, 47149, 47161, 47189, 47207, 47221, 47237, 47251, 47269, 47279, 47287, 47293, 47297, 47303, 47309, 47317, 47339, 47351, 47353, 47363, 47381, 47387, 47389, 47407, 47417, 47419, 47431, 47441, 47459, 47491, 47497, 47501, 47507, 47513, 47521, 47527, 47533, 47543, 47563, 47569, 47581, 47591, 47599, 47609, 47623, 47629, 47639, 47653, 47657, 47659, 47681, 47699, 47701, 47711, 47713, 47717, 47737, 47741, 47743, 47777, 47779, 47791, 47797, 47807, 47809, 47819, 47837, 47843, 47857, 47869, 47881, 47903, 47911, 47917, 47933, 47939, 47947, 47951, 47963, 47969, 47977, 47981, 48017, 48023, 48029, 48049, 48073, 48079, 48091, 48109, 48119, 48121, 48131, 48157, 48163, 48179, 48187, 48193, 48197, 48221, 48239, 48247, 48259, 48271, 48281, 48299, 48311, 48313, 48337, 48341, 48353, 48371, 48383, 48397, 48407, 48409, 48413, 48437, 48449, 48463, 48473, 48479, 48481, 48487, 48491, 48497, 48523, 48527, 48533, 48539, 48541, 48563, 48571, 48589, 48593, 48611, 48619, 48623, 48647, 48649, 48661, 48673, 48677, 48679, 48731, 48733, 48751, 48757, 48761, 48767, 48779, 48781, 48787, 48799, 48809, 48817, 48821, 48823, 48847, 48857, 48859, 48869, 48871, 48883, 48889, 48907, 48947, 48953, 48973, 48989, 48991, 49003, 49009, 49019, 49031, 49033, 49037, 49043, 49057, 49069, 49081, 49103, 49109, 49117, 49121, 49123, 49139, 49157, 49169, 49171, 49177, 49193, 49199, 49201, 49207, 49211, 49223, 49253, 49261, 49277, 49279, 49297, 49307, 49331, 49333, 49339, 49363, 49367, 49369, 49391, 49393, 49409, 49411, 49417, 49429, 49433, 49451, 49459, 49463, 49477, 49481, 49499, 49523, 49529, 49531, 49537, 49547, 49549, 49559, 49597, 49603, 49613, 49627, 49633, 49639, 49663, 49667, 49669, 49681, 49697, 49711, 49727, 49739, 49741, 49747, 49757, 49783, 49787, 49789, 49801, 49807, 49811, 49823, 49831, 49843, 49853, 49871, 49877, 49891, 49919, 49921, 49927, 49937, 49939, 49943, 49957, 49991, 49993, 49999, 50021, 50023, 50033, 50047, 50051, 50053, 50069, 50077, 50087, 50093, 50101, 50111, 50119, 50123, 50129, 50131, 50147, 50153, 50159, 50177, 50207, 50221, 50227, 50231, 50261, 50263, 50273, 50287, 50291, 50311, 50321, 50329, 50333, 50341, 50359, 50363, 50377, 50383, 50387, 50411, 50417, 50423, 50441, 50459, 50461, 50497, 50503, 50513, 50527, 50539, 50543, 50549, 50551, 50581, 50587, 50591, 50593, 50599, 50627, 50647, 50651, 50671, 50683, 50707, 50723, 50741, 50753, 50767, 50773, 50777, 50789, 50821, 50833, 50839, 50849, 50857, 50867, 50873, 50891, 50893, 50909, 50923, 50929, 50951, 50957, 50969, 50971, 50989, 50993, 51001, 51031, 51043, 51047, 51059, 51061, 51071, 51109, 51131, 51133, 51137, 51151, 51157, 51169, 51193, 51197, 51199, 51203, 51217, 51229, 51239, 51241, 51257, 51263, 51283, 51287, 51307, 51329, 51341, 51343, 51347, 51349, 51361, 51383, 51407, 51413, 51419, 51421, 51427, 51431, 51437, 51439, 51449, 51461, 51473, 51479, 51481, 51487, 51503, 51511, 51517, 51521, 51539, 51551, 51563, 51577, 51581, 51593, 51599, 51607, 51613, 51631, 51637, 51647, 51659, 51673, 51679, 51683, 51691, 51713, 51719, 51721, 51749, 51767, 51769, 51787, 51797, 51803, 51817, 51827, 51829, 51839, 51853, 51859, 51869, 51871, 51893, 51899, 51907, 51913, 51929, 51941, 51949, 51971, 51973, 51977, 51991, 52009, 52021, 52027, 52051, 52057, 52067, 52069, 52081, 52103, 52121, 52127, 52147, 52153, 52163, 52177, 52181, 52183, 52189, 52201, 52223, 52237, 52249, 52253, 52259, 52267, 52289, 52291, 52301, 52313, 52321, 52361, 52363, 52369, 52379, 52387, 52391, 52433, 52453, 52457, 52489, 52501, 52511, 52517, 52529, 52541, 52543, 52553, 52561, 52567, 52571, 52579, 52583, 52609, 52627, 52631, 52639, 52667, 52673, 52691, 52697, 52709, 52711, 52721, 52727, 52733, 52747, 52757, 52769, 52783, 52807, 52813, 52817, 52837, 52859, 52861, 52879, 52883, 52889, 52901, 52903, 52919, 52937, 52951, 52957, 52963, 52967, 52973, 52981, 52999, 53003, 53017, 53047, 53051, 53069, 53077, 53087, 53089, 53093, 53101, 53113, 53117, 53129, 53147, 53149, 53161, 53171, 53173, 53189, 53197, 53201, 53231, 53233, 53239, 53267, 53269, 53279, 53281, 53299, 53309, 53323, 53327, 53353, 53359, 53377, 53381, 53401, 53407, 53411, 53419, 53437, 53441, 53453, 53479, 53503, 53507, 53527, 53549, 53551, 53569, 53591, 53593, 53597, 53609, 53611, 53617, 53623, 53629, 53633, 53639, 53653, 53657, 53681, 53693, 53699, 53717, 53719, 53731, 53759, 53773, 53777, 53783, 53791, 53813, 53819, 53831, 53849, 53857, 53861, 53881, 53887, 53891, 53897, 53899, 53917, 53923, 53927, 53939, 53951, 53959, 53987, 53993, 54001, 54011, 54013, 54037, 54049, 54059, 54083, 54091, 54101, 54121, 54133, 54139, 54151, 54163, 54167, 54181, 54193, 54217, 54251, 54269, 54277, 54287, 54293, 54311, 54319, 54323, 54331, 54347, 54361, 54367, 54371, 54377, 54401, 54403, 54409, 54413, 54419, 54421, 54437, 54443, 54449, 54469, 54493, 54497, 54499, 54503, 54517, 54521, 54539, 54541, 54547, 54559, 54563, 54577, 54581, 54583, 54601, 54617, 54623, 54629, 54631, 54647, 54667, 54673, 54679, 54709, 54713, 54721, 54727, 54751, 54767, 54773, 54779, 54787, 54799, 54829, 54833, 54851, 54869, 54877, 54881, 54907, 54917, 54919, 54941, 54949, 54959, 54973, 54979, 54983, 55001, 55009, 55021, 55049, 55051, 55057, 55061, 55073, 55079, 55103, 55109, 55117, 55127, 55147, 55163, 55171, 55201, 55207, 55213, 55217, 55219, 55229, 55243, 55249, 55259, 55291, 55313, 55331, 55333, 55337, 55339, 55343, 55351, 55373, 55381, 55399, 55411, 55439, 55441, 55457, 55469, 55487, 55501, 55511, 55529, 55541, 55547, 55579, 55589, 55603, 55609, 55619, 55621, 55631, 55633, 55639, 55661, 55663, 55667, 55673, 55681, 55691, 55697, 55711, 55717, 55721, 55733, 55763, 55787, 55793, 55799, 55807, 55813, 55817, 55819, 55823, 55829, 55837, 55843, 55849, 55871, 55889, 55897, 55901, 55903, 55921, 55927, 55931, 55933, 55949, 55967, 55987, 55997, 56003, 56009, 56039, 56041, 56053, 56081, 56087, 56093, 56099, 56101, 56113, 56123, 56131, 56149, 56167, 56171, 56179, 56197, 56207, 56209, 56237, 56239, 56249, 56263, 56267, 56269, 56299, 56311, 56333, 56359, 56369, 56377, 56383, 56393, 56401, 56417, 56431, 56437, 56443, 56453, 56467, 56473, 56477, 56479, 56489, 56501, 56503, 56509, 56519, 56527, 56531, 56533, 56543, 56569, 56591, 56597, 56599, 56611, 56629, 56633, 56659, 56663, 56671, 56681, 56687, 56701, 56711, 56713, 56731, 56737, 56747, 56767, 56773, 56779, 56783, 56807, 56809, 56813, 56821, 56827, 56843, 56857, 56873, 56891, 56893, 56897, 56909, 56911, 56921, 56923, 56929, 56941, 56951, 56957, 56963, 56983, 56989, 56993, 56999, 57037, 57041, 57047, 57059, 57073, 57077, 57089, 57097, 57107, 57119, 57131, 57139, 57143, 57149, 57163, 57173, 57179, 57191, 57193, 57203, 57221, 57223, 57241, 57251, 57259, 57269, 57271, 57283, 57287, 57301, 57329, 57331, 57347, 57349, 57367, 57373, 57383, 57389, 57397, 57413, 57427, 57457, 57467, 57487, 57493, 57503, 57527, 57529, 57557, 57559, 57571, 57587, 57593, 57601, 57637, 57641, 57649, 57653, 57667, 57679, 57689, 57697, 57709, 57713, 57719, 57727, 57731, 57737, 57751, 57773, 57781, 57787, 57791, 57793, 57803, 57809, 57829, 57839, 57847, 57853, 57859, 57881, 57899, 57901, 57917, 57923, 57943, 57947, 57973, 57977, 57991, 58013, 58027, 58031, 58043, 58049, 58057, 58061, 58067, 58073, 58099, 58109, 58111, 58129, 58147, 58151, 58153, 58169, 58171, 58189, 58193, 58199, 58207, 58211, 58217, 58229, 58231, 58237, 58243, 58271, 58309, 58313, 58321, 58337, 58363, 58367, 58369, 58379, 58391, 58393, 58403, 58411, 58417, 58427, 58439, 58441, 58451, 58453, 58477, 58481, 58511, 58537, 58543, 58549, 58567, 58573, 58579, 58601, 58603, 58613, 58631, 58657, 58661, 58679, 58687, 58693, 58699, 58711, 58727, 58733, 58741, 58757, 58763, 58771, 58787, 58789, 58831, 58889, 58897, 58901, 58907, 58909, 58913, 58921, 58937, 58943, 58963, 58967, 58979, 58991, 58997, 59009, 59011, 59021, 59023, 59029, 59051, 59053, 59063, 59069, 59077, 59083, 59093, 59107, 59113, 59119, 59123, 59141, 59149, 59159, 59167, 59183, 59197, 59207, 59209, 59219, 59221, 59233, 59239, 59243, 59263, 59273, 59281, 59333, 59341, 59351, 59357, 59359, 59369, 59377, 59387, 59393, 59399, 59407, 59417, 59419, 59441, 59443, 59447, 59453, 59467, 59471, 59473, 59497, 59509, 59513, 59539, 59557, 59561, 59567, 59581, 59611, 59617, 59621, 59627, 59629, 59651, 59659, 59663, 59669, 59671, 59693, 59699, 59707, 59723, 59729, 59743, 59747, 59753, 59771, 59779, 59791, 59797, 59809, 59833, 59863, 59879, 59887, 59921, 59929, 59951, 59957, 59971, 59981, 59999, 60013, 60017, 60029, 60037, 60041, 60077, 60083, 60089, 60091, 60101, 60103, 60107, 60127, 60133, 60139, 60149, 60161, 60167, 60169, 60209, 60217, 60223, 60251, 60257, 60259, 60271, 60289, 60293, 60317, 60331, 60337, 60343, 60353, 60373, 60383, 60397, 60413, 60427, 60443, 60449, 60457, 60493, 60497, 60509, 60521, 60527, 60539, 60589, 60601, 60607, 60611, 60617, 60623, 60631, 60637, 60647, 60649, 60659, 60661, 60679, 60689, 60703, 60719, 60727, 60733, 60737, 60757, 60761, 60763, 60773, 60779, 60793, 60811, 60821, 60859, 60869, 60887, 60889, 60899, 60901, 60913, 60917, 60919, 60923, 60937, 60943, 60953, 60961, 61001, 61007, 61027, 61031, 61043, 61051, 61057, 61091, 61099, 61121, 61129, 61141, 61151, 61153, 61169, 61211, 61223, 61231, 61253, 61261, 61283, 61291, 61297, 61331, 61333, 61339, 61343, 61357, 61363, 61379, 61381, 61403, 61409, 61417, 61441, 61463, 61469, 61471, 61483, 61487, 61493, 61507, 61511, 61519, 61543, 61547, 61553, 61559, 61561, 61583, 61603, 61609, 61613, 61627, 61631, 61637, 61643, 61651, 61657, 61667, 61673, 61681, 61687, 61703, 61717, 61723, 61729, 61751, 61757, 61781, 61813, 61819, 61837, 61843, 61861, 61871, 61879, 61909, 61927, 61933, 61949, 61961, 61967, 61979, 61981, 61987, 61991, 62003, 62011, 62017, 62039, 62047, 62053, 62057, 62071, 62081, 62099, 62119, 62129, 62131, 62137, 62141, 62143, 62171, 62189, 62191, 62201, 62207, 62213, 62219, 62233, 62273, 62297, 62299, 62303, 62311, 62323, 62327, 62347, 62351, 62383, 62401, 62417, 62423, 62459, 62467, 62473, 62477, 62483, 62497, 62501, 62507, 62533, 62539, 62549, 62563, 62581, 62591, 62597, 62603, 62617, 62627, 62633, 62639, 62653, 62659, 62683, 62687, 62701, 62723, 62731, 62743, 62753, 62761, 62773, 62791, 62801, 62819, 62827, 62851, 62861, 62869, 62873, 62897, 62903, 62921, 62927, 62929, 62939, 62969, 62971, 62981, 62983, 62987, 62989, 63029, 63031, 63059, 63067, 63073, 63079, 63097, 63103, 63113, 63127, 63131, 63149, 63179, 63197, 63199, 63211, 63241, 63247, 63277, 63281, 63299, 63311, 63313, 63317, 63331, 63337, 63347, 63353, 63361, 63367, 63377, 63389, 63391, 63397, 63409, 63419, 63421, 63439, 63443, 63463, 63467, 63473, 63487, 63493, 63499, 63521, 63527, 63533, 63541, 63559, 63577, 63587, 63589, 63599, 63601, 63607, 63611, 63617, 63629, 63647, 63649, 63659, 63667, 63671, 63689, 63691, 63697, 63703, 63709, 63719, 63727, 63737, 63743, 63761, 63773, 63781, 63793, 63799, 63803, 63809, 63823, 63839, 63841, 63853, 63857, 63863, 63901, 63907, 63913, 63929, 63949, 63977, 63997, 64007, 64013, 64019, 64033, 64037, 64063, 64067, 64081, 64091, 64109, 64123, 64151, 64153, 64157, 64171, 64187, 64189, 64217, 64223, 64231, 64237, 64271, 64279, 64283, 64301, 64303, 64319, 64327, 64333, 64373, 64381, 64399, 64403, 64433, 64439, 64451, 64453, 64483, 64489, 64499, 64513, 64553, 64567, 64577, 64579, 64591, 64601, 64609, 64613, 64621, 64627, 64633, 64661, 64663, 64667, 64679, 64693, 64709, 64717, 64747, 64763, 64781, 64783, 64793, 64811, 64817, 64849, 64853, 64871, 64877, 64879, 64891, 64901, 64919, 64921, 64927, 64937, 64951, 64969, 64997, 65003, 65011, 65027, 65029, 65033, 65053, 65063, 65071, 65089, 65099, 65101, 65111, 65119, 65123, 65129, 65141, 65147, 65167, 65171, 65173, 65179, 65183, 65203, 65213, 65239, 65257, 65267, 65269, 65287, 65293, 65309, 65323, 65327, 65353, 65357, 65371, 65381, 65393, 65407, 65413, 65419, 65423, 65437, 65447, 65449, 65479, 65497, 65519, 65521, 65537}; unsigned int i; unsigned int nextPrime; unsigned int nextSqrootIndex; if (number <= 65537) { for (i=0; i<6543; i++) { if (prime[i] >= number) { return prime[i]; } } } else { if (number > 4294967291UL) { // this is the largest 32 bit prime if (number > 4294967293UL) { return 4294967295UL; // 4294967295 = 3*5*17*257*65537 } else { return 4294967293UL; // 4294967293 = 9241*464773 } } if (number % 2 == 0) { nextPrime = number + 1; } else { nextPrime = number; } for (nextSqrootIndex=54; nextSqrootIndex<6542; nextSqrootIndex++) { // the 54th prime is 251; 251*251 = 63001 < 65538 // the 55th prime is 257; 257*257 = 66049 > 65538 if (prime[nextSqrootIndex] * prime[nextSqrootIndex] > nextPrime) { break; } } i = 1; while (TRUE) { while (i> 1) & 0x55555555); x = (((x >> 2) & 0x33333333) + (x & 0x33333333)); x = (((x >> 4) + x) & 0x0f0f0f0f); x += (x >> 8); x += (x >> 16); return(x & 0x0000003f); } soap2.20/Makefile0000644000105300011350000000321411231713573012603 0ustar yuchangrdSHELL = /bin/sh PROG = soap DEBUG = NO PROFILE = NO PTHREADS = YES CC = gcc DEBUG_FLAGS = -g3 -Wall -O2 PROFILE_FLAGS = -fprofile-arcs -ftest-coverage -pg RELEASE_FLAGS = -msse3 -O3 -static -funroll-loops -maccumulate-outgoing-args -fomit-frame-pointer STATIC_FLAGS = -static DFLAGS = -DMAKE_TIME=\""`date`"\" LIBS = -lm #TARBALL_EXCLUDE = "*.(o,gz,zip)" #ZIP_EXCLUDE = *.o *.gz *.zip ifeq (YES, $(DEBUG)) CFLAGS = $(DEBUG_FLAGS) $(STATIC_FLAGS) DFLAGS += -DDEBUG # PTHREADS = NO else CFLAGS = $(RELEASE_FLAGS) $(STATIC_FLAGS) endif ifeq (YES, $(PTHREADS)) LIBS += -lpthread DFLAGS += -DPTHREADS endif ifeq (YES, $(PROFILE)) DFLAGS += $(PROFILE_FLAGS) endif OBJ = SeqIO.o MiscUtilities.o MemManager.o TextConverter.o r250.o DNACount.o HSP.o Timing.o BWT.o extratools.o soapio.o BWTAln.o Match.o PairMatch.o stdaln.o kstring.o .SUFFIX: .SUFFIX: .c .o .c.o: $(CC) -c $(CFLAGS) $(DFLAGS) $< -o $@ all: $(PROG) $(PROG): $(OBJ) soap.o $(CC) $(CFLAGS) $(DFLAGS) $(OBJ) soap.o -o $@ $(LIBS) SeqIO.o:SeqIO.h r250.o: r250.h DNACount.o:DNACount.h HSP.o:HSP.h MiscUtilities.o:MiscUtilities.h MemManager.o:MemManager.h TextConverter.o:TextConverter.h extratools.o:extratools.h BWT.h MiscUtilities.h MemManager.h TextConverter.h Timing.h HSP.h kstring.h soapio.o:soapio.h SeqIO.h BWT.o:BWT.h BWTAln.o:BWTAln.h BWT.h Match.o:Match.h BWTAln.h soapio.h PairMatch.o:Match.h BWTAln.h stdaln.h MiscUtilities.o:MiscUtilities.h MemManager.o:MemManager.h TextConverter.o:TextConverter.h stdaln.o:stdaln.h kstring.o:kstring.h clean: rm -f *.o $(PROG) soap2.20/PairMatch.c0000644000105300011350000004310111232000420013133 0ustar yuchangrd/* * ============================================================================= * * Filename: PariMatch.c * * Description: * * Revision: none * Compiler: gcc 4.3.2 or aboAve * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #include "Match.h" inline int CheckIns(HITITEM *p, HITITEM *q, PEAUX *o) { int strain1 = (p->info >> 24)&1; int strain2 = (q->info >> 24)&1; if(p->chr != q->chr || strain1 == strain2) return FALSE; else if(o->FR) { if(!strain1 && q->pos-p->pos+o->len >= o->min_ins && q->pos-p->pos+o->len <= o->max_ins) return TRUE; else if(strain1 && p->pos-q->pos+o->len >= o->min_ins && p->pos-q->pos+o->len <= o->max_ins) return TRUE; else{ return FALSE; }; } else if(!o->FR){ if(strain1 && q->pos-p->pos >= o->min_ins && q->pos-p->pos+o->len <= o->max_ins) return TRUE; else if(!strain1 && p->pos-q->pos >= o->min_ins && p->pos-q->pos+o->len <= o->max_ins) return TRUE; else{ return FALSE; } } return TRUE; } int HITCMP(const void *a, const void *b){ if ((*(HITITEM *)a).chr != (*(HITITEM *)b).chr){ return (*(HITITEM *)a).chr - (*(HITITEM *)b).chr; } else return (*(HITITEM *)a).pos - (*(HITITEM *)b).pos; } int GenPair(HITTABLE **hitse, PEAUX *po, HITTABLE **hitpe) { if(!hitse[0]->n || !hitse[1]->n) return 0; HITITEM *p, *q; p = hitse[0]->itemList; q = hitse[1]->itemList; const int cutoff = po->cutoff; if(hitse[0]->n == 1 && hitse[1]->n == 1 ){ if(CheckIns(p, q, po)){ HITCPY(hitpe[0]->itemList+hitpe[0]->n, p); HITCPY(hitpe[1]->itemList+hitpe[1]->n, q); hitpe[0]->n++; hitpe[1]->n++; return 1; }else{ return 0; } }else{ if(hitse[0]->n > 1)qsort(hitse[0]->itemList, hitse[0]->n, sizeof(HITITEM), HITCMP); if(hitse[1]->n > 1)qsort(hitse[1]->itemList, hitse[1]->n, sizeof(HITITEM), HITCMP); int n = hitpe[0]->n; // fprintf(stderr, "%d\n", n); p=hitse[0]->itemList; q = hitse[1]->itemList; while (p!=hitse[0]->itemList+hitse[0]->n) { while(p!=hitse[0]->itemList+hitse[0]->n && p->chrchr)p++; while(q!=hitse[1]->itemList+hitse[1]->n && p->chr>q->chr)q++; if (p==(hitse[0]->itemList+hitse[0]->n) || q==(hitse[1]->itemList+hitse[1]->n)) return n ; while (p->chr==q->chr && q!=(hitse[1]->itemList+hitse[1]->n)){ if (CheckIns(p, q, po)) { HITCPY(hitpe[0]->itemList+n, p); HITCPY(hitpe[1]->itemList+n, q); n++; hitpe[0]->n++; hitpe[1]->n++; if(n >= cutoff){ hitpe[0]->n = hitpe[1]->n = n; return n; } } q++; } p++; q = hitse[1]->itemList; } hitpe[0]->n = hitpe[1]->n = n; return n; } } #if 1 unsigned short *SWRescue(const ALNSEQ *alnSeq, const BWTOPT *bo, const PEAUX *po, const int rescue, HITTABLE **hitse, HITTABLE **hitpe, int *nc, int *n_rescue){ HITITEM *hitf = hitse[rescue^1]->itemList; int nfound = hitse[rescue^1]->n; const unsigned int *pacRef = bo->pacRef; const unsigned int dnaLen = bo->dnaLen; char *seq; ChrBlock *blockList = bo->blockList; int minIns, maxIns, len, keyLength, n; unsigned int occPos, beg; keyLength = alnSeq->len; minIns = po->min_ins; maxIns = po->max_ins; len = maxIns-minIns+3*keyLength; occPos = beg = 0; AlnParam ap = aln_param_bwa; path_t *path, *p; int i, path_len, n_cigar; path_len = n_cigar = 0; cigar_t * cigar = NULL; path = (path_t *)calloc((len+keyLength), sizeof(path_t)); unsigned char *refSeq = (unsigned char *)calloc(len, sizeof(unsigned char)); int SWCutoffX, SWCutoffY; SWCutoffX = SWCutoffY = bo->min_len < keyLength ? bo->min_len : (keyLength < 17 ? keyLength : 17); HITITEM *peItem1, *peItem2; peItem1 = hitpe[rescue^1]->itemList; peItem2 = hitpe[rescue]->itemList; int mm = 10; int n_mm, n_gapo, n_gape, gap_beg, ed_dist; n_mm = n_gapo = n_gape = gap_beg = 0; ed_dist = keyLength; unsigned short tmp_cigar[16]; // fprintf(stderr, "%d\n", nfound); for (i = 0; i < nfound; ++i) { occPos = (hitf+i)->occ_pos; if((((hitf+i)->info>>24)&0x7) > mm) continue; mm = (hitf+i)->info>>24&0x7; n = (hitf+i)->blockid; if(po->FR ^ (hitf+i)->strain) { beg = occPos + minIns - keyLength; if(beg + len >= (blockList + n)->blockEnd) continue; seq = (hitf+i)->strain ? alnSeq->seq : alnSeq->rc; } else { beg = occPos - maxIns - keyLength; if(beg < (blockList + n)->blockStart) continue; seq = (hitf + i)->strain ? alnSeq->seq : alnSeq->rc; } { unsigned char *p = refSeq; unsigned int j, l; for(j=beg, l=0; l>4)))>>(((~j)&0xf)<<1))&0x3); } // fprintf(stderr, "%d n_cigar %d\n", i, n_cigar); if (n_cigar) {free(cigar); n_cigar = 0;} // fprintf(stderr, "%d n_cigar %d\n", i, n_cigar); aln_local_core(refSeq, len, (unsigned char *)seq, keyLength, &ap, path, &path_len, 1); cigar = aln_path2cigar(path, path_len, &n_cigar); int k, x, y; x = y = k = 0; for (k = 0, x = y = 0; k < n_cigar; ++k) { unsigned short c = cigar[k]; if (c>>14 == FROM_M) x += c&0x3fff, y += c&0x3fff; else if (c>>14 == FROM_D) x += c&0x3fff; else y += c&0x3fff; } if (x < SWCutoffX && y < SWCutoffY) continue; { // update cigar and coordinate; SWCutoffX = x; SWCutoffY = y; int start, end; p = path + path_len - 1; beg += (p->i? p->i : 1) - 1; start = (p->j? p->j : 1) - 1; end = path->j; cigar = (unsigned short*)realloc(cigar, 2 * (n_cigar + 2)); if (start) { memmove(cigar + 1, cigar, 2 * (n_cigar)); cigar[0] = 3<<14 | start; ++(n_cigar); } if (end < keyLength) { cigar[n_cigar] = 3<<14 | (keyLength - end); ++(n_cigar); } } n_mm = n_gapo = n_gape = gap_beg = 0; int indel = 3; { p = path + path_len - 1; x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0; int l=0; for (k = 0; k < n_cigar; ++k) { unsigned short c = cigar[k]; if (c>>14 == FROM_M) { for (l = 0; l < (c&0x3fff); ++l) if (refSeq[x+l] < 4 && seq[y+l] < 4 && refSeq[x+l] != seq[y+l]) ++n_mm; x += c&0x3fff, y += c&0x3fff; } else if (c>>14 == FROM_D) { indel = 3; gap_beg = y; x += c&0x3fff; ++n_gapo; n_gape += (c&0x3fff) - 1; } else if (c>>14 == FROM_I){ indel = 4; gap_beg = y; y += c&0x3fff; ++n_gapo; n_gape += (c&0x3fff) - 1; } } if(n_mm >= bo->max_mm || n_gapo > 1 || n_gape + n_gapo > bo->gap_len) continue; if (!n_gapo) indel=0; } *n_rescue += 1; if (n_gape + n_gapo + n_mm < ed_dist) {// update pe hit hitpe[rescue^1]->n = hitpe[rescue]->n = 1; HITCPY(peItem1, hitf+i); peItem2->chr = peItem1->chr; peItem2->pos = beg-occPos+(hitf+i)->pos; peItem2->occ_pos = beg; peItem2->strain = 1 ^ peItem1->strain; peItem2->n_mm = n_mm; peItem2->n_gapo = n_gapo; peItem2->n_gape = n_gape; peItem2->info = 0; peItem2->info |= ((indel<<25) | ((gap_beg&0xff)<<12) | ((n_gape+1)&0xff)); peItem2->gap_beg = gap_beg; peItem2->n_cigar = n_cigar; ed_dist = n_gape + n_gapo + n_mm; for(k=0; k < n_cigar; ++k)tmp_cigar[k] = cigar[k]; } } if (*n_rescue) { if (n_cigar < peItem2->n_cigar) cigar = (unsigned short *) calloc (peItem2->n_cigar, sizeof(unsigned short)); for(i = 0; i < peItem2->n_cigar; ++i) cigar[i] = tmp_cigar[i]; *nc = peItem2->n_cigar; } free(path); free(refSeq); return cigar; } #endif void PEAlnCore(int tid, MULTISEQ *mseqs, BWT *bwt, BWT *rev_bwt, LOOKUPTABLE *lookup, LOOKUPTABLE *rev_lookup, HSP *hsp,const SOAPOPT *opt) { int i; ALNSEQ *alnSeq[2]; HITTABLE *hitse[2], *hitpe[2]; hitse[0] = (HITTABLE *)malloc(sizeof(HITTABLE)); hitse[1] = (HITTABLE *)malloc(sizeof(HITTABLE)); hitpe[0] = (HITTABLE *)malloc(sizeof(HITTABLE)); hitpe[1] = (HITTABLE *)malloc(sizeof(HITTABLE)); hitse[0]->itemList = (HITITEM *) malloc (sizeof(HITITEM) * MAX_ALN); hitse[1]->itemList = (HITITEM *) malloc (sizeof(HITITEM) * MAX_ALN); hitpe[0]->itemList = (HITITEM *) malloc (sizeof(HITITEM) * (MAX_ALN+1)); hitpe[1]->itemList = (HITITEM *) malloc (sizeof(HITITEM) * (MAX_ALN+1)); const int multiTotal = mseqs->n; PEAUX pe_aux; BWTOPT boA, boB; int mode, cutoff, ns, seedLen, rr; mode = opt->mode; cutoff = opt->cutoff; ns = opt->ns; seedLen = opt->aln_len; rr = opt->rr; boB.nblock = boA.nblock = hsp->numOfBlock;boB.blockList = boA.blockList = hsp->blockList; boB.cutoff=boA.cutoff = MAX_ALN; boB.gap_len = boA.gap_len = opt->gap_len; boB.gap_fb = boA.gap_fb = opt->gap_fb; boB.max_mm = boA.max_mm = opt->max_mm; boB.pacRef = boA.pacRef = hsp->packedDNA; boB.dnaLen = boA.dnaLen = hsp->dnaLength; boA.min_len = boB.min_len = opt->min_len; boA.h = boA.x = boA.y = boB.h = boB.x = boB.y = 0; pe_aux.min_ins = opt->min_ins; pe_aux.max_ins = opt->max_ins; pe_aux.FR = opt->FR; pe_aux.len = 0; pe_aux.cutoff = MAX_ALN; int x = 0; int se, pe, non; se=pe=non=0; double swBeg, swTime; swBeg = swTime = 0; // int swRun=0; int nRescue = 0; for(i=0; i < multiTotal; i += 2){ // fprintf(stderr, "%d\n", i); #ifdef PTHREADS if (opt->nthreads > 1) { pthread_mutex_lock(&lock); ALNSEQ *p = mseqs->seqList+i; if (p->tid < 0) { int j; int pend = multiTotal-i; for (j = 0; j < pend && j < NSEQ_PER_THREAD; j+=2){ (p+j)->tid = (p+j+1)->tid = tid; } } else if (p->tid != tid) { pthread_mutex_unlock(&lock); continue; } pthread_mutex_unlock(&lock); } #endif alnSeq[0] = mseqs->seqList+i; alnSeq[1] = mseqs->seqList+i+1; hitse[0]->n = hitse[1]->n = hitpe[0]->n = hitpe[1]->n = 0; if(alnSeq[0]->ns <= ns || alnSeq[1]->ns <= ns){ int nc = 0; nRescue = 0; x+=2; int ah0, ah1, ah2, bh0, bh1, bh2, ah3, bh3; boA.seqLen = boA.alnLen = alnSeq[0]->len; boB.seqLen = boB.alnLen = alnSeq[1]->len; unsigned int extLen = 0; pe_aux.len = alnSeq[0]->len; boA.fw = alnSeq[0]->seq; boA.rc = alnSeq[0]->rc; boB.fw = alnSeq[1]->seq; boB.rc = alnSeq[1]->rc; ALIGN: ah0 = ah1 = ah2 = ah3 = bh0 = bh1 = bh2 = bh3 = 0; boA.h = boA.alnLen>>1; boA.x = boA.y = boA.alnLen>=39?boA.alnLen/3:(boA.alnLen>=32 && boA.alnLen<39)?10:7; boB.h = boB.alnLen>>1; boB.x = boB.y = boB.alnLen>=39?boB.alnLen/3:(boB.alnLen>=32 && boB.alnLen<39)?10:7; switch (mode) { case 5: case 4: cutoff = opt->cutoff; case 0: ah0 = BWTExactMatching((unsigned char*)alnSeq[0]->seq, &boA, FORWARD, bwt, lookup, hitse[0]); ah0 += BWTExactMatching((unsigned char*)alnSeq[0]->rc+extLen, &boA, REVERSE, bwt, lookup, hitse[0]); bh0 = BWTExactMatching((unsigned char*)alnSeq[1]->seq, &boB, FORWARD, bwt, lookup, hitse[1]); bh0 += BWTExactMatching((unsigned char*)alnSeq[1]->rc+extLen, &boB, REVERSE, bwt, lookup, hitse[1]); if (ah0 && bh0) { GenPair(hitse, &pe_aux, hitpe);} if (hitpe[0]->n >= cutoff || mode == 0) break; case 1: ah1 = BWT1ErrorMatching((unsigned char*)alnSeq[0]->seq, &boA, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hitse[0]); ah1 += BWT1ErrorMatching((unsigned char*)alnSeq[0]->rc+extLen, &boA, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hitse[0]); bh1 = BWT1ErrorMatching((unsigned char*)alnSeq[1]->seq, &boB, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hitse[1]); bh1 += BWT1ErrorMatching((unsigned char*)alnSeq[1]->rc+extLen, &boB, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hitse[1]); if (ah1 || bh1) { GenPair(hitse, &pe_aux, hitpe);} if (hitpe[0]->n >= cutoff || mode == 1) break; case 2: ah2 = BWT2ErrorMatching((unsigned char*)alnSeq[0]->seq, &boA, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hitse[0]); ah2 += BWT2ErrorMatching((unsigned char*)alnSeq[0]->rc+extLen, &boA, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hitse[0]); bh2 = BWT2ErrorMatching((unsigned char*)alnSeq[1]->seq, &boB, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hitse[1]); bh2 += BWT2ErrorMatching((unsigned char*)alnSeq[1]->rc+extLen, &boB, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hitse[1]); if (ah2 || bh2){ GenPair(hitse, &pe_aux, hitpe);} if (hitpe[0]->n >= cutoff || mode == 4 || mode == 2) break; } if (seedLenn && !hitse[1]->n && (seedLenlen-seedLen; boB.extLen = alnSeq[1]->len-seedLen; if (alnSeq[0]->len < seedLen || alnSeq[1]->len n && !hitse[0]->n && seedLenlen < seedLen){ fprintf(stderr, "read_len shorter than seed_len%d. Continue\n", seedLen); goto OUTPUT; } boA.alnLen = seedLen; boA.extLen = alnSeq[0]->len - seedLen; boA.h = boA.alnLen>>1; boA.x = boA.y = boA.alnLen>=39?boA.alnLen/3:(boA.alnLen>=32 && boA.alnLen<39)?10:7; ah0 = BWTExactMatching((unsigned char*)alnSeq[0]->seq, &boA, FORWARD, bwt, lookup, hitse[0]); ah0 += BWTExactMatching((unsigned char*)alnSeq[0]->rc+boA.extLen, &boA, REVERSE, bwt, lookup, hitse[0]); if (ah0 && GenPair(hitse, &pe_aux, hitpe)) goto OUTPUT; ah1 = BWT1ErrorMatching((unsigned char*)alnSeq[0]->seq, &boA, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hitse[0]); ah1 += BWT1ErrorMatching((unsigned char*)alnSeq[0]->rc+boA.extLen, &boA, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hitse[0]); if (ah1 && GenPair(hitse, &pe_aux, hitpe))goto OUTPUT; ah2 = BWT2ErrorMatching((unsigned char*)alnSeq[0]->seq, &boA, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hitse[0]); ah2 += BWT2ErrorMatching((unsigned char*)alnSeq[0]->rc+boA.extLen, &boA, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hitse[0]); if (ah2 && GenPair(hitse, &pe_aux, hitpe))goto OUTPUT; } else if (!hitpe[1]->n && !hitse[1]->n && seedLen len < seedLen){ fprintf(stderr, "read_len shorter than seed_len%d. Continue\n", seedLen); goto OUTPUT; } boB.alnLen = seedLen; boB.extLen = alnSeq[1]->len - seedLen; boB.h = boB.alnLen>>1; boB.x = boB.y = boB.alnLen>=39?boB.alnLen/3:(boB.alnLen>=32 && boB.alnLen<39)?10:7; bh0 = BWTExactMatching((unsigned char*)alnSeq[1]->seq, &boB, FORWARD, bwt, lookup, hitse[1]); bh0 += BWTExactMatching((unsigned char*)alnSeq[1]->rc+boB.extLen, &boB, REVERSE, bwt, lookup, hitse[1]); if(bh0 && GenPair(hitse, &pe_aux, hitpe)) goto OUTPUT; bh1 = BWT1ErrorMatching((unsigned char*)alnSeq[1]->seq, &boB, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hitse[1]); bh1 += BWT1ErrorMatching((unsigned char*)alnSeq[1]->rc+boB.extLen, &boB, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hitse[1]); if(bh1 && GenPair(hitse, &pe_aux, hitpe)) goto OUTPUT; bh2 = BWT2ErrorMatching((unsigned char*)alnSeq[1]->seq, &boB, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hitse[1]); bh2 += BWT2ErrorMatching((unsigned char*)alnSeq[1]->rc+boB.extLen, &boB, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hitse[1]); if(bh2 && GenPair(hitse, &pe_aux, hitpe)) goto OUTPUT; } } unsigned short * cigar = NULL; // if (hitse[1]->n && !hitse[0]->n && boA.gap_len){ /* gap goto sw */ if (hitse[1]->n && !hitse[0]->n && (boA.min_len < alnSeq[0]->len || boA.gap_len)){ // swBeg = setStartTime(); cigar = SWRescue(alnSeq[0], &boA, &pe_aux, 0, hitse, hitpe, &nc, &nRescue); // swTime += getElapsedTime(swBeg); // swRun++; goto OUTPUT; // } else if (!hitse[1]->n && hitse[0]->n && boA.gap_len) { /* gap goto sw */ } else if (!hitse[1]->n && hitse[0]->n && (boB.min_len < alnSeq[1]->len || boA.gap_len)) { // swBeg = setStartTime(); cigar = SWRescue(alnSeq[1], &boB, &pe_aux, 1, hitse, hitpe, &nc, &nRescue); // swTime += getElapsedTime(swBeg); // swRun++; goto OUTPUT; } OUTPUT: if (hitpe[0]->n && hitpe[1]->n){ pe+=2; // assert(hitpe[0]->n==hitpe[1]->n); // printf("site: %d\n", hitpe[0]->n); int site = (hitpe[0]->n == 1)?0:rand()%hitpe[0]->n; alnSeq[0]->flag = alnSeq[1]->flag = 0x3; // printf("site: %d\n", site); // printf("out:%d\t%d\n", hitpe[0]->itemList[site].n_cigar, nc); PickupHit(alnSeq[0], rr, &site, hitpe[0], hsp->packedDNA, hsp->dnaLength, cigar); // printf("out:%d\n", hitpe[1]->itemList[site].n_cigar); PickupHit(alnSeq[1], rr, &site, hitpe[1], hsp->packedDNA, hsp->dnaLength, cigar); if (nRescue) alnSeq[0]->nhits = alnSeq[1]->nhits = nRescue; } else { int site = 0; if (hitse[0]->n && hitse[1]->n) { se+=2; site = hitse[0]->n == 1?0:rand()%hitse[0]->n; PickupHit(alnSeq[0], rr, &site, hitse[0], hsp->packedDNA, hsp->dnaLength, cigar); site = hitse[1]->n == 1?0:rand()%hitse[1]->n; PickupHit(alnSeq[1], rr, &site, hitse[1], hsp->packedDNA, hsp->dnaLength, cigar); alnSeq[0]->flag = alnSeq[1]->flag = 1; } else if(!hitse[1]->n && hitse[0]->n) { se++; site = hitse[0]->n == 1?0:rand()%hitse[0]->n; PickupHit(alnSeq[0], rr, &site, hitse[0], hsp->packedDNA, hsp->dnaLength, cigar); alnSeq[1]->flag |= 0x8; non++; } else if(!hitse[0]->n && hitse[1]->n) { se++; site = hitse[1]->n == 1?0:rand()%hitse[1]->n; PickupHit(alnSeq[1], rr, &site, hitse[1], hsp->packedDNA, hsp->dnaLength, cigar); alnSeq[0]->flag |= 0x8; non++; } else { non+=2; alnSeq[0]->flag |= 0x12; alnSeq[1]->flag |= 0x12; alnSeq[0]->report = 0; alnSeq[1]->report = 0; } } if(nc) { free(cigar); nc = 0; } } else { non+=2; alnSeq[0]->flag |= 0x12; alnSeq[1]->flag |= 0x12; alnSeq[0]->report = 0; alnSeq[1]->report = 0; } } free(hitse[0]->itemList); free(hitse[1]->itemList); free(hitpe[0]->itemList); free(hitpe[1]->itemList); free(hitse[0]);free(hitse[1]);free(hitpe[0]);free(hitpe[1]); } soap2.20/r250.c0000644000105300011350000000525111164534250012000 0ustar yuchangrd/* r250.c the r250 uniform random number algorithm Kirkpatrick, S., and E. Stoll, 1981; "A Very Fast Shift-Register Sequence Random Number Generator", Journal of Computational Physics, V.40 also: see W.L. Maier, DDJ May 1991 */ #include #include "r250.h" // static functions static unsigned int randlcg(); #define MSB 0x80000000L #define ALL_BITS 0xffffffffL #define HALF_RANGE 0x40000000L #define STEP 7 #define BITS 32 static unsigned int r250_buffer[ 250 ]; static int r250_index; static unsigned int randlcg(int sd) /* returns a random unsigned integer */ { static int quotient1 = LONG_MAX / 16807L; static int remainder1 = LONG_MAX % 16807L; if ( sd <= quotient1 ) sd = (sd * 16807L) % LONG_MAX; else { int high_part = sd / quotient1; int low_part = sd % quotient1; int test = 16807L * low_part - remainder1 * high_part; if ( test > 0 ) sd = test; else sd = test + LONG_MAX; } return sd; } void r250_init(int sd) { int j, k; unsigned int mask, msb; r250_index = 0; for (j = 0; j < 250; j++) /* fill r250 buffer with BITS-1 bit values */ sd = r250_buffer[j] = randlcg(sd); for (j = 0; j < 250; j++) /* set some MSBs to 1 */ if ( (sd=randlcg(sd)) > HALF_RANGE ) r250_buffer[j] |= MSB; msb = MSB; /* turn on diagonal bit */ mask = ALL_BITS; /* turn off the leftmost bits */ for (j = 0; j < BITS; j++) { k = STEP * j + 3; /* select a word to operate on */ r250_buffer[k] &= mask; /* turn off bits left of the diagonal */ r250_buffer[k] |= msb; /* turn on the diagonal bit */ mask >>= 1; msb >>= 1; } } unsigned int r250() /* returns a random unsigned integer */ { register int j; register unsigned int new_rand; if ( r250_index >= 147 ) j = r250_index - 147; /* wrap pointer around */ else j = r250_index + 103; new_rand = r250_buffer[ r250_index ] ^ r250_buffer[ j ]; r250_buffer[ r250_index ] = new_rand; if ( r250_index >= 249 ) /* increment pointer for next time */ r250_index = 0; else r250_index++; return new_rand; } double dr250() /* returns a random double in range 0..1 */ { register int j; register unsigned int new_rand; if ( r250_index >= 147 ) j = r250_index - 147; /* wrap pointer around */ else j = r250_index + 103; new_rand = r250_buffer[ r250_index ] ^ r250_buffer[ j ]; r250_buffer[ r250_index ] = new_rand; if ( r250_index >= 249 ) /* increment pointer for next time */ r250_index = 0; else r250_index++; return (double)new_rand / ALL_BITS; } soap2.20/SeqIO.c0000644000105300011350000001101411207570767012275 0ustar yuchangrd/* * ============================================================================= * * Filename: SeqIO.c * * Description: * * Revision: none * Compiler: gcc 4.3.2 or above * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #include "SeqIO.h" extern unsigned char charMap[256]; extern unsigned char complementMap[256]; extern const char ambiguityCount[16]; int CheckFast (int fd) { char c; if (read(fd, &c, 1)>0) { lseek(fd, -1, SEEK_CUR); if (c == '>') return FASTA; else if (c == '@') return FASTQ; else { fprintf(stderr, "File Error: unrecognized file\n"); close(fd); exit(EXIT_FAILURE); } } return FASTA; } int fasta(FILE *fp, seq_t *seq, const int CONV){ int l, max, ns; int c; char *p, *q; l = ns = 0; max=seq->max; while (!feof(fp) && getc(fp)!= (int)'>'); if (feof(fp)) return -1; p = seq->name; while ((c= getc(fp)) != ' ' && c != '\r' && c != '\t' && c != '\n' && ++l < MAX_NAME_LEN) *p++ = c; /*parse RG ID for SAM if(o->SAM){ if(p[l-1] == '1' && p[l-2] == '/') {r->RG_ID = 1; l-=2;} else if(p[l-1] == '2' && p[l-2] == '/'){r->RG_ID = 2; l-=2;} else {seq->RG_ID = 1;} } //*/ *p = '\0'; while (c != '\n') c = (char) getc(fp); if (feof(fp)) { fprintf(stderr, "\nFile Error: unexpected feof\n"); exit(EXIT_FAILURE); } l = 0; p=seq->seq; q=seq->rc; while ((c = getc(fp)) != '>' && !feof(fp)) { if (c != '\n' && c != '\r') { if (l >= max) { max += QUERY_LEN; seq->seq = (char *)realloc(seq->seq, sizeof(char)* max); seq->rc = (char *)realloc(seq->rc, sizeof(char)* max); seq->qual = (char *)realloc(seq->qual, sizeof(char) * max); p = seq->seq + l; q = seq->rc + l; } if (ambiguityCount[charMap[c]] != 1) { *p++ = charMap['G']; *q++ = complementMap['G']; ns++; } else { *p++ = charMap[c]; *q++ = complementMap[c]; } seq->qual[l] = 'h'; l++; } } seq->qual[l] = *p = *q = '\0'; seq->l = l; seq->max = max; seq->ns = ns; if (c == '>') ungetc(c,fp); return l; } /* ----- end of function fasta ----- */ int fastq (FILE *fp, seq_t *seq, const int CONV) { #ifdef DEBUG // fprintf(stderr, "get read\n"); #endif int l, max, ns; int c; char *p; l = ns = 0; max = seq->max; while (!feof(fp) && getc(fp)!= '@'); if (feof(fp)) return -1; l = 0; p=seq->name; while ((c = getc(fp)) != '\t' && c != ' ' && c != '\n' && c != '\r' && l++ < MAX_NAME_LEN) *p++ = c; *p = '\0'; // fprintf(stderr, "%s\n", seq->name); /* RG ID for SAM if(o->SAM){ if(p[l-1] == '1' && p[l-2] == '/') {r->RG_ID = 1; l-=2;} else if(p[l-1] == '1' && p[l-2] == '/'){r->RG_ID = 2; l-=2;} else {r->RG_ID = 1;} } //*/ while (c != '\n') c = getc(fp); if (feof(fp)) { fprintf(stderr, "\nFile Error: unexpected feof\n"); exit(EXIT_FAILURE); } l = 0; // p = seq->seq; q = seq->rc; while ((c = getc(fp)) != '+' && !feof(fp)) { if (c != '\n' && c != '\r') { if (l >= max) { max += QUERY_LEN; seq->seq = (char *)realloc(seq->seq, sizeof(char)*max); seq->rc = (char *)realloc(seq->rc, sizeof(char)* max); seq->qual = (char *)realloc(seq->qual, sizeof(char)*max); // fprintf(stderr, "%d\n", max); // p = seq->seq + l; // q = seq->rc + l; } // fprintf(stdout, "%c", c); if(ambiguityCount[charMap[c]] == 1){ seq->seq[l] = charMap[c]; seq->rc[l++] = complementMap[c]; }else{ seq->seq[l] = charMap['G']; seq->rc[l++] = complementMap['G']; ns++; } } } // *p = '\0'; *q = '\0'; // fprintf(stderr, "\n"); // for(j=0; jseq[j]); // fprintf(stderr, "\n"); seq->l = l; // fprintf(stdout, "\n"); while (!feof(fp) && (c= getc(fp))!= '\n'); if (feof(fp)) { fprintf(stderr, "\nFile Error: unexpected feof\n"); return 0; } l = 0; p = seq->qual; while ((c = (char) getc(fp)) != '\n' && c != '\r' && !feof(fp)) { if (l > max) { max += QUERY_LEN; seq->qual = (char *)realloc(seq->qual, sizeof(char)*max); p=seq->qual; p+=l; } *p++ = c; l++; } *p = '\0'; if (l != seq->l) { fprintf(stderr, "Length Error: incompitable seq and qual length\n"); fprintf(stderr, " %s\n", seq->name); return 0; } if (c == '@') ungetc(c,fp); seq->max = max; seq->ns = ns; // fprintf(stderr, "%d:%d\n", seq->l, l); return seq->l; } /* test int main(int argc, char *argv[]){ int fd = open(argv[1]) return EXIT_SUCCESS; } //*/ soap2.20/soapio.c0000644000105300011350000002177411231711370012605 0ustar yuchangrd/* * ============================================================================= * * Filename: soapio.c * * Description: * * Revision: none * Compiler: gcc 4.3.2 or above * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #include "soapio.h" #include #define BUF_SIZE 0xF00000 #define BUF_PER_LINE 0x800 void FreeMultiSeq(MULTISEQ *mseqs){ int i; int n = mseqs->n; ALNSEQ *p ; for(i=0; iseqList+i; free(p->name); free(p->seq); free(p->rc); free(p->qual); free(p->rcqual); if(p->report>0){ free(p->itemList->md); free(p->itemList->cigar); free(p->itemList); } } } static inline char * reverse(const char *seq, int len){ char *rc = ( char *)malloc(sizeof( char) * (len+1)); int i; for(i=len;i>0;--i) *(rc+len-i) = *(seq+i-1); return rc; } #define SEQDUP(dest, ori, pe) { \ dest->flag = pe; \ dest->tid = -1; \ dest->id = id; \ dest->len = ori.l; \ dest->nhits = dest->report = 0; \ dest->ns = ori.ns; \ dest->seq = (char *)malloc(sizeof(char)*ori.l); \ memcpy(dest->seq, ori.seq, ori.l); \ dest->rc = reverse(ori.rc, ori.l); \ dest->qual = strdup(ori.qual); \ dest->rcqual = reverse(ori.qual, ori.l); \ dest->rcqual[ori.l] = '\0'; \ dest->name = strdup(ori.name); \ dest++; \ } int GenMultiReads(const HSP *hsp, MULTISEQ *mseqs, const int len, const int pe, unsigned int *start, int *nb){ const unsigned int *pacRef = hsp->packedDNA; char **chrName = hsp->chrName; const unsigned int refLen = hsp->dnaLength; const ChrBlock *blockList = hsp->blockList; ALNSEQ *alnSeq; alnSeq = mseqs->seqList; int num, i, j; num = i = j = 0; unsigned int st = *start; int n = *nb ; unsigned int ori = (blockList+n)->ori; unsigned int blockStart = (blockList+n)->blockStart; unsigned int blockEnd = (blockList+n)->blockEnd; while (num < MAX_MULTI_READS && st < refLen-len) { alnSeq->name = (char *)malloc(sizeof(char)*MAX_NAME_LEN); alnSeq->seq = (char *)malloc(sizeof(char)*(len+1)); alnSeq->rc = (char *)malloc(sizeof(char)*(len+1)); alnSeq->qual = (char *)malloc(sizeof(char)*(len+1)); for (i=0; iseq+i) = ((*(pacRef+((st+i)>>4)))>>(30-((st+i)&0xf)*2))&0x3; *(alnSeq->rc+len-i-1) = (~((*(alnSeq->seq+i))&0x3))&0x3; *(alnSeq->qual+i) = 'h'; } if (st+len-1>blockEnd) { n++; ori = (blockList+n)->ori; blockStart = (blockList+n)->blockStart; blockEnd = (blockList+n)->blockEnd; continue; } sprintf(alnSeq->name, ">%s_%d", chrName[(blockList+n)->chrID], st-blockStart+ori+1); alnSeq->name[strlen(alnSeq->name)] = '\0'; alnSeq->report = 0; alnSeq->nhits = 0; alnSeq->qual[i] = '\0'; alnSeq->ns = 0; alnSeq->tid = -1; alnSeq->flag = 0; alnSeq->len = len; alnSeq->id = st++; num++; alnSeq++; } *start = st; *nb = n; mseqs->n = num; return num; } int GetMultiSeq (InFileList *ifp, MULTISEQ *mseqs, const int pe, int(*get_read)(FILE * , seq_t * , const int)){ #ifdef DEBUG // fprintf(stderr, "Get Multi Seqs\n"); #endif ALNSEQ *alnSeq; alnSeq= mseqs->seqList; int num, len, id; FILE * ifpA, * ifpB; num = 0; ifpA = ifp->ifpA; ifpB = ifp->ifpB; id = ifp->id; seq_t tmp; tmp.max = tmp.l = 0; tmp.seq = tmp.rc = tmp.qual = NULL; while(num < MAX_MULTI_READS){ tmp.ns = 0; if (feof(ifpA)||(pe && feof(ifpB))) break; if ((len=get_read(ifpA, &tmp, TRUE))>0){ SEQDUP(alnSeq, tmp, pe); ++num; if(pe &&(len=get_read(ifpB, &tmp, TRUE))>0){ SEQDUP(alnSeq, tmp, pe); ++num ; } ++id; } } mseqs->n = num; ifp->id = id; // fprintf(stderr, "%d\n", num); /* #ifdef DEBUG int j; fprintf(stderr, "len :%d\n", len); fprintf(stderr, "fw\n"); for(j = 0; jlen; j++) fprintf(stderr, "rc %d", *(alnSeq->rc+j)); fprintf(stderr, "\n"); fprintf(stderr, "%s\n%s\n", alnSeq->name, alnSeq->qual); fprintf(stderr, "soap get multisequences ...\n"); #endif exit(0); //*/ free(tmp.seq); free(tmp.qual);free(tmp.rc); return num; } #define SOAPOUT(file){ \ /* fprintf (stderr, "generate format\n"); \ */ \ int k = 0; \ if(o->id) \ ksprintf(str, "%d\t", alnSeq->id);\ else \ ksprintf(str, "%s\t", alnSeq->name); \ int n_cigar = hit->n_cigar; \ int beg=0, end=len; \ if(hit->cigar[0]>>14 == 3) beg = hit->cigar[0]&0x3ff; \ if(hit->cigar[n_cigar-1]>>14 == 3) end = len - (hit->cigar[n_cigar-1]&0x3ff); \ if(strain){ \ for(k=beg; krcqual[k], str); \ } \ ksprintf(str, "\t"); \ } else { \ for(k=beg; kqual[k], str); \ } \ ksprintf(str, "\t"); \ } \ /* \ fprintf(stderr, "%s\n", alnSeq->qual); \ fprintf(stderr, "%d\n", alnSeq->nhits); \ fprintf(stderr, "%c\n", "ab"[file]); fprintf(stderr, "%d\n", alnSeq->len);fprintf(stderr,"%c\n", "+-"[strain]);fprintf(stderr, "%s\n", chrName[(alnSeq->itemList+j)->chr]);fprintf(stderr, "%u\n", alnSeq->itemList->pos); \ */ \ ksprintf(str, "%d\t%c\t%d\t%c\t%s\t%d\t", alnSeq->nhits,"ab"[file], end-beg, "+-"[strain], chrName[hit->chr], hit->pos); \ if(!n_seedMM)ksprintf(str, "0\t"); \ else if (n_seedMM == 1) \ ksprintf(str, "1\t%c->%d%c%d\t", "ACGT"[(info_seedMM>>8)&3], info_seedMM&0xff, strain?"ACGT"[(int)rc[info_seedMM&0xff]]:"ACGT"[(int)seq[info_seedMM&0xff]], (strain?alnSeq->qual[len-(info_seedMM&0xff)]:alnSeq->qual[info_seedMM&0xff])-'@'); \ else if (n_seedMM == 2) \ ksprintf(str, "2\t%c->%d%c%d\t%c->%d%c%d\t", \ "ACGT"[(info_seedMM>>8)&3], info_seedMM&0xff, strain?"ACGT"[(int)rc[info_seedMM&0xff]]:"ACGT"[(int)seq[info_seedMM&0xff]], (strain?alnSeq->qual[len-(info_seedMM&0xff)-1]:alnSeq->qual[info_seedMM&0xff])-'@', \ "ACGT"[(info_seedMM>>20)&3], (info_seedMM>>12)&0xff, strain?"ACGT"[(int)rc[(info_seedMM>>12)&0xff]]:"ACGT"[(int)seq[(info_seedMM>>12)&0xff]], (strain?alnSeq->qual[len-1-((info_seedMM>>12)&0xff)]:alnSeq->qual[(info_seedMM>>12)&0xff])-'@'); \ else if (n_seedMM == 3) { \ ksprintf(str, "%d\t%d\t", (100+1+hit->n_gape), (info_seedMM>>12)&0xff); \ } \ else if (n_seedMM == 4) { \ ksprintf(str, "%d\t%d\t", (200+1+hit->n_gape), (info_seedMM>>12)&0xff); \ } \ if ((alnSeq->itemList+j)->n_cigar){ \ for (k=0; kn_cigar;k++) \ ksprintf(str, "%d%c", hit->cigar[k]&0x3ff, "MIDS"[(hit->cigar[k]>>14)]); \ }else ksprintf(str, "%dM\t", alnSeq->len); \ ksprintf(str, "\t%s\n", hit->md); \ } #if 0 #define BINARY_SOAP() { \ fwrite(&(alnSeq->itemList+j)->id, sizeof(unsigned int), 1, ofp); \ fwrite(&(alnSeq->itemList+j)->chr, sizeof(unsigned int), 1, ofp); \ fwrite(&(alnSeq->itemList+j)->pos, sizeof(unsigned int), 1, ofp); \ fwrite(&(alnSeq->itemList+j)->len, sizeof(unsigned int), 1, ofp); \ int k; \ if (strain) { \ for(k=0; k<(alnSeq->itemList+j)->len; ++k) \ fwrite(); \ } else { \ for(k=0; k<(alnSeq->itemList+j)->len; ++k) \ fwrite(); \ } \ } #endif #include void DumpAln(MULTISEQ *mseqs, OUTAUX *o, OutFileList *ofp,unsigned int *nAln, unsigned int *nSE){ int n, i; n = mseqs->n; unsigned int n_aln = *nAln; unsigned int n_se = *nSE; char **chrName = o->chrName ; ALNSEQ *alnSeq; HITITEM *hit; FILE *ofpAln, *ofpSe, *ofpUn; ofpAln = ofp->ofpAln; ofpSe = ofp->ofpSe; ofpUn = ofp->ofpUn; kstring_t *str = (kstring_t *)calloc(1, sizeof(kstring_t)); for (i=0; iseqList + i; int j = 0; if (alnSeq->report) { unsigned int strain, n_seedMM, n_mm, info_seedMM, len, flag; char *seq = alnSeq->seq; char *rc = alnSeq->rc; flag = alnSeq->flag; len = alnSeq->len; if((alnSeq->flag>>1&0x1) || !(alnSeq->flag&0x1)) { hit = alnSeq->itemList; for(j=0; jreport; ++j) { str->l = 0; strain = hit->strain; n_seedMM = hit->info >> 25 & 0x7; info_seedMM = hit->info & 0xffffff; n_mm = hit->n_mm; // int file =(alnSeq->flag)&1?(i&1):0; SOAPOUT((alnSeq->flag&1)?(i&1):0); fprintf(ofpAln, "%s", str->s); ++hit; } ++n_aln; } else { hit = alnSeq->itemList; for(j=0; jreport; ++j) { str->l = 0; strain = hit->strain; n_seedMM = hit->info >> 25 & 0x7; info_seedMM = hit->info & 0xffffff; n_mm = hit->n_mm; SOAPOUT((alnSeq->flag&1)?(i&1):0); fprintf(ofpSe, "%s", str->s); ++hit; } ++n_se; } } else if (o->un) { fprintf(ofpUn, ">%s\n", alnSeq->name); int j=0; for(;jlen;j++) fprintf(ofpUn, "%c", "ACGT"[(int)*(alnSeq->seq+j)]); fprintf(ofpUn, "\n"); } } *nAln = n_aln; *nSE = n_se; free(str->s); free(str); } soap2.20/stdaln.c0000644000105300011350000007146311231522664012606 0ustar yuchangrd/* The MIT License Copyright (c) 2003-2006, 2008, by Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include "stdaln.h" /* char -> 17 (=16+1) nucleotides */ unsigned char aln_nt16_table[256] = { 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 }; char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-"; /* char -> 5 (=4+1) nucleotides */ unsigned char aln_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; char *aln_nt4_rev_table = "AGCTN-"; /* char -> 22 (=20+1+1) amino acids */ unsigned char aln_aa_table[256] = { 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21 }; char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-"; /* 01234567890123456789012 */ /* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */ unsigned char aln_trans_table_eu[66] = { 11,11, 2, 2, 1, 1,15,15, 16,16,16,16, 9,12, 9, 9, 6, 6, 3, 3, 7, 7, 7, 7, 0, 0, 0, 0, 19,19,19,19, 5, 5, 8, 8, 1, 1, 1, 1, 14,14,14,14, 10,10,10,10, 20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22 }; char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX"; /* 01234567890123456789012345678901234567890123456789012345678901234 */ int aln_sm_blosum62[] = { /* A R N D C Q E G H I L K M F P S T W Y V * X */ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 }; int aln_sm_blosum45[] = { /* A R N D C Q E G H I L K M F P S T W Y V * X */ 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0, -2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1, -1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1, -2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1, -1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2, -1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1, -1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1, 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1, -2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1, -1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1, -1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1, -1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1, -1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1, -2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1, -1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1, 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0, 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0, -2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2, -2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1, 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1, -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5, 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1 }; int aln_sm_nt[] = { /* X A G R C M S V T W K D Y H B N */ -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2, -2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0, -2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0, -2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0, -2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0, -2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0, -2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, -2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0, -2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0, -2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0, -2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0, -2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int aln_sm_read[] = { /* X A G R C M S V T W K D Y H B N */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, -17, 2,-17, 1,-17, 1,-17, 0,-17, 1,-17, 0,-17, 0,-17, 0, -17,-17, 2, 1,-17,-17, 1, 0,-17,-17, 1, 0,-17,-17, 0, 0, -17, 1, 1, 1,-17,-17,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, -17,-17,-17,-17, 2, 1, 1, 0,-17,-17,-17,-17, 1, 0, 0, 0, -17, 1,-17,-17, 1, 1,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, -17,-17, 1,-17, 1,-17, 1, 0,-17,-17,-17, 0,-17, 0, 0, 0, -17, 0, 0, 0, 0, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, -17,-17,-17,-17,-17,-17,-17,-17, 2, 1, 1, 0, 1, 0, 0, 0, -17, 1,-17,-17,-17,-17,-17, 0, 1, 1,-17, 0,-17, 0, 0, 0, -17,-17, 1,-17,-17,-17,-17, 0, 1,-17, 1, 0,-17, 0, 0, 0, -17, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17,-17,-17,-17, 1,-17,-17, 0, 1,-17,-17, 0, 1, 0, 0, 0, -17, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int aln_sm_hs[] = { /* A G C T N */ 91, -31,-114,-123, -44, -31, 100,-125,-114, -42, -123,-125, 100, -31, -42, -114,-114, -31, 91, -42, -44, -42, -42, -42, -43 }; int aln_sm_maq[] = { 11, -19, -19, -19, -13, -19, 11, -19, -19, -13, -19, -19, 11, -19, -13, -19, -19, -19, 11, -13, -13, -13, -13, -13, -13 //*/ }; int aln_sm_blast[] = { 1, -3, -3, -3, -2, -3, 1, -3, -3, -2, -3, -3, 1, -3, -2, -3, -3, -3, 1, -2, -2, -2, -2, -2, -2 }; /********************/ /* START OF align.c */ /********************/ AlnParam aln_param_blast = { 5, 2, 5, aln_sm_blast, 5, 50 }; AlnParam aln_param_bwa = { 30, 5, 0, aln_sm_maq, 5, 50 }; AlnParam aln_param_nt2nt = { 8, 2, 2, aln_sm_nt, 16, 75 }; AlnParam aln_param_rd2rd = { 1, 19, 19, aln_sm_read, 16, 75 }; AlnParam aln_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; AlnAln *aln_init_AlnAln() { AlnAln *aa; aa = (AlnAln*)MYALLOC(sizeof(AlnAln)); aa->path = 0; aa->out1 = aa->out2 = aa->outm = 0; aa->path_len = 0; return aa; } void aln_free_AlnAln(AlnAln *aa) { MYFREE(aa->path); MYFREE(aa->cigar); MYFREE(aa->out1); MYFREE(aa->out2); MYFREE(aa->outm); MYFREE(aa); } /***************************/ /* START OF common_align.c */ /***************************/ #define LOCAL_OVERFLOW_THRESHOLD 32000 #define LOCAL_OVERFLOW_REDUCE 16000 #define NT_LOCAL_SCORE int #define NT_LOCAL_SHIFT 16 #define NT_LOCAL_MASK 0xffff #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; #define set_M(MM, cur, p, sc) \ { \ if ((p)->M >= (p)->I) { \ if ((p)->M >= (p)->D) { \ (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ } else { \ (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ } \ } else { \ if ((p)->I > (p)->D) { \ (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ } else { \ (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ } \ } \ } #define set_I(II, cur, p) \ { \ if ((p)->M - gap_open > (p)->I) { \ (cur)->It = FROM_M; \ (II) = (p)->M - gap_open - gap_ext; \ } else { \ (cur)->It = FROM_I; \ (II) = (p)->I - gap_ext; \ } \ } #define set_end_I(II, cur, p) \ { \ if (gap_end >= 0) { \ if ((p)->M - gap_open > (p)->I) { \ (cur)->It = FROM_M; \ (II) = (p)->M - gap_open - gap_end; \ } else { \ (cur)->It = FROM_I; \ (II) = (p)->I - gap_end; \ } \ } else set_I(II, cur, p); \ } #define set_D(DD, cur, p) \ { \ if ((p)->M - gap_open > (p)->D) { \ (cur)->Dt = FROM_M; \ (DD) = (p)->M - gap_open - gap_ext; \ } else { \ (cur)->Dt = FROM_D; \ (DD) = (p)->D - gap_ext; \ } \ } #define set_end_D(DD, cur, p) \ { \ if (gap_end >= 0) { \ if ((p)->M - gap_open > (p)->D) { \ (cur)->Dt = FROM_M; \ (DD) = (p)->M - gap_open - gap_end; \ } else { \ (cur)->Dt = FROM_D; \ (DD) = (p)->D - gap_end; \ } \ } else set_D(DD, cur, p); \ } typedef struct { unsigned char Mt:3, It:2, Dt:2; } dpcell_t; typedef struct { int M, I, D; } dpscore_t; /* build score profile for accelerating alignment, in theory */ void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array) { int *tmp, *tmp2, i, k; i = k = 0; for (i = 0; i != row; ++i) { tmp = score_matrix + i * row; tmp2 = s_array[i]; for (k = 0; k != len; ++k) tmp2[k] = tmp[seq[k]]; } } /*************************** * banded global alignment * ***************************/ int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, path_t *path, int *path_len) { register int i, j; i = j = 0; dpcell_t **dpcell, *q; dpscore_t *curr, *last, *s; path_t *p; int b1, b2, tmp_end; int *mat, end, max; unsigned char type, ctype; int gap_open, gap_ext, gap_end, b; int *score_matrix, N_MATRIX_ROW; /* initialize some align-related parameters. just for compatibility */ gap_open = ap->gap_open; gap_ext = ap->gap_ext; gap_end = ap->gap_end; b = ap->band_width; score_matrix = ap->matrix; N_MATRIX_ROW = ap->row; if (len1 == 0 || len2 == 0) { *path_len = 0; return 0; } /* calculate b1 and b2 */ if (len1 > len2) { b1 = len1 - len2 + b; b2 = b; } else { b1 = b; b2 = len2 - len1 + b; } if (b1 > len1) b1 = len1; if (b2 > len2) b2 = len2; --seq1; --seq2; /* allocate memory */ end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); dpcell = (dpcell_t**)MYALLOC(sizeof(dpcell_t*) * (len2 + 1)); for (j = 0; j <= len2; ++j) dpcell[j] = (dpcell_t*)MYALLOC(sizeof(dpcell_t) * end); for (j = b2 + 1; j <= len2; ++j) dpcell[j] -= j - b2; curr = (dpscore_t*)MYALLOC(sizeof(dpscore_t) * (len1 + 1)); last = (dpscore_t*)MYALLOC(sizeof(dpscore_t) * (len1 + 1)); /* set first row */ SET_INF(*curr); curr->M = 0; for (i = 1, s = curr + 1; i < b1; ++i, ++s) { SET_INF(*s); set_end_D(s->D, dpcell[0] + i, s - 1); } s = curr; curr = last; last = s; /* core dynamic programming, part 1 */ tmp_end = (b2 < len2)? b2 : len2 - 1; for (j = 1; j <= tmp_end; ++j) { q = dpcell[j]; s = curr; SET_INF(*s); set_end_I(s->I, q, last); end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; mat = score_matrix + seq2[j] * N_MATRIX_ROW; ++s; ++q; for (i = 1; i != end; ++i, ++s, ++q) { set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ set_I(s->I, q, last + i); set_D(s->D, q, s - 1); } set_M(s->M, q, last + i - 1, mat[seq1[i]]); set_D(s->D, q, s - 1); if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ set_end_I(s->I, q, last + i); } else s->I = MINOR_INF; s = curr; curr = last; last = s; } /* last row for part 1, use set_end_D() instead of set_D() */ if (j == len2 && b2 != len2 - 1) { q = dpcell[j]; s = curr; SET_INF(*s); set_end_I(s->I, q, last); end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; mat = score_matrix + seq2[j] * N_MATRIX_ROW; ++s; ++q; for (i = 1; i != end; ++i, ++s, ++q) { set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ set_I(s->I, q, last + i); set_end_D(s->D, q, s - 1); } set_M(s->M, q, last + i - 1, mat[seq1[i]]); set_end_D(s->D, q, s - 1); if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ set_end_I(s->I, q, last + i); } else s->I = MINOR_INF; s = curr; curr = last; last = s; ++j; } /* core dynamic programming, part 2 */ for (; j <= len2 - b2 + 1; ++j) { SET_INF(curr[j - b2]); mat = score_matrix + seq2[j] * N_MATRIX_ROW; end = j + b1 - 1; for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { set_M(s->M, q, last + i - 1, mat[seq1[i]]); set_I(s->I, q, last + i); set_D(s->D, q, s - 1); } set_M(s->M, q, last + i - 1, mat[seq1[i]]); set_D(s->D, q, s - 1); s->I = MINOR_INF; s = curr; curr = last; last = s; } /* core dynamic programming, part 3 */ for (; j < len2; ++j) { SET_INF(curr[j - b2]); mat = score_matrix + seq2[j] * N_MATRIX_ROW; for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { set_M(s->M, q, last + i - 1, mat[seq1[i]]); set_I(s->I, q, last + i); set_D(s->D, q, s - 1); } set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); set_end_I(s->I, q, last + i); set_D(s->D, q, s - 1); s = curr; curr = last; last = s; } /* last row */ if (j == len2) { SET_INF(curr[j - b2]); mat = score_matrix + seq2[j] * N_MATRIX_ROW; for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { set_M(s->M, q, last + i - 1, mat[seq1[i]]); set_I(s->I, q, last + i); set_end_D(s->D, q, s - 1); } set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); set_end_I(s->I, q, last + i); set_end_D(s->D, q, s - 1); s = curr; curr = last; last = s; } /* backtrace */ i = len1; j = len2; q = dpcell[j] + i; s = last + len1; max = s->M; type = q->Mt; ctype = FROM_M; if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } p = path; p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ ++p; do { switch (ctype) { case FROM_M: --i; --j; break; case FROM_I: --j; break; case FROM_D: --i; break; } q = dpcell[j] + i; ctype = type; switch (type) { case FROM_M: type = q->Mt; break; case FROM_I: type = q->It; break; case FROM_D: type = q->Dt; break; } p->ctype = ctype; p->i = i; p->j = j; ++p; } while (i || j); *path_len = p - path - 1; /* free memory */ for (j = b2 + 1; j <= len2; ++j) dpcell[j] += j - b2; for (j = 0; j <= len2; ++j) MYFREE(dpcell[j]); MYFREE(dpcell); MYFREE(curr); MYFREE(last); return max; } /************************************************* * local alignment combined with banded strategy * *************************************************/ int aln_local_core(unsigned char *seq1,const int len1, unsigned char *seq2,const int len2, const AlnParam *ap, path_t *path, int *path_len, int do_align) { register NT_LOCAL_SCORE *s; register int i; int q, r, qr, tmp_len, qr_shift; int **s_array, *score_array; int e, f; int is_overflow, of_base; NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h; int j, start_i, start_j, end_i, end_j; path_t *p; int score_f, score_r, score_g; int start, end, max_score; int gap_open, gap_ext, b; int *score_matrix, N_MATRIX_ROW; /* initialize some align-related parameters. just for compatibility */ gap_open = ap->gap_open; gap_ext = ap->gap_ext; b = ap->band_width; score_matrix = ap->matrix; N_MATRIX_ROW = ap->row; if (len1 == 0 || len2 == 0) return -1; /* allocate memory */ eh = (NT_LOCAL_SCORE*)MYALLOC(sizeof(NT_LOCAL_SCORE) * (len1 + 1)); s_array = (int**)MYALLOC(sizeof(int*) * N_MATRIX_ROW); for (i = 0; i != N_MATRIX_ROW; ++i) s_array[i] = (int*)MYALLOC(sizeof(int) * len1); /* initialization */ aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); q = gap_open; r = gap_ext; qr = q + r; qr_shift = (qr+1) << NT_LOCAL_SHIFT; tmp_len = len1 + 1; start_i = start_j = end_i = end_j = 0; for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i) if (max_score < score_matrix[i]) max_score = score_matrix[i]; /* convert the coordinate */ --seq1; --seq2; for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; /* forward dynamic programming */ for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0; score_f = 0; is_overflow = of_base = 0; for (j = 1; j <= len2; ++j) { last_h = f = 0; score_array = s_array[seq2[j]]; if (is_overflow) { /* adjust eh[] array if overflow occurs. */ /* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed. * If it is too large, this block will be excuted frequently and therefore * slow down the whole program. * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the * number of assignments because it sets some cells to zero when overflow * happens. */ int tmp, tmp2; score_f -= LOCAL_OVERFLOW_REDUCE; of_base += LOCAL_OVERFLOW_REDUCE; is_overflow = 0; for (i = 1, s = eh; i <= tmp_len; ++i, ++s) { tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; else tmp2 -= LOCAL_OVERFLOW_REDUCE; if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; else tmp -= LOCAL_OVERFLOW_REDUCE; *s = (tmp << NT_LOCAL_SHIFT) | tmp2; } } for (i = 1, s = eh; i != tmp_len; ++i, ++s) { /* prepare for calculate current h */ curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; if (curr_h < 0) curr_h = 0; if (last_h > qr) { /* initialize f */ f = (f > last_h - q)? f - r : last_h - qr; if (curr_h < f) curr_h = f; } if (*(s+1) >= qr_shift) { /* initialize e */ curr_last_h = *(s+1) >> NT_LOCAL_SHIFT; e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; if (curr_h < e) curr_h = e; *s = (last_h << NT_LOCAL_SHIFT) | e; } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */ last_h = curr_h; if (score_f < curr_h) { score_f = curr_h; end_i = i; end_j = j; if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; } } *s = last_h << NT_LOCAL_SHIFT; } score_f += of_base; if (path == 0) goto end_func; /* skip path-filling */ /* reverse dynamic programming */ for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0; if (end_i == 0 || end_j == 0) goto end_func; /* no local match */ score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]]; is_overflow = of_base = 0; start_i = end_i; start_j = end_j; eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */ start = end_i - 1; end = end_i - 3; if (end <= 0) end = 0; /* second pass DP can be done in a band, speed will thus be enhanced */ for (j = end_j - 1; j != 0; --j) { last_h = f = 0; score_array = s_array[seq2[j]]; if (is_overflow) { /* adjust eh[] array if overflow occurs. */ int tmp, tmp2; score_r -= LOCAL_OVERFLOW_REDUCE; of_base += LOCAL_OVERFLOW_REDUCE; is_overflow = 0; for (i = start, s = eh + start + 1; i >= end; --i, --s) { tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; else tmp2 -= LOCAL_OVERFLOW_REDUCE; if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; else tmp -= LOCAL_OVERFLOW_REDUCE; *s = (tmp << NT_LOCAL_SHIFT) | tmp2; } } for (i = start, s = eh + start + 1; i != end; --i, --s) { /* prepare for calculate current h */ curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; if (curr_h < 0) curr_h = 0; if (last_h > qr) { /* initialize f */ f = (f > last_h - q)? f - r : last_h - qr; if (curr_h < f) curr_h = f; } if (*(s-1) >= qr_shift) { /* initialize e */ curr_last_h = *(s-1) >> NT_LOCAL_SHIFT; e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; if (curr_h < e) curr_h = e; *s = (last_h << NT_LOCAL_SHIFT) | e; } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */ last_h = curr_h; if (score_r < curr_h) { score_r = curr_h; start_i = i; start_j = j; if (score_r + of_base - qr == score_f) { j = 1; break; } if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; } } *s = last_h << NT_LOCAL_SHIFT; /* recalculate start and end, the boundaries of the band */ if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start; if (start <= 0) start = 0; end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1; if (end <= 0) end = 0; } if (path_len == 0) { path[0].i = start_i; path[0].j = start_j; path[1].i = end_i; path[1].j = end_j; goto end_func; } score_r += of_base; score_r -= qr; #ifdef DEBUG /* this seems not a bug */ if (score_f != score_r) fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r); #endif if (do_align) { /* call global alignment to fill the path */ score_g = 0; j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j; ++j; /* j is the maximum band_width */ for (i = ap->band_width;; i <<= 1) { AlnParam ap_real = *ap; ap_real.gap_end = -1; ap_real.band_width = i; score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j, end_j - start_j + 1, &ap_real, path, path_len); if (score_g == score_r || score_f == score_g) break; if (i > j) break; } #ifdef DEBUG if (score_r > score_g && score_f > score_g) fprintf(stderr, "[aln_local_core] Cannot find reasonable band width. Continue anyway.\n"); #endif score_f = score_g; /* convert coordinate */ for (p = path + *path_len - 1; p >= path; --p) { p->i += start_i - 1; p->j += start_j - 1; } } else { /* just store the start and end */ *path_len = 2; path[1].i = start_i; path[1].j = start_j; path->i = end_i; path->j = end_j; } end_func: /* free */ MYFREE(eh); for (i = 0; i != N_MATRIX_ROW; ++i) { ++s_array[i]; MYFREE(s_array[i]); } MYFREE(s_array); return score_f; } AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, int is_global, int do_align, int len1, int len2) { unsigned char *seq11, *seq22; int score; int i, j, l; path_t *p; char *out1, *out2, *outm; AlnAln *aa; if (len1 < 0) len1 = strlen(seq1); if (len2 < 0) len2 = strlen(seq2); aa = aln_init_AlnAln(); seq11 = (unsigned char*)MYALLOC(sizeof(unsigned char) * len1); seq22 = (unsigned char*)MYALLOC(sizeof(unsigned char) * len2); aa->path = (path_t*)MYALLOC(sizeof(path_t) * (len1 + len2 + 1)); if (ap->row < 10) { /* 4-nucleotide alignment */ for (i = 0; i < len1; ++i) seq11[i] = aln_nt4_table[(int)seq1[i]]; for (j = 0; j < len2; ++j) seq22[j] = aln_nt4_table[(int)seq2[j]]; } else if (ap->row < 20) { /* 16-nucleotide alignment */ for (i = 0; i < len1; ++i) seq11[i] = aln_nt16_table[(int)seq1[i]]; for (j = 0; j < len2; ++j) seq22[j] = aln_nt16_table[(int)seq2[j]]; } else { /* amino acids */ for (i = 0; i < len1; ++i) seq11[i] = aln_aa_table[(int)seq1[i]]; for (j = 0; j < len2; ++j) seq22[j] = aln_aa_table[(int)seq2[j]]; } if (is_global) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len); else score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, do_align); aa->score = score; if (do_align) { out1 = aa->out1 = (char*)MYALLOC(sizeof(char) * (aa->path_len + 1)); out2 = aa->out2 = (char*)MYALLOC(sizeof(char) * (aa->path_len + 1)); outm = aa->outm = (char*)MYALLOC(sizeof(char) * (aa->path_len + 1)); --seq1; --seq2; --seq11; --seq22; p = aa->path + aa->path_len - 1; for (l = 0; p >= aa->path; --p, ++l) { switch (p->ctype) { case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j]; outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' '; break; case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break; case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break; } } out1[l] = out2[l] = outm[l] = '\0'; ++seq11; ++seq22; } MYFREE(seq11); MYFREE(seq22); p = aa->path + aa->path_len - 1; aa->start1 = p->i? p->i : 1; aa->end1 = aa->path->i; aa->start2 = p->j? p->j : 1; aa->end2 = aa->path->j; aa->cigar = aln_path2cigar(aa->path, aa->path_len, &aa->n_cigar); return aa; } AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int is_global, int do_align) { return aln_stdaln_aux(seq1, seq2, ap, is_global, do_align, -1, -1); } cigar_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) { int i, n; cigar_t *cigar; unsigned char last_type; if (path_len == 0 || path == 0) { *n_cigar = 0; return 0; } last_type = path->ctype; for (i = n = 1; i < path_len; ++i) { if (last_type != path[i].ctype) ++n; last_type = path[i].ctype; } *n_cigar = n; cigar = (cigar_t*)MYALLOC(*n_cigar * sizeof(cigar_t)); cigar[0] = (int)path[path_len-1].ctype << 14 | 1; last_type = path[path_len-1].ctype; for (i = path_len - 2, n = 0; i >= 0; --i) { if (path[i].ctype == last_type) ++cigar[n]; else { cigar[++n] = (int)path[i].ctype << 14 | 1; last_type = path[i].ctype; } } return cigar; } soap2.20/TextConverter.c0000644000105300011350000006357611164534250014142 0ustar yuchangrd/* TextConverter.c Text Converter This module contains miscellaneous text conversion functions. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include "TextConverter.h" #include "MiscUtilities.h" #include "r250.h" unsigned int GetWordPackedText(const unsigned int *packedText, const unsigned int index, const unsigned int shift, const unsigned int numberOfBit, const unsigned int vacantBit) { unsigned int text; const static unsigned int mask[32] = { 0x00000000, 0x80000000, 0xC0000000, 0xE0000000, 0xF0000000, 0xF8000000, 0xFC000000, 0xFE000000, 0xFF000000, 0xFF800000, 0xFFC00000, 0xFFE00000, 0xFFF00000, 0xFFF80000, 0xFFFC0000, 0xFFFE0000, 0xFFFF0000, 0xFFFF8000, 0xFFFFC000, 0xFFFFE000, 0xFFFFF000, 0xFFFFF800, 0xFFFFFC00, 0xFFFFFE00, 0xFFFFFF00, 0xFFFFFF80, 0xFFFFFFC0, 0xFFFFFFE0, 0xFFFFFFF0, 0xFFFFFFF8, 0xFFFFFFFC, 0xFFFFFFFE }; if (shift > 0) { // packedText should be allocated with at least 1 Word buffer initialized to zero #ifdef DNA_ONLY text = (packedText[index] << shift) | (packedText[index + 1] >> (BITS_IN_WORD - shift)); #else text = (packedText[index] << shift) | (packedText[index + 1] >> (BITS_IN_WORD - shift) << vacantBit); #endif } else { text = packedText[index]; } if (numberOfBit < BITS_IN_WORD) { // Fill unused bit with zero text &= mask[numberOfBit]; } return text; } unsigned int ReadCharMap(unsigned char *charMap, const char *inputFileName, const unsigned char defaultMapping) { FILE *inputFile; char c; unsigned int v, alphabetSize; inputFile = (FILE*)fopen64(inputFileName, "r"); if (inputFile == NULL) { fprintf(stderr, "ReadCharMap() : Cannot open character map!\n"); exit(1); } for (v=0; v CHAR_MAP_SIZE) { fprintf(stderr, "ReadCharMap() : Invalid charMap!\n"); return 0; } charMap[(unsigned int)c] = (unsigned char)v; if (v > alphabetSize) { alphabetSize = v; } } fclose(inputFile); alphabetSize++; return alphabetSize; } void GenerateReverseCharMap(const unsigned char *charMap, unsigned char *reverseCharMap) { unsigned int i, j; for (i=0; i BITS_IN_BYTE) { fprintf(stderr, "BitPerBytePackedChar() : bitPerChar > BITS_IN_BYTE!\n"); exit(1); } #endif // Return the largest number of bit that does not affect packing efficiency if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) { bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); } return bitPerChar; } unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar, unsigned int lastByteLength) { if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) { fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n"); exit(1); } return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; } unsigned int BytePackedLengthFromText(unsigned int textLength, unsigned int bitPerChar) { return (textLength + (BITS_IN_BYTE / bitPerChar) - 1) / (BITS_IN_BYTE / bitPerChar); } unsigned char LastByteLength(unsigned int textLength, unsigned int bitPerChar) { return (unsigned char)(textLength % (BITS_IN_BYTE / bitPerChar)); } void ConvertTextToWordPacked(const unsigned char *input, unsigned int *output, const unsigned char *charMap, const unsigned int alphabetSize, const unsigned int textLength) { unsigned int bitPerChar, charPerWord; unsigned int i, j, k; unsigned int c; unsigned int charValue; bitPerChar = BitPerWordPackedChar(alphabetSize); charPerWord = BITS_IN_WORD / bitPerChar; for (i=0; i= alphabetSize) { charValue = 0; } c = c | (charValue << (BITS_IN_WORD - (k+1) * bitPerChar)); } output[i] = c; } if (i * charPerWord < textLength) { c = 0; j = i * charPerWord; for (k=0; j+k < textLength; k++) { charValue = charMap[input[j+k]]; if (charValue >= alphabetSize) { charValue = 0; } c = c | (charValue << (BITS_IN_WORD - (k+1) * bitPerChar)); } output[i] = c; } } void ConvertTextToBytePacked(const unsigned char *input, unsigned char *output, const unsigned char *charMap, const unsigned int alphabetSize, const unsigned int textLength) { unsigned int bitPerChar, charPerByte; unsigned int i, j, k; unsigned char c; bitPerChar = BitPerBytePackedChar(alphabetSize); //2 charPerByte = BITS_IN_BYTE / bitPerChar; //4 for (i=0; i> (BITS_IN_WORD - bitPerChar)]; c <<= bitPerChar; } } if (i * charPerWord < textLength) { c = input[i]; j = i * charPerWord; for (k=0; j+k> (BITS_IN_WORD - bitPerChar)]; c <<= bitPerChar; } } } void ConvertBytePackedToText(const unsigned char *input, unsigned char *output, const unsigned char *reverseCharMap, const unsigned int alphabetSize, const unsigned int textLength) { unsigned int bitPerChar, charPerByte; unsigned int i, j, k; unsigned char c; bitPerChar = BitPerBytePackedChar(alphabetSize); charPerByte = BITS_IN_BYTE / bitPerChar; for (i=0; i> (BITS_IN_BYTE - bitPerChar)]; c <<= bitPerChar; } } if (i * charPerByte < textLength) { c = input[i]; j = i * charPerByte; for (k=0; j+k> (BITS_IN_BYTE - bitPerChar)]; c <<= bitPerChar; } } } void ConvertBytePackedToCode(const unsigned char *input, unsigned char *output, const unsigned int alphabetSize, const unsigned int textLength) { unsigned int bitPerChar, charPerByte; unsigned int i, j, k; unsigned char c; bitPerChar = BitPerBytePackedChar(alphabetSize); charPerByte = BITS_IN_BYTE / bitPerChar; for (i=0; i> (unsigned char)(BITS_IN_BYTE - bitPerChar); c <<= bitPerChar; } } if (i * charPerByte < textLength) { c = input[i]; j = i * charPerByte; for (k=0; j+k> (unsigned char)(BITS_IN_BYTE - bitPerChar); c <<= bitPerChar; } } } void ConvertWordPackedToBytePacked(const unsigned int *input, unsigned char *output, const unsigned int alphabetSize, const unsigned int textLength) { unsigned int i, j, k; unsigned int c; unsigned int bitPerBytePackedChar; unsigned int bitPerWordPackedChar; unsigned int charPerWord; unsigned int charPerByte; unsigned int bytePerIteration; unsigned int byteProcessed = 0; unsigned int wordProcessed = 0; unsigned int mask, shift; unsigned int buffer[BITS_IN_WORD]; bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); charPerWord = BITS_IN_WORD / bitPerBytePackedChar; charPerByte = BITS_IN_BYTE / bitPerWordPackedChar; bytePerIteration = charPerWord / charPerByte; mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); shift = BITS_IN_WORD - bitPerWordPackedChar; while ((wordProcessed + 1) * charPerWord < textLength) { c = input[wordProcessed]; for (i=0; i> shift; c <<= bitPerWordPackedChar; } wordProcessed++; k = 0; for (i=0; i> shift; c <<= bitPerWordPackedChar; } k = 0; while (byteProcessed * charPerByte < textLength) { c = 0; for (j=0; j < textLength - wordProcessed * charPerWord; j++) { c |= buffer[k] << (BITS_IN_BYTE - (j+1) * bitPerBytePackedChar); k++; } output[byteProcessed] = (unsigned char)c; byteProcessed++; } } void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, const unsigned int textLength) { unsigned int i, j, k; unsigned int c; unsigned int bitPerBytePackedChar; unsigned int bitPerWordPackedChar; unsigned int charPerWord; unsigned int charPerByte; unsigned int bytePerIteration; unsigned int byteProcessed = 0; unsigned int wordProcessed = 0; unsigned int mask, shift; unsigned int buffer[BITS_IN_WORD]; bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; charPerWord = BITS_IN_WORD / bitPerWordPackedChar; bytePerIteration = charPerWord / charPerByte; mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; while ((wordProcessed + 1) * charPerWord < textLength) { k = 0; for (i=0; i> bitPerWordPackedChar * i; } output[wordProcessed] = c; wordProcessed++; } k = 0; for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { c = (unsigned int)input[byteProcessed] << shift; for (j=0; j> bitPerWordPackedChar * i; } output[wordProcessed] = c; } void ConvertTextToCode(const unsigned char *input, unsigned char *output, const unsigned char *charMap, const unsigned int textLength) { unsigned int i; for (i=0; i< textLength; i++) { output[i] = charMap[input[i]]; } } void ConvertCodeToText(const unsigned char *input, unsigned char *output, const unsigned char *reverseCharMap, const unsigned int textLength) { unsigned int i; for (i=0; i< textLength; i++) { output[i] = reverseCharMap[input[i]]; } } void PackTextWithAllShift(const unsigned char *input, unsigned int **output, const unsigned char *charMap, const unsigned int alphabetSize, const unsigned int textLength) { unsigned int bitPerChar; unsigned int numberOfShift; unsigned int numberOfWord; unsigned int shift; unsigned int i, j; bitPerChar = BitPerWordPackedChar(alphabetSize); numberOfShift = BITS_IN_WORD / bitPerChar; numberOfWord = WordPackedLengthFromText(textLength, bitPerChar); ConvertTextToWordPacked(input, output[0], charMap, alphabetSize, textLength); for (i=1; i> shift; for (j=1; j<=numberOfWord; j++) { output[i][j] = (output[0][j] >> shift) | (output[0][j-1] << (BITS_IN_WORD - shift)); } } } unsigned int ReadTextAsWordPacked(const char *inputFileName, const unsigned char *charMap, const unsigned int alphabetSize, unsigned int *targetAddress, const unsigned int maxTextLength) { FILE *inputFile; unsigned char *buffer; unsigned int charPerWord; unsigned int charRead; unsigned int charProcessed = 0, wordProcessed = 0; unsigned int charPerBuffer; inputFile = (FILE*)fopen64(inputFileName, "rb"); if (inputFile == NULL) { fprintf(stderr, "ReadTextAsWordPacked() : Cannot open inputFileName!\n"); exit(1); } charPerWord = BITS_IN_WORD / BitPerWordPackedChar(alphabetSize); charPerBuffer = PACKED_BUFFER_SIZE / charPerWord * charPerWord; buffer = MMUnitAllocate(charPerBuffer); charRead = (unsigned int)fread(buffer, 1, charPerBuffer, inputFile); while (charRead > 0 && charProcessed + charRead < maxTextLength) { ConvertTextToWordPacked(buffer, targetAddress + wordProcessed, charMap, alphabetSize, charRead); wordProcessed += charRead / charPerWord; charProcessed += charRead; charRead = (unsigned int)fread(buffer, 1, charPerBuffer, inputFile); } if (charRead > 0 && charProcessed < maxTextLength) { ConvertTextToWordPacked(buffer, targetAddress + wordProcessed, charMap, alphabetSize, min(charRead, maxTextLength - charProcessed)); charProcessed += charRead; } MMUnitFree(buffer, charPerBuffer); fclose(inputFile); return charProcessed; } unsigned int ReadBytePackedAsWordPacked(const char *inputFileName, const unsigned int alphabetSize, unsigned int *targetAddress, const unsigned int maxTextLength) { FILE *inputFile; unsigned char *buffer1, *buffer2; unsigned int charPerByte, charPerWord; unsigned int charPerBuffer, wordPerBuffer; unsigned int charProcessed = 0, wordProcessed = 0; unsigned int byteRead, tempByteRead; unsigned int charInLastBuffer; unsigned int bufferSize; inputFile = (FILE*)fopen64(inputFileName, "rb"); if (inputFile == NULL) { fprintf(stderr, "ReadBytePackedAsWordPacked() : Cannot open inputFileName!\n"); exit(1); } charPerByte = BITS_IN_BYTE / BitPerBytePackedChar(alphabetSize); charPerWord = BITS_IN_WORD / BitPerWordPackedChar(alphabetSize); bufferSize = PACKED_BUFFER_SIZE / charPerByte / charPerWord * charPerByte * charPerWord; charPerBuffer = bufferSize * charPerByte; wordPerBuffer = charPerBuffer / charPerWord; buffer1 = MMUnitAllocate(bufferSize); buffer2 = MMUnitAllocate(bufferSize); byteRead = (unsigned int)fread(buffer1, 1, bufferSize, inputFile); tempByteRead = (unsigned int)fread(buffer2, 1, bufferSize, inputFile); while (tempByteRead > 1 && charProcessed + charPerBuffer < maxTextLength) { ConvertBytePackedToWordPacked(buffer1, targetAddress + wordProcessed, alphabetSize, charPerBuffer); charProcessed += charPerBuffer; wordProcessed += wordPerBuffer; memcpy(buffer1, buffer2, bufferSize); byteRead = tempByteRead; tempByteRead = (unsigned int)fread(buffer2, 1, bufferSize, inputFile); } if (tempByteRead > 1) { ConvertBytePackedToWordPacked(buffer1, targetAddress + wordProcessed, alphabetSize, maxTextLength - charProcessed); charProcessed += charPerBuffer; } else { if (tempByteRead == 1) { charInLastBuffer = charPerBuffer - charPerByte + buffer2[0]; } else { charInLastBuffer = (byteRead - 2) * charPerByte + buffer1[byteRead - 1]; } ConvertBytePackedToWordPacked(buffer1, targetAddress + wordProcessed, alphabetSize, min(maxTextLength - charProcessed, charInLastBuffer)); charProcessed += charInLastBuffer; } MMUnitFree(buffer1, bufferSize); MMUnitFree(buffer2, bufferSize); fclose(inputFile); return charProcessed; } /* void *DNALoadPacked_bit64(const char *inputFileName, unsigned int *textLength){ FILE *inputFile; unsigned char tempChar[8]; unsigned long long *packedText; unsigned int packedFileLen; unsigned char lastByteLength; unsigned int wordToProcess; unsigned int i; inputFile = (FILE*)(FILE*)fopen64(inputFileName, "rb"); if (inputFile == NULL) { fprintf(stderr, "DNALoadPacked() : Cannot open inputFileName!\n"); exit(1); } fseek(inputFile, -1, SEEK_END); packedFileLen = ftell(inputFile); if ((int)packedFileLen < 0) { fprintf(stderr, "DNALoadPacked(): Cannot determine file length!\n"); exit(1); } fread(&lastByteLength, sizeof(unsigned char), 1, inputFile); *textLength = (packedFileLen - 1) * 4 + lastByteLength; wordToProcess = (*textLength + 32 - 1) / 32; packedText = malloc((wordToProcess + 1) * sizeof(unsigned long long)); // allocate 1 more word at end packedText[wordToProcess - 1] = 0; packedText[wordToProcess] = 0; fseek(inputFile, 0, SEEK_SET); fread(packedText, 1, packedFileLen, inputFile); fclose(inputFile); if (convertToWordPacked) { for (i=0; i packedFileLenForThisLoad) { fseek(packedFile, -((int)packedLengthPerLoad), SEEK_CUR); } *textLength = len; *textLengthForThisLoad = TextLengthFromBytePacked(packedFileLenForThisLoad, bitPerChar, lastByteLength); return packedFile; } void LoadPackedIncFromEnd(FILE *packedFile, unsigned char *packedOutput, const unsigned int packedLengthPerLoad) { fread(packedOutput, sizeof(unsigned char), packedLengthPerLoad, packedFile); fseek(packedFile, -(2*(int)packedLengthPerLoad), SEEK_CUR); } FILE *InitialLoadTextIncFromEnd(const char* inputFileName, unsigned char *textOutput, const unsigned int textLengthPerLoad, unsigned int *textLength, unsigned int *textLengthForThisLoad) { FILE *textFile; unsigned int len, textLenForThisLoad; textFile = (FILE*)fopen64(inputFileName, "rb"); if (textFile == NULL) { fprintf(stderr, "InitialLoadTextIncFromEnd() : Cannot open inputFileName!\n"); exit(1); } fseek(textFile, 0, SEEK_END); len = ftell(textFile); if ((int)len < 0) { fprintf(stderr, "InitialLoadTextIncFromEnd(): Cannot determine file length!\n"); exit(1); } textLenForThisLoad = len % textLengthPerLoad; if (textLenForThisLoad > 0) { fseek(textFile, -((int)textLenForThisLoad), SEEK_END); fread(textOutput, sizeof(unsigned char), textLenForThisLoad, textFile); fseek(textFile, -((int)textLenForThisLoad), SEEK_END); } *textLength = len; *textLengthForThisLoad = textLenForThisLoad; return textFile; } void LoadTextIncFromEnd(FILE *textFile, unsigned char *textOutput, const unsigned int textLengthPerLoad) { if (ftell(textFile) < (int)textLengthPerLoad) { fprintf(stderr, "LoadTextIncFromEnd(): file pointer is not correctly placed!\n"); exit(1); } fseek(textFile, -((int)textLengthPerLoad), SEEK_CUR); fread(textOutput, sizeof(unsigned char), textLengthPerLoad, textFile); fseek(textFile, -((int)textLengthPerLoad), SEEK_CUR); } soap2.20/Timing.c0000644000105300011350000000730211231711145012531 0ustar yuchangrd/* Timing.c Measuring Program running time This module contains functions for measuring program running time. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #ifdef RUSAGE #include #else #ifdef TIME_BY_CLOCK #include #else #include #endif #endif #include "Timing.h" void asciiTime(const char *c){ time_t now = time(NULL); fprintf(stderr, "\n%s\n%s", c, asctime(localtime(&now))); } double setStartTime() { #ifdef RUSAGE double usertime, systime; struct rusage usage; getrusage(RUSAGE_SELF, &usage); usertime = (double)usage.ru_utime.tv_sec + (double)usage.ru_utime.tv_usec / 1000000.0; systime = (double)usage.ru_stime.tv_sec + (double)usage.ru_stime.tv_usec / 1000000.0; return(usertime + systime); #else #ifdef TIME_BY_CLOCK return (double)clock() / (double)CLOCKS_PER_SEC; #else struct timeval tp; gettimeofday(&tp, NULL); return (double)tp.tv_sec + (double)tp.tv_usec / (double)1000000; #endif #endif } double getElapsedTime(double startTime) { #ifdef RUSAGE double usertime, systime; struct rusage usage; getrusage(RUSAGE_SELF, &usage); usertime = (double)usage.ru_utime.tv_sec + (double)usage.ru_utime.tv_usec / 1000000.0; systime = (double)usage.ru_stime.tv_sec + (double)usage.ru_stime.tv_usec / 1000000.0; return (usertime + systime) - startTime; #else #ifdef TIME_BY_CLOCK return (double)clock() / (double)CLOCKS_PER_SEC - startTime; #else struct timeval tp; gettimeofday(&tp, NULL); return (double)tp.tv_sec + (double)tp.tv_usec / (double)1000000 - startTime; #endif #endif } void printElapsedTime(FILE *file, const int printHour, const int printMin, const int printSec, const int secNumberOfDecimal, const double seconds) { printElapsedTimeNoNewLine(file, printHour, printMin, printSec, 0, secNumberOfDecimal, seconds); fprintf(file, "\n"); } void printElapsedTimeNoNewLine(FILE *file, const int printHour, const int printMin, const int printSec, const int secMinPrintLength, const int secNumberOfDecimal, const double seconds) { int hour, min; double sec; char secondDisplay[8] = "%0.0f s"; #ifdef DEBUG if (printHour && !printMin && printSec) { fprintf(stderr, "printElapsedTime(): Cannot skip minute only!\n"); exit(1); } if (secNumberOfDecimal > 9) { fprintf(stderr, "printElapsedTime(): secNumberOfDecimal > 9!\n"); exit(1); } #endif secondDisplay[1] = secondDisplay[1] + (char)secMinPrintLength; secondDisplay[3] = secondDisplay[3] + (char)secNumberOfDecimal; sec = seconds; min = (int)(seconds / 60); if (!printSec && printMin) { if (seconds - min * 60 >= 30) { min++; } } if (printMin) { sec -= min * 60; } hour = min / 60; if (!printMin) { min = hour * 60; if (min >= 30) { hour++; } } if (printHour) { min -= hour * 60; } if (printHour) { fprintf(file, "%d h ", hour); } if (printMin) { fprintf(file, "%d m ", min); } if (printSec) { fprintf(file, secondDisplay, sec); } } soap2.20/BWTAln.h0000644000105300011350000000454011164534250012404 0ustar yuchangrd/* * ============================================================================= * * Filename: BWTAln.h * * Description: * * Revision: none * Compiler: gcc 4.3.2 or above * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #ifndef _BWTALN_H__INC #define _BWTALN_H__INC #include "BWT.h" #include "extratools.h" #include "HSP.h" unsigned int REVBWTForwardSearch(const unsigned char *convertedKey, const unsigned int keyLength, const BWT *rev_bwt,unsigned int *resultSaIndexLeft, unsigned int *resultSaIndexRight,unsigned int *rev_resultSaIndexLeft, unsigned int *rev_resultSaIndexRight); unsigned int REVBWTContForwardSearch(const unsigned char *convertedKey, const unsigned int start, const unsigned int len,const BWT *rev_bwt,unsigned int *saL, unsigned int *saR,unsigned int *rev_saL, unsigned int *rev_saR); unsigned int BWTContBackwardSearch(const unsigned char *convertedKey, const unsigned int start, const unsigned int len, const BWT *bwt, unsigned int *saL, unsigned int *saR); unsigned int BWTBackward1Error(const unsigned char *querypattern, const BWTOPT *bo, BWT *bwt, unsigned int start, unsigned int len, unsigned int pl, unsigned int pr, unsigned int info, HITTABLE *hits); unsigned int REVBWTForward1Error(const unsigned char *queryPattern,const BWTOPT *bo, BWT *bwt, BWT * rev_bwt, unsigned int start,unsigned int len, unsigned int pl,unsigned int pr, unsigned int rev_pl,unsigned int rev_pr, unsigned int info, HITTABLE *hits); int BWTExactMatching(const unsigned char *convertedKey, const BWTOPT *bo, const int chain, BWT *bwt, LOOKUPTABLE *lookup, HITTABLE *hits); int BWT1ErrorMatching(const unsigned char *convertedKey, const BWTOPT *bo, const int chain, BWT *bwt, BWT *rev_bwt, LOOKUPTABLE *lookup, LOOKUPTABLE *rev_lookup, HITTABLE *hits); int BWT2ErrorMatching(const unsigned char *convertedKey, const BWTOPT *bo, const int chain, BWT *bwt, BWT *rev_bwt, LOOKUPTABLE *lookup, LOOKUPTABLE *rev_lookup, HITTABLE *hits); int BWTGapMatching(const unsigned char *convertedKey, const BWTOPT *bo, const int chain, BWT *bwt, BWT *rev_bwt, LOOKUPTABLE *lookup, LOOKUPTABLE *rev_lookup, HITTABLE *hits); #endif /* ----- #ifndef _BWTALN_H__INC ----- */ soap2.20/BWT.h0000644000105300011350000002556011167574713011771 0ustar yuchangrd/* BWT.h BWT-Index This module contains an implementation of BWT-index for alphabet size = 4. The functions provided include: Load functions for loading BWT to memory; Core functions for accessing core Inverse Psi values; Search functions for searching patterns from text; Text retrieval functions for retrieving text from BWT. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __BWT_H__ #define __BWT_H__ #include "HSP.h" #include "TypeNLimit.h" #include "MemManager.h" #include "TextConverter.h" #define BITS_PER_OCC_VALUE 16 #define OCC_VALUE_PER_WORD 2 #define OCC_INTERVAL 256 #define WORD_BETWEEN_OCC 16 #define OCC_INTERVAL_MAJOR 65536 #define SORT_ALL 0 #define SORT_16_BIT 1 #define SORT_NONE 2 #define BUCKET_BIT 16 #define NUM_BUCKET 65536 #define MAX_APPROX_MATCH_ERROR 7 #define MAX_ARPROX_MATCH_LENGTH 32 #define BWTDP_MAX_SUBSTRING_LENGTH 512 typedef struct _BWTOPT_TYPE_{ int cutoff; int alnLen, seqLen; int min_len; int h, x, y; int max_mm, gap_len, gap_fb; int nblock; ChrBlock *blockList; unsigned int *pacRef; unsigned int dnaLen; unsigned int extLen; char *fw, *rc; }BWTOPT; typedef struct SaIndexRange { unsigned int startSaIndex; unsigned int endSaIndex; } SaIndexRange; typedef struct BWT { unsigned int textLength; // length of the text unsigned int saInterval; // interval between two SA values stored explicitly unsigned int inverseSaInterval; // interval between two inverse SA stored explicitly unsigned int inverseSa0; // SA-1[0] unsigned int *cumulativeFreq; // cumulative frequency unsigned int *bwtCode; // BWT code unsigned int *occValue; // Occurrence values stored explicitly unsigned int *occValueMajor; // Occurrence values stored explicitly unsigned int *saValue; // SA values stored explicitly unsigned int *inverseSa; // Inverse SA stored explicitly SaIndexRange *saIndexRange; // SA index range int saIndexRangeNumOfChar; // Number of characters indexed in SA index range unsigned int *saValueOnBoundary; // Pre-calculated frequently referred data unsigned int *decodeTable; // For decoding BWT by table lookup unsigned int decodeTableGenerated; // == TRUE if decode table is generated on load and will be freed unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated unsigned int occSizeInWord; // Temporary variable to hold the memory allocated unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated unsigned int saValueSize; // Temporary variable to hold the memory allocated unsigned int inverseSaSize; // Temporary variable to hold the memory allocated unsigned int saIndexRangeSize; // Temporary variable to hold the memory allocated } BWT; #define MAX_DIAGONAL_LEVEL 4 // Number of sub-pattern to keep for detecting diagonal hit // Error information is stored as: // 1. bitVector // After hamming distance match // 2. count // After edit distance match // 3. score // After the hits are processed with scoring functions typedef struct SaIndexGroupNew { // SA index range and information of a particular error arrangement of a matched sub-pattern unsigned int startSaIndex; // starting SA index unsigned int numOfMatch; // number of match unsigned int posQuery; // position in query; used for detecting diagonal hits unsigned int info; // extra hit information; to be copied to hitList.info } SaIndexGroupNew; typedef struct SaIndexGroupOld { // SA index range and information of a particular error arrangement of a matched sub-pattern unsigned int startSaIndex; // starting SA index unsigned int numOfMatch; // number of match unsigned int info; // extra hit information; to be copied to hitList.info } SaIndexGroupOld; typedef struct SaIndexGroup { // SA index range and information of a particular error arrangement of a matched sub-pattern unsigned int startSaIndex; // starting SA index unsigned int numOfMatch; // number of match unsigned int info; // extra hit information } SaIndexGroup; typedef struct SaIndexGroupWithErrorBitVector { // SA index range and information of a particular error arrangement of a matched sub-pattern unsigned int startSaIndex; // starting SA index unsigned int numOfMatch; // number of match unsigned int errorBitVector; // error bit vector } SaIndexGroupWithErrorBitVector; typedef struct SaIndexGroupWithLengthError { // SA index range and information of a particular error arrangement of a matched sub-pattern unsigned int startSaIndex; // starting SA index unsigned int numOfMatch; // number of match unsigned posQuery : 16; // position in query unsigned length : 8; // length of hit unsigned error : 8; // error in hit } SaIndexGroupWithLengthError; typedef struct SaIndexGroupProcessed { // Alternative usage of SaIndexGroup - once processed, error bit vector is replaced by index to text position unsigned int startSaIndex; // starting SA index unsigned int numOfMatch; // number of match unsigned int textPositionIndex; // storing the pointer to text position } SaIndexGroupProcessed; typedef struct DupSaIndexGroup { // Alternative usage of SaIndexGroup - the group duplicates another group unsigned int lastDupSaIndexGroupIndex; // index to last duplicated group unsigned int saIndexGroupIndex; // index to the first SA into group among the duplicates unsigned int textPositionIndex; // storing the pointer to text position } DupSaIndexGroup; typedef struct SaIndexGroupHash { // Hash table for checking duplicate SA index group unsigned int startSaIndex; unsigned int saIndexGroupIndex; } SaIndexGroupHash; typedef struct BWTSaRetrievalStatistics { unsigned int bwtSaRetrieved; unsigned int saDiagonalLinked; unsigned int saDiagonalFiltered; unsigned int saDuplicated; } BWTSaRetrievalStatistics; typedef struct BWTDPStatistics { int maxDepth; int maxDPCell; int maxDPMemoryInWord; int totalMaxDepth; int totalMaxDPCell; int totalMaxDPMemoryInWord; LONG acceptedPathDepth; LONG acceptedPath; LONG rejectedPathDepth; LONG rejectedPath; LONG* __restrict totalNode; LONG* __restrict rejectedNode; LONG* __restrict totalDPCell; } BWTDPStatistics; typedef struct SaIndexList { unsigned int saIndex; unsigned int textPositionIndex; } SaIndexList; typedef struct HitCombination { int numOfCombination; int maxError; int keyLength; int skipTableWidth; int *errorPos; int *skip; int *skipErrorIndex; } HitCombination; typedef struct DPText { int charBeingProcessed; int dpCellIndex; int numOfDpCellSegment; unsigned int dummy1; // Must not be removed; so that saIndexLeft and saIndexRight are aligned to 16 byte boundary unsigned int saIndexLeft[ALPHABET_SIZE]; unsigned int saIndexRight[ALPHABET_SIZE]; } DPText; typedef struct DPScanDepth { unsigned P : 31; unsigned withAmbiguity : 1; } DPScanDepth; // Load / unload functions BWT *BWTCreate(MMPool *mmPool, const unsigned int textLength, unsigned int *decodeTable); BWT *BWTLoad(MMPool *mmPool, const char *bwtCodeFileName, const char *occValueFileName, const char *saValueFileName, const char *inverseSaFileName, const char *saIndexRangeFileName, unsigned int *decodeTable); void BWTFree(MMPool *mmPool, BWT *bwt); //void BWTPrintMemoryUsage(const BWT *bwt, FILE *output, const unsigned int packedDNASize); // Precalculate frequenctly accessed data void BWTGenerateSaValueOnBoundary(MMPool *mmPool, BWT *bwt); // Core functions // The following must be customized for differenet compression schemes *** unsigned int BWTDecode(const BWT *bwt, const unsigned int index1, const unsigned int index2, const unsigned int character); void BWTDecodeAll(const BWT *bwt, const unsigned int index1, const unsigned int index2, unsigned int* __restrict occValue); unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character); void BWTOccValueTwoIndex(const BWT *bwt, unsigned int index1, unsigned int index2, const unsigned int character, unsigned int* __restrict occValue); void BWTAllOccValue(const BWT *bwt, unsigned int index, unsigned int* __restrict occValue); void BWTAllOccValueTwoIndex(const BWT *bwt, unsigned int index1, unsigned int index2, unsigned int* __restrict occValue1, unsigned int* __restrict occValue2); unsigned int BWTOccValueOnSpot(const BWT *bwt, unsigned int index, unsigned int* __restrict character); unsigned int BWTSearchOccValue(const BWT *bwt, const unsigned int character, const unsigned int searchOccValue); // Utility functions for no compression only unsigned int BWTResidentSizeInWord(const unsigned int numChar); unsigned int BWTFileSizeInWord(const unsigned int numChar); void BWTClearTrailingBwtCode(BWT *bwt); // These are generic to different compression schemes (and generic to no compression as well) unsigned int BWTPsiMinusValue(const BWT *bwt, const unsigned int index); unsigned int BWTPsiPlusValue(const BWT *bwt, const unsigned int index); unsigned int BWTSaValue(const BWT *bwt, unsigned int index); unsigned int BWTInverseSa(const BWT *bwt, unsigned int saValue); unsigned int BWTOccIntervalMajor(const unsigned int occInterval); unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar); unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar); // Search functions // packedText should be allocated with at least 1 Word buffer initialized to zero // Text retrieval functions // Position in text will be placed at the first word of hitListSizeInWord // startSaIndex + resultInfo must be sorted in increasing order; there must be no overlapping groups except that one group can completely enclose another // QSort comparison functions int SaIndexGroupStartSaIndexOrder(const void *saIndexGroup, const int index1, const int index2); int SaIndexGroupStartSaIndexLengthErrorOrder(const void *saIndexGroup, const int index1, const int index2); int HitListPosTextErrorLengthOrder(const void *hitList, const int index1, const int index2); int HitListPosText16BitOrder(const void *hitList, const int index1, const int index2); int HitListPosTextOrder(const void *hitList, const int index1, const int index2); int GappedHitListScorePosTextOrder(const void *gappedHitList, const int index1, const int index2); int GappedHitListDbSeqIndexScorePosTextOrder(const void *gappedHitList, const int index1, const int index2); #endif soap2.20/DNACount.h0000644000105300011350000001234011164534250012725 0ustar yuchangrd/* DNACount.h DNA Count This module contains DNA occurrence counting functions. The DNA must be in word-packed format. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __DNA_COUNT_H__ #define __DNA_COUNT_H__ #include "TypeNLimit.h" // DNA #define DNA_ALPHABET_SIZE 4 #define DNA_CHAR_PER_WORD 16 #define DNA_BIT_PER_CHAR 2 // DNA occurrence count table #define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536 #define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) // DNA with 'n' #define DNA_N_ALPHABET_SIZE 5 #define DNA_N_CHAR_PER_WORD 10 #define DNA_N_BIT_PER_CHAR 3 // DNA with 'n' occurrence count table #define DNA_N_OCC_CNT_TABLE_SIZE_IN_WORD 32786 void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable); // The following functions can only count up to 255 characters unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable); unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable); void ForwardDNAAllOccCount(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable); void BackwardDNAAllOccCount(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable); unsigned int Forward1OccCount(const unsigned int* bitVector, const unsigned int index, const unsigned int* dnaDecodeTable); // Count number of 1 bit unsigned int Backward1OccCount(const unsigned int* bitVector, const unsigned int index, const unsigned int* dnaDecodeTable); // Count number of 1 bit // The following functions have no limit on the number of characters unsigned int ForwardDNAOccCountNoLimit(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable); unsigned int BackwardDNAOccCountNoLimit(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable); void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable); void BackwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable); void GenerateDNA_NOccCountTable(unsigned int *dnaDecodeTable); // The following functions have no limit on the number of characters unsigned int ForwardDNA_NOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable); unsigned int BackwardDNA_NOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable); void ForwardDNA_NAllOccCount(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable); void BackwardDNA_NAllOccCount(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable); // The following functions have no limit on the number of characters unsigned int ForwardDNAnOccCountNoLimit(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable); unsigned int BackwardDNA_NOccCountNoLimit(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable); void ForwardDNA_NAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable); void BackwardDNA_NAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable); // The first character from startAddr is indexed as 1 // DNA_NAllOccCount only count occurrence from character 0 to 3 // The following functions work for any word packed text unsigned int ForwardOccCount(const unsigned int* packed, const unsigned int index, const unsigned int character, const unsigned int alphabetSize); unsigned int BackwardOccCount(const unsigned int* packed, const unsigned int index, const unsigned int character, const unsigned int alphabetSize); void ForwardAllOccCount(const unsigned int* packed, const unsigned int index, const unsigned int alphabetSize, unsigned int* occCount); void BackwardAllOccCount(const unsigned int* packed, const unsigned int index, const unsigned int alphabetSize, unsigned int* occCount); #endif soap2.20/extratools.h0000644000105300011350000001100211206274405013510 0ustar yuchangrd#ifndef _EXTRATOOLS_H_ #define _EXTRATOOLS_H_ #include #include #include "MiscUtilities.h" #include "MemManager.h" #include "TextConverter.h" #include "Timing.h" #include "BWT.h" #include "kstring.h" #include #include #define MAX_DIFF 32 typedef struct LOOKUPTABLE_TYPE { unsigned int tableSize; unsigned int * table; }LOOKUPTABLE; typedef struct HASHCELL_TYPE { unsigned int count; unsigned int index; }HASHCELL; typedef struct HASHITEM_TYPE { unsigned int l; unsigned int r; unsigned int occIndex; }HASHITEM; typedef unsigned int OCC; typedef struct HASHTABLE_TYPE { unsigned int prime; unsigned int a; unsigned int b; unsigned int tableSize; HASHCELL * table; HASHITEM * itemList; OCC * occList; }HASHTABLE; typedef struct _HITITEM_TYPE_{ int info; int strain; int chr; unsigned int occ_pos, pos, blockid; int n_diff; int n_mm; int n_gapo, n_gape, gap_beg; int n_cigar; char *md; unsigned short *cigar; }HITITEM; typedef struct _HITTABLE_TYPE_{ int n; HITITEM *itemList; }HITTABLE; BWT * occBwt; HASHTABLE * occHashtable; unsigned int * occCollector; unsigned int occCollected; FILE * textPositionFile; void registerTPFile(FILE * filePtr,unsigned int searchMode); void registerQIndex(unsigned int queryIndex); void registerQSection(); void LoadLookupTable(LOOKUPTABLE * lookupTable, const char * fileName, const int tableSize); void FreeLookupTable(LOOKUPTABLE * lookupTable); unsigned int LookupSafe(LOOKUPTABLE lookupTable, BWT * bwt,unsigned long long lKey, unsigned long long rKey,unsigned int *l, unsigned int *r); void LoadHashTable(HASHTABLE * hashTable, const char * fileName); HASHITEM * HashFind(HASHTABLE * hashTable, unsigned int l,unsigned int r); void FreeHashTable(HASHTABLE * hashTable); void RegisterDecoder(BWT * bwt,HASHTABLE * hashTable); //void OCCClean(); //void OCCProcess(unsigned int l,unsigned int r); inline int altCalMM(unsigned int x); inline int CalMismatch(const char *,const unsigned int *,const unsigned int , const unsigned int, const unsigned int); int OCCProcess(unsigned int l, unsigned int r, const BWTOPT *bo, const unsigned int info, HITTABLE *hits); #define GenOCCArr(arr) do{ \ int occ = 0; \ if (r-l+1 >= 4) { \ HASHITEM *item = HashFind(occHashtable,(l),(r)); \ if (item==NULL) { \ unsigned int k; \ for (k=l;k<=r;++k) { \ arr[occ++] = BWTSaValue(occBwt,k); \ }\ } else {\ unsigned int k;\ for (k=0;kr-item->l+1;++k) {\ arr[occ++] = occHashtable->occList[item->occIndex+k];\ }\ }\ } else {\ unsigned int k;\ for (k=l;k<=r;++k) {\ arr[occ++] = BWTSaValue(occBwt,k);\ }\ }\ }while(0); #define OrientPacPos(){\ int start, end; \ start = end = 0; \ int l, m, h; \ l = 0; h = nblock; m = nblock/2; \ /* \ fprintf(stderr, "pacPos %u\n",occ_pos); \ */ \ while(l<=h){ \ m = (h+l)>>1; \ if((start=blockList[m].blockStart)>occ_pos){ \ h = m - 1; \ } \ else if((end = blockList[m].blockEnd)= (occ_pos + (strain?alnLen:seqLen)))){ \ chr = blockList[m].chrID; \ pos = occ_pos - start + (blockList+m)->ori + 1; \ blockid = m; \ break; \ }else break; \ } \ } #define MAX_MD_LEN 1024 #define HitInc(n) { \ /* \ if(cutoff == n) \ fprintf(stderr, "max %d->n %d\n", cutoff, n); \ */ \ int chr = -1;int pos = -1;int blockid = 0;int mm = 0; \ OrientPacPos(); \ /* if(chr > -1 && (pos-(strain?extLen:0)) > 0 && (!extLen || max_mm >= ((info>>24)&0x7>3?0:((info>>24)&0x7))+(mm=CalMismatch(seq, pacRef, strain?(occ_pos-extLen):(occ_pos+alnLen), extLen, errTmp, strain?0:alnLen, allErr)))) { */ \ if(chr > -1 && (pos-(strain?extLen:0)) > 0 && (max_mm >= (mm=CalMismatch(seq, pacRef, strain?(occ_pos-extLen):(occ_pos), seqLen, dnaLength)))) { \ hit->strain = strain; \ hit->chr = chr; \ hit->pos = pos-(strain?extLen:0); \ if (hit->pos < 0 ) printf("%d\t%d\n", pos, extLen); \ hit->blockid = blockid; \ hit->occ_pos = occ_pos-(strain?extLen:0); \ hit->info = info; \ hit->n_cigar = 0; \ hit->n_mm = mm + ((info>>24)&0x7); \ n++; hit++; \ } \ } //void CleanDecoder(); double getTextPositionTime(); unsigned int getSARetrieved(); unsigned int getHASHRetrieved(); #endif /*_EXTRATOOLS_H_*/ soap2.20/HSP.h0000644000105300011350000000720711164534250011752 0ustar yuchangrd/* HSP.h BWTBlastn functions This module contains miscellaneous BWTBlastn functions. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __HSP_H__ #define __HSP_H__ #include "TypeNLimit.h" #include "MemManager.h" #include "TextConverter.h" #define ALPHABET_SIZE 4 #define BIT_PER_CHAR 2 #define CHAR_PER_128 64 #define CHAR_PER_WORD 16 #define CHAR_PER_BYTE 4 #define MAX_ALIGNMENT_LENGTH 131072 #define SHORTEST 70 typedef struct _ChrBlock{ int chrID; unsigned int blockStart; unsigned int blockEnd; unsigned int ori; }ChrBlock; typedef struct _NewAnnotation{ char chrName[MAX_SEQ_NAME_LENGTH]; int nameLen; unsigned int chrStart; unsigned int chrEnd; int blockNum; ChrBlock *blockInChr; }NewAnnotation; typedef struct Annotation { int gi; char text[MAX_SEQ_NAME_LENGTH+1]; } Annotation; typedef struct HSP { unsigned int* packedDNA; int chrNum; char **chrName; int numOfBlock; ChrBlock *blockList; unsigned int dnaLength; }HSP; #define MAX_SEQ_NAME_LENGTH 256 #define MAX_HISTO_SIZE 256 #define INVALID_CHAR_INDEX 15 #define ALIGN_MATCH 0 #define ALIGN_MISMATCH_AMBIGUITY 1 #define ALIGN_INSERT 2 #define ALIGN_DELETE 3 #define ALIGN_PER_WORD 16 #define ALIGN_BIT 2 #define AUX_TEXT_PER_WORD 8 #define AUX_TEXT_BIT 4 static const char lowercaseDnaCharIndex = 14; // Seems that BLAST treat masked characters as 'N' (still have 1/4 chance of matching) static const char nonMatchDnaCharIndex = 15; static const char dnaChar[16] = {'A', 'C', 'G', 'T', 'M', 'R', 'S', 'V', 'W', 'Y', 'H', 'K', 'D', 'B', 'N', 'L'}; static const char dnaComplement[16] = {'T', 'G', 'C', 'A', 'K', 'Y', 'S', 'B', 'W', 'R', 'D', 'M', 'H', 'V', 'N', 'L'}; static const char ambiguityCount[16] = { 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 0 }; static const char ambiguityMatch[16][4] = {{0, 0, 0, 0}, {1, 0, 0, 0}, {2, 0, 0, 0}, {3, 0, 0, 0}, {0, 1, 0, 0}, {0, 2, 0, 0}, {1, 2, 0, 0}, {0, 1, 2, 0}, {0, 3, 0, 0}, {1, 3, 0, 0}, {0, 1, 3, 0}, {2, 3, 0, 0}, {0, 2, 3, 0}, {1, 2, 3, 0}, {0, 1, 2, 3}, {0, 0, 0, 0} }; // Map must be allocated with char[256] void HSPFillCharMap(unsigned char *charMap); void HSPFillComplementMap(unsigned char *complementMap); HSP *HSPLoad(MMPool *mmPool, const char *PackedDNAFileName, const char *AnnotationFileName); HSP *HSPConvertFromText(MMPool *mmPool, const unsigned char *text, const unsigned int textLength, const unsigned int FASTARandomSeed, const int maskLowerCase, const int gi, const char *seqName); void HSPFree(MMPool *mmPool, HSP *hsp); unsigned int HSPParseFASTAToPacked(const char* FASTAFileName, const char* annotationFileName, const char* packedDNAFileName, const char* ambiguityFileName, const unsigned int FASTARandomSeed, const int maskLowerCase); unsigned int HSPPackedToFASTA(const char* FASTAFileName, const char* annotationFileName, const char* packedDNAFileName, const char* ambiguityFileName); #endif soap2.20/kstring.h0000644000105300011350000000147111164534250012776 0ustar yuchangrd#ifndef KSTRING_H #define KSTRING_H #include #include #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif static inline int kputs(const char *p, kstring_t *s) { int l = strlen(p); if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } strcpy(s->s + s->l, p); s->l += l; return l; } static inline int kputc(int c, kstring_t *s) { if (s->l + 1 >= s->m) { s->m = s->l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } s->s[s->l++] = c; s->s[s->l] = 0; return c; } int ksprintf(kstring_t *s, const char *fmt, ...); #endif soap2.20/Match.h0000644000105300011350000000634011231712023012340 0ustar yuchangrd/* * ============================================================================= * * Filename: Match.h * * Description: * * Revision: none * Compiler: gcc 4.3.2 or above * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #ifndef _MATCH_H_ #define _MATCH_H_ #include "SeqIO.h" #include "BWTAln.h" #include "BWT.h" #include "extratools.h" #include "soapio.h" #include "stdaln.h" #ifdef PTHREADS #include #define NSEQ_PER_THREAD 0xF00 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; #define SEQ_ALLOC() {\ if (opt->nthreads > 1) { \ pthread_mutex_lock(&lock); \ if (alnSeq->tid < 0) { \ int j; \ for (j = i; j < mseqs->n && j < i + NSEQ_PER_THREAD; ++j) \ alnSeq[j].tid = tid; \ } else if (alnSeq->tid != tid) { \ pthread_mutex_unlock(&lock); \ continue; \ } \ pthread_mutex_unlock(&lock); \ } \ } #else #define SEQ_ALLOC() #endif #define MULTI_SEQ 0x100000 #ifndef MAX_MISMATCH #define MAX_MISMATCH 20 #endif #ifndef MAX_GAP_LEN #define MAX_GAP_LEN 10 #endif #define MAX_SEQ_LEN 256 #define MAX_ALN 10000 #define FORWARD 0 #define REVERSE 1 #define ALN_MAT 0 #define ALN_MIS 0x11 #define ALN_INS 0x22 #define ALN_DEL 0x33 #include #define HITCPY(dest, ori) { \ (dest)->info = (ori)->info; \ (dest)->strain = (ori)->strain; \ (dest)->chr = (ori)->chr; \ (dest)->occ_pos = (ori)->occ_pos; \ (dest)->pos = (ori)->pos; \ (dest)->n_mm = (ori)->n_mm; \ (dest)->n_gapo = (ori)->n_gapo; \ (dest)->n_gape = (ori)->n_gape; \ (dest)->gap_beg = (ori)->gap_beg; \ (dest)->n_diff = (ori)->n_diff; \ (dest)->n_cigar = (ori)->n_cigar; \ } #define PacReadExt(fw, rc, start, len, seqPac, rcPac) {\ int j; \ for(j=0; j>4] <<= 2; \ seqPac[j>>4] |= *(fw+j+start); \ rcPac[j>>4] <<= 2; \ rcPac[j>>4] |= *(rc+j); \ } \ } typedef struct _SOAPOPT_{ int fast, o_format, chain; int aln_len, ns, max_mm, gap_len, gap_fb; int mode, cutoff; int pe; int zero_qual; int min_ins, max_ins, FR; int rr; int unmapped; int nthreads; //number of pthreads int id; int bisulfite; int allErr; int min_len; int uniq; }SOAPOPT; typedef struct _FILEDS_{ int ifdA, ifdB; int ofdAln, ofdSe, ofdUn; }FILEDS; typedef struct _MATCHAUX_TYPE_{ int max_mm; int len, ext; unsigned int *pac; unsigned int dnaLen; int allErr; }MATCHAUX; typedef struct _PEAUX_TYPE_{ int min_ins, max_ins; int FR; int cutoff, len; int allErr; }PEAUX; inline int CheckIns(HITITEM *, HITITEM *, PEAUX *); void MatchProcess (FILEDS *, BWT *, BWT *, LOOKUPTABLE *, LOOKUPTABLE *, HSP *, SOAPOPT * const ); inline void PickupHit(ALNSEQ *, const int ,int *, HITTABLE *,const unsigned int *, const unsigned int, unsigned short * ); void SEAlnCore(int , MULTISEQ *, BWT *, BWT *, LOOKUPTABLE *, LOOKUPTABLE *, HSP *, const SOAPOPT *); void PEAlnCore(int , MULTISEQ *, BWT *, BWT *, LOOKUPTABLE *, LOOKUPTABLE *, HSP *, const SOAPOPT *); int HITCMP(const void *a, const void *b); #endif /* ----- #ifndef _MATCH_H_INC ----- */ soap2.20/MemManager.h0000644000105300011350000001466411164534250013336 0ustar yuchangrd/* MemManager.h Memory Manager This module provides memory management functions. Copyright (C) 2004, Wong Chi Kwong. This program is FREEALIGN software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __MEM_MANAGER_H__ #define __MEM_MANAGER_H__ #include "TypeNLimit.h" #include #include #define MAX_ALIGN 64 // All memory except pool memory are aligned to MAX_ALIGN; pool memory is aligned to finer boundary for small memory size #define MIN_ALIGN 1 #define RECORD_GRAND_TOTAL // Memory type: // // unit memory: allocation managed by malloc() individually; // to be used for large and less frequently accessed items // allocation can be freed individually at any time // pool memory: pre-allocated memory pool for items with varying sizes // allocation cannot be freed by individually // to be used for small and frequently accessed items // temp memory: temporary use granted from pool memory // allocation is allocated and freed like the items in a stack // pool memory allocation is disabled while temporary memory is in use // bulk memory: pre-allocated memory pool for items with the same size // to be used for massively numbered items // memory address of dispatched items can be calculated by dispatch index #ifdef DEBUG #define Mem(mmBulk, index) MMBulkAddress(mmBulk, index) #else #define Mem(mmBulk, index) (void*)&(mmBulk->directory[index >> mmBulk->itemPerAllocationInPowerOf2][(index & mmBulk->indexMask) * mmBulk->itemSize]) #endif typedef struct MMPool { unsigned int poolSize; // Size of memory pool; the beginning of the pool holds the MMPool structure unsigned int poolByteDispatched; // Includes any spillover and memory skipped for align unsigned int poolByteSpillover; // Exclude spillover pointers unsigned int currentTempByteDispatched; // Includes any spillover unsigned int currentTempByteSpillover; // Exclude spillover pointers unsigned int maxTotalByteDispatched; // The max of pool memory + temp memory dispatched void *firstSpillOverAddress; // if pool is freed, = address of mmPool } MMPool; typedef struct MMBulk { unsigned int itemSize; unsigned int itemPerAllocationInPowerOf2; unsigned int boundaryCushionSize; // boundary cushion is a piece of memory allocated so that the memory around items can be safely referenced unsigned int indexMask; unsigned int currentDirectoryEntry; unsigned int nextUnusedItem; unsigned int directorySize; unsigned char **directory; // if bulk is freed, = NULL } MMBulk; typedef struct MMMaster { unsigned int currentUnitByteAllocated; unsigned int maxUnitByteAllocated; unsigned int maxNumberOfPools; MMPool **mmPool; unsigned int maxNumberOfBulks; MMBulk **mmBulk; unsigned int maxTotalByteAllocated; unsigned int maxTotalByteDispatched; int traceUnitByteAllocation; FILE *unitByteTraceFile; } MMMaster; void *MMMalloc(const unsigned int memSize); void MMFree(void *address); void MMMasterInitialize(const unsigned int maxNumberOfPools, const unsigned int maxNumberOfBulks, const int traceUnitByteAllocation, FILE *unitByteTraceFile); void MMMasterFreeAll(); unsigned int MMMasterCurrentTotalByteAllocated(); unsigned int MMMasterCurrentTotalByteDispatched(); unsigned int MMMasterMaxTotalByteAllocated(); unsigned int MMMasterMaxTotalByteDispatched(); void MMMasterSetMaxTotalByteAllocated(); void MMMasterSetMaxTotalByteDispatched(); void MMMasterPrintReport(FILE *output, const unsigned int withUnitDetails, const unsigned int withPoolDetails, const unsigned int withBulkDetails); void *MMUnitAllocate(const unsigned int memSize); void *MMUnitReallocate(void *address, const unsigned int newMemSize, const unsigned int oldMemSize); void MMUnitFree(void *address, const unsigned int memSize); unsigned int MMUnitCurrentByteAllocated(); unsigned int MMUnitMaxByteAllocated(); void MMUnitPrintReport(FILE *output); MMPool *MMPoolCreate(const unsigned int poolSize); unsigned int MMPoolIsActive(const MMPool *mmPool); void MMPoolSetInactive(MMPool *mmPool); unsigned int MMPoolCurrentTotalByteAllocated(const MMPool *mmPool); unsigned int MMPoolCurrentTotalByteDispatched(const MMPool *mmPool); unsigned int MMPoolMaxTotalByteDispatched(const MMPool *mmPool); unsigned int MMPoolByteAvailable(const MMPool *mmPool); MMPool *MMPoolFree(MMPool *mmPool); void MMPoolReset(MMPool *mmPool); void MMPoolDestory(MMPool *mmPool); void *MMPoolDispatch(MMPool *mmPool, const unsigned int memSize); unsigned int MMPoolDispatchOffset(MMPool *mmPool, const unsigned int memSize); void MMPoolReturn(MMPool *mmPool, void *address, const unsigned int memSize); // Dummy function void MMPoolPrintReport(MMPool *mmPool, FILE *output); void *MMTempDispatch(MMPool *mmPool, const unsigned int memsize); void MMTempReturn(MMPool *mmPool, void *address, const unsigned int memSize); void MMTempPrintReport(MMPool *mmPool, FILE *output); MMBulk *MMBulkCreate(MMPool *mmPool, const unsigned int itemSize, const unsigned int itemPerAllocationInPowerOf2, unsigned int const boundaryCushionSize, unsigned int const directorySize); unsigned int MMBulkIsActive(const MMBulk *mmBulk); void MMBulkSetInactive(MMBulk *mmBulk); unsigned int MMBulkByteAllocated(const MMBulk *mmBulk); unsigned int MMBulkByteDispatched(const MMBulk *mmBulk); unsigned int MMBulkUnitDispatched(const MMBulk *mmBulk); void MMBulkFree(MMBulk *mmBulk); void MMBulkDestory(MMBulk *mmBulk); unsigned int MMBulkDispatch(MMBulk *mmBulk); void *MMBulkAddress(const MMBulk *mmBulk, const unsigned int index); MMPool *MMBulkFindPoolUsed(const MMBulk *mmBulk); void MMBulkPrintReport(MMBulk *mmBulk, FILE *output); void MMBulkSave(MMBulk *mmBulk, FILE *output); MMBulk *MMBulkLoad(MMPool *mmPool, FILE *input); #endif soap2.20/MiscUtilities.h0000644000105300011350000001021111164534250014074 0ustar yuchangrd/* MiscUtilities.h Miscellaneous Utilities This module contains miscellaneous utility functions. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __MISC_UTILITIES_H__ #define __MISC_UTILITIES_H__ #include "TypeNLimit.h" #include "stdio.h" #define init(variable) variable = 0; // this is for avoiding compiler warning // disable it if compiler becomes smarter! #define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) #define truncateLeft(value, offset) ( (value) << (offset) >> (offset) ) // alignBoundary must be power of 2 #define nextAlignedBoundary(offset, alignBoundary) ( ((offset) + (alignBoundary) - 1) & (- (alignBoundary)) ) #define lastAlignedBoundary(offset, alignBoundary) ( (offset) & (- (alignBoundary)) ) #define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 ) #define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) #define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) ) #define med3(a, b, c) ( ac ? b : a>c ? c : a)) #define med3Index(key, ia, ib, ic) ( key[ia]key[ic] ? ib : key[ia]>key[ic] ? ic : ia)) #define swap(a, b, t); t = a; a = b; b = t; void Dust(const unsigned int len, unsigned char *pattern, const unsigned int level, const unsigned int window, const unsigned int word); void LimitCodeGenerateCodeTable(const unsigned int limit, unsigned int** codeValue, unsigned int** codeLength); int QSortUnsignedIntOrder(const void *data, const int index1, const int index2); void QSort(void* __restrict data, const int numData, const int dataWidth, int (*QSortComp)(const void*, const int, const int) ); unsigned int checkDuplicate(int *input, const unsigned int numItem, const int minValue, const int maxValue, char* text); unsigned int leadingZero(const unsigned int input); unsigned int ceilLog2(const unsigned int input); unsigned int floorLog2(const unsigned int input); unsigned int power(const unsigned int base, const unsigned int power); void formatVALAsBinary(const unsigned int input, char* output, unsigned int bitGroup); unsigned int getRandomSeed(); void ConvertBytePackedDNAToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int textLength); unsigned int reverseBit(unsigned int x); void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue); void initializeCHAR(unsigned char *startAddr, const unsigned int length, const unsigned char initValue); unsigned int numberOfMatchInVAL(unsigned int *startAddr, const unsigned int length, const unsigned int searchValue); unsigned int numberOfMatchInCHAR(unsigned char *startAddr, const unsigned int length, const unsigned char searchValue); void bitCopyNoDestOffset(unsigned int *destinationAddress, const unsigned int *sourceAddress, int sourceBitOffset, int copyLengthInBit); void bitCopyNoDestBitOffset(unsigned int *destinationAddress, int destinationWordOffset, const unsigned int *sourceAddress, int sourceWordOffset, int sourceBitOffset, int copyLengthInBit); unsigned int bitCopy(unsigned int *destinationAddress, int destinationWordOffset, int destinationBitOffset, const unsigned int *sourceAddress, int sourceBitOffset, int copyLengthInBit); unsigned int nextPrime(const unsigned int number); unsigned int popCount(const unsigned int bitVector); #endif soap2.20/r250.h0000644000105300011350000000070711164534250012006 0ustar yuchangrd/* r250.h prototypes for r250 random number generator, Kirkpatrick, S., and E. Stoll, 1981; "A Very Fast Shift-Register Sequence Random Number Generator", Journal of Computational Physics, V.40 also: see W.L. Maier, DDJ May 1991 */ #ifndef _R250_H_ #define _R250_H_ 1.2 #ifdef __cplusplus extern "C" { #endif void r250_init(int seed); unsigned int r250( void ); double dr250( void ); #ifdef __cplusplus } #endif #endif soap2.20/SeqIO.h0000644000105300011350000000171411164534250012275 0ustar yuchangrd/* * ============================================================================= * * Filename: SeqIO.h * * Description: * * Revision: none * Compiler: gcc 4.3.2 or above * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #ifndef __SEQIO_H__ #define __SEQIO_H__ /* */ #include #include #include #include #include #include "HSP.h" #define MAX_NAME_LEN 256 #define QUERY_LEN 256 #define FASTA 0 #define FASTQ 1 typedef struct _SEQ_T_{ int max, l, ns; char name[MAX_NAME_LEN]; char *seq, *rc, *qual; }seq_t; int CheckFast(int fd); int fasta(FILE *fp, seq_t *seq, const int CONV); int fastq(FILE *fp, seq_t *seq, const int CONV); #endif /* ----- __SEQIO_H__ ----- */ soap2.20/soap.h0000644000105300011350000000114211164534250012252 0ustar yuchangrd/* * ============================================================================= * * Filename: soap.h * * Description: * * Version: 1.0 * Created: 2009年02月16日 02时23分25秒 * Revision: none * Compiler: gcc 4.3.2 or above * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #ifndef __soap_H__ #define __soap_H__ #endif /* soap.h */ soap2.20/soapio.h0000644000105300011350000000276211226525665012624 0ustar yuchangrd/* * ============================================================================= * * Filename: soapio.h * * Description: * * Revision: none * Compiler: gcc 4.3.2 or above * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #ifndef _SOAPIO_H_ #define _SOAPIO_H_ #include #include #include "extratools.h" #include "SeqIO.h" #define MAX_MULTI_READS 0x20000 typedef struct _ALNSEQ_TYPE_{ int tid; int id, len, ns; char *name, *seq, *rc, *qual, *rcqual; unsigned int flag; int nhits; struct { int H0; int H1; int H2; }top; int report; HITITEM *itemList; }ALNSEQ; typedef struct _MULTISEQ_TYPE_{ int n, max; ALNSEQ *seqList; }MULTISEQ; typedef struct _OUTAUX_TYPE_{ int id, un, chrNum; char **chrName; int allErr; }OUTAUX; typedef struct _INFILELIST_{ FILE *ifpA, *ifpB; int id; int lock; }InFileList; typedef struct _OUTFILELIST_{ FILE *ofpAln, *ofpSe, *ofpUn; int id; int lock; }OutFileList; void FreeMultiSeq(MULTISEQ *); int GenMultiReads(const HSP *, MULTISEQ *, const int , const int , unsigned int *, int *); int GetMultiSeq (InFileList *, MULTISEQ *, const int , int (*)(FILE *, seq_t *, const int)); void DumpAln(MULTISEQ *, OUTAUX *, OutFileList *,unsigned int *, unsigned int *); #endif /* ----- #ifndef SOAPIO_INC ----- */ soap2.20/stdaln.h0000644000105300011350000001016511231510457012600 0ustar yuchangrd/* The MIT License Copyright (c) 2003-2006, 2008, by Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* 2008-08-04, 0.9.8 - Fixed the wrong declaration of aln_stdaln_aux() - Avoid 0 coordinate for global alignment 2008-08-01, 0.9.7 - Change gap_end penalty to 5 in aln_param_bwa - Add function to convert path_t to the CIGAR format 2008-08-01, 0.9.6 - The first gap now costs (gap_open+gap_ext), instead of gap_open. Scoring systems are modified accordingly. - Gap end is now correctly handled. Previously it is not correct. - Change license to MIT. */ #ifndef LH3_STDALN_H_ #define LH3_STDALN_H_ #define STDALN_VERSION 0.9.8 #ifndef MYALLOC # define MYALLOC malloc #endif #ifndef MYFREE # define MYFREE free #endif #define FROM_M 0 #define FROM_I 1 #define FROM_D 2 /* This is the smallest integer. It might be CPU-dependent in very RARE cases. */ #define MINOR_INF -1073741823 typedef unsigned short cigar_t; typedef struct { int gap_open; int gap_ext; int gap_end; int *matrix; int row; int band_width; } AlnParam; typedef struct { int i, j; unsigned char ctype; } path_t; typedef struct { path_t *path; /* for advanced users... :-) */ int path_len; /* for advanced users... :-) */ int start1, end1; /* start and end of the first sequence, coordinations are 1-based */ int start2, end2; /* start and end of the second sequence, coordinations are 1-based */ int score; /* score */ char *out1, *out2; /* print them, and then you will know */ char *outm; int n_cigar; cigar_t *cigar; } AlnAln; #ifdef __cplusplus extern "C" { #endif AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, int is_global, int do_align, int len1, int len2); AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int is_global, int do_align); void aln_free_AlnAln(AlnAln *aa); int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, path_t *path, int *path_len); int aln_local_core(unsigned char *seq1,const int len1, unsigned char *seq2,const int len2, const AlnParam *ap, path_t *path, int *path_len, int do_align); cigar_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar); #ifdef __cplusplus } #endif /******************** * global variables * ********************/ extern AlnParam aln_param_bwa ; /*= { 37, 9, 0, aln_sm_maq, 5, 50 };//*/ extern AlnParam aln_param_blast; /* = { 5, 2, 0, aln_sm_blast, 5, 50 }; */ extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */ extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */ extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */ /* common nucleotide score matrix for 16 bases */ extern int aln_sm_nt[]; /* BLOSUM62 and BLOSUM45 */ extern int aln_sm_blosum62[], aln_sm_blosum45[]; /* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */ extern int aln_sm_read[]; /* human-mouse score matrix for 4 bases */ extern int aln_sm_hs[]; #endif soap2.20/TextConverter.h0000644000105300011350000001357211164534250014136 0ustar yuchangrd/* TextConverter.h Text Converter This module contains miscellaneous text conversion functions. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __TEXTCONVERTOR_H__ #define __TEXTCONVERTOR_H__ #include "TypeNLimit.h" #include "MemManager.h" #define INVALID_CHAR 0xFF #define CHAR_MAP_SIZE 256 #define PACKED_BUFFER_SIZE (PACKED_BUFFER_SIZE_IN_WORD * BYTES_IN_WORD) #define PACKED_BUFFER_SIZE_IN_WORD 65536 #define MAX_SEQ_NAME_LENGTH 256 #define RANDOM_SUBSTITUTE 'R' // charMap is a char array of size 256. The index of the array is the input text value // and the content of the array is the output text value. e.g. A -> 0, C -> 1 // If the value of an entry = INVALID_CHAR, the indexed text value is an invalid input // Retrieve word packed text unsigned int GetWordPackedText(const unsigned int *packedText, const unsigned int index, const unsigned int shift, const unsigned int numberOfBit, const unsigned int vacantBit); // Character map functions unsigned int ReadCharMap(unsigned char *charMap, const char *inputFileName, const unsigned char defaultMapping); void GenerateReverseCharMap(const unsigned char *charMap, unsigned char *reverseCharMap); // Word packed text functions unsigned int BitPerWordPackedChar(const unsigned int alphabetSize); unsigned int TextLengthFromWordPacked(unsigned int wordPackedLength, unsigned int bitPerChar, unsigned int lastWordLength); unsigned int WordPackedLengthFromText(unsigned int textLength, unsigned int bitPerChar); unsigned int LastWordLength(unsigned int textLength, unsigned int bitPerChar); // Byte packed text functions unsigned int BitPerBytePackedChar(const unsigned int alphabetSize); unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar, unsigned int lastByteLength); unsigned int BytePackedLengthFromText(unsigned int textLength, unsigned int bitPerChar); unsigned char LastByteLength(unsigned int textLength, unsigned int bitPerChar); // Conversion functions void ConvertTextToWordPacked(const unsigned char *input, unsigned int *output, const unsigned char *charMap, const unsigned int alphabetSize, const unsigned int textLength); void ConvertTextToBytePacked(const unsigned char *input, unsigned char *output, const unsigned char *charMap, const unsigned int alphabetSize, const unsigned int textLength); void ConvertWordPackedToText(const unsigned int *input, unsigned char *output, const unsigned char *reverseCharMap, const unsigned int alphabetSize, const unsigned int textLength); void ConvertBytePackedToText(const unsigned char *input, unsigned char *output, const unsigned char *reverseCharMap, const unsigned int alphabetSize, const unsigned int textLength); void ConvertBytePackedToCode(const unsigned char *input, unsigned char *output, const unsigned int alphabetSize, const unsigned int textLength); void ConvertWordPackedToBytePacked(const unsigned int *input, unsigned char *output, const unsigned int alphabetSize, const unsigned int textLength); void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, const unsigned int textLength); void ConvertTextToCode(const unsigned char *input, unsigned char *output, const unsigned char *charMap, const unsigned int textLength); void ConvertCodeToText(const unsigned char *input, unsigned char *output, const unsigned char *reverseCharMap, const unsigned int textLength); // Pack text with all shift void PackTextWithAllShift(const unsigned char *input, unsigned int **output, const unsigned char *charMap, const unsigned int alphabetSize, const unsigned int textLength); // Full load function unsigned int ReadTextAsWordPacked(const char *inputFileName, const unsigned char *charMap, const unsigned int alphabetSize, unsigned int *targetAddress, const unsigned int maxTextLength); unsigned int ReadBytePackedAsWordPacked(const char *inputFileName, const unsigned int alphabetSize, unsigned int *targetAddress, const unsigned int maxTextLength); void *DNALoadPacked(const char *inputFileName, unsigned int *textLength, const unsigned int convertToWordPacked); void DNAFreePacked(void* packedDna, const unsigned int textLength); // Save functions void SaveText(const char *outputFileName, const unsigned char *text, const unsigned int textLength); void SaveBytePacked(const char *outputFileName, const unsigned char *wordPacked, const unsigned int textLength, const unsigned int alphabetSize); void SaveWordPacked(const char *outputFileName, const unsigned int *wordPacked, const unsigned int textLength, const unsigned int alphabetSize); // Incremental load functions (start from end of text) FILE *InitialLoadPackedIncFromEnd(const char* inputFileName, unsigned char *packedOutput, const unsigned int alphabetSize, const unsigned int packedLengthPerLoad, unsigned int *textLength, unsigned int *textLengthForThisLoad); void LoadPackedIncFromEnd(FILE *packedFile, unsigned char *packedOutput, const unsigned int packedLengthPerLoad); FILE *InitialLoadTextIncFromEnd(const char* inputFileName, unsigned char *textOutput, const unsigned int textLengthPerLoad, unsigned int *textLength, unsigned int *textLengthForThisLoad); void LoadTextIncFromEnd(FILE *textFile, unsigned char *textOutput, const unsigned int textLengthPerLoad); #endif soap2.20/Timing.h0000644000105300011350000000253711164534250012550 0ustar yuchangrd/* Timing.h Measuring Program running time This module contains functions for measuring program running time. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __TIMING_H__ #define __TIMING_H__ void asciiTime(const char *c); double setStartTime(); double getElapsedTime(double startTime); void printElapsedTime(FILE *file, const int printHour, const int printMin, const int printSec, const int secNumberOfDecimal, const double seconds); void printElapsedTimeNoNewLine(FILE *file, const int printHour, const int printMin, const int printSec, const int secMinPrintLength, const int secNumberOfDecimal, const double seconds); #endif soap2.20/TypeNLimit.h0000644000105300011350000000427011231711171013345 0ustar yuchangrd/* TypeNLimit.h Miscellaneous Constants Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __TYPENLIMIT_H__ #define __TYPENLIMIT_H__ #include #define BITS_IN_WORD 32 #define BITS_IN_WORD_MINUS_1 31 #define BITS_IN_WORD_MASK 0x0000001F #define BITS_IN_WORD_SHIFT 5 #define BITS_IN_HALF_WORD 16 #define BITS_IN_4_WORD 128 #define BITS_IN_4_WORD_MINUS_1 127 #define BITS_IN_4_WORD_SHIFT 7 #define FIRST_BIT_MASK 0x80000000 #define ALL_BUT_FIRST_BIT_MASK 0x7FFFFFFF #define ALL_ONE_MASK 0xFFFFFFFF #define FOUR_MULTIPLE_MASK 0xFFFFFFFC #define BITS_IN_BYTE 8 #define BITS_IN_BYTE_SHIFT 3 #define BYTES_IN_WORD 4 #define TRUE 1 #define FALSE 0 // Compatibilities #ifdef _WIN32 #define fopen64 fopen #define ftello64 ftell #define INLINE __inline #define ALIGN_16 __declspec(align(16)) #define ALIGN_32 __declspec(align(32)) #define ALIGN_64 __declspec(align(64)) #define MEMALIGN(a, b) _aligned_malloc(a, b) #define FREEALIGN(a) _aligned_free(a) #else #define fopen64 fopen #define ftello64 ftell #define INLINE __inline #define ALIGN_16 __attribute__((aligned(16))) #define ALIGN_32 __attribute__((aligned(32))) #define ALIGN_64 __attribute__((aligned(64))) #define MEMALIGN(a, b) _mm_malloc(a, b) #define FREEALIGN(a) _mm_free(a) #endif // To make sure that LONG means 64 bit integer #define LONG long long // For 32 & 64 bits compatibility on Windows and Linux //#define MAX_FILENAME_LEN 256 #endif soap2.20/soap.c0000644000105300011350000003272111231777427012266 0ustar yuchangrd/* * ============================================================================= * * Filename: soap.c * * Description: * * Revision: none * Compiler: gcc 4.3.2 or above * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #include #include #include #include #include #include #include #include "Match.h" #include "HSP.h" #include "BWT.h" #include "TypeNLimit.h" #include "extratools.h" #include "MemManager.h" #ifndef MAKE_TIME #define MAKE_TIME "00:00:00" #endif #ifndef VID //#define VID "2.11" //#define VID "2.15" // with gap and extend cigar //#define VID "2.16" // -r 2 segment fault //#define VID "2.17" // mm in rc //#define VID "2.18" // 07/05/2009 //#define VID "2.19" // 13/07/2009 #define VID "2.20" // 23/07/2009 gap missed in forward strand #endif #define MAX_FILENAME_LEN 1024 #define MAX_SUFFIX_LEN 255 const char *PROGRAM = "SOAPaligner/soap2"; const char *AUTHOR = "BGI shenzhen"; const char *VERSION = VID; /*release date: 14/01/2009*/ const char *CONTACT = "soap@genomics.org.cn"; char readAFileName[MAX_FILENAME_LEN] = ""; char readBFileName[MAX_FILENAME_LEN] = ""; char outFileName[MAX_FILENAME_LEN] = ""; char outUnpairFileName[MAX_FILENAME_LEN] = ""; char outUnmapFileName[MAX_FILENAME_LEN] = ""; char database_prefix[MAX_FILENAME_LEN] = ""; char AnnotationSuffix[MAX_SUFFIX_LEN] = ".ann"; char PackedDNASuffix[MAX_SUFFIX_LEN] = ".pac"; char BWTCodeSuffix[MAX_SUFFIX_LEN] = ".bwt"; char BWTOccValueSuffix[MAX_SUFFIX_LEN] = ".fmv"; char SaValueSuffix[MAX_SUFFIX_LEN] = ".sa"; char RevPackedDNASuffix[MAX_SUFFIX_LEN] = ".rev.pac"; char RevBWTCodeSuffix[MAX_SUFFIX_LEN] = ".rev.bwt"; char RevBWTOccValueSuffix[MAX_SUFFIX_LEN] = ".rev.fmv"; char LookupTableSuffix[MAX_SUFFIX_LEN] = ".lkt"; char RevLookupTableSuffix[MAX_SUFFIX_LEN] = ".rev.lkt"; char HighOccHashTableSuffix[MAX_SUFFIX_LEN] = ".hot"; // DatabaseFiles parameters char AnnotationFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; char PackedDNAFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; char BWTCodeFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; char BWTOccValueFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; char SaValueFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; //For Reversed BWT char RevPackedDNAFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; char RevBWTCodeFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; char RevBWTOccValueFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; //For Extra Data Structures char LookupTableFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; char RevLookupTableFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; char HighOccHashTableFileName[MAX_FILENAME_LEN+MAX_SUFFIX_LEN] = ""; unsigned int LookUpTableSize = 13; unsigned int RevLookUpTableSize = 13; // Memory parameters /* int PoolSize = 20971520; // 2M - fixed; not configurable through ini int WorkingMemorySize = 67108864; // 64M - good for 8M hit; configurable through ini int AlignmentMemorySize = 4194304; // 4M //*/ int PoolSize = 20971520; int WorkingMemorySize = 1073741824; int AlignmentMemorySize = 536870912; unsigned char charMap[256]; unsigned char complementMap[256]; static SOAPOPT *opt; void Usage(void) { fprintf(stdout, "\nProgram: %s\n", PROGRAM); fprintf(stdout, "Compile Date: " MAKE_TIME"\n"); fprintf(stdout, "Author: %s\n", AUTHOR); fprintf(stdout, "Version: %s\n", VERSION); fprintf(stdout, "Contact: %s\n", CONTACT); fprintf(stdout, "\nUsage:\tsoap [options]\n"); fprintf(stdout, "\t-a query a file, *.fq, *.fa\n"); fprintf(stdout, "\t-b query b file\n"); fprintf(stdout, "\t-D reference sequences indexing table, *.index format\n"); fprintf(stdout, "\t-o output alignment file(txt)\n"); fprintf(stdout, "\t-M match mode for each read or the seed part of read, which shouldn't contain more than 2 mismaches, [4]\n" "\t 0: exact match only\n" "\t 1: 1 mismatch match only\n" "\t 2: 2 mismatch match only\n" "\t 4: find the best hits\n"); fprintf(stdout, "\t-u output unmapped reads file\n"); fprintf(stdout, "\t-t output reads id instead reads name, [none]\n"); fprintf(stdout, "\t-l align the initial n bps as a seed [%d] means whole length of read\n", opt->aln_len); fprintf(stdout, "\t-n filter low-quality reads containing >n Ns before alignment, [%d]\n", opt->ns); fprintf(stdout, "\t-r [0,1,2] how to report repeat hits, 0=none; 1=random one; 2=all, [%d]\n", opt->rr); fprintf(stdout, "\t-m minimal insert size allowed, [%d]\n", opt->min_ins); //minimal insert size fprintf(stdout, "\t-x maximal insert size allowed, [%d]\n", opt->max_ins); //max_insert_size fprintf(stdout, "\t-2 output file of unpaired alignment hits\n"); fprintf(stdout, "\t-v maximum number of mismatches allowed on a read. [%d] bp\n", opt->max_mm); fprintf(stdout, "\t-s minimal alignment length (for soft clip) [%d] bp\n", opt->min_len); // fprintf(stdout, "\t-U only find uniq mapped reads with n mismatches for single-end, [%d]\n", opt->uniq); // fprintf(stdout, "\t-A report all mismatches reads in SOAP Format, default [none] report number \n"); fprintf(stdout, "\t-g one continuous gap size allowed on a read. [%d] bp\n", opt->gap_len );//max_gap, allowed_gap fprintf(stdout, "\t-R for long insert size of pair end reads RF. [none](means FR pair)\n"); fprintf(stdout, "\t-e will not allow gap exist inside n-bp edge of a read, default=5\n"); //gap_edge //fprintf(stdout, "\t\t-z initial quality, default=@ [Illumina is using '@', Sanger Institute is using '!']\n");//zero_quality // fprintf(stdout, "\t\t-c [0,1,2] do alignment on which reference chain? 0:both; 1:forward only; 2:reverse only. default=%d");//chains #ifdef PTHREADS fprintf(stdout, "\t-p number of processors to use, [%d]\n", opt->nthreads); //number of processors #endif fprintf(stdout, "\n\t-h this help\n\n"); exit(1); //*/ } /* ----- end of function Usage ----- */ SOAPOPT *OptIni(){ SOAPOPT *o; o = (SOAPOPT *) malloc (1 * sizeof(SOAPOPT)); o->fast = FASTQ; o->aln_len = 256; o->ns = 5; o->max_mm = 5; o->gap_len = 5; o->gap_fb = 5; o->nthreads = 1; o->min_ins = 400; o->max_ins = 600; o->unmapped = 0; o->rr = 1; o->gap_len = 0; o->pe = FALSE; o->cutoff = 1; o->mode = 4; o->id = FALSE; o->FR = TRUE; o->allErr = FALSE; o->min_len = 255; o->uniq = 0; return o; } void ParseOpt(int argc, char *argv[]){ char c; while((c = getopt(argc, argv, "a:b:D:o:2:u:m:x:M:AK:l:v:U:g:w:i:e:q:c:Rz:r:B:s:p:tn:h"))!=-1){ switch(c){ //basic IO case 'a': snprintf(readAFileName, MAX_FILENAME_LEN, "%s", optarg); break; case 'D': snprintf(database_prefix, MAX_FILENAME_LEN, "%s", optarg); break; case 'o': snprintf(outFileName, MAX_FILENAME_LEN, "%s", optarg); break; case 'b': opt->pe = TRUE; snprintf(readBFileName, MAX_FILENAME_LEN, "%s", optarg); break; case '2': snprintf(outUnpairFileName, MAX_FILENAME_LEN, "%s", optarg); break; case 'm': opt->min_ins = atoi(optarg); break; case 'x': opt->max_ins = atoi(optarg); break; //advance options case 'u': opt->unmapped = TRUE; snprintf(outUnmapFileName, MAX_FILENAME_LEN, "%s", optarg); break; case 'l': opt->aln_len = atoi(optarg); break; case 'M': { opt->mode = atoi(optarg); if(opt->mode == 4) opt->cutoff = 1; else if(opt->mode == 5) opt->cutoff = MAX_ALN; break; } case 'K': opt->cutoff = min(atoi(optarg), MAX_ALN); break; case 'v': opt->max_mm = min(atoi(optarg), MAX_MISMATCH); break; case 'g': opt->gap_len = min(atoi(optarg), MAX_GAP_LEN); break; case 'e': opt->gap_fb = atoi(optarg); break; case 'R': opt->FR = 0; break; case 'z': opt->zero_qual = atoi(optarg); break; case 'r': opt->rr = atoi(optarg); break; case 't': opt->id = 1; break; case 'n': opt->ns = atoi(optarg); break; case 'B': opt->bisulfite = atoi(optarg); break; case 'U': opt->uniq = atoi(optarg); break; case 's': opt->min_len = atoi(optarg); break; #ifdef PTHREADS #define MAX_PTHREADS 20 case 'p': opt->nthreads = min(atoi(optarg), MAX_PTHREADS); break; #endif case 'c': opt->chain = atoi(optarg); break; //unrecognizable input case 'h': case '?': Usage(); } } } void FileNameIni(){ snprintf(AnnotationFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, AnnotationSuffix); snprintf(PackedDNAFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, PackedDNASuffix); snprintf(BWTCodeFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, BWTCodeSuffix); snprintf(BWTOccValueFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, BWTOccValueSuffix); snprintf(SaValueFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, SaValueSuffix); snprintf(RevPackedDNAFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, RevPackedDNASuffix); snprintf(RevBWTCodeFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, RevBWTCodeSuffix); snprintf(RevBWTOccValueFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s",database_prefix , RevBWTOccValueSuffix); snprintf(LookupTableFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, LookupTableSuffix); snprintf(RevLookupTableFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, RevLookupTableSuffix); snprintf(HighOccHashTableFileName, MAX_FILENAME_LEN+MAX_SUFFIX_LEN, "%s%s", database_prefix, HighOccHashTableSuffix); } #define MODE S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH void FileTest(FILEDS *fds){ if((fds->ifdA=open(readAFileName, O_RDONLY))==-1){ fprintf(stderr, "Query File Error: Can't read %s\n", readAFileName); exit(EXIT_FAILURE); } if((fds->ofdAln=creat(outFileName, MODE))==-1){ fprintf(stderr, "Output File Error: Can't write %s\n",outFileName); exit(EXIT_FAILURE); } if(opt->pe){ if((fds->ifdB=open(readBFileName, O_RDONLY))==-1){ fprintf(stderr, "Query File Error: Can't read %s\n", readBFileName); exit(EXIT_FAILURE); } if( (fds->ofdSe=creat(outUnpairFileName, MODE))==-1){ fprintf(stderr, "Output File Error: Can't write %s\n", outUnpairFileName); exit(EXIT_FAILURE); } } fprintf(stderr, "Query File a: %s\n", readAFileName); if (opt->pe)fprintf(stderr, "Query File b: %s\n", readBFileName); fprintf(stderr, "Output File: %s\n", outFileName); if (opt->pe)fprintf(stderr, " %s\n", outUnpairFileName); if(opt->unmapped){ if ((fds->ofdUn=creat(outUnmapFileName, MODE))==-1){ fprintf(stderr, "Output File Error: Can't write %s\n", outUnmapFileName); exit(EXIT_FAILURE); } else { fprintf(stderr, " %s\n", outUnmapFileName); } } } int main(int argc, char *argv[]){ opt = OptIni(); if (argc < 3) { Usage(); } HSP *hsp; BWT *bwt; BWT *rev_bwt; LOOKUPTABLE lookup; LOOKUPTABLE rev_lookup; HASHTABLE hashtable; MMPool *mmPool; MMMasterInitialize(3, 0, FALSE, NULL); mmPool = MMPoolCreate(PoolSize); HSPFillCharMap(charMap); HSPFillComplementMap(complementMap); asciiTime("Begin Program SOAPaligner/soap2"); double startTime = setStartTime(); double elapsedTime = 0; double loadTime=0; ParseOpt(argc, argv); FileNameIni(); fprintf(stderr, "Reference: %s\n", database_prefix); FILEDS fds; FileTest(&fds); fprintf(stderr, "Load Index Table ...\n"); //* hsp = HSPLoad(mmPool, PackedDNAFileName, AnnotationFileName); bwt = BWTLoad(mmPool, BWTCodeFileName, BWTOccValueFileName, SaValueFileName, NULL, NULL, NULL); rev_bwt = BWTLoad(mmPool, RevBWTCodeFileName, RevBWTOccValueFileName, NULL, NULL, NULL, NULL); LoadLookupTable(&lookup,LookupTableFileName,LookUpTableSize); LoadLookupTable(&rev_lookup,RevLookupTableFileName,RevLookUpTableSize); LoadHashTable(&hashtable,HighOccHashTableFileName); RegisterDecoder(bwt,&hashtable); loadTime = getElapsedTime(startTime); //*/ fprintf(stderr, "Load Index Table OK\n"); fprintf(stderr, "Begin Alignment ...\n"); MatchProcess(&fds, bwt, rev_bwt, &lookup, &rev_lookup, hsp, opt); //*/ elapsedTime = getElapsedTime(startTime); fprintf(stderr, "Total Elapsed Time: %9.2f\n" " - Load Index Table: %9.2f\n" " - Alignment: %9.2f\n", elapsedTime, loadTime, (elapsedTime-loadTime)); FreeLookupTable(&lookup); FreeLookupTable(&rev_lookup); FreeHashTable(&hashtable); HSPFree(mmPool, hsp); BWTFree(mmPool, bwt); BWTFree(mmPool, rev_bwt); MMPoolFree(mmPool); close(fds.ifdA);close(fds.ofdAln); if(opt->pe){close(fds.ifdB); close(fds.ofdSe);} if(opt->unmapped)close(fds.ofdUn); free(opt); asciiTime("SOAPaligner/soap2 End"); fprintf(stderr, "\n"); return EXIT_SUCCESS; } /* ---------- end of function main ---------- */ soap2.20/Match.c0000644000105300011350000003126211231711753012345 0ustar yuchangrd/* * ============================================================================= * * Filename: Match.c * * Description: * * Revision: none * Compiler: gcc 4.3.2 or above * * Author: Chang Yu (yc), yuchang@genomics.org.cn * Company: BGI Shenzhen * CopyRight: Copyright (c) 2009, BGI Shenzhen * * ============================================================================= */ #include "Match.h" #define GenCigarMD() { \ if (hits->itemList[i].n_cigar == 0){ \ int j, match; \ match = 0; \ for(j=0; j>4)))>>(((~occPos)&0xf)<<1)))&0x3; \ if(!(((*(seq+j))&0x3) ^ c)){ \ ++match; \ } else { \ if(match||!j) ksprintf(str, "%d", match); \ kputc("ACGTN"[c], str); \ match = 0; \ } \ } \ ksprintf(str, "%d", match); \ (alnSeq->itemList+i)->n_cigar = 1; \ (alnSeq->itemList+i)->cigar = (unsigned short *)malloc(sizeof(unsigned short)*(alnSeq->itemList->n_cigar)); \ (alnSeq->itemList+i)->cigar[0] = (FROM_M << 14) | (len & 0x3ff); \ (alnSeq->itemList+i)->md = strdup(str->s); \ } else { \ int n_cigar = hits->itemList[i].n_cigar; \ hits->itemList[i].n_cigar = 0; \ (alnSeq->itemList+i)->cigar = (unsigned short *)malloc(sizeof(unsigned short)*(1+n_cigar)); \ memcpy((alnSeq->itemList+i)->cigar, cigar, n_cigar*sizeof(unsigned short)); \ unsigned int x = (alnSeq->itemList+i)->occ_pos; \ unsigned int y, z; \ y = z = 0; \ int k, l, u; \ unsigned char c; \ for (k = u = 0; k < n_cigar; ++k) { \ l = cigar[k]&0x3fff; \ if (cigar[k]>>14 == FROM_M) { \ for (z = 0; z < l && x+z < dnaLength; ++z) { \ c = (((*(pacRef+((x+z)>>4)))>>(((~(x+z))&0xf)<<1))) & 0x3; \ if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { \ if(u||!(y+z)) ksprintf(str, "%d", u); \ kputc("ACGTN"[c], str); \ u = 0; \ } else ++u; \ } \ x += l; y += l; \ } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { \ y += l; \ } else if (cigar[k]>>14 == FROM_D) { \ ksprintf(str, "%d", u); \ kputc('D', str); \ for (z = 0; z < l && x+z < dnaLength; ++z) \ kputc("ACGTN"[(((*(pacRef+((x+z)>>4)))>>(((~(x+z))&0xf)<<1))) & 0x3], str); \ u = 0; \ x += l; \ } \ } \ /* free(cigar); cigar = NULL; */ \ ksprintf(str, "%d", u);\ (alnSeq->itemList+i)->md = strdup(str->s); \ } \ } inline void PickupHit(ALNSEQ *alnSeq, const int rr,int *site, HITTABLE *hits, const unsigned int *pacRef, const unsigned int dnaLength, unsigned short *cigar){ #ifdef DEBUG // fprintf(stderr, "Pick up for output\n"); #endif int i = *site; kstring_t *str = (kstring_t *)calloc(1, sizeof(kstring_t)); str->l = 0; str->m = 0; if (!hits->n || (hits->n > 1 && !rr)) {alnSeq->report = 0; alnSeq->nhits =0; return;} else { int n; n = hits->n; if(rr == 1 || rr == 0) { alnSeq->report = 1; alnSeq->itemList = (HITITEM *)malloc(sizeof(HITITEM) *1); // assert(in); HITCPY(alnSeq->itemList, hits->itemList+i); unsigned int occPos = alnSeq->itemList->occ_pos; unsigned int len = alnSeq->len; char *seq = alnSeq->itemList->strain?alnSeq->rc:alnSeq->seq; if (hits->itemList[i].n_cigar == 0){ int j, match; match = 0; for(j=0; j>4)))>>(((~occPos)&0xf)<<1)))&0x3; if(!(((*(seq+j))&0x3) ^ c)){ ++match; } else { if(match || !j)ksprintf(str, "%d", match); kputc("ACGTN"[c], str); match = 0; } } if(match)ksprintf(str, "%d", match); alnSeq->itemList->n_cigar = 1; alnSeq->itemList->cigar = (unsigned short *)malloc(sizeof(unsigned short)*(alnSeq->itemList->n_cigar)); alnSeq->itemList->cigar[0] = (FROM_M << 14) | (len & 0x3fff); alnSeq->itemList->md = strdup(str->s); // fprintf(stderr, "%d%c\n", alnSeq->itemList->cigar[0]&0x3ff, "MIDS"[alnSeq->itemList->cigar[0]>>14]); } else { int n_cigar = hits->itemList[i].n_cigar; hits->itemList[i].n_cigar = 0; alnSeq->itemList->cigar = (unsigned short *)malloc(sizeof(unsigned short)*(1+n_cigar)); memcpy(alnSeq->itemList->cigar, cigar, n_cigar*sizeof(unsigned short)); unsigned int x = alnSeq->itemList->occ_pos; unsigned int y, z; y = z = 0; int k, l, u; k = l = u = 0; unsigned char c; for (k = u = 0; k < n_cigar; ++k) { l = cigar[k]&0x3fff; if (cigar[k]>>14 == FROM_M) { for (z = 0; z < l && x+z < dnaLength; ++z) { c = (((*(pacRef+((x+z)>>4)))>>(((~(x+z))&0xf)<<1))) & 0x3; if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { if(u||!(y+z))ksprintf(str, "%d", u); kputc("ACGTN"[c], str); u = 0; } else ++u; } x += l; y += l; } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { y += l; } else if (cigar[k]>>14 == FROM_D) { ksprintf(str, "%d", u); kputc('D', str); for (z = 0; z < l && x+z < dnaLength; ++z) kputc("ACGTN"[(((*(pacRef+((x+z)>>4)))>>(((~(x+z))&0xf)<<1))) & 0x3], str); u = 0; x += l; } } if (u) ksprintf(str, "%d", u); alnSeq->itemList->md = strdup(str->s); } // GenCigarMD(); alnSeq->nhits = n; } else { alnSeq->report = n; alnSeq->itemList = (HITITEM *)malloc(sizeof(HITITEM) * n); for (i = 0; i < n; ++i){ str->l = 0; HITCPY(alnSeq->itemList+i, hits->itemList+i); unsigned int occPos = (alnSeq->itemList+i)->occ_pos; unsigned int len = alnSeq->len; char *seq = (alnSeq->itemList+i)->strain?alnSeq->rc:alnSeq->seq; GenCigarMD(); } alnSeq->nhits = n; } } free(str->s); free(str); } void SEAlnCore(int tid, MULTISEQ *mseqs, BWT *bwt, BWT *rev_bwt, LOOKUPTABLE *lookup, LOOKUPTABLE *rev_lookup, HSP *hsp, const SOAPOPT *opt) { int i; ALNSEQ *alnSeq; HITTABLE *hits; hits = (HITTABLE *) malloc (sizeof(HITTABLE) * 1); hits->itemList = (HITITEM *) malloc (sizeof(HITITEM) * MAX_ALN); /* for(i=0; ipath = (unsigned short *) malloc (sizeof(unsigned short) * MAX_DIFF); //*/ int mode, seedLen, ns, rr, cutoff; mode = opt->mode; rr = opt->rr;ns = opt->ns; seedLen = opt->aln_len; cutoff = opt->cutoff; if (opt->uniq) mode = 4; BWTOPT bo; //* bo.nblock = hsp->numOfBlock; bo.blockList = hsp->blockList; bo.cutoff = MAX_ALN; bo.max_mm = opt->max_mm; bo.gap_len = opt->gap_len; bo.gap_fb = opt->gap_fb; bo.pacRef = hsp->packedDNA; bo.dnaLen = hsp->dnaLength; // int count = 0; //*/ #ifdef DEBUG // fprintf(stderr, "%d\n", mseqs->n); #endif for(i=0; i < mseqs->n; i+=1){ // fprintf(stderr, "n reads %d\n", i); alnSeq = mseqs->seqList+i; //* #ifdef PTHREADS ALNSEQ *p = mseqs->seqList + i; if (opt->nthreads > 1) { pthread_mutex_lock(&lock); if (alnSeq->tid < 0) { int j; for (j = i; j < mseqs->n && j < i + NSEQ_PER_THREAD; ++j) p++->tid = tid; } else if (alnSeq->tid != tid) { pthread_mutex_unlock(&lock); continue; } pthread_mutex_unlock(&lock); } #endif ///* ----- not PTHREADS -----*/ if (alnSeq->ns <= ns){ int h0, h1, h2, h3; h0 = h1 = h2 = h3 = 0; hits->n = 0; bo.seqLen = bo.alnLen = alnSeq->len; unsigned int extLen = 0; bo.fw = alnSeq->seq; bo.rc = alnSeq->rc; ALIGN: bo.h = bo.alnLen>>1; bo.x = bo.alnLen>39?bo.alnLen/3:13; bo.y = bo.alnLen-13-bo.x; if (bo.y <= 0) {fprintf(stderr, "length y < 0, countinue as 13\n");} switch (mode) { case 5: case 4: case 0: h0 = BWTExactMatching((unsigned char *)alnSeq->seq, &bo, FORWARD, bwt, lookup, hits); h0 += BWTExactMatching((unsigned char *)alnSeq->rc+extLen, &bo, REVERSE, bwt, lookup, hits); if(hits->n >= cutoff || mode == 0) break; case 1: h1 = BWT1ErrorMatching((unsigned char *)alnSeq->seq, &bo, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hits); h1 += BWT1ErrorMatching((unsigned char *)alnSeq->rc+extLen, &bo, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hits); if(hits->n >= cutoff || mode == 1) break; case 2: h2 = BWT2ErrorMatching((unsigned char *)alnSeq->seq, &bo, FORWARD, bwt, rev_bwt, lookup, rev_lookup, hits); h2 += BWT2ErrorMatching((unsigned char *)alnSeq->rc+extLen, &bo, REVERSE, bwt, rev_bwt, lookup, rev_lookup, hits); if(mode == 4 || hits->n >= cutoff || mode == 2) break; } if (!hits->n && seedLen < bo.alnLen) { bo.alnLen = seedLen; extLen = alnSeq->len-seedLen; goto ALIGN; } if (hits->n) { alnSeq->flag = 0; int site = hits->n?(hits->n == 1?0:rand()%hits->n):-1; PickupHit(alnSeq, rr, &site, hits, hsp->packedDNA, hsp->dnaLength, NULL); }else{ alnSeq->flag = 0; alnSeq->report = 0; } }else{ alnSeq->flag = 0; alnSeq->report = 0; } //*/ } // fprintf(stderr, "Alignment Time: %2.7f\n", getElapsedTime(startTime)); free(hits->itemList);free(hits); } #ifdef PTHREADS typedef struct _THREADAUX_TYPE_{ int tid; BWT *bwt; BWT *rev_bwt; LOOKUPTABLE *lookup; LOOKUPTABLE *rev_lookup; HSP *hsp; MULTISEQ *mseqs; SOAPOPT *o; }THREADAUX; static void *Workers(void *threadAux){ THREADAUX *aux = (THREADAUX *)threadAux; aux->o->pe? PEAlnCore(aux->tid, aux->mseqs, aux->bwt, aux->rev_bwt, aux->lookup,aux->rev_lookup, aux->hsp, aux->o) :SEAlnCore(aux->tid, aux->mseqs, aux->bwt, aux->rev_bwt, aux->lookup, aux->rev_lookup, aux->hsp, aux->o); } #endif void MatchProcess (FILEDS *fds, BWT *bwt, BWT *rev_bwt, LOOKUPTABLE *lookup, LOOKUPTABLE *rev_lookup, HSP *hsp, SOAPOPT * const opt) { InFileList *ifp; OutFileList *ofp; ifp = (InFileList *) malloc(sizeof(InFileList) * 1); ofp = (OutFileList *) malloc(sizeof(OutFileList) * 1); ifp->ifpA = fdopen(fds->ifdA, "r"); ofp->ofpAln = fdopen(fds->ofdAln, "w"); if (opt->pe) { ifp->ifpB = fdopen(fds->ifdB, "r"); ofp->ofpSe = fdopen(fds->ofdSe, "w"); } if (opt->unmapped) ofp->ofpUn = fdopen(fds->ofdUn, "w"); int fast = CheckFast(fds->ifdA); ifp->id = 0; MULTISEQ mseqs; mseqs.n = mseqs.max = 0; mseqs.seqList = (ALNSEQ *)malloc(sizeof(ALNSEQ) * MAX_MULTI_READS); #define INITALN(aln) { \ int j; \ for(j=0;jallErr; o_aux.un = opt->unmapped; o_aux.id = opt->id; o_aux.chrName = hsp->chrName; o_aux.chrNum = hsp->chrNum; //*/ #ifdef DEBUG // fprintf(stderr, "Begin Aln process\n"); #endif while (GetMultiSeq(ifp, &mseqs,opt->pe, fast?fastq:fasta) != 0) { nseq += mseqs.n; startAlnTime = setStartTime(); #ifndef PTHREADS // fprintf(stderr, "no threads\n"); opt->pe ? PEAlnCore(0, &mseqs, bwt, rev_bwt, lookup, rev_lookup, hsp, opt) :SEAlnCore(0, &mseqs, bwt, rev_bwt, lookup, rev_lookup, hsp, opt); #else if(opt->nthreads <= 1) opt->pe?PEAlnCore(0, &mseqs, bwt, rev_bwt, lookup, rev_lookup, hsp, opt) :SEAlnCore(0, &mseqs, bwt, rev_bwt, lookup, rev_lookup, hsp, opt); else { pthread_t *tid; pthread_attr_t attr; THREADAUX *threadAux; int j; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); threadAux = (THREADAUX *)calloc(opt->nthreads, sizeof(THREADAUX)); tid = (pthread_t*)calloc(opt->nthreads, sizeof(pthread_t)); for (j = 0; j < opt->nthreads; ++j) { threadAux[j].tid = j; threadAux[j].bwt = bwt; threadAux[j].rev_bwt = rev_bwt; threadAux[j].lookup = lookup; threadAux[j].rev_lookup = rev_lookup; threadAux[j].hsp = hsp; threadAux[j].mseqs = &mseqs; threadAux[j].o = opt; pthread_create(&tid[j], &attr, Workers, threadAux + j); } pthread_attr_destroy(&attr); for (j = 0; j < opt->nthreads; ++j) pthread_join(tid[j], 0); free(threadAux); free(tid); } #endif fprintf(stderr, "%d ok %7.2f sec\n", nseq, getElapsedTime(startAlnTime)); DumpAln(&mseqs, &o_aux, ofp, &nAln, &nSE); FreeMultiSeq(&mseqs); } if (opt->pe) fprintf(stderr, "Total Pairs: %d PE\n" "Paired: %d (%5.2f%%) PE\n" "Singled: %d (%5.2f%%) SE\n", nseq/2, nAln/2, (float)nAln/nseq*100, nSE, (float)nSE/(nseq)*100); else fprintf(stderr, "Total Reads: %d\n" "Alignment: %d (%5.2f%%)\n", nseq, nAln, (float)nAln/nseq*100); free(mseqs.seqList); fclose(ifp->ifpA); fclose(ofp->ofpAln); if(opt->pe){fclose(ifp->ifpB); fclose(ofp->ofpSe);} if(opt->unmapped) fclose(ofp->ofpUn); free(ifp); free(ofp); } /* */ soap2.20/NEWS0000644000105300011350000000020011370171567011637 0ustar yuchangrdBeta Release 2.20 (5 May, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ First source code release version. (2.20: 5 May, 2010) soap2.20/ChangeLog10000644000105300011350000000060511205266470012777 0ustar yuchangrdMay 21 2009 1. SWRescue serier bug fixed ------------------------------------------------------ Apr 10 2009 1. -l option compatible with diff read_length 2. -s min_length after soft clip 3. seq and quality real length is coordinated by soft clip 4. MD contain no 0 except first ------------------------------------------------------ Apr 8 2009 1. -t option 2. -r segfault 3. chang x, y soap2.20/ChangLogCode.Apr10000644000105300011350000000430711164550406014150 0ustar yuchangrd--- ../soap2.15/BWTAln.c 2009-03-27 10:42:36.000000000 +0800 +++ BWTAln.c 2009-04-01 10:25:41.000000000 +0800 @@ -163,6 +163,8 @@ unsigned char ec; unsigned int i; int k; + const int coord = (info>>24&1)?(bo->seqLen-bo->alnLen):0; + for (i=0;(i>24&1)?(bo->seqLen-bo->alnLen):0; // fprintf(stdout, "BWT2misMatching\n"); //Separate into 4 cases according to the documentation. //============================================== @@ -426,7 +429,7 @@ unsigned int mk_l=l; unsigned int mk_r=r; info &= 0x7000000; - info |= ((((ec&0x3)<<8)|(i&0xff))&0x3ff)<<12; + info |= ((((ec&0x3)<<8)|((i+coord)&0xff))&0x3ff)<<12; mk_l = bwt->cumulativeFreq[ec] + occCount_pstart[ec] + 1; mk_r = bwt->cumulativeFreq[ec] + occCount_pend[ec]; @@ -523,7 +526,7 @@ if (convertedKey[i]==ec) continue; // fprintf(stdout, "%d\n", i); info &= 0x7000000; - info |= ((((ec&0x3)<<8)|(i&0xff))&0x3ff)<<12; + info |= ((((ec&0x3)<<8)|((i+coord)&0xff))&0x3ff)<<12; unsigned int mk_l=l; unsigned int mk_r=r; unsigned int rev_mk_l=rev_l; @@ -596,7 +599,7 @@ for (ec=0;ec<4;ec++) { if (ec == (convertedKey[i] & 0x3)) continue; info &= 0x7000000; - info |= ((((ec&0x3)<<8)|(i&0xff))&0x3ff)<<12; + info |= ((((ec&0x3)<<8)|((i+coord)&0xff))&0x3ff)<<12; unsigned long long err_packedPattern = packedPattern; unsigned long long err_rev_packedPattern = rev_packedPattern; @@ -681,7 +684,7 @@ for (ec=0;ec<4;ec++) { if ((convertedKey[i]&0x3) ==ec) continue; info &= 0x7000000; - info |= ((((ec&0x3)<<8)|(i&0xff))&0x3ff)<<12; + info |= ((((ec&0x3)<<8)|((i+coord)&0xff))&0x3ff)<<12; unsigned int mk_l=l; unsigned int mk_r=r; unsigned int rev_mk_l=rev_l; soap2.20/soap.man0000644000105300011350000001362611370170201012577 0ustar yuchangrdSOAPaligner/soap2(1) Bioinformatics tool SOAPaligner/soap2(1) NNAAMMEE SOAPaligner/soap2 - Short Oligonucleotide Analysis Package aligner SSYYNNOOPPSSIISS soap reference.index short_reads.fast[a|q] alignment.out [options] DDEESSCCRRIIPPTTIIOONN SOAPaligner/soap2 is a member of the SOAP (Short Oligonucleotide Analy- sis Package). It is an updated version of SOAP software for short oligonucleotide alignment. The new program features in super fast and accurate alignment for huge amounts of short reads generated by Illu- mina/Solexa Genome Analyzer. Compared to soap v1, it is one order of magnitude faster. It require only 2 minutes aligning one million sin- gle-end reads onto the human reference genome. Another remarkable improvement of SOAPaligner is that it now supports a wide range of the read length. SOAPaligner benefitted in time and space efficiency by a revolution in the basic data structures and algorithms used.The core algorithms and the indexing data structures (2way-BWT) are developed by the algorithms research group of the Department of Computer Science, the University of Hong Kong (T.W. Lam, Alan Tam, Simon Wong, Edward Wu and S.M. Yiu). CCOOMMMMAANNDD AANNDD OOPPTTIIOONNSS ssooaapp -D -a [-b ] -o [-2 ] [options] OOPPTTIIOONNSS:: --DD SSTTRR Prefix name for reference index [*.index]. See AAPPPPEENNDDIIXX How to build the reference index --aa SSTTRR Query file, for SE reads alignment or one end of PE reads --bb SSTTRR Query b file, one end of PE reads --oo SSTTRR Output file for alignment results --22 SSTTRR Output file contains mapped but unpaired reads when do PE alignment --uu SSTTRR Output file for unmapped reads, [none] --mm IINNTT Minimal insert size INT allowed for PE, [400] --xx IINNTT Maximal insert size INT allowed for PE, [600] --nn IINNTT Filter low quality reads containing more INT bp Ns, [5] --tt Output reads id instead reads name, [none] --rr IINNTT How to report repeat hits, 0=none; 1=random one; 2=all, [1] --RR RF alignment for long insert size(>= 2k bps) PE data, [none] FR alignment --ll IINNTT For long reads with high error rate at 3'-end, those can't align whole length, then first align 5' INT bp sub- sequence as a seed, [256] use whole length of the read --ss IINNTT minimal alignment length (for soft clip) --vv IINNTT Totally allowed mismatches in one read, when use subse- quence as a seed, [5] --gg IINNTT Allow gap size in one read, [0] --MM IINNTT Match mode for each read or the seed part of read, which shouldn't contain more than 2 mismaches, [4] 0: exact match only 1: 1 mismatch match only 2: 2 mismatch match only 4: find the best hits --pp IINNTT Multithreads, n threads, [1] OOUUTTPPUUTT FFOORRMMAATT SOAP2 output format contains following column information: 1. reads name / reads ID (if -t is available) 2. reads sequence (if read align to reverse strand, here is the reverse sequence of orignal read) 3. quality sequence (if input is fasta reads, the column will be all 'h', and the sequence is backward if reads mapping reverse ) 4. AAPPPPEENNDDIIXX Before use soap2 to do alignment, the reference index must be generated by 2bwt-builder. 22bbwwtt--bbuuiillddeerr NNOOTTEE:: 1. the reference input should only be FASTA format; 2. the program wil auto generate the index files in the directory where the fasta file is located, so confirm the permission at first. EENNVVIIRROONNMMEENNTT The datastructure is imcompatible with 32bit, so it can't be migrated on any 32bit platforms. Due to using the MMX instruction to opitimize parts of code, the current version can only run on xx8866__6644 ppllaattffoorrmm.. We will provide a universal version for most of the 64bit platform later. HHAARRDDWWAARREE RREEQQUUIIRREEMMEENNTT 1.8Gb RAM (for a genome as large as human's) 2.at least 8Gb hard disk to store index (for a genome as large as human's) SSYYSSTTEEMM RREEQQUUIIRREEMMEENNTT Linux x86_64 SSEEEE AALLSSOO Website for SOAP , Google Group for SOAP PPuubblliiccaattiioonn:: "SOAP: short oligonucleotide alignment program" (2008) BIOINFOR- MATICS,Vol. 24 no.5 2008, pages 713-714 AATTHHOOUURR BBGGII SShheennzzhheenn SOAP team. The core algorithm Bidirect-BWT is wrotten by Prof. T.W. Lam and his team at HongKong University. RREEPPOORRTT BBUUGGSS Report bugs to AACCKKNNOOWWLLEEDDGGEEMMEENNTTSS We appreciate Prof. T.W. Lam, Alan Tam, Simon Wong, Edward Wu and S.M. Yiu prominent work on Bidirect-BWT. SOAPaligner-2.1X 25 May 2009 SOAPaligner/soap2(1) soap2.20/soap.10000644000105300011350000001101511370170163012161 0ustar yuchangrd.TH SOAPaligner/soap2 1 "25 May 2009" SOAPaligner-2.1X "Bioinformatics tool" .SH NAME .PP SOAPaligner/soap2 \- Short Oligonucleotide Analysis Package aligner .SH SYNOPSIS .PP soap reference.index short_reads.fast[a|q] alignment.out [options] .SH DESCRIPTION .PP SOAPaligner/soap2 is a member of the SOAP (Short Oligonucleotide Analysis Package). It is an updated version of SOAP software for short oligonucleotide alignment. The new program features in super fast and accurate alignment for huge amounts of short reads generated by Illumina/Solexa Genome Analyzer. Compared to soap v1, it is one order of magnitude faster. It require only 2 minutes aligning one million single-end reads onto the human reference genome. Another remarkable improvement of SOAPaligner is that it now supports a wide range of the read length. .PP SOAPaligner benefitted in time and space efficiency by a revolution in the basic data structures and algorithms used.The core algorithms and the indexing data structures (2way-BWT) are developed by the algorithms research group of the Department of Computer Science, the University of Hong Kong (T.W. Lam, Alan Tam, Simon Wong, Edward Wu and S.M. Yiu). .SH COMMAND AND OPTIONS .PP .B soap -D -a [-b ] -o [-2 ] [options] .P .B OPTIONS: .RS .TP .B -D STR Prefix name for reference index [*.index]. See .B APPENDIX How to build the reference index .TP .B -a STR Query file, for SE reads alignment or one end of PE reads .TP .B -b STR Query b file, one end of PE reads .TP .B -o STR Output file for alignment results .TP .B -2 STR Output file contains mapped but unpaired reads when do PE alignment .TP .B -u STR Output file for unmapped reads, [none] .TP .B -m INT Minimal insert size INT allowed for PE, [400] .TP .B -x INT Maximal insert size INT allowed for PE, [600] .TP .B -n INT Filter low quality reads containing more INT bp Ns, [5] .TP .B -t Output reads id instead reads name, [none] .TP .B -r INT How to report repeat hits, 0=none; 1=random one; 2=all, [1] .TP .B -R RF alignment for long insert size(>= 2k bps) PE data, [none] FR alignment .TP .B -l INT For long reads with high error rate at 3'-end, those can't align whole length, then first align 5' INT bp subsequence as a seed, [256] use whole length of the read .TP .B -s INT minimal alignment length (for soft clip) .TP .B -v INT Totally allowed mismatches in one read, when use subsequence as a seed, [5] .TP .B -g INT Allow gap size in one read, [0] .TP .B -M INT Match mode for each read or the seed part of read, which shouldn't contain more than 2 mismaches, [4] .RS .TP 0: exact match only .TP 1: 1 mismatch match only .TP 2: 2 mismatch match only .TP 4: find the best hits .RE .B -p INT Multithreads, n threads, [1] .SH OUTPUT FORMAT .PP SOAP2 output format contains following column information: .PP 1. reads name / reads ID (if -t is available) .P 2. reads sequence (if read align to reverse strand, here is the reverse sequence of orignal read) .P 3. quality sequence (if input is fasta reads, the column will be all 'h', and the sequence is backward if reads mapping reverse ) .P 4. .SH APPENDIX .PP Before use soap2 to do alignment, the reference index must be generated by 2bwt-builder. .P .RS .B 2bwt-builder .P .B NOTE: 1. the reference input should only be FASTA format; 2. the program wil auto generate the index files in the directory where the fasta file is located, so confirm the permission at first. .RE .SH ENVIRONMENT .PP The datastructure is imcompatible with 32bit, so it can't be migrated on any 32bit platforms. Due to using the MMX instruction to opitimize parts of code, the current version can only run on .B x86_64 platform. We will provide a universal version for most of the 64bit platform later. .TP .B HARDWARE REQUIREMENT .RS 1.8Gb RAM (for a genome as large as human's) .P 2.at least 8Gb hard disk to store index (for a genome as large as human's) .RE .TP .B SYSTEM REQUIREMENT .RS Linux x86_64 .RE .SH SEE ALSO .PP Website for SOAP , .P Google Group for SOAP .TP .BR Publication: "SOAP: short oligonucleotide alignment program" (2008) BIOINFORMATICS,Vol. 24 no.5 2008, pages 713\-714 .SH ATHOUR .PP .B BGI Shenzhen SOAP team. The core algorithm Bidirect-BWT is wrotten by Prof. T.W. Lam and his team at HongKong University. .SH REPORT BUGS .PP Report bugs to .SH ACKNOWLEDGEMENTS .PP We appreciate Prof. T.W. Lam, Alan Tam, Simon Wong, Edward Wu and S.M. Yiu prominent work on Bidirect-BWT. soap2.20/INSTALL0000644000105300011350000000031311370172123012163 0ustar yuchangrdSystem Requirements =================== To complie SOAP2 requires Intel x86_64 CPU, and gcc version 4.2.3 or above. Compilation =========== Type `make' to compile SOAP2. Installation ============ soap2.20/GPLv30000644000105300011350000010451311370170434011762 0ustar yuchangrd GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read .