pax_global_header00006660000000000000000000000064130756337410014523gustar00rootroot0000000000000052 comment=7c992efb9c404351ef6d129bf317583d11a3a2be ClonalFrameML-1.11/000077500000000000000000000000001307563374100140615ustar00rootroot00000000000000ClonalFrameML-1.11/.gitignore000066400000000000000000000000751307563374100160530ustar00rootroot00000000000000 src/ClonalFrameML src/main.o src/version.h src/.vscode/* ClonalFrameML-1.11/README.md000066400000000000000000000052631307563374100153460ustar00rootroot00000000000000# ClonalFrameML # Introduction # This is the homepage of ClonalFrameML, a software package that performs efficient inference of recombination in bacterial genomes. ClonalFrameML was created by [Xavier Didelot](http://www.imperial.ac.uk/medicine/people/x.didelot/) and [Daniel Wilson](http://www.danielwilson.me.uk/). ClonalFrameML can be applied to any type of aligned sequence data, but is especially aimed at analysis of whole genome sequences. It is able to compare hundreds of whole genomes in a matter of hours on a standard Desktop computer. There are three main outputs from a run of ClonalFrameML: a phylogeny with branch lengths corrected to account for recombination, an estimation of the key parameters of the recombination process, and a genomic map of where recombination took place for each branch of the phylogeny. ClonalFrameML is a maximum likelihood implementation of the Bayesian software [ClonalFrame](http://www.xavierdidelot.xtreemhost.com/clonalframe.htm) which was previously described by [Didelot and Falush (2007)](http://www.genetics.org/cgi/content/abstract/175/3/1251). The recombination model underpinning ClonalFrameML is exactly the same as for ClonalFrame, but this new implementation is a lot faster, is able to deal with much larger genomic dataset, and does not suffer from MCMC convergence issues. A scientific paper describing ClonalFrameML in detail has been published, see [Didelot X, Wilson DJ (2015) ClonalFrameML: Efficient Inference of Recombination in Whole Bacterial Genomes. PLoS Comput Biol 11(2): e1004041. doi:10.1371/journal.pcbi.1004041](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004041). # Download and Installation # You can obtain the most up to date version of ClonalFrameML by downloading and compiling the C++ source code via GIT using the command: ``` git clone https://github.com/xavierdidelot/ClonalFrameML ``` Please note that the code for ClonalFrameML is distributed under the terms of the GNU GPL v3 license, for more details see https://www.gnu.org/copyleft/gpl.html You can compile the code using the following command: ``` cd ClonalFrameML/src ./make.sh ``` Compilation requires a C++ compiler, such as [GCC](https://gcc.gnu.org/), to be installed. Running the bundled R scripts requires [R](http://cran.r-project.org/) to be installed with the ape and phangorn packages. # User guide # The user guide for ClonalFrameML is available [here](https://github.com/xavierdidelot/clonalframeml/wiki). # Getting help # If you need assistance using ClonalFrameML, you can get in touch by emailing either [Xavier Didelot](http://www.xavierdidelot.xtreemhost.com/contact.htm) or [Daniel Wilson](http://www.danielwilson.me.uk/contact.html). ClonalFrameML-1.11/src/000077500000000000000000000000001307563374100146505ustar00rootroot00000000000000ClonalFrameML-1.11/src/README.txt000066400000000000000000000057501307563374100163550ustar00rootroot00000000000000ClonalFrameML Xavier Didelot and Daniel Wilson. 2015 This program reads in a Newick tree and FASTA file and, for all variable sites, reconstructs the joint maximum likelihood sequences at all nodes (including, for the purposes of imputation, tips) using the HKY85 nucleotide substitution model and an algorithm described in: A Fast Algorithm for Joint Reconstruction of Ancestral Amino Acid Sequences Tal Pupko, Itsik Peer, Ron Shamir, and Dan Graur. Mol. Biol. Evol. 17(6):890–896. 2000 Branch lengths of the tree are corrected for heterospecific horizontal gene transfer using a new maximum- likelihood algorithm implementing the ClonalFrame model that was described in: Inference of Bacterial Microevolution Using Multilocus Sequence Data Xavier Didelot, and Daniel Falush. Genetics 175(3):1251-1266. 2007 Syntax: ClonalFrameML newick_file fasta_file output_file [OPTIONS] newick_file The tree specified in Newick format. It must be an unrooted bifurcating tree. All tips should be uniquely labelled and the internal nodes must not be labelled. Note that the branch lengths must be scaled in units of expected number of substitutions per site. Failure to provide appropriately scaled branch lengths will adversely affect results. fasta_file The nucleotide sequences specified in FASTA format, with labels exactly matching those in the newick_file. The letter codes A, C, G and T are interpreted directly, U is converted to T, and N, -, ? and X are treated equivalently as ambiguity codes. No other codes are allowed. output_file The prefix for the output files, described below. [OPTIONS] Run ClonalFrameML with no arguments to see the options available. The program reports the empirical nucleotide frequencies and the joint log-likelihood of the reconstructed sequences for variable sites. Files are output with the following suffixes: .labelled_tree.newick The corrected Newick tree is ouput with internal nodes labelled so that they correspond with the reconstructed ancestral sequence file. .ML_sequence.fasta The reconstructed sequences (ancestral and, for the purposes of imputation, observed) in FASTA format with letter codes A, C, G and T only. The labels match exactly those in the output Newick tree. .position_cross_reference.txt A vector of comma-separated values equal in length to the input FASTA file relating the positions of (variable) sites in the input FASTA file to the positions of their reconstructed sequences in the output FASTA file, starting with position 1. Sites in the input file not reconstructed are assigned a 0. .importation_status.txt A FASTA file representing the inferred importation status of every site coded as 0 (unimported) 1 (imported) 2 (unimported homoplasy/multiallelic) 3 (imported homoplasy/multiallelic) 4 (untested compatible) 5 (untested homoplasy). ClonalFrameML-1.11/src/bank/000077500000000000000000000000001307563374100155635ustar00rootroot00000000000000ClonalFrameML-1.11/src/bank/MLST.h000066400000000000000000000077551307563374100165310ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * MLST.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ #ifndef _MLST_H_ #define _MLST_H_ #pragma warning(disable: 4786) #include "myutils/myerror.h" #include "myutils/vector.h" #include "myutils/matrix.h" #include "myutils/DNA.h" using namespace myutils; class MLST { public: int n; // number of sequences int nloc; // number of loci Vector nhap; // nhap[l] (l=0..nloc-1) gives the number of unique alleles at locus l Vector allele; // allele[l] (l=0..nloc-1) stores the DNA sequences of the nhap[l] unique alleles at locus l Matrix count; // count[l][i] (l=0..nloc-1,i=0..nhap[l]-1) is the count of unique allele i at locus l Matrix haplotype; // haplotype[i] (i=0..n-1) gives the allelic profile for sequence i, so that // haplotype[i][l] (l=0..nloc-1) is allele number at locus l, so that the DNA sequence // is accessed using allele[l][haplotype[i][l]]. However, a short-cut would be, rather than // using MLST.allele[l][haplotype[i][l]], to use MLST.seq(i,l). public: string& seq(const int i, const int l) { return allele[l][haplotype[i][l]]; } MLST() {}; MLST(const int nloc_in, const char* filename[]) { nloc = nloc_in; Vector temp(nloc); int l; for(l=0;l &temp) { initialize(temp); } void initialize(Vector &temp) { nloc = temp.size(); if(nloc<1) myutils::error("MLST::initialize(): must be at least one locus"); int l; n = temp[0]->nseq; for(l=1;lnseq!=n) myutils::error("MLST(): all loci should have the same number of sequences"); nhap.resize(nloc); allele.resize(nloc); haplotype = Matrix(n,nloc,-1); count = Matrix(nloc,n,0); Vector convert(n); int i,j; for(l=0;lsequence[i]==temp[l]->sequence[j]) { ++count[l][j]; haplotype[i][l] = j; break; } int check_total = 0; for(i=0;i0) ? 1 : 0; check_total += count[l][i]; } if(check_total!=n) myutils::error("MLST(): problem in counting haplotypes"); allele[l].resize(nhap[l],temp[l]->lseq); int hap = 0; for(i=0;i0) { allele[l][hap] = temp[l]->sequence[i]; count[l][hap] = count[l][i]; convert[i] = hap; ++hap; } } if(hap!=nhap[l]) myutils::error("MLST(): hap and nhap disagree"); for(;hap1) { double pi = allele[l].pi(); double H = allele[l].H(); }*/ } /*cout << "Allelic profiles of the " << n << " haplotypes" << endl; for(i=0;i. */ #ifndef _APPROXDF_H_ #define _APPROXDF_H_ #include "myutils/vector.h" #include "myutils/myerror.h" #include namespace myutils { class approxdf { public: int n; Vector CDF,G,EV,PR; public: approxdf() { n = 0; CDF = G = EV = PR = Vector(0); } approxdf(Vector &EV_in, Vector &PR_in) { initialize(EV_in,PR_in); } void initialize(Vector &EV_in, Vector &PR_in) { n = EV_in.size(); if(PR_in.size()!=n) error("approxdf(): EV and PR must have same length"); EV = Vector(n); PR = Vector(n); int i; for(i=0;i0 && EV[i] PDF(n-1); for(i=1;i(n); CDF[0] = 0; for(i=1;i(n-1); for(i=1;ix) { --wh; break; } } if(wh==-1 || wh==n) error("cdf(): x lies outside original range"); //# A piecewise quadratic approximation to the c.d.f. return CDF[wh]+(x-EV[wh])*(PR[wh]+G[wh]/2*(x-EV[wh])); } double icdf(const double U) { int wh; for(wh=0;whU) { --wh; break; } } if(wh==-1) error("icdf(): U is less than 0"); if(wh==n) error("icdf(): U is greater than 1"); //# A piecewise inverse-quadratic approximation to the i.c.d.f. return ((G[wh]*EV[wh]-PR[wh]+sqrt(PR[wh]*PR[wh]+2*G[wh]*(U-CDF[wh])))/G[wh]); } double pdf(const double x) { int wh; for(wh=0;whx) { --wh; break; } } if(wh==-1 || wh==n) error("cdf(): x lies outside original range"); //# A piecewise linear approximation to the p.d.f. return PR[wh]+(x-EV[wh])*G[wh]; } }; }; //namespace myutils #endif//_APPROXDF_H_ ClonalFrameML-1.11/src/bank/census.h000066400000000000000000000175661307563374100172530ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * census.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* census.h 28th August 2009 */ /* */ /* Keeps track of the membership of a */ /* finite number of individuals among a */ /* finite number of populations. */ /* */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _MYUTILS_CENSUS_H_ #define _MYUTILS_CENSUS_H_ #include "myutils/myerror.h" #include "myutils/vector.h" #include "myutils/utils.h" #include //#include using std::cout; using std::endl; namespace myutils { class Census { public: /*Default constructor*/ Census() { Vector where(0); initialize(0,0,where); } /*Constructor*/ Census(const unsigned int npop, const unsigned int nind) { Vector where(nind,0); initialize(npop,nind,where); } /*Constructor*/ Census(const unsigned int npop, const unsigned int nind, Vector &where) { initialize(npop,nind,where); } /*Copy constructor*/ Census(const Census& cen) { _npop = cen._npop; _nind = cen._nind; _where = cen._where; _who = cen._who; _index = cen._index; _mind = cen._mind; _cind = cen._cind; } /*Assignment operator*/ Census& operator=(const Census& cen) { _npop = cen._npop; _nind = cen._nind; _where = cen._where; _who = cen._who; _index = cen._index; _mind = cen._mind; _cind = cen._cind; return *this; } Census& initialize(const unsigned int npop, const unsigned int nind, Vector &where) { if(npop<0) error("Census::Census(): number of populations must be non-negative"); if(nind<0) error("Census::Census(): number of individuals must be non-negative"); /* Accept the arguments */ _npop = npop; _nind = nind; /* Initialize the membership lists */ _where = Vector(_nind); _mind = Vector(_npop,0); int i; for(i=0;i<_nind;i++) { if(where[i]<0) error("Census::Census(): population cannot be negative"); if(where[i]>=_npop) error("Census::Census(): population number exceeds maximum"); _where[i] = where[i]; _mind[where[i]]++; } /* Calculate the cumulative membership numbers */ _cind = Vector(_npop,0); int p; for(p=1;p<_npop;p++) { _cind[p] = _cind[p-1] + _mind[p-1]; } if(_npop>0 && _cind[_npop-1]+_mind[_npop-1]!=_nind) error("Census::Census(): number of individuals doesn't match"); /* Initialize the who list */ _who = Vector(_nind); _index = Vector(_nind); Vector _tind(_npop,0); for(i=0;i<_nind;i++) { const int pop = _where[i]; const int ix = _cind[pop]+_tind[pop]; _who[ix] = i; _index[i] = ix; ++_tind[pop]; } return *this; } /*Destructor*/ ~Census() {} /*Simple functions*/ int npop() {return _npop;} int nind() {return _nind;} int nind(const int p) {return _mind[p];} Vector where() {return _where;} int where(const int i) {return _where[i];} Vector who(const int p) { if(p<0 || p>=_npop) error("Census::who(): Population p out of range"); Vector ret(_mind[p]); int i; for(i=0;i<_mind[p];i++) { ret[i] = _who[_cind[p]+i]; } return ret; } int who(const int p, const int i) {return _who[_cind[p]+i];} int ferocious_who(const int p, const int i) { if(p<0 || p>=_npop) error("Census::who(): Population p out of range"); if(i<0 || i>=_mind[p]) error("Census::who(): Index i out of range for population p"); return _who[_cind[p]+i]; } int meek_who(const int p, const int i) { if(p<0 || p>=_npop) return -1; if(i<0 || i>=_mind[p]) return -1; return _who[_cind[p]+i]; } /* Last individual in the population */ int last(const int p) { if(p<0 || p>=_npop) error("Census::last(): population out of range"); if(_mind[p]==0) error("Census::last(): population is empty"); return _who[_cind[p]+_mind[p]-1]; } /*Not-so simple functions*/ int migrate(const int from, const int to) { const int ind = last(from); migrate(ind,from,to); return ind; } /* ind is the absolute index of the individual */ Census& migrate(const int ind, const int from, const int to) { if(from==to) return *this; if(from<0 || from>=_npop) error("Census::migrate(): donor population out of range"); if(_where[ind]!=from) error("Census::migrate(): individual is not member of donor population"); if(to<0 || to>=_npop) error("Census::migrate(): recipient population out of range"); if(fromto;p--) { /* 1. Add to new pop */ --_mind[p]; ++_mind[p-1]; ++_cind[p]; /* 2. Swap from last to first position */ const int ix_from = _cind[p-1]+_mind[p-1]-1; const int ix_to = _cind[p-1]; const int ifrom = _who[ix_from]; const int ito = _who[ix_to]; SWAP(_who[ix_from],_who[ix_to]); SWAP(_index[ifrom],_index[ito]); } /* Update _where */ _where[ind] = to; } return *this; } Census& inspect() { int i; cout << "_where = {" << _where[0]; for(i=1;i<_nind;i++) cout << " " << _where[i]; cout << "}" << endl; cout << "_who = {" << _who[0]; for(i=1;i<_nind;i++) cout << " " << _who[i]; cout << "}" << endl; cout << "_index = {" << _index[0]; for(i=1;i<_nind;i++) cout << " " << _index[i]; cout << "}" << endl; cout << "_mind = {" << _mind[0]; for(i=1;i<_npop;i++) cout << " " << _mind[i]; cout << "}" << endl; cout << "_cind = {" << _cind[0]; for(i=1;i<_npop;i++) cout << " " << _cind[i]; cout << "}" << endl; return *this; } protected: /* Number of populations */ int _npop; /* Number of individuals */ int _nind; /* _where[i], i=0.._nind-1, has value [0,_npop-1], Population to which individual i belongs */ Vector _where; /* _who[i], i=0.._nind-1, has value [0,_nind-1], Collapsed unordered list of individuals belonging to the population to which i corresponds */ Vector _who; /* _index[i], i=0.._nind-1, has value [0,_nind-1], Position of individual i in vector _who */ Vector _index; /* _mind[p], p=0.._npop-1, has value [0,_nind], Number of members of population p */ Vector _mind; /* _cind[p], p=0.._npop-1, has value [0,_nind], Cumulative number of members of population p */ Vector _cind; }; }; #endif // _MYUTILS_CENSUS_H_ ClonalFrameML-1.11/src/bank/cmatrix.h000066400000000000000000000103251307563374100174040ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * cmatrix.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* cmatrix.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _CMATRIX_H_ #define _CMATRIX_H_ #include #include namespace myutils { /*Cannot accept objects of type class*/ template class CMatrix { public: /*Preserve public access for back-compatibility*/ T **element; protected: int protected_nrows; int protected_ncols; int initialized; public: /*Default constructor*/ CMatrix() { initialized=0; initialize(0,0); } /*Constructor*/ CMatrix(int nrows, int ncols) { initialize(nrows,ncols); } /*Constructor*/ CMatrix(int nrows, int ncols, T value) { initialize(nrows,ncols); int i,j; for(i=0;i=0;i--) free((T*) element[i]); free((T**) element); } CMatrix& initialize(int nrows, int ncols) { element=(T **) malloc((unsigned) nrows*sizeof(T*)); if (!element) error("row allocation failure in Matrix::initialize()"); int i; for(i=0;i& resize(int nrows, int ncols) { int i; if (!initialized) initialize(nrows,ncols); else { if(nrows!=protected_nrows) { element=(T **) realloc(element,(unsigned) nrows*sizeof(T*)); if (!element) error("row allocation failure in Matrix::resize()"); if(nrows=nrows;i--) free ((T*) element[i]); } if(nrows>protected_nrows) { for(i=protected_nrows;i& mat) /* Copy constructor for the following cases: Matrix mat2(mat); Matrix mat2=mat; and when Matrix is returned from a function */ { initialize(mat.nrows(),mat.ncols()); int i,j; for(i=0;i& operator=(CMatrix& mat) { if(this==&mat)return *this; resize(mat.nrows(),mat.ncols()); int i,j; for(i=0;i. */ #ifndef _HRCOALESCENT_H_ #define _HRCOALESCENT_H_ //#include "coalesce/coalescent_control.h" //#include "coalesce/coalescent_process.h" #include "coalesce/coalescent_record.h" #endif ClonalFrameML-1.11/src/bank/coalescent_control.h000066400000000000000000000213441307563374100216200ustar00rootroot00000000000000/* Copyright 2013 Daniel Wilson. * * coalescent_control.h * Part of the coalesce library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ #ifndef _CONTROL_H_ #define _CONTROL_H_ #pragma warning(disable: 4786) #include #include #include #include using namespace std; #include "myutils/matrix.h" #include "myutils/controlwizard.h" using myutils::Matrix; using myutils::ControlWizard; using myutils::TP_UNRECOGNISED; using myutils::TP_INT; using myutils::TP_DOUBLE; using myutils::TP_STRING; using myutils::TP_VEC_INT; using myutils::TP_VEC_DOUBLE; using myutils::TP_EXT_VEC_DOUBLE; //using myutils::DATA_TYPE; class Control { public: int nsamp; // Sample size vector ntimes; // Times (in Ne gens) of samples, ordering unimportant double Negens; // Expresses 1 unit of Ne gens in the same units as ntimes int loci; // Number of independent loci simulated int seq_len; // Total length of sequences simulated vector len; // Lengths for each locus double r; // Per site rate of crossing-over per Negens (standard model, lambda = 0) // or TWICE the per site rate of initiation of recombination per Negens // (bacterial model, lambda > 0) vector rmap; // Map for heterogeneous recombination rates double lambda; // 1/mean tract length double M; // headline per site mutation rate int n_states; // number of states (e.g. 4 nucleotides, 64 codons) vector state_freq; // initial state frequencies vector state_rel_mut_rate; // mutation rates relative to headline mutation rate vector state_M; // 1/(state-specific per site mutation rate) // (to be calculated) Matrix mut_matrix; // transition matrix for the states vector state_name; // letters for the states int nruns; // store this in control class also int update_interval; bool coutput; /* Variables for structured coalescent */ int ndemes; vector deme_config; // for each sample member, their starting deme (0..ndemes-1) Matrix mig; // ndemes * ndemes matrix: double the backwards in time migration rate from i to j vector N_deme_over_D; // for each deme, the pop size relative to the total public: Control() { coutput=true; /* Set defaults */ nsamp = 2; ntimes = vector(2,0.0); Negens = 1.0; loci = 1; seq_len = 1; len = vector(1,1); r = lambda = M = 0.0; rmap = vector(0); ndemes = 0; } Control& read_input(char* filename) { seq_len=-1; Negens=1.0; //By default ControlWizard control_file; control_file.coutput=coutput; control_file.add_ITEM("n",TP_INT,&nsamp); control_file.add_ITEM("ntimes",TP_VEC_DOUBLE,&ntimes); control_file.add_item("Negens",TP_DOUBLE,&Negens); control_file.add_ITEM("loci",TP_INT,&loci); control_file.add_ITEM("len",TP_VEC_INT,&len); control_file.add_ITEM("lambda",TP_DOUBLE,&lambda); control_file.add_ITEM("n_states",TP_INT,&n_states); control_file.add_ITEM("mu",TP_DOUBLE,&M); control_file.add_ITEM("r",TP_DOUBLE,&r); vector temp_mut_matrix; control_file.add_ITEM("mut_matrix",TP_EXT_VEC_DOUBLE,&temp_mut_matrix); control_file.add_item("state_rel_mut_rate",TP_EXT_VEC_DOUBLE,&state_rel_mut_rate); control_file.add_item("state_M",TP_EXT_VEC_DOUBLE,&state_M); control_file.add_ITEM("state_freq",TP_EXT_VEC_DOUBLE,&state_freq); control_file.add_item("nruns",TP_INT,&nruns); control_file.add_item("seq_len",TP_INT,&seq_len); control_file.add_ITEM("update_interval",TP_INT,&update_interval); control_file.read_input(filename); if(coutput)control_file.check_required(); else if(!control_file.got_required)error("Not all necessary items found in control file"); /*Check for necessary parameters*/ if(!control_file.got_required)error("read_input(): necessary parameters not found"); vector_to_Matrix(&temp_mut_matrix,&mut_matrix,n_states,n_states); if(ntimes.size()!=nsamp)error("ntimes inconsistent in size with n"); sort(ntimes.begin(),ntimes.end()); if(coutput) { int o; for(o=0;o<(int)ntimes.size();o++)printf("%g ",ntimes[o]); } if(len.size()!=loci)error("len inconsistent in size with loci"); if(state_freq.size()!=n_states)error("state_freq inconsistent in size with n_states"); if(update_interval==0)update_interval=1; int ind_seq_len=0; int i; for(i=0;i0) { one_or_the_other=true; got_state_M=true; } if(state_rel_mut_rate.size()>0) one_or_the_other=true; if(!one_or_the_other)error("read_input(): neither state_M or state_rel_mut_rate received"); if(got_state_M) { if(state_M.size()!=n_states)error("state_M inconsistent in size with n_states"); state_rel_mut_rate.resize(n_states); int i; for(i=0;i *vec, Matrix *mat, int rows, int cols) { if((int)vec->size()<(rows*cols))error("vector_to_Matrix(): vector too small to fill matrix"); if((int)vec->size()>(rows*cols))error("vector_to_Matrix(): vector too large to fit matrix"); mat->resize(rows,cols); int current_row=0; int i,j; for(i=0;i<(int)vec->size();i+=cols) { for(j=0;jelement[current_row][j]=vec->at(i+j); } ++current_row; } return *this; } Control& read_mut_matrix(Matrix &G, vector &pi) { /*Check it is a rate matrix*/ int i,j; if(G.nrows()!=G.ncols())error("read_mut_matrix(): not a square matrix"); for(i=0;i. */ #ifndef _COALESCENT_PROCESS_H_ #define _COALESCENT_PROCESS_H_ #include #include #include #include using namespace std; #include "myutils/matrix.h" using myutils::Matrix; #include "myutils/random.h" using myutils::Random; #include "myutils/myerror.h" using myutils::error; #include "coalesce/coalescent_record.h" #include "coalesce/coalescent_control.h" #include "coalesce/mutation.h" class ptr_vector { int size; public: class mt_node **ptr; //vec of generic ptrs public: ptr_vector() {}; ptr_vector& initialize(const int size_in) { size=size_in; ptr=(class mt_node**) malloc((size_t) size*sizeof(class mt_node*)); int i; for(i=0;iptr[i]; return *this; } ptr_vector& copy(ptr_vector *donor, const int position) { ptr[position]=donor->ptr[position]; return *this; } ptr_vector& copy(ptr_vector *donor) { int i; for(i=0;iptr[i]; return *this; } inline ptr_vector& assign(class mt_node *target, const int position) { ptr[position]=target; return *this; } inline ptr_vector& assign(class mt_node *target, const int from, const int to) { int i; for(i=from;i<=to;i++) ptr[i]=target; return *this; } int get_size() {return size;}; ~ptr_vector() { //nullify(); free((class mt_node**) ptr); } }; class ap_node { public: /*Fixed once*/ int id; /*Recyclable*/ enum {NOT_IN_USE,IN_USE,FIXED_NODE} flag; int deme; // records the deme the node belongs to double time; class ptr_vector AMP; double rlen; double L; int ltr; int rtr; int active_id; // records the position in active_node int ctree_id; // records the position in the conditional marginal tree public: ap_node() {}; ap_node& initialize(const int id_in, const int size) { id=id_in; active_id = ctree_id = -1; AMP.initialize(size); recycle(); return *this; } ap_node& recycle() { flag = NOT_IN_USE; active_id = ctree_id = deme = -1; //time=0.0; //AMP.nullify(); return *this; } ap_node& activate(double *time_in) { flag = IN_USE; time=*time_in; return *this; } ~ap_node() {}; }; class eventChain { public: enum eventType {NONE,COALESCENCE,RECOMBINATION,ADD_LINEAGE,END}; protected: class eventChainEvent { public: double k,rlen,time; eventType type; eventChainEvent() { k = rlen = time = 0.0; type = NONE; } }; vector ev; public: double rho; public: eventChain() { ev = vector(0); } eventChain(const int size_in) { ev = vector(size_in); } const int size() const { return (int)ev.size(); } void resize(const int size_in) { // cout << "resize to " << size_in << endl; if(size_in<0) myutils::error("eventChain::resize(): cannot have a negative size"); ev.resize(size_in); // cout << "done resizing" << endl; } eventChainEvent& operator[](const int pos) { return ev[pos]; } double loglikelihood(const double rh) { if(ev.size()<=0) myutils::error("eventChain::loglikelihood(): chain has zero length"); if(ev[0].type==NONE) myutils::error("eventChain::loglikelihood(): no chain exists"); int e = 0; if(ev[e].type==END) myutils::error("eventChain::loglikelihood(): first event is the last"); double rate = (ev[e].k*ev[e].rlen*rh + ev[e].k*(ev[e].k-1.0))/2.0; double prec = ev[e].k*ev[e].rlen*rh/2.0/rate; double L = 0.0; if(ev[e].type==ADD_LINEAGE) { L += - rate * (ev[e].time); } else if(ev[e].type==RECOMBINATION) { L += log(rate) - rate * (ev[e].time) + log(prec); } else if(ev[e].type==COALESCENCE) { L += log(rate) - rate * (ev[e].time) + log(1.-prec); } for(e=1;e<(int)ev.size();e++) { if(ev[e].type==END) break; rate = (ev[e].k*ev[e].rlen*rh + ev[e].k*(ev[e].k-1.0))/2.0; prec = ev[e].k*ev[e].rlen*rh/2.0/rate; if(ev[e].type==ADD_LINEAGE) { L += - rate * (ev[e].time - ev[e-1].time); } else if(ev[e].type==RECOMBINATION) { L += log(rate) - rate * (ev[e].time - ev[e-1].time) + log(prec); } else if(ev[e].type==COALESCENCE) { L += log(rate) - rate * (ev[e].time - ev[e-1].time) + log(1.-prec); } } if(e==ev.size()) error("eventChain::loglikelihood() chain has no end"); return L; } }; class coalescent { public: /*Fixed*/ class Control *con; //ptr to con class Random *ran; //ptr to ran class marginal_tree *tree; //vec of tree's class ap_node **node; //vec of ptrs to ap_node's class ap_node **active_node;//vec of ptrs to ap_node's class ap_node **inactive_node;//vec of ptrs to ap_node's int nodes_reserved; int L; int **segregating_tree; int **internal_seg_tree; int seg_tree_id; Matrix genotype; bool no_gene_conversion; /*Recyclable*/ int n_inactive; int ARG_k; //#lineages in ARG int gen; //number of events double total_rlen; double rho; //total_rlen/ARG_k int n_segregating; bool samples_waiting; double time_next_sample; // contemporaneous samples vector::iterator ntimes_itr;// iterator for rifling through con->ntimes int next_waiting_sample; int nrecTypeI; int nrecTypeII; int nrecTypeIII; int nco,nrec,naddbase,nmut; vector nrecWatt; int ncoI,ncoIIa,ncoIIb,ncoIII; /* Variables for conditional simulation */ int ARG_k_fixed; vector ftimes; // times for fixed events vector fnode; /* Variables for structured coalescent */ ap_node ***ptr_deme; // con->ndemes * con->nsamp matrix: members of each deme Vector k_deme; // number of ancestral lineages in each deme Vector rho_deme; // effective recn rate of each deme Vector sum_mig; // total backwards-in-time mig for each deme i double rate_coal, rate_recn, rate_mign; Vector coal_deme; protected: vector _uniqueHaps; vector _sites; LowerTriangularMatrix ____B; vector _M; vector _F; vector _four; LowerTriangularMatrix< vector > _G; LowerTriangularMatrix _A,___B,___C; Matrix _D; public: coalescent() {}; coalescent& initialize(class Control *con_in, class Random *ran_in) { con=con_in; ran=ran_in; if(con->nsamp<0) con->nsamp = 0; if(con->ntimes.size()==0) con->ntimes = vector(con->nsamp,0.0); if(con->seq_len<0) con->seq_len = 0; if(con->Negens<0) con->Negens = 1; if(con->len.size()==0) con->len = vector(1,con->seq_len); if(con->r<0.0) con->r = 0.0; if(con->lambda<0.0) con->lambda = 0.0; if(con->lambda==0.0)no_gene_conversion=true; else no_gene_conversion=false; L=con->seq_len; tree=(class marginal_tree*) malloc((size_t) L*sizeof(marginal_tree)); int i; for(i=0;insamp); internal_seg_tree=(int**) malloc((size_t) 2*sizeof(int*)); internal_seg_tree[0]=(int*) malloc((size_t) L*sizeof(int)); internal_seg_tree[1]=(int*) malloc((size_t) L*sizeof(int)); seg_tree_id=0; segregating_tree=&(internal_seg_tree[seg_tree_id]); ARG_k=ARG_k_fixed=0; node=(class ap_node**) malloc((size_t) ARG_k*sizeof(ap_node*)); active_node=(class ap_node**) malloc((size_t) ARG_k*sizeof(ap_node*)); inactive_node=(class ap_node**) malloc((size_t) ARG_k*sizeof(ap_node*)); if(con->ndemes>0) { ptr_deme = (ap_node***) malloc((size_t) con->ndemes*sizeof(ap_node**)); for(i=0;indemes;i++) ptr_deme[i] = (ap_node**) malloc((size_t) ARG_k*sizeof(ap_node*)); } n_inactive=0; nodes_reserved=0; reserve_nodes(10*con->nsamp); genotype.initialize(con->nsamp,L); return *this; } coalescent& go() { recycle(); if(add_next_sample()!=0.0)error("Most recent node does not occur at time zero"); double current_time=0.0; gen=0; while((ARG_k>1)||(samples_waiting)) { //event(¤t_time); double denom = 1.0/((double)ARG_k*rho+(double)ARG_k*((double)ARG_k-1.0)); current_time += constant_size_model(2.0*denom); if((samples_waiting)&&(current_time>=time_next_sample)) { current_time = add_next_sample(); } else { /*2nd, choose type of event*/ double rnum1 = ran->U(); double pr_recom=(double)ARG_k*rho*denom; if (rnum1 <= pr_recom) recombine(¤t_time); else coalesce(¤t_time); } ++gen; } return *this; } coalescent& go(eventChain& e) { recycle(); if(add_next_sample()!=0.0)error("Most recent node does not occur at time zero"); double current_time=0.0; gen=0; e.rho = 2.0*con->r; if(e.size()<1000) e.resize(1000); while((ARG_k>1)||(samples_waiting)) { if(e.size()<=gen) e.resize(2*e.size()); double denom = 1.0/((double)ARG_k*rho+(double)ARG_k*((double)ARG_k-1.0)); current_time += constant_size_model(2.0*denom); e[gen].k = (double)ARG_k; e[gen].rlen = rho/e.rho; e[gen].time = current_time; if((samples_waiting)&&(current_time>=time_next_sample)) { current_time = add_next_sample(); e[gen].time = current_time; e[gen].type = eventChain::ADD_LINEAGE; } else { /*2nd, choose type of event*/ double rnum1 = ran->U(); double pr_recom=(double)ARG_k*rho*denom; if (rnum1 <= pr_recom) { recombine(¤t_time); e[gen].type = eventChain::RECOMBINATION; } else { coalesce(¤t_time); e[gen].type = eventChain::COALESCENCE; } } ++gen; } e[gen-1].type = eventChain::END; return *this; } coalescent& migrate() { recycle(); if(add_next_sample()!=0.0)error("Most recent node does not occur at time zero"); double current_time=0.0; gen=0; int i,j; //const char tab = '\t'; //for(i=0;indemes;i++) cout << tab << "k" << i << tab << "co" << tab << "mig"; //cout << endl; //cout << setprecision(3); //Vector k_avg(con->ndemes,0); while((ARG_k>1)||(samples_waiting)) { //if(gen%2==0) { rate_coal = rate_recn = rate_mign = 0.0; for(i=0;indemes;i++) { coal_deme[i] = (double)k_deme[i] * (double)(k_deme[i]-1) / con->N_deme_over_D[i]; rate_coal += coal_deme[i]; rate_recn += rho_deme[i]; rate_mign += (double)k_deme[i] * sum_mig[i]; } rate_coal /= 2.0; rate_recn /= 2.0; rate_mign /= 2.0; //} //cout << current_time; //for(i=0;indemes;i++) { // cout << tab << k_deme[i] << tab << coal_deme[i]/2. << tab << (double)k_deme[i] * sum_mig[i]/2.; // k_avg[i] += k_deme[i]; //} //cout << endl; double denom = (rate_coal + rate_recn + rate_mign); current_time += constant_size_model(1.0/denom); if((samples_waiting)&&(current_time>=time_next_sample)) { current_time = add_next_sample(); } else { /*2nd, choose type of event*/ double rnum1 = ran->U() * denom; if(rnum1 <= rate_coal) migrate_coalesce(¤t_time); else if(rnum1 <= rate_coal+rate_recn) migrate_recombine(¤t_time); else migrate_migrate(¤t_time); } for(i=0;indemes;i++) for(j=0;jflag==ap_node::NOT_IN_USE) error("migrate(): pointer problem"); ++gen; } //cout << current_time; //for(i=0;indemes;i++) cout << tab << (double)k_avg[i]/(double)gen; //cout << endl; return *this; } coalescent& conditional(class marginal_tree &ctree) { recycle(); int i; fnode = vector(ctree.size,(ap_node*)NULL); ftimes = vector(ctree.size,0.0); for(i=0;i1)||(samples_waiting)) { //event(¤t_time); //double denom = (double)ARG_k*rho+2.*(double)(ARG_k_fixed)*(double)(ARG_k-ARG_k_fixed); //if(ARG_k>ARG_k_fixed) denom += (double)(ARG_k-ARG_k_fixed)*((double)(ARG_k-ARG_k_fixed)-1.0); double denom = (double)ARG_k*rho+(double)(ARG_k)*(double)(ARG_k-1); if(ARG_k_fixed>1) denom -= (double)(ARG_k_fixed)*(double)(ARG_k_fixed-1); if(denom == 0.0) { if(samples_waiting) current_time = add_conditional_event(ctree); else error("conditional(): infinite time until next event"); } else { denom = 1.0/denom; current_time += constant_size_model(2.0*denom); if((samples_waiting)&&(current_time>=time_next_sample)) { current_time = add_conditional_event(ctree); } else { /*2nd, choose type of event*/ double rnum1 = ran->U(); double pr_recom=(double)ARG_k*rho*denom; if (rnum1 <= pr_recom) { conditionally_recombine(¤t_time); /*if(tree[0].node[2].time!=0) { warning("MRCA is not supposed to be found during recombination"); }*/ } else { conditionally_coalesce(¤t_time); } } } if(ARG_k_fixed==1) { warning("Single fixed lineage left"); } if(ARG_k_fixed>0) for(i=0;i<(int)fnode.size();i++) if(fnode[i]!=NULL) if(fnode[i]->active_id==-1) { warning("fnode inconsistency"); } if(ARG_k_fixed>ARG_k) error("conditional(): ARG_k_fixed > ARG_k"); for(i=0;iactive_id!=i) error("conditional(): active_id's incorrect"); int ctr_fnode = 0; for(i=0;i ARG_k_fixed"); ++gen; } return *this; } coalescent& mutate() { int i; for(i=0;idraw(),M); return *this; } coalescent& mutate(const int site, Mutation_Matrix *M) { mutate_tree(site,tree[site].size-1,M->draw(),M); return *this; } coalescent& mutate(const int site, Mutation_Matrix *M, vector& mutLog) { mutLog.clear(); mutate_tree_and_record(site,tree[site].size-1,M->draw(),M,mutLog); return *this; } coalescent& output_FASTA(vector &code, const char* filename) { FILE* fout=fopen(filename,"w"); // fprintf(fout,"%d %d\n\n",con->nsamp,con->seq_len); int n; for(n=0;nnsamp;n++) { fprintf(fout,">seq%d_%g\n",n,con->ntimes[n]*con->Negens); int pos; for(pos=0;posseq_len;pos++) fprintf(fout,"%c",code[(int)genotype[n][pos]]); fprintf(fout,"\n"); } fclose(fout); return *this; } coalescent& output_FASTA(vector code, const char* filename) { // FILE* fout=fopen(filename,"w"); // fprintf(fout,"%d %d\n\n",con->nsamp,con->seq_len); ofstream fout(filename); // fout << con->nsamp << " " << con->seq_len << endl << endl; int n; for(n=0;nnsamp;n++) { fout << ">seq" << n << "_" << con->ntimes[n]*con->Negens << endl; //fprintf(fout,">seq%d_%g\n",n,con->ntimes[n]*con->Negens); int pos; for(pos=0;posseq_len;pos++) fout << code[(int)genotype[n][pos]]; //fprintf(fout,"%s",code[genotype[n][pos]].c_str()); fout << endl; //fprintf(fout,"\n"); } //fclose(fout); fout.close(); return *this; } /* which is a vector true or false whether to include each sequence */ coalescent& output_FASTA(vector &code, const char* filename, vector &which) { if(which.size()!=con->nsamp) error("coalescent::output_FASTA(): which must have length nsamp"); FILE* fout=fopen(filename,"w"); int n; for(n=0;nnsamp;n++) { if(which[n]) { fprintf(fout,">seq%d_%g\n",n,con->ntimes[n]*con->Negens); int pos; for(pos=0;posseq_len;pos++) fprintf(fout,"%c",code[(int)genotype[n][pos]]); fprintf(fout,"\n"); } } fclose(fout); return *this; } /* which is a vector true or false whether to include each sequence */ coalescent& output_FASTA(vector &code, const char* filename, vector &which) { if(which.size()!=con->nsamp) error("coalescent::output_FASTA(): which must have length nsamp"); ofstream fout(filename); int n; for(n=0;nnsamp;n++) { if(which[n]) { fout << ">seq" << n << "_" << con->ntimes[n]*con->Negens << endl; int pos; for(pos=0;posseq_len;pos++) fout << code[(int)genotype[n][pos]]; fout << endl; } } fout.close(); return *this; } coalescent& output_MEP(const char* filename) { FILE* fout=fopen(filename,"w"); fprintf(fout,"n=%d, mu=%g, r=%g, Negens=%g\n",con->nsamp,con->M/con->Negens,con->r/con->Negens,con->Negens); fprintf(fout,"Time points = "); int i; for(i=0;i<(int)con->ntimes.size();i++)fprintf(fout,"%g ",con->ntimes[i]*con->Negens); fprintf(fout,"\n\n"); int mrca=2*con->nsamp-2; double t_height=0.0; for(i=0;iseq_len;i++) { double temp=tree[i].node[mrca].time; if(t_height!=tree[i].node[mrca].time) { t_height=tree[i].node[mrca].time; fprintf(fout,"Position %d\tHeight %g\t\t%g\n",i,t_height,t_height*con->Negens); } } fclose(fout); return *this; } coalescent& output_tree(const int site) { /*This node is always the mrca*/ int mrca=2*(con->nsamp-1); int i=site; /*Create names for the files*/ stringstream ageout_file; ageout_file << "age" << i << ".dat"; stringstream treeout_file; treeout_file << "tree" << i << ".dat"; /*Open them for writing*/ FILE *ageout = fopen(ageout_file.str().c_str(), "w"); FILE *treeout = fopen(treeout_file.str().c_str(), "w"); int tree_id=site; /*ageout contains ages for each of the nodes*/ int j; for(j=0;j<=mrca;j++) { fprintf(ageout,"%d %g\n",mrca-j,10.0*tree[tree_id].node[j].time); } /*treeout contains the labels for each of the base nodes*/ /*followed by a colon then a list of all nodes ancestral to it*/ for(j=0;jnsamp;j++) { int gt=(int)genotype[j][tree_id]; fprintf(treeout,"%2d : %d ",gt+1,mrca-j); class mt_node* anc; class mt_node* nextanc=tree[tree_id].node[j].ancestor; do { anc=nextanc; fprintf(treeout,"%d ",mrca-anc->id); nextanc=anc->ancestor; }while(nextanc!=NULL); fprintf(treeout,"\n"); } fclose(ageout); fclose(treeout); FILE *tpicout = fopen("tpic.bat","w"); int number=1; for(i=0;indemes>0) { for(i=con->ndemes-1;i>=0;i--) free((ap_node**) ptr_deme[i]); free((ap_node***) ptr_deme); } for(i=0;indemes>0) { for(i=0;indemes;i++) ptr_deme[i] = (ap_node**) realloc(ptr_deme[i],(size_t) number*sizeof(ap_node*)); } for(i=nodes_reserved;iinitialize(i,L); inactive_node[n_inactive]=node[i]; ++n_inactive; } nodes_reserved=number; return *this; } coalescent& recycle() { int i,j; for(i=0;intimes.begin(); next_waiting_sample=0; nrecTypeI=0; nrecTypeII=0; nrecTypeIII=0; nco=nrec=naddbase=nmut=0; nrecWatt = vector(con->seq_len,0); ncoI=ncoIIa=ncoIIb=ncoIII=0; if(con->ndemes>0) { for(i=0;indemes;i++) for(j=0;j(con->ndemes,0); rho_deme = Vector(con->ndemes,0.0); if(con->N_deme_over_D.size()!=con->ndemes) error("recycle(): con->N_deme_over_D wrong # demes"); if(con->mig.nrows()!=con->ndemes) error("recycle(): con->mig wrong number of rows"); if(con->mig.ncols()!=con->ndemes) error("recycle(): con->mig wrong number of columns"); sum_mig = Vector(con->ndemes,0.0); coal_deme = Vector(con->ndemes,0.0); if(con->deme_config.size()!=con->nsamp) error("recycle(): con->deme_config wrong sample size"); for(i=0;indemes;i++) for(j=0;jndemes;j++) sum_mig[i] += con->mig[i][j]; } if(!no_gene_conversion && con->lambda<=0.0) error("coalescent::recycle(): lambda<=0.0 in gene conversion model"); return *this; } coalescent& event(double *time) { double denom = 1.0/((double)ARG_k*rho+(double)ARG_k*((double)ARG_k-1.0)); (*time)+=constant_size_model(2.0*denom); if((samples_waiting)&&((*time)>=time_next_sample)) { (*time) = add_next_sample(); } else { /*2nd, choose type of event*/ double rnum1 = ran->U(); double pr_recom=(double)ARG_k*rho*denom; if (rnum1 <= pr_recom) recombine(time); else coalesce(time); } return *this; } coalescent& coalesce(double *time) { ++nco; /*Create new lineage in the ARG*/ class ap_node *new_node=create_node(time); //printf(" : ");int o;for(o=0;oid);printf("\n"); /*NB If you deactivate the nodes before */ /*creating the new one then you overwrite*/ /*memory you want to read from! */ /*However, must not allow the new node to*/ /*be chosen as one of the coalescing */ /*nodes. Since it is added at the end of */ /*the active_node vector, simply restrict*/ /*the maximum node that can be chosen. */ /*Choose 1st lineage to coalesce*/ int lin1=ran->discrete(0,ARG_k-2); //New node is in position ARG_k-1, so do not class ap_node *ap_node1=active_node[lin1]; //allow this position to be chosen deactivate_node(lin1); //printf(" : ");for( o=0;oid);printf("\n"); /*Unfortunately, now the new node has */ /*been switched into position lin1 in the*/ /*vector. Therefore, restrict the maximum*/ /*again and if lin1 is chosen, override */ /*it and choose the end lineage. */ /*Choose 2nd lineage to coalesce*/ //New node is now in position lin1, so if it int lin2=ran->discrete(0,ARG_k-2); //is chosen, force it to choose the last old node if(lin2==lin1)lin2=ARG_k-1; //instead. As a result, do not let the last old node class ap_node *ap_node2=active_node[lin2]; //(in position ARG_k-1) be chosen initially. deactivate_node(lin2); //printf(" : ");for( o=0;oid);printf("\n"); if((new_node->id==ap_node1->id)||(new_node->id==ap_node2->id) ||(ap_node1->id==ap_node2->id))error("coalesce(): nodes not chosen correctly"); // printf("Identity of New node: %d Coalescing nodes: %d and %d\n",new_node->id,ap_node1->id,ap_node2->id); // printf(" which point to: %d and %d\n",ap_node1->AMP.ptr[0]->id,ap_node2->AMP.ptr[0]->id); /*Perform copying and coalescing*/ int imax=n_segregating; int i; for(i=0;iAMP.ptr[tree_id]==NULL) { /*Rule 2.ii */ if(ap_node2->AMP.ptr[tree_id]==NULL) { ++ncoI; new_node->AMP.assign(NULL,tree_id);} /*Rule 2.i */ else { ++ncoIIa; new_node->AMP.assign(ap_node2->AMP.ptr[tree_id],tree_id);} } else { /*Rule 2.ii */ if(ap_node2->AMP.ptr[tree_id]==NULL) { ++ncoIIb; new_node->AMP.assign(ap_node1->AMP.ptr[tree_id],tree_id);} /*Rule 2.iii */ else { ++ncoIII; new_node->AMP.assign(tree[tree_id].coalesce(*time,ap_node1->AMP.ptr[tree_id]->id,ap_node2->AMP.ptr[tree_id]->id),tree_id); } } } if(!samples_waiting) { deactivate_trees(); /*i=0; while(iAMP.ptr[0]->id); return *this; } virtual coalescent& recombine(double *time) { ++nrec; int rtype=-1; /*Create new lineages in the ARG*/ class ap_node *new_node1=create_node(time); class ap_node *new_node2=create_node(time); /*First choose the lineage*/ double rnum1=ran->U()*total_rlen; int lin; for(lin=0;linrlen) break; rnum1 -= active_node[lin]->rlen; } if (lin>=ARG_k) error("recombine(): lineage not chosen correctly"); class ap_node *old_node=active_node[lin]; deactivate_node(lin); //active_node[lin]->edge_time=(*time)-active_node[lin]->time; if(new_node1==old_node)error("Aah!"); if(new_node2==old_node)error("Aah!"); /*Determine the number of breakpoints*/ if (old_node->L==0.0)error("recombine(): recombination at an empty locus"); int ltr=old_node->ltr; int rtr=old_node->rtr; /*Is it a swap?*/ if(no_gene_conversion) { perform_single_crossover(<r,&rtr); if(tree[ltr-1].get_k()>1 && tree[ltr].get_k()>1 && ltr!=old_node->ltr && old_node->AMP.ptr[ltr-1]!=NULL && old_node->AMP.ptr[ltr]!=NULL) ++nrecWatt[ltr-1]; else if(tree[rtr-1].get_k()>1 && tree[rtr].get_k()>1 && rtr!=old_node->rtr && old_node->AMP.ptr[rtr-1]!=NULL && old_node->AMP.ptr[rtr]!=NULL) ++nrecWatt[rtr-1]; rtype=2; ++nrecTypeII; } /* else { double a,b,c,swap_yn,rnum3,single_yn; a = con->lambda*old_node->L; b = exp(-a); c = a+b; swap_yn = b/c; //=1 if L=0 so "swap", but has no effect //this error shouldve been caught anyway rnum3 = ran->U(); if (rnum3<=swap_yn) { //It's a swap! //In which case all of the recipient's genome is //ancestral except for the locus of interest //ltr and rtr do not need modifying rtype=1; ++nrecTypeI; }*/ else { /*Is it a single cross-over?*/ //single_yn = (1-a)/c; //rnum3 -= swap_yn; double rnum3 = ran->U() * old_node->rlen; /* Before 11.08.06 the next line was the same, but changes to calc_node_rlen imply that the relative rate of single to double xovers is altered. */ double single_yn = con->r/con->lambda*(1.-pow(1.-con->lambda,(double)(old_node->L-1))); if (rnum3<=single_yn) { /*It's a single cross-over!*/ perform_single_crossover(<r,&rtr); rtype=2; ++nrecTypeII; if(tree[ltr-1].get_k()>1 && tree[ltr].get_k()>1 && ltr!=old_node->ltr && old_node->AMP.ptr[ltr-1]!=NULL && old_node->AMP.ptr[ltr]!=NULL) ++nrecWatt[ltr-1]; else if(tree[rtr-1].get_k()>1 && tree[rtr].get_k()>1 && rtr!=old_node->rtr && old_node->AMP.ptr[rtr-1]!=NULL && old_node->AMP.ptr[rtr]!=NULL) ++nrecWatt[rtr-1]; } else { /*It's a double cross-over!*/ perform_double_crossover(<r,&rtr); rtype=3; ++nrecTypeIII; //++nrecWatt[ltr-1]; //++nrecWatt[rtr-1]; } } //} /*Copy the relevant parts of AMP*/ int i,pos; for(i=0,pos=(*segregating_tree)[0];(posAMP.assign(old_node->AMP.ptr[pos],pos); new_node2->AMP.assign(NULL,pos); } for(;(posAMP.assign(old_node->AMP.ptr[pos],pos); new_node1->AMP.assign(NULL,pos); } for(;iAMP.assign(old_node->AMP.ptr[pos],pos); new_node2->AMP.assign(NULL,pos); } /*Recalculate rlen*/ calc_rlen(new_node1,new_node2); return *this; } coalescent& conditionally_coalesce(double *time) { ++nco; /*Create new lineage in the ARG*/ class ap_node *new_node=create_node(time); int lin1,lin2; class ap_node *ap_node1,*ap_node2; if(true){//ARG_k_fixed>=ARG_k) { while(true) { lin1=ran->discrete(0,ARG_k-2); ap_node1=active_node[lin1]; /* Always accept if not a FIXED_NODE */ if(ap_node1->flag!=ap_node::FIXED_NODE) break; /* otherwise accept with probability */ else if(ran->U()<1.-(ARG_k_fixed-1.)/(ARG_k-2.)) break; } /* If ap_node1 is a FIXED_NODE make new_node a FIXED_NODE */ if(ap_node1->flag==ap_node::FIXED_NODE) { new_node->flag = ap_node::FIXED_NODE; new_node->ctree_id = ap_node1->ctree_id; } deactivate_node(lin1); while(true) { /* Choose a different lineage */ lin2=ran->discrete(0,ARG_k-2); if(lin2==lin1)lin2=ARG_k-1; ap_node2=active_node[lin2]; /* Don't accept if both nodes have FIXED_NODE status */ if(!(new_node->flag==ap_node::FIXED_NODE && ap_node2->flag==ap_node::FIXED_NODE)) break; } /* If ap_node2 is a FIXED_NODE make new_node a FIXED_NODE */ if(ap_node2->flag==ap_node::FIXED_NODE) { new_node->flag = ap_node::FIXED_NODE; new_node->ctree_id = ap_node2->ctree_id; } //else new_node->ctree_id = -1; if(new_node->ctree_id>-1) fnode[new_node->ctree_id] = new_node; /*Finally, deactivate*/ deactivate_node(lin2); } else { } /* //Choose 1st lineage to coalesce int lin1=ran->discrete(0,ARG_k-2); //Do not choose FIXED_NODEs while(active_node[lin1]->flag==ap_node::FIXED_NODE) lin1 = ran->discrete(0,ARG_k-2); class ap_node *ap_node1=active_node[lin1]; deactivate_node(lin1); //Choose 2nd lineage to coalesce int lin2=ran->discrete(0,ARG_k-2); if(lin2==lin1)lin2=ARG_k-1; class ap_node *ap_node2=active_node[lin2]; //Sort out fnode stuff new_node->flag = ap_node2->flag; new_node->ctree_id = ap_node2->ctree_id; if(new_node->ctree_id>-1) fnode[new_node->ctree_id] = new_node; //Finally, deactivate deactivate_node(lin2);*/ if((new_node->id==ap_node1->id)||(new_node->id==ap_node2->id) ||(ap_node1->id==ap_node2->id))error("coalesce(): nodes not chosen correctly"); /*Give new_node the flag of ap_node2, which might be a FIXED_NODE int found_fnode = 0; int a; for(a=0;aflag==ap_node::FIXED_NODE && found_fnode!=1) error("coalescent::conditionally_coalesce(): problem finding FIXED_NODE");*/ /*Perform copying and coalescing*/ int imax=n_segregating; int i; for(i=0;iAMP.ptr[tree_id]==NULL) { /*Rule 2.ii */ if(ap_node2->AMP.ptr[tree_id]==NULL) { ++ncoI; new_node->AMP.assign(NULL,tree_id);} /*Rule 2.i */ else { ++ncoIIa; new_node->AMP.assign(ap_node2->AMP.ptr[tree_id],tree_id);} } else { /*Rule 2.ii */ if(ap_node2->AMP.ptr[tree_id]==NULL) { ++ncoIIb; new_node->AMP.assign(ap_node1->AMP.ptr[tree_id],tree_id);} /*Rule 2.iii */ else { ++ncoIII; new_node->AMP.assign(tree[tree_id].coalesce(*time,ap_node1->AMP.ptr[tree_id]->id,ap_node2->AMP.ptr[tree_id]->id),tree_id); } } } if(!samples_waiting) { //deactivate_trees2(); i=0; while(iU()*total_rlen; int lin; for(lin=0;linrlen) break; rnum1 -= active_node[lin]->rlen; } if (lin>=ARG_k) error("recombine(): lineage not chosen correctly"); class ap_node *old_node=active_node[lin]; /*new_node1 is always the recipient*/ new_node1->flag = old_node->flag; new_node1->ctree_id = old_node->ctree_id; if(new_node1->ctree_id!=-1) fnode[new_node1->ctree_id] = new_node1; int old_node_flag = (int)old_node->flag; deactivate_node(lin); //active_node[lin]->edge_time=(*time)-active_node[lin]->time; if(new_node1==old_node)error("Aah!"); if(new_node2==old_node)error("Aah!"); /*Determine the number of breakpoints*/ if (old_node->L==0.0)error("recombine(): recombination at an empty locus"); int ltr=old_node->ltr; int rtr=old_node->rtr; /*Is it a swap?*/ if(no_gene_conversion) { //error("coalescent::conditionally_recombine(): only donor-recipient style rec defined"); perform_single_crossover(<r,&rtr); if(tree[ltr-1].get_k()>1 && tree[ltr].get_k()>1 && ltr!=old_node->ltr && old_node->AMP.ptr[ltr-1]!=NULL && old_node->AMP.ptr[ltr]!=NULL) ++nrecWatt[ltr-1]; else if(tree[rtr-1].get_k()>1 && tree[rtr].get_k()>1 && rtr!=old_node->rtr && old_node->AMP.ptr[rtr-1]!=NULL && old_node->AMP.ptr[rtr]!=NULL) ++nrecWatt[rtr-1]; rtype=2; ++nrecTypeII; } else { double swap_yn,rnum3,single_yn; rnum3 = ran->U() * old_node->rlen; swap_yn = (old_node_flag == (int)ap_node::FIXED_NODE) ? 0.5 * con->r/con->lambda*pow(1.-con->lambda,(double)(old_node->L-1)) : 0.0; /* Before 11.08.06 swap_yn = (old_node_flag == (int)ap_node::FIXED_NODE) ? con->r/con->lambda*pow(1.-con->lambda,(double)(old_node->L-1)) : 0.0;*/ if (rnum3<=swap_yn) { //It's a swap! //In which case all of the recipient's genome is //ancestral except for the locus of interest //ltr and rtr do not need modifying rtype=1; ++nrecTypeI; } else { rnum3 -= swap_yn; /*Is it a single cross-over?*/ //single_yn = (1-a)/c; //rnum3 -= swap_yn; /* The following line was the same before 11.08.06, but the changes in calc_node_rlen imply that the relative probability of double crossovers are altered as a result. */ single_yn = con->r/con->lambda*(1.-pow(1.-con->lambda,(double)(old_node->L-1))); if (rnum3<=single_yn) { /*It's a single cross-over!*/ perform_single_crossover(<r,&rtr); rtype=2; ++nrecTypeII; if(tree[ltr-1].get_k()>1 && tree[ltr].get_k()>1 && ltr!=old_node->ltr && old_node->AMP.ptr[ltr-1]!=NULL && old_node->AMP.ptr[ltr]!=NULL) ++nrecWatt[ltr-1]; else if(tree[rtr-1].get_k()>1 && tree[rtr].get_k()>1 && rtr!=old_node->rtr && old_node->AMP.ptr[rtr-1]!=NULL && old_node->AMP.ptr[rtr]!=NULL) ++nrecWatt[rtr-1]; } else { /*It's a double cross-over!*/ perform_double_crossover(<r,&rtr); rtype=3; ++nrecTypeIII; //++nrecWatt[ltr-1]; //++nrecWatt[rtr-1]; } } } /*new_node1 is always the recipient int found_fnode = 0; int a; for(a=0;aflag==ap_node::FIXED_NODE && found_fnode!=1) error("coalescent::conditionally_recombine(): problem finding FIXED_NODE");*/ /*Copy the relevant parts of AMP*/ int i,pos; for(i=0,pos=(*segregating_tree)[0];(posAMP.assign(old_node->AMP.ptr[pos],pos); new_node2->AMP.assign(NULL,pos); } for(;(posAMP.assign(old_node->AMP.ptr[pos],pos); new_node1->AMP.assign(NULL,pos); } for(;iAMP.assign(old_node->AMP.ptr[pos],pos); new_node2->AMP.assign(NULL,pos); } /*Recalculate rlen*/ calc_rlen(new_node1,new_node2); return *this; } coalescent& migrate_coalesce(double *time) { ++nco; /*First choose deme for coalescence*/ double rdeme = ran->U() * rate_coal; int deme; for(deme=0;demendemes;deme++) { if(rdeme <= coal_deme[deme]/2.) break; else rdeme -= coal_deme[deme]/2.; } if(deme==con->ndemes) error("migrate_coalesce(): deme chosen incorrectly"); /*Create new lineage in the ARG*/ class ap_node *new_node=create_node(time); int lin1,lin2; class ap_node *ap_node1,*ap_node2; lin1 = ran->discrete(0,k_deme[deme]-1); ap_node1 = ptr_deme[deme][lin1]; if(ap_node1->deme!=deme) error("migrate_coalesce(): node 1 deme not right deme"); SWAP(ptr_deme[deme][lin1],ptr_deme[deme][k_deme[deme]-1]); --k_deme[deme]; deactivate_node(ap_node1->active_id); lin2 = ran->discrete(0,k_deme[deme]-1); ap_node2 = ptr_deme[deme][lin2]; if(ap_node2->deme!=deme) error("migrate_coalesce(): node 2 deme not right deme"); ptr_deme[deme][lin2] = new_node; deactivate_node(ap_node2->active_id); new_node->deme = deme; if((new_node->id==ap_node1->id)||(new_node->id==ap_node2->id) ||(ap_node1->id==ap_node2->id))error("migrate_coalesce(): nodes not chosen correctly"); /*Perform copying and coalescing*/ int imax=n_segregating; int i; mt_node *ptr; double last_update; for(i=0;iAMP.ptr[tree_id]==NULL) { /*Rule 2.ii */ if(ap_node2->AMP.ptr[tree_id]==NULL) { ++ncoI; new_node->AMP.assign(NULL,tree_id);} /*Rule 2.i */ else { ++ncoIIa; new_node->AMP.assign(ap_node2->AMP.ptr[tree_id],tree_id);} } else { /*Rule 2.ii */ if(ap_node2->AMP.ptr[tree_id]==NULL) { ++ncoIIb; new_node->AMP.assign(ap_node1->AMP.ptr[tree_id],tree_id);} /*Rule 2.iii */ else { ++ncoIII; /*** Structured coalescent stuff ***/ ptr = ap_node1->AMP.ptr[tree_id]; last_update = ptr->last_update; ptr->edge_time += (*time-last_update) * con->N_deme_over_D[deme]; ptr->last_update = *time; ptr = ap_node2->AMP.ptr[tree_id]; last_update = ptr->last_update; ptr->edge_time += (*time-last_update) * con->N_deme_over_D[deme]; ptr->last_update = *time; /***********************************/ new_node->AMP.assign(tree[tree_id].migrate_coalesce(*time,ap_node1->AMP.ptr[tree_id]->id,ap_node2->AMP.ptr[tree_id]->id),tree_id); } } } if(!samples_waiting) { //deactivate_trees2(); i=0; while(irlen / con->N_deme_over_D[deme]; rho_deme[deme] -= ap_node2->rlen / con->N_deme_over_D[deme]; calc_node_rlen(new_node); rho_deme[deme] += new_node->rlen / con->N_deme_over_D[deme]; rate_recn += rho_deme[deme]; rate_coal -= k_deme[deme] / con->N_deme_over_D[deme]; coal_deme[deme] -= k_deme[deme] / con->N_deme_over_D[deme]; rate_mign -= sum_mig[deme];*/ return *this; } coalescent& migrate_recombine(double *time) { ++nrec; int rtype=-1; /*First choose deme*/ double rdeme = ran->U() * rate_recn; int deme; for(deme=0;demendemes;deme++) { if(rdeme <= rho_deme[deme]/2.) break; else rdeme -= rho_deme[deme]/2.; } if(deme==con->ndemes) error("migrate_recombine(): deme not chosen correctly"); /*Create new lineages in the ARG*/ class ap_node *new_node1=create_node(time); class ap_node *new_node2=create_node(time); /*First choose the lineage*/ double rnum1=ran->U() * rho_deme[deme] / 2.0 * con->N_deme_over_D[deme]; class ap_node *old_node; int lin; for(lin=0;linrlen) break; else rnum1 -= old_node->rlen; } if(lin>=k_deme[deme]) error("migrate_recombine(): lineage not chosen correctly"); if(old_node->deme!=deme) error("migrate_recombine(): lineage deme not right deme"); new_node1->deme = new_node2->deme = deme; ptr_deme[deme][lin] = new_node1; ++k_deme[deme]; ptr_deme[deme][k_deme[deme]-1] = new_node2; /*new_node1 is always the recipient*/ deactivate_node(old_node->active_id); if(new_node1==old_node)error("Aah!"); if(new_node2==old_node)error("Aah!"); /*Determine the number of breakpoints*/ if(old_node->L==0.0)error("recombine(): recombination at an empty locus"); int ltr=old_node->ltr; int rtr=old_node->rtr; /*Is it a swap?*/ if(no_gene_conversion) { //error("coalescent::conditionally_recombine(): only donor-recipient style rec defined"); perform_single_crossover(<r,&rtr); if(tree[ltr-1].get_k()>1 && tree[ltr].get_k()>1 && ltr!=old_node->ltr && old_node->AMP.ptr[ltr-1]!=NULL && old_node->AMP.ptr[ltr]!=NULL) ++nrecWatt[ltr-1]; else if(tree[rtr-1].get_k()>1 && tree[rtr].get_k()>1 && rtr!=old_node->rtr && old_node->AMP.ptr[rtr-1]!=NULL && old_node->AMP.ptr[rtr]!=NULL) ++nrecWatt[rtr-1]; rtype=2; ++nrecTypeII; } else { double swap_yn,rnum3,single_yn; rnum3 = ran->U() * old_node->rlen; swap_yn = con->r/con->lambda*pow(1.-con->lambda,(double)(old_node->L-1)); if (rnum3<=swap_yn) { //It's a swap! //In which case all of the recipient's genome is //ancestral except for the locus of interest //ltr and rtr do not need modifying rtype=1; ++nrecTypeI; } else { rnum3 -= swap_yn; /*Is it a single cross-over?*/ //single_yn = (1-a)/c; //rnum3 -= swap_yn; single_yn = con->r/con->lambda*(1.-pow(1.-con->lambda,(double)(old_node->L-1))); if (rnum3<=single_yn) { /*It's a single cross-over!*/ perform_single_crossover(<r,&rtr); rtype=2; ++nrecTypeII; if(tree[ltr-1].get_k()>1 && tree[ltr].get_k()>1 && ltr!=old_node->ltr && old_node->AMP.ptr[ltr-1]!=NULL && old_node->AMP.ptr[ltr]!=NULL) ++nrecWatt[ltr-1]; else if(tree[rtr-1].get_k()>1 && tree[rtr].get_k()>1 && rtr!=old_node->rtr && old_node->AMP.ptr[rtr-1]!=NULL && old_node->AMP.ptr[rtr]!=NULL) ++nrecWatt[rtr-1]; } else { /*It's a double cross-over!*/ perform_double_crossover(<r,&rtr); rtype=3; ++nrecTypeIII; //++nrecWatt[ltr-1]; //++nrecWatt[rtr-1]; } } } /*Copy the relevant parts of AMP*/ int i,pos; for(i=0,pos=(*segregating_tree)[0];(posAMP.assign(old_node->AMP.ptr[pos],pos); new_node2->AMP.assign(NULL,pos); } for(;(posAMP.assign(old_node->AMP.ptr[pos],pos); new_node1->AMP.assign(NULL,pos); } for(;iAMP.assign(old_node->AMP.ptr[pos],pos); new_node2->AMP.assign(NULL,pos); } /*Recalculate rlen*/ migrate_calc_rlen(new_node1,new_node2); /*Recalculate rates* rate_recn -= rho_deme[deme]; rho_deme[deme] -= old_node->rlen / con->N_deme_over_D[deme]; calc_node_rlen(new_node1); calc_node_rlen(new_node2); rho_deme[deme] += new_node1->rlen / con->N_deme_over_D[deme]; rho_deme[deme] += new_node2->rlen / con->N_deme_over_D[deme]; rate_recn += rho_deme[deme]; rate_coal += (double)(k_deme[deme]-1) / con->N_deme_over_D[deme]; coal_deme[deme] += (double)(k_deme[deme]-1) / con->N_deme_over_D[deme]; rate_mign += sum_mig[deme];*/ return *this; } coalescent& migrate_migrate(double *time) { /*Choose source deme*/ int source; double rdeme = ran->U() * rate_mign; for(source=0;sourcendemes;source++) { if(rdeme <= k_deme[source]*sum_mig[source]/2.) break; else rdeme -= k_deme[source]*sum_mig[source]/2.; } if(source>=con->ndemes) error("migrate_migrate(): source deme not chosen correctly"); /*Choose target deme*/ int target; rdeme = ran->U() * sum_mig[source]; for(target=0;targetndemes;target++) { if(rdeme <= con->mig[source][target]) break; else rdeme -= con->mig[source][target]; } if(target>=con->ndemes) error("migrate_migrate(): target deme not chosen correctly"); /*Choose lineage*/ int lin = ran->discrete(0,k_deme[source]-1); ap_node *old_node = ptr_deme[source][lin]; if(old_node->deme!=source) error("migrate_migrate(): lineage belongs to wrong deme"); /*Perform migration*/ if(old_node->deme!=target) { SWAP(ptr_deme[source][lin],ptr_deme[source][k_deme[source]-1]); --k_deme[source]; ++k_deme[target]; ptr_deme[target][k_deme[target]-1] = old_node; old_node->deme = target; } /*Update edge_time for migrating node*/ int i,pos; mt_node *ptr; double last_update; for(i=0;iAMP.ptr[pos]; if(ptr!=NULL) { last_update = ptr->last_update; ptr->edge_time += (*time-last_update) * con->N_deme_over_D[source]; ptr->last_update = *time; } } /*Recalculate rlen*/ migrate_calc_rlen(); /*Recalculate rates* rate_recn -= rho_deme[source] + rho_deme[target]; rho_deme[source] -= old_node->rlen / con->N_deme_over_D[source]; rho_deme[target] += old_node->rlen / con->N_deme_over_D[target]; rate_recn += rho_deme[source] + rho_deme[target]; rate_coal -= coal_deme[source] + coal_deme[target]; coal_deme[source] -= (double)k_deme[source] / con->N_deme_over_D[source]; coal_deme[target] += (double)(k_deme[target]-1) / con->N_deme_over_D[target]; rate_coal += coal_deme[source] + coal_deme[target]; rate_mign += sum_mig[target] - sum_mig[source];*/ return *this; } double add_next_sample() { double samptime=*ntimes_itr; double currenttime=samptime; class ap_node* new_node; class mt_node* new_tree_node; while((currenttime==*ntimes_itr)&&(ntimes_itr!=con->ntimes.end())) //relies on ordering of ntimes { //add the new node ++naddbase; new_node=create_node(&(*ntimes_itr)); int i; for(i=0;iAMP.assign(new_tree_node,i); } calc_node_rlen(new_node); if(con->ndemes>0) { int deme = con->deme_config[next_waiting_sample]; new_node->deme = deme; ++k_deme[deme]; ptr_deme[deme][k_deme[deme]-1] = new_node; /*rho_deme[deme] += new_node->rlen / con->N_deme_over_D[deme]; rate_recn += new_node->rlen / con->N_deme_over_D[deme]; rate_coal += (double)(k_deme[deme]-1) / con->N_deme_over_D[deme]; coal_deme[deme] += (double)(k_deme[deme]-1) / con->N_deme_over_D[deme]; rate_mign += sum_mig[deme];*/ } ++next_waiting_sample; //++ARG_k; ++ntimes_itr; } if(ntimes_itr==con->ntimes.end()) samples_waiting=false; else time_next_sample=*ntimes_itr; if(con->ndemes>0) { migrate_calc_rlen(); } else calc_rlen(); return currenttime; } double add_conditional_event(class marginal_tree &ctree) { double samptime=*ntimes_itr; double currenttime=samptime; class ap_node* new_node; class mt_node* new_tree_node; int i; class mt_node* conditional_event; while((currenttime==*ntimes_itr)&&(ntimes_itr!=con->ntimes.end())) //relies on ordering of ntimes { conditional_event = &(ctree.node[next_waiting_sample]); if(conditional_event->descendant[0]==NULL) { // add a base node ++naddbase; new_node = create_node(&(*ntimes_itr)); for(i=0;iAMP.assign(new_tree_node,i); } fnode[next_waiting_sample] = new_node; new_node->flag = ap_node::FIXED_NODE; new_node->ctree_id = next_waiting_sample; calc_node_rlen(new_node); ++ARG_k_fixed; } else { // add a coalescence //fnode[conditional_event->id] = coalesce(fnode[conditional_event->descendant[0]->id],fnode[conditional_event->descendant[1]->id]); class ap_node *new_node = create_node(&(*ntimes_itr)); class ap_node *ap_node1 = fnode[conditional_event->descendant[0]->id]; deactivate_node(ap_node1->active_id); class ap_node *ap_node2 = fnode[conditional_event->descendant[1]->id]; if(ap_node1==ap_node2) error("add_conditional_event(): lineage cannot coalesce with itself"); deactivate_node(ap_node2->active_id); if((new_node->id==ap_node1->id)||(new_node->id==ap_node2->id) ||(ap_node1->id==ap_node2->id))error("add_conditional_event(): nodes not chosen correctly"); fnode[conditional_event->id] = new_node; fnode[conditional_event->descendant[0]->id] = NULL; fnode[conditional_event->descendant[1]->id] = NULL; /* give node FIXED_NODE status only if it is not the mrca */ if(conditional_event->idflag = ap_node::FIXED_NODE; --ARG_k_fixed; } else ARG_k_fixed -= 2; new_node->ctree_id = conditional_event->id; /*Perform copying and coalescing*/ int imax=n_segregating; for(i=0;iAMP.ptr[tree_id]==NULL) { /*Rule 2.ii */ if(ap_node2->AMP.ptr[tree_id]==NULL) { ++ncoI; new_node->AMP.assign(NULL,tree_id);} /*Rule 2.i */ else { ++ncoIIa; new_node->AMP.assign(ap_node2->AMP.ptr[tree_id],tree_id);} } else { /*Rule 2.ii */ if(ap_node2->AMP.ptr[tree_id]==NULL) { ++ncoIIb; new_node->AMP.assign(ap_node1->AMP.ptr[tree_id],tree_id);} /*Rule 2.iii */ else { ++ncoIII; new_node->AMP.assign(tree[tree_id].coalesce(*ntimes_itr,ap_node1->AMP.ptr[tree_id]->id,ap_node2->AMP.ptr[tree_id]->id),tree_id); } } } if(!samples_waiting) { //deactivate_trees2(); i=0; while(i1 && samples_waiting==false) error("add_conditional_event(): not all fixed events completed"); calc_rlen(); return currenttime; } double constant_size_model(const double mean) { double time=ran->exponential(1.0); time *= mean; return time; } coalescent& deactivate_node(const int id) { inactive_node[n_inactive]=active_node[id]; inactive_node[n_inactive]->recycle(); ++n_inactive; active_node[id]=active_node[ARG_k-1]; active_node[id]->active_id = id; active_node[ARG_k-1]=NULL; --ARG_k; /*NB no memory reallocation occurs*/ return *this; } /*coalescent& deactivate_node(ap_node *id) { int lin1 = 0; while(active_node[lin1]!=id) ++lin1; return deactivate_node(lin1); }*/ class ap_node* create_node(double *time) { if(ARG_k==nodes_reserved)reserve_nodes(2*(ARG_k+1)); active_node[ARG_k]=inactive_node[n_inactive-1]; active_node[ARG_k]->activate(time); active_node[ARG_k]->active_id = ARG_k; inactive_node[n_inactive-1]=NULL; ++ARG_k; --n_inactive; return active_node[ARG_k-1]; } coalescent& deactivate_trees() { int new_seg_tree_id=1-seg_tree_id; int i,j; for(i=0,j=0;iL; calc_node_rlen(active_node[i]); after = active_node[i]->L; if(before!=after) { warning("weird"); } } calc_rlen(); }*/ n_segregating = new_n_segregating; seg_tree_id=new_seg_tree_id; segregating_tree=&(internal_seg_tree[seg_tree_id]); return *this; } coalescent& deactivate_trees2() { int new_seg_tree_id=seg_tree_id; int i,j; for(i=0,j=0;iL; calc_node_rlen(active_node[i]); after = active_node[i]->L; if(before!=after) { warning("weird"); } } calc_rlen(); }*/ n_segregating = new_n_segregating; seg_tree_id=new_seg_tree_id; segregating_tree=&(internal_seg_tree[seg_tree_id]); return *this; } coalescent& deactivate_tree(const int id) { int new_seg_tree_id=1-seg_tree_id; //segregating_tree[id]=segregating_tree[n_segregating-1]; int j=0; int i; for(i=0;iAMP.ptr[left]!=NULL)break; } /*right is the first non-NULL seg site from the right*/ for(i=n_segregating-1;i>=0;i--) { right=(*segregating_tree)[i]; if(id->AMP.ptr[right]!=NULL)break; } if(i<0)/*This occurs when there are no non-NULL seg sites*/ { id->rlen=0.0; id->L=0.0; id->ltr=0; id->rtr=0; } else { /*L is the total number of sites bounded by non-NULL seg sites*/ id->L = (double)(right - left + 1); if(no_gene_conversion) id->rlen = con->r * (double)(id->L - 1); else { id->rlen = 0.5 * con->r * ((double)(id->L - 1) + 1./con->lambda * (1. - pow(1.-con->lambda,(double)(id->L-1)))); if(id->flag == ap_node::FIXED_NODE && id->L>0) id->rlen += 0.5 * con->r/con->lambda*pow(1.-con->lambda,(double)(id->L-1)); /* Before 11.08.06, id->rlen = con->r * (double)(id->L - 1); if(id->flag == ap_node::FIXED_NODE && id->L>0) id->rlen += con->r/con->lambda*pow(1.-con->lambda,(double)(id->L-1));*/ /* Even earlier, id->rlen = con->r*con->lambda*(1.-pow(1.-1./con->lambda,(double)(id->L-1))); */ } // + (con->r)/(con->lambda) * exp(-con->lambda * (double)id->L); // if((id->L==0.0)&&(con->r!=0.0)) // { // error("calc_rlen: non-zero rlen value when L=0"); // } id->ltr = left; id->rtr = right+1; } return *this; } coalescent& calc_rlen() { total_rlen=0.0; int i; for(i=0;irlen; rho=2.0*total_rlen/(double)ARG_k; return *this; } coalescent& calc_rlen(class ap_node* id) { calc_node_rlen(id); total_rlen=0.0; int i; for(i=0;irlen; rho=2.0*total_rlen/(double)ARG_k; return *this; } coalescent& calc_rlen(class ap_node* id1, class ap_node* id2) { calc_node_rlen(id1); calc_node_rlen(id2); total_rlen=0.0; int i; for(i=0;irlen; rho=2.0*total_rlen/(double)ARG_k; return *this; } coalescent& migrate_calc_rlen() { total_rlen=0.0; int i; for(i=0;indemes;i++) rho_deme[i] = 0.0; for(i=0;irlen; rho_deme[active_node[i]->deme] += active_node[i]->rlen; } rho=2.0*total_rlen/(double)ARG_k; for(i=0;indemes;i++) rho_deme[i] *= 2.0 / con->N_deme_over_D[i]; return *this; } coalescent& migrate_calc_rlen(class ap_node* id) { calc_node_rlen(id); total_rlen=0.0; int i; for(i=0;indemes;i++) rho_deme[i] = 0.0; for(i=0;irlen; rho_deme[active_node[i]->deme] += active_node[i]->rlen; } rho=2.0*total_rlen/(double)ARG_k; for(i=0;indemes;i++) rho_deme[i] *= 2.0 / con->N_deme_over_D[i]; return *this; } coalescent& migrate_calc_rlen(class ap_node* id1, class ap_node* id2) { calc_node_rlen(id1); calc_node_rlen(id2); total_rlen=0.0; int i; for(i=0;indemes;i++) rho_deme[i] = 0.0; for(i=0;irlen; rho_deme[active_node[i]->deme] += active_node[i]->rlen; } rho=2.0*total_rlen/(double)ARG_k; for(i=0;indemes;i++) rho_deme[i] *= 2.0 / con->N_deme_over_D[i]; return *this; } coalescent& perform_single_crossover(int *ltr, int *rtr) { /*ltr and rtr are modified so that when they are */ /*returned they dictate the recombination boundaries */ /*Fragment length, 1<=f<=L-1, where L=rtr-ltr */ /*Simulate X=(f-1) Truncated exponential, mean=1/lambda,*/ /*truncation point=L-1 s.t. Pr(L-1)=0. */ //printf("\nLTR: %3d RTR: %3d ",*ltr,*rtr); int L = (*rtr)-(*ltr); int X; if(no_gene_conversion) X = ran->discrete(0,L-2); else { //double mean = 1.0/(con->lambda); //X = floor(ran->trunc_exponential(mean,L-1)); X = ran->trunc_geometric(con->lambda,L-1) - 1; } /*Implement it according to direction*/ bool dir = ran->bernoulliTF(0.5); if (dir) /*left to right*/ { (*rtr) = (*ltr) + (X+1); } else /*right to left*/ { (*ltr) = (*rtr) - (X+1); } //printf("X: %3d Dir: %d LTR: %3d RTR: %3d\n",X,dir,*ltr,*rtr); return *this; } coalescent& perform_double_crossover(int *ltr, int *rtr) { /*First part as for single cross-over */ int L = (*rtr)-(*ltr); //double mean = 1.0/(con->lambda); //int X = floor(ran->trunc_exponential(mean,L-1)); int X = ran->trunc_geometric(con->lambda,L-1) - 1; /*Implement it according to direction*/ int dir = ran->discrete(0,1); if (dir) /*left to right*/ { (*ltr) += X+1; /*except now this is ltr*/ } else /*right to left*/ { (*rtr) -= (X+1); /*and this is now rtr */ } /*And repeat with re-defined boundaries */ L = (*rtr)-(*ltr); //X = floor(ran->trunc_exponential(mean,L-1)); X = ran->trunc_geometric(con->lambda,L-1) - 1; if (dir) /*left to right*/ { (*rtr) = (*ltr)+X+1; } else /*right to left*/ { (*ltr) = (*rtr)-X-1; } return *this; } coalescent& mutate_tree(const int tid, const int nid, const int state) { int my_state=mutate_edge(state,tree[tid].node[nid].edge_time); if(tree[tid].node[nid].descendant[0]==NULL) { /*I.e. a base node*/ if(tree[tid].node[nid].descendant[0]!=NULL)error("mutate_tree() err1: node has one, not two descendants"); genotype.element[nid][tid]=my_state; } else { if(tree[tid].node[nid].descendant[1]==NULL)error("mutate_tree() err2: node has one, not two descendants"); int desc=tree[tid].node[nid].descendant[0]->id; mutate_tree(tid, desc, my_state); desc=tree[tid].node[nid].descendant[1]->id; mutate_tree(tid, desc, my_state); } return *this; } coalescent& mutate_tree(const int tid, const int nid, const int state, Mutation_Matrix *M) { int my_state=mutate_edge(state,tree[tid].node[nid].edge_time,M); if(tree[tid].node[nid].descendant[0]==NULL) { /*I.e. a base node*/ if(tree[tid].node[nid].descendant[0]!=NULL)error("mutate_tree() err1: node has one, not two descendants"); genotype.element[nid][tid]=my_state; } else { if(tree[tid].node[nid].descendant[1]==NULL)error("mutate_tree() err2: node has one, not two descendants"); int desc=tree[tid].node[nid].descendant[0]->id; mutate_tree(tid, desc, my_state, M); desc=tree[tid].node[nid].descendant[1]->id; mutate_tree(tid, desc, my_state, M); } return *this; } coalescent& mutate_tree_and_record(const int tid, const int nid, const int state, Mutation_Matrix *M, vector &mutLog) { int my_state=mutate_edge_and_record(state,tree[tid].node[nid].edge_time,M,mutLog); if(tree[tid].node[nid].descendant[0]==NULL) { /*I.e. a base node*/ if(tree[tid].node[nid].descendant[0]!=NULL)error("mutate_tree_and_record() err1: node has one, not two descendants"); genotype.element[nid][tid]=my_state; } else { if(tree[tid].node[nid].descendant[1]==NULL)error("mutate_tree_and_record() err2: node has one, not two descendants"); int desc=tree[tid].node[nid].descendant[0]->id; mutate_tree_and_record(tid, desc, my_state, M, mutLog); desc=tree[tid].node[nid].descendant[1]->id; mutate_tree_and_record(tid, desc, my_state, M, mutLog); } return *this; } int mutate_mrca() { double rnum1 = ran->U(); int i; for(i=0;in_states;i++) { if (rnum1<=con->state_freq[i]) break; rnum1 -= con->state_freq[i]; } if (i>=con->n_states) error("mutate_mrca(): initial state chosen incorrectly"); //It's important state_freq sums to one return i; } int mutate_edge(const int state, const double edge_time) { int gt=state; if (gt==-1) { error("mutate_edge(): genotype does not exist"); } double time_left=edge_time; double next_mut=ran->exponential(con->state_M[gt]); while (next_mutU(); int i; for(i=0;in_states;i++) { if(rnum1<=con->mut_matrix.element[gt][i]) break; rnum1-=con->mut_matrix.element[gt][i]; } if(i>=con->n_states) { printf("\nCurrent state: %d Random uniform[0,1] deviate: %g",gt,rnum1); printf("\nError by-passed, mutation anulled\n"); //error("mutate_edge(): transition incorrectly chosen"); i=gt; } //node[st_node]->genotype[site]=i; gt=i; ++nmut; //++mutation_spectrum[count_descendants(site,st_node)]; time_left-=next_mut; next_mut=ran->exponential(con->state_M[gt]); } /*Make descendants inherit state*/ //inherit(site,st_node); return gt; } int mutate_edge(const int state, const double edge_time, Mutation_Matrix *M) { int gt=state; if (gt<0||gt>=M->n_states) { error("mutate_edge(): genotype does not exist"); } double time_left=edge_time; double next_mut=ran->exponential(M->mutation_mean[gt]); double rp; while (next_mutU(); int i; for(i=0;in_states-1;i++) { rp = M->D[gt][i]; if(rnum1 <= rp) break; rnum1 -= rp; } gt=i; ++nmut; time_left-=next_mut; next_mut=ran->exponential(M->mutation_mean[gt]); } return gt; } int mutate_edge_and_record(const int state, const double edge_time, Mutation_Matrix *M, vector &mutLog) { int gt=state; if (gt<0||gt>=M->n_states) { error("mutate_edge(): genotype does not exist"); } double time_left=edge_time; double next_mut=ran->exponential(M->mutation_mean[gt]); double rp; while (next_mutU(); int i; for(i=0;in_states-1;i++) { rp = M->D[gt][i]; if(rnum1 <= rp) break; rnum1 -= rp; } gt=i; ++nmut; mutLog.push_back(gt); time_left-=next_mut; next_mut=ran->exponential(M->mutation_mean[gt]); } return gt; } public: /* Number of segregating sites */ double S() { double result = 0.0; if(con->nsamp==0) return 0.0; int i,j; for(j=0;jseq_len;j++) { double hap = genotype[0][j]; for(i=1;insamp;i++) if(genotype[i][j]!=hap) { ++result; break; } } return result; } /* Number of unique haplotypes */ double H() { int result = 1; if(con->nsamp==0) return 0.0; _uniqueHaps = vector(con->nsamp,-1); _uniqueHaps[0] = 0; int i,ii,j; bool unique; for(i=1;insamp;i++) { unique = true; for(ii=0;iiseq_len;j++) { if(genotype[i][j]!=genotype[_uniqueHaps[ii]][j]) break; } if(j==con->seq_len) unique = false; } if(unique==true) { _uniqueHaps[result] = i; ++result; } } return (double)result; } /* Average number of pairwise differences */ double pi() { double result = 0.0; int i,j,k; for(i=0;insamp;i++) for(j=0;jseq_len;k++) result += (genotype[i][k]==genotype[j][k]) ? 0.0 : 1.0; result *= 2.0/(double)(con->nsamp)/(double)(con->nsamp-1); return result; } /* Variance in number of pairwise differences */ double Varpi() { double E,EE,pi; int i,j,k; E = EE = 0.0; for(i=0;insamp;i++) for(j=0;jseq_len;k++) pi += (genotype[i][k]==genotype[j][k]) ? 0.0 : 1.0; E += pi; EE += pi*pi; } E *= 2.0/(double)(con->nsamp)/(double)(con->nsamp-1); EE *= 2.0/(double)(con->nsamp)/(double)(con->nsamp-1); double result = EE - E*E; return result; } double Tajima() { double D = 0.0; int i,j,k,n,L; n = con->nsamp; L = con->seq_len; double a1,a2,b1,b2,c1,c2,e1,e2,khat,S; bool segregating; khat = S = 0.0; for(k=0;k &diff) { double result = 0.0; int i,j,k; for(i=0;insamp;i++) for(j=0;jseq_len;k++) result += (diff[(int)genotype[i][k]][(int)genotype[j][k]]==0); result *= 2.0/(double)(con->nsamp)/(double)(con->nsamp-1); return result; } /* Hudson and Kaplan's Rm, the minimum # recombinations. See Myers and Griffiths(2003)*/ double Rm() { if(con->nsamp==0) return 0.0; if(con->seq_len==0) return 0.0; /* Determine which sites are biallelic segregating */ _sites = vector(con->seq_len,0); int i,j,k; int S = 0; double hap0,hap1; bool segregating; for(j=0;jseq_len;j++) { segregating = false; hap0 = genotype[0][j]; for(i=1;insamp;i++) { if(!segregating && genotype[i][j]!=hap0) { segregating = true; hap1 = genotype[i][j]; } else if(segregating && genotype[i][j]!=hap0 && genotype[i][j]!=hap1) { segregating = false; // define segregating only for biallelic sites break; } } if(segregating) { _sites[S] = j; ++S; } } if(S<2) return 0.0; /* Calculate the compatibility matrix */ ____B = LowerTriangularMatrix(S,0); // so j>=k always // ____B[j][k] = 0 for compatible, 1 for incompatible bool comb[3]; for(j=0;jnsamp;i++) { if(genotype[i][_sites[j]]==hap0 && genotype[i][_sites[k]]!=hap1) comb[0] = true; if(genotype[i][_sites[j]]!=hap0 && genotype[i][_sites[k]]==hap1) comb[1] = true; if(genotype[i][_sites[j]]!=hap0 && genotype[i][_sites[k]]!=hap1) comb[2] = true; if(comb[0] && comb[1] && comb[2]) break; } ____B[j][k] = (comb[0] && comb[1] && comb[2]) ? 1 : 0; } /* Calculate the dynamic programming partition matrix */ _M = vector(S,0); int maxM = 0; _M[S-1] = 0; _M[S-2] = ____B[S-1][S-2]; for(i=S-3;i>=0;i--) { _M[i] = ____B[i+1][i] + _M[i+1]; for(k=i+2;k_M[i]) _M[i] = ____B[k][i]+_M[k]; } return (double)_M[0]; } void RecCorrelations(double* result) { RecCorrelations(result,true); } void RecCovariances(double* result) { RecCorrelations(result,false); } void RecCorrelations(double* result, bool normalize) { result[0] = result[1] = result[2] = 0.0; if(con->nsamp==0) return; if(con->seq_len==0) return; /* Determine which sites are biallelic segregating */ _sites = vector(con->seq_len,0); int i,j,k; int S = 0; double hap0,hap1; bool segregating; for(j=0;jseq_len;j++) { segregating = false; hap0 = genotype[0][j]; for(i=1;insamp;i++) { if(!segregating && genotype[i][j]!=hap0) { segregating = true; hap1 = genotype[i][j]; } else if(segregating && genotype[i][j]!=hap0 && genotype[i][j]!=hap1) { segregating = false; // define segregating only for biallelic sites break; } } if(segregating) { _sites[S] = j; ++S; } } if(S<3) return; /* Calculate frequency statistics */ _F = vector(S,1.0); /* _F is the marginal frequency of hap0 at site j */ for(j=0;jnsamp;i++) if(genotype[i][_sites[j]]==hap0) _F[j]++; _F[j] /= (double)con->nsamp; } _four = vector(4,0.0); /* _G[j][k] is the frequency of AB (_G[j][k][0]), */ _G = LowerTriangularMatrix< vector >(S,_four); /* Ab (1), aB (2), ab (3) for sites j and k */ for(j=0;jnsamp;i++) { if(genotype[i][_sites[j]]==hap0 && genotype[i][_sites[k]]==hap1) ++_G[j][k][0]; else if(genotype[i][_sites[j]]==hap0 && genotype[i][_sites[k]]!=hap1) ++_G[j][k][1]; else if(genotype[i][_sites[j]]!=hap0 && genotype[i][_sites[k]]==hap1) ++_G[j][k][2]; else if(genotype[i][_sites[j]]!=hap0 && genotype[i][_sites[k]]!=hap1) ++_G[j][k][3]; else warning("Unexpected choice"); } for(i=0;i<4;i++) _G[j][k][i] /= (double)con->nsamp; } /* Calculate LD statistics for pairs of sites */ _A = LowerTriangularMatrix(S,0.0); // rsq ___B = LowerTriangularMatrix(S,0.0); // Dprime ___C = LowerTriangularMatrix(S,0.0); // G4 _D = Matrix(S,S,0.0); double temp; for(i=0;i0.0 && _G[i][j][1]>0.0 && _G[i][j][2]>0.0 && _G[i][j][3]>0.0) ? 1.0 : 0.0; _D[i][j] = _D[j][i] = _sites[i] - _sites[j]; } } double E[4] = {0.0,0.0,0.0,0.0}; double EE[4] = {0.0,0.0,0.0,0.0}; double ED[3] = {0.0,0.0,0.0}; int ctr; // ofstream out("ld.txt"); for(i=0,ctr=0;i. */ /************************************************/ /* control_wizard.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /************************************************/ #ifndef _CONTROL_WIZARD_H_ #define _CONTROL_WIZARD_H_ #pragma warning(disable: 4786) #include #include #include #include #include #include #include #include #include //using namespace std; namespace myutils { #ifndef _CONTROL_AND_ARGUMENT_WIZARD_TYPES_ #define _CONTROL_AND_ARGUMENT_WIZARD_TYPES_ typedef void RTRV;//functions that retrieve the data typedef void GENERIC;//for the generic pointers enum DATA_TYPE {TP_UNRECOGNISED,TP_INT,TP_DOUBLE,TP_STRING,TP_VEC_INT,TP_VEC_DOUBLE,TP_EXT_VEC_DOUBLE,TP_VEC_STRING}; #endif // _CONTROL_AND_ARGUMENT_WIZARD_TYPES_ class ControlWizard { /*MEMBER VARIABLES*/ public: std::vector line_delimiters; std::vector label_delimiters; std::vector white_space; std::vector elem_delimiters; std::vector rem_delimiters; std::vector eof_delimiters; std::list required; bool coutput; // Set to true to print out all comments bool unrecognised; // Set to true to print out unrecognised options bool got_required; bool case_sensitive; bool _EOF; /* used to avoid function pointers in selecting data-read function */ DATA_TYPE switcher; protected: std::map label_map; std::map data_map; /*MEMBER FUNCTIONS*/ public: ControlWizard(){set_defaults();} void read_input(const char* filename) { std::ifstream infile(filename); if(infile.is_open()==false) { string errTxt(filename); errTxt += " not found"; error(errTxt.c_str()); } required.unique(); std::string label; while(eof(infile)==false) { if(read_label(infile,label)) { data_format(label); //(*this.*read_data)(infile,label); read_data(infile,label); required.remove(label); } } if(coutput)printf("Finished reading in control file.\n\n"); got_required=auto_check_required(); infile.close(); } std::ifstream& read_input(std::ifstream &infile) { if(infile.is_open()==false)error("File not found"); required.unique(); std::string label; while(eof(infile)==false) { if(read_label(infile,label)) { data_format(label); //(*this.*read_data)(infile,label); read_data(infile,label); required.remove(label); } } if(coutput)printf("Finished reading in control file.\n\n"); got_required=auto_check_required(); return infile; } void add_item(std::string label,const DATA_TYPE data_type,GENERIC* location) //GENERIC* lets any type of pointer be passed to the function { if(!case_sensitive) { //std::transform(label.begin(),label.end(),label.begin(),tolower); int i; for(i=0;i<(int)label.length();i++) label[i] = tolower(label[i]); } label_map[label]=data_type; //printf("Assigned label \"%s\"\n",label.c_str()); data_map[label]=location; } void add_ITEM(std::string label,const DATA_TYPE data_type,GENERIC* location) //These are essential items { if(!case_sensitive) { //std::transform(label.begin(),label.end(),label.begin(),tolower); int i; for(i=0;i<(int)label.length();i++) label[i] = tolower(label[i]); } label_map[label]=data_type; data_map[label]=location; required.push_back(label); } bool check_required() //Returns false if some required items are not found { if(!got_required) { printf("The following required items have not been found: "); //std::copy(required.begin(),required.end(),std::ostream_iterator(std::cout," ")); std::list::iterator i; for(i=required.begin();i!=required.end();i++) cout << *i << " "; cout << endl; } else printf("All required items were found\n"); return got_required; } bool eof(std::ifstream &infile) { return (infile.eof() || _EOF); } char get(std::ifstream &infile) { char ch = infile.get(); int i; if(!_EOF) for(i=0;i<(int)eof_delimiters.size();i++) if((int)ch==eof_delimiters[i]) { _EOF = true; break; } return ch; } protected: void set_defaults() { coutput=true; unrecognised=true; case_sensitive=false; white_space.push_back(' '); white_space.push_back(-1); white_space.push_back(10); white_space.push_back(13); label_delimiters.push_back('='); line_delimiters.push_back(10); line_delimiters.push_back(13); elem_delimiters.push_back(','); //elem_delimiters.push_back('\t'); rem_delimiters.push_back('#'); _EOF = false; } void error(const char* error_text) { printf("Run-time error in ControlWizard::"); printf("%s\n", error_text); printf("Exiting to system...\n"); exit(13); } bool read_label(std::ifstream &infile, std::string &word) /*Returns true if a label is found*/ { int character; word=""; bool label_delim_found=false; bool line_delim_found=false; bool include_char=true; while(eof(infile)==false) { character=get(infile); label_delim_found=false; include_char=true; int i; for(i=0;i<(int)white_space.size();i++) { if(character==white_space[i])include_char=false; } for(i=0;i<(int)line_delimiters.size();i++) { if(character==line_delimiters[i]) { include_char=false; line_delim_found=true; } } for(i=0;i<(int)label_delimiters.size();i++) { if(character==label_delimiters[i]) { include_char=false; label_delim_found=true; } } for(i=0;i<(int)rem_delimiters.size();i++) { if(character==rem_delimiters[i]) { snail(infile); return false; } } if(include_char==true)word += static_cast(character); if(line_delim_found==true) { if(word.size()>0)printf("Incomplete line \"%s\"\n",word.c_str()); break; } if(label_delim_found==true)break; } if(!case_sensitive) { //std::transform(word.begin(),word.end(),word.begin(),tolower); int ii; for(ii=0;ii<(int)word.length();ii++) word[ii] = tolower(word[ii]); } //cout << "Returning string: (" << word << ")" << endl; return label_delim_found; } void data_format(std::string& label) { label_map[label]; //DATA_TYPE switcher=label_map[label]; switcher=label_map[label]; /*switch(switcher) { case TP_UNRECOGNISED: read_data=function_get_unrecognised;break; case TP_INT: read_data=function_get_int;break; case TP_DOUBLE: read_data=function_get_double;break; case TP_STRING: read_data=function_get_string;break; case TP_VEC_INT: read_data=function_get_vector_int;break; case TP_VEC_DOUBLE: read_data=function_get_vector_double;break; case TP_EXT_VEC_DOUBLE: read_data=function_get_external_vector_double;break; default: read_data=function_get_unrecognised;break; }*/ } bool auto_check_required() //Returns false if some required items are not found { if(required.size()>0)return false; return true; } void get_single(std::ifstream &infile,std::string &word) { int character; word=""; bool line_delim_found=false; bool include_char=true; while(eof(infile)==false) { character=get(infile); include_char=true; int i; for(i=0;i<(int)white_space.size();i++) { if(character==white_space[i])include_char=false; } for(i=0;i<(int)line_delimiters.size();i++) { if(character==line_delimiters[i]) { include_char=false; line_delim_found=true; } } if(include_char==true)word += static_cast(character); if(line_delim_found==true)break; } } bool get_element(std::ifstream &infile,std::string &word) /*Returns false when line delimiter or EOF is reached*/ { int character; word=""; bool line_delim_found=false; bool include_char=true; bool elem_delim_found=false; while(eof(infile)==false) { character=get(infile); include_char=true; int i; for(i=0;i<(int)white_space.size();i++) { if(character==white_space[i])include_char=false; } for(i=0;i<(int)line_delimiters.size();i++) { if(character==line_delimiters[i]) { include_char=false; line_delim_found=true; } } for(i=0;i<(int)elem_delimiters.size();i++) { if(character==elem_delimiters[i]) { include_char=false; elem_delim_found=true; } } if(include_char==true)word += static_cast(character); if(line_delim_found==true)return false; if(elem_delim_found==true)return true; } return false; } void get_multiple(std::ifstream &infile,std::vector &words) { bool loop=true; int elem=(int)words.size()-1; while(loop==true) { words.push_back(""); ++elem; loop=get_element(infile,words[elem]); } } void snail(std::ifstream &infile) /*Proceeds to next line*/ { int character; bool line_delim_found=false; while(eof(infile)==false) { character=get(infile); int i; for(i=0;i<(int)line_delimiters.size();i++) { if(character==line_delimiters[i])line_delim_found=true; } if(line_delim_found==true)break; } } protected: // RTRV (ControlWizard::*read_data)(std::ifstream &infile, std::string &label); void read_data(std::ifstream &infile, std::string &label) { switch(switcher) { case TP_UNRECOGNISED: function_get_unrecognised(infile,label); break; case TP_INT: function_get_int(infile,label); break; case TP_DOUBLE: function_get_double(infile,label); break; case TP_STRING: function_get_string(infile,label); break; case TP_VEC_INT: function_get_vector_int(infile,label); break; case TP_VEC_DOUBLE: function_get_vector_double(infile,label); break; case TP_EXT_VEC_DOUBLE: function_get_external_vector_double(infile,label); break; case TP_VEC_STRING: function_get_vector_string(infile,label); break; default: function_get_unrecognised(infile,label); break; } } RTRV function_get_unrecognised(std::ifstream &infile, std::string &label) { if((label.size()>0)&&(coutput || unrecognised)) printf("Label \"%s\" not recognised.\n",label.c_str()); snail(infile); } RTRV function_get_int(std::ifstream &infile, std::string &label) { std::string word=""; get_single(infile,word); int value=atoi(word.c_str()); GENERIC* ptr=data_map[label]; (*(static_cast(ptr)))=value; if(coutput)printf("%s = %d\n",label.c_str(),value); } RTRV function_get_double(std::ifstream &infile, std::string &label) { std::string word=""; get_single(infile,word); double value; if(word=="1.#INF") value = numeric_limits::infinity(); else if(word=="-1.#IND") value = numeric_limits::quiet_NaN(); else if(word=="-1.#INF") value = numeric_limits::signaling_NaN(); else value=atof(word.c_str()); GENERIC* ptr=data_map[label]; (*(static_cast(ptr)))=value; if(coutput)printf("%s = %g\n",label.c_str(),value); } RTRV function_get_string(std::ifstream &infile, std::string &label) /* TP_STRING must be enclosed by "double quotes" and must fit on one line */ { int character; std::string word=""; bool line_delim_found=false; bool include_char=true; bool string_terminated=false; while(eof(infile)==false) { character=get(infile); include_char=true; int i; for(i=0;i<(int)white_space.size();i++) { if(character==white_space[i])include_char=false; } for(i=0;i<(int)line_delimiters.size();i++) { if(character==line_delimiters[i]) { include_char=false; line_delim_found=true; } } if(character=='\"') { include_char = false; break; } if(include_char==true) { word += static_cast(character); break; } if(line_delim_found==true)break; } if(line_delim_found==false) { while(eof(infile)==false) { character=get(infile); include_char=true; int i; if(character=='\"') { include_char = false; string_terminated = true; } for(i=0;i<(int)line_delimiters.size();i++) { if(character==line_delimiters[i]) { include_char=false; line_delim_found=true; } } if(include_char==true)word += static_cast(character); if(line_delim_found==true)break; if(string_terminated==true) { snail(infile); break; } } } GENERIC* ptr=data_map[label]; (*(static_cast(ptr)))=word; if(coutput)printf("%s = %s\n",label.c_str(),word.c_str()); } /* RTRV function_get_string(std::ifstream &infile, std::string &label) { int character; std::string word=""; bool line_delim_found=false; bool include_char=true; bool string_started=false; while(eof(infile)==false) { character=get(infile); include_char=true; int i; for(i=0;i(character); break; } if(line_delim_found==true)break; } if(line_delim_found==false) { while(eof(infile)==false) { character=get(infile); include_char=true; int i; for(i=0;i(character); if(line_delim_found==true)break; } } GENERIC* ptr=data_map[label]; (*(static_cast(ptr)))=word; if(coutput)printf("%s = %s\n",label.c_str(),word.c_str()); }*/ RTRV function_get_vector_int(std::ifstream &infile, std::string &label) { std::vector words; get_multiple(infile,words); GENERIC* g_ptr=data_map[label]; std::vector* ptr=static_cast*>(g_ptr); ptr->clear(); int i; for(i=0;i<(int)words.size();i++) ptr->push_back(atoi(words[i].c_str())); if(coutput)printf("%s read in %lu elements\n",label.c_str(),ptr->size()); } RTRV function_get_vector_double(std::ifstream &infile, std::string &label) { std::vector words; get_multiple(infile,words); GENERIC* g_ptr=data_map[label]; std::vector* ptr=static_cast*>(g_ptr); ptr->clear(); int i; for(i=0;i<(int)words.size();i++) ptr->push_back(atof(words[i].c_str())); if(coutput)printf("%s read in %lu elements\n",label.c_str(),ptr->size()); } RTRV function_get_external_vector_double(std::ifstream &infile, std::string &label) { std::string filename=""; std::string internal_call="Opening external file"; data_map[internal_call]=&filename; function_get_string(infile,internal_call); std::ifstream extfile(filename.c_str()); if(extfile.is_open()==false)error("function_get_external_vector_double():External file not found"); if(coutput)printf("\t"); function_get_vector_double(extfile,label); } RTRV function_get_vector_string(std::ifstream &infile, std::string &label) { std::vector words; get_multiple(infile,words); GENERIC* g_ptr=data_map[label]; std::vector* ptr=static_cast*>(g_ptr); ptr->clear(); int i; for(i=0;i<(int)words.size();i++) ptr->push_back(words[i]); if(coutput)printf("%s read in %lu elements\n",label.c_str(),ptr->size()); } }; }; #endif ClonalFrameML-1.11/src/bank/ess.h000066400000000000000000000051451307563374100165330ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * ess.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ #ifndef _EFFECTIVE_SAMPLE_SIZE_H_ #define _EFFECTIVE_SAMPLE_SIZE_H_ #include #include "myutils/vector.h" namespace myutils { inline double effectiveSampleSize(double* statistic, const int samples) { //int maxLag = samples; int maxLag = 1000; Vector gammaStat(maxLag,0.0); Vector varGammaStat(maxLag,0.0); double meanStat = 0.0; double varStat,varVarStat,assVarCor,del1, del2; int i,j,lag; for(i=0; i 0)) { varStat += (2.0*(gammaStat[lag]+gammaStat[lag+1])); varVarStat += (2.0*(varGammaStat[lag] + varGammaStat[lag+1])); assVarCor += (2.0*((gammaStat[lag] * gammaStat[lag]) + (gammaStat[lag+1] * gammaStat[lag+1])) / (gammaStat[0] * gammaStat[0])); if (gammaStat[lag]+gammaStat[lag+1] < gammaStat[lag+2]+gammaStat[lag+3] ) break; lag += 2; } // standard error of mean double stdErrorOfMean = sqrt(varStat/samples); // variance of statistic double variance = gammaStat[0]; // standard error of variance double stdErrorOfVariance = sqrt(varVarStat/samples); // effective sample size double ESS = gammaStat[0] * samples / varStat; // M int M = lag; return ESS; } }; #endif//_EFFECTIVE_SAMPLE_SIZE_H_ ClonalFrameML-1.11/src/bank/mutation.h000066400000000000000000000716541307563374100176110ustar00rootroot00000000000000/* Copyright 2013 Daniel Wilson. * * mutation.h * Part of the coalesce library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ #ifndef _MUTATION_H_ #define _MUTATION_H_ #include #include "myutils/myutils.h" using std::vector; using namespace myutils; class Mutation_Matrix { public: int n_states; vector state_freq; vector state_char; Random *ran; Matrix C; /*continuous-time rate matrix*/ Matrix D; /*discrete-time transition matrix*/ vector mutation_rate; vector mutation_mean; protected: /*assumes C_in is a valid rate matrix of size n_states_in*/ void initialize(const int n_states_in, Matrix *C_in) { n_states = n_states_in; C = *C_in; D.resize(n_states,n_states); mutation_rate.resize(n_states); mutation_mean.resize(n_states); int i,j; for(i=0;i state_freq_in) { if(state_freq_in.size()!=n_states) error("Mutation_Matrix::set_state_freq(state_freq_in): state_freq_in inconsistent in size with n_states"); double tot = 0.0; int i; for(i=0;i state_char_in) { if(state_char_in.size()!=n_states) error("Mutation_Matrix::set_state_char(state_char_in): state_char_in inconsistent in size with n_states"); state_char = state_char_in; } void set_ran(Random *ran_in) { ran = ran_in; } inline double get_rate(const int state) { if(state<0||state>=n_states) error("Mutation_Matrix::get_rate(state): unknown state"); return mutation_rate[state]; } int draw() { int state; double rp; double U = ran->U(); for(state=0;state=n_states) error("Mutation_Matrix::mutate(state): unknown state"); int new_state = state; double U = ran->U(); double rp; for(new_state=0;new_state=n_states) error("Mutation_Matrix::mutate_edge(state,time): unknown state"); if(time<0.0) error("Mutation_Matrix::mutate_edge(state,time): time must be non-negative"); double time_remaining = time; double next_mutation = ran->exponential(mutation_mean[old_state]); double U,rp; int new_state = state; while(next_mutationU(); for(new_state=0;new_stateexponential(mutation_mean[old_state]); } return new_state; } int mutate_edge(const int state, const double time, int &nmut) { int old_state = state; if(old_state<0||old_state>=n_states) error("Mutation_Matrix::mutate_edge(state,time): unknown state"); if(time<0.0) error("Mutation_Matrix::mutate_edge(state,time): time must be non-negative"); double time_remaining = time; double next_mutation = ran->exponential(mutation_mean[old_state]); double U,rp; int new_state = state; while(next_mutationU(); for(new_state=0;new_stateexponential(mutation_mean[old_state]); ++nmut; } return new_state; } double expected_rate() { double result = 0.0; int i; for(i=0;iU(); switch(state) { case 0: if(U state_freq_in, Random *ran_in) { set_defaults(); ran = ran_in; set_state_freq(state_freq_in); C.resize(n_states,n_states); int i,j; for(i=0;i<4;i++) { C[i][i] = 0.0; for(j=0;j<4;j++) if(i!=j) { C[i][j] = lambda*state_freq[j]; C[i][i] -= C[i][j]; } } initialize(n_states,&C); } }; class K80 : public Nucleotide_Mutation_Matrix { public: K80(const double lambda, const double kappa, Random *ran_in) { set_defaults(); ran = ran_in; C.resize(n_states,n_states); update(lambda,kappa); } K80& update(const double lambda, const double kappa) { int i,j; for(i=0;i<4;i++) for(j=0;j<4;j++) C[i][j] = (i==j) ? 0.0 : lambda/4.; C[0][1]*=kappa;C[1][0]*=kappa; C[2][3]*=kappa;C[3][2]*=kappa; for(i=0;i<4;i++) for(j=0;j<4;j++) if(i!=j) C[i][i] -= C[i][j]; initialize(n_states,&C); return *this; } int fast_mutate(const int state, const double time) { double lambda = 4.*C[0][2]; double kappa = C[0][1]/C[0][2]; double p0 = .25 + .25*exp(-2.*lambda*time) + .5*exp(-lambda*time*(1.+kappa)); double p1 = p0 + .25 - .25*exp(-2.*lambda*time); double p2 = p1 + .25 - .25*exp(-2.*lambda*time); double U = ran->U(); switch(state) { case 0: if(U state_freq_in, Random *ran_in) { set_defaults(); ran = ran_in; set_state_freq(state_freq_in); C.resize(n_states,n_states); int i,j; for(i=0;i<4;i++) for(j=0;j<4;j++) C[i][j] = (i==j) ? 0.0 : lambda*state_freq[j]; C[0][1]*=kappa;C[1][0]*=kappa; C[2][3]*=kappa;C[3][2]*=kappa; for(i=0;i<4;i++) for(j=0;j<4;j++) if(i!=j) C[i][i] -= C[i][j]; initialize(n_states,&C); } }; class TN93 : public Nucleotide_Mutation_Matrix { public: TN93(const double lambda, const double kappa_R, const double kappa_Y, const vector state_freq_in, Random *ran_in) { set_defaults(); ran = ran_in; set_state_freq(state_freq_in); C.resize(n_states,n_states); int i,j; for(i=0;i<4;i++) for(j=0;j<4;j++) C[i][j] = (i==j) ? 0.0 : lambda*state_freq[j]; C[0][1]*=kappa_R;C[1][0]*=kappa_R; C[2][3]*=kappa_Y;C[3][2]*=kappa_Y; for(i=0;i<4;i++) for(j=0;j<4;j++) if(i!=j) C[i][i] -= C[i][j]; initialize(n_states,&C); } }; class Codon_Mutation_Matrix : public Mutation_Matrix { public: void set_defaults() { n_states = 64; state_freq.resize(n_states,1./61.); state_freq[10]=state_freq[11]=state_freq[14]=0.0; string default_char = string(3,'-'); state_char.resize(n_states,default_char); int i,j,k,l; vector base(4,'-'); base[0] = 'U'; base[1] = 'C'; base[2] = 'A'; base[3] = 'G'; for(i=0,l=0;i<4;i++) for(j=0;j<4;j++) for(k=0;k<4;k++,l++) { state_char[l][0] = base[i]; state_char[l][1] = base[j]; state_char[l][2] = base[k]; } //for(i=0;i<64;i++) state_char[i] = i+1; } virtual Codon_Mutation_Matrix& update(const double mu, const double kappa, const double omega) = 0; virtual Codon_Mutation_Matrix& update(const double mu, const double kappa, const double omega, const vector &pi) = 0; virtual Codon_Mutation_Matrix& build_C(const double mu, const double kappa, const double omega, const vector &pi) = 0; }; class NY98 : public Codon_Mutation_Matrix { protected: NY98() {} public: NY98(Random *ran_in) { set_defaults(); ran = ran_in; C.resize(n_states,n_states); D.resize(n_states,n_states); } NY98(const double mu, const double kappa, const double omega, Random *ran_in) { set_defaults(); ran = ran_in; C.resize(n_states,n_states); D.resize(n_states,n_states); update(mu,kappa,omega,state_freq); } NY98(const double mu, const double kappa, const double omega, const vector &pi, Random *ran_in) { set_defaults(); ran = ran_in; set_state_freq(pi); C.resize(n_states,n_states); D.resize(n_states,n_states); update(mu,kappa,omega,state_freq); } Codon_Mutation_Matrix& update(const double mu, const double kappa, const double omega) { build_C(mu,kappa,omega,state_freq); initialize(n_states,&C); return *this; } Codon_Mutation_Matrix& update(const double mu, const double kappa, const double omega, const vector &pi) { set_state_freq(pi); build_C(mu,kappa,omega,state_freq); initialize(n_states,&C); return *this; } Codon_Mutation_Matrix& build_C(const double mu, const double kappa, const double omega, const vector &pi) { int i,j; /*Initialize to zero*/ for(i=0;i<64;i++){for(j=0;j<64;j++)C[i][j]=0.0;} C[0][1]=kappa*mu; C[0][2]=omega*mu; C[0][3]=omega*mu; C[0][4]=kappa*omega*mu; C[0][8]=omega*mu; C[0][12]=omega*mu; C[0][16]=kappa*omega*mu; C[0][32]=omega*mu; C[0][48]=omega*mu; C[1][2]=omega*mu; C[1][3]=omega*mu; C[1][5]=kappa*omega*mu; C[1][9]=omega*mu; C[1][13]=omega*mu; C[1][17]=kappa*omega*mu; C[1][33]=omega*mu; C[1][49]=omega*mu; C[2][3]=kappa*mu; C[2][6]=kappa*omega*mu; C[2][10]=omega*mu; C[2][14]=omega*mu; C[2][18]=kappa*mu; /*Synonymous!*/ C[2][34]=omega*mu; C[2][50]=omega*mu; C[3][7]=kappa*omega*mu; C[3][11]=omega*mu; C[3][15]=omega*mu; C[3][19]=kappa*mu; /*Synonymous!*/ C[3][35]=omega*mu; C[3][51]=omega*mu; C[4][5]=kappa*mu; C[4][6]=mu; C[4][7]=mu; C[4][8]=omega*mu; C[4][12]=omega*mu; C[4][20]=kappa*omega*mu; C[4][36]=omega*mu; C[4][52]=omega*mu; C[5][6]=mu; C[5][7]=mu; C[5][9]=omega*mu; C[5][13]=omega*mu; C[5][21]=kappa*omega*mu; C[5][37]=omega*mu; C[5][53]=omega*mu; C[6][7]=kappa*mu; C[6][10]=omega*mu; C[6][14]=omega*mu; C[6][22]=kappa*omega*mu; C[6][38]=omega*mu; C[6][54]=omega*mu; C[7][11]=omega*mu; C[7][15]=omega*mu; C[7][23]=kappa*omega*mu; C[7][39]=omega*mu; C[7][55]=omega*mu; C[8][9]=kappa*mu; C[8][10]=omega*mu; C[8][11]=omega*mu; C[8][12]=kappa*omega*mu; C[8][24]=kappa*omega*mu; C[8][40]=omega*mu; C[8][56]=omega*mu; C[9][10]=omega*mu; C[9][11]=omega*mu; C[9][13]=kappa*omega*mu; C[9][25]=kappa*omega*mu; C[9][41]=omega*mu; C[9][57]=omega*mu; C[10][11]=kappa*mu; C[10][14]=kappa*mu; C[10][26]=kappa*omega*mu; C[10][42]=omega*mu; C[10][58]=omega*mu; C[11][15]=kappa*omega*mu; C[11][27]=kappa*omega*mu; C[11][43]=omega*mu; C[11][59]=omega*mu; C[12][13]=kappa*mu; C[12][14]=omega*mu; C[12][15]=omega*mu; C[12][28]=kappa*omega*mu; C[12][44]=omega*mu; C[12][60]=omega*mu; C[13][14]=omega*mu; C[13][15]=omega*mu; C[13][29]=kappa*omega*mu; C[13][45]=omega*mu; C[13][61]=omega*mu; C[14][15]=kappa*omega*mu; C[14][30]=kappa*omega*mu; C[14][46]=omega*mu; C[14][62]=omega*mu; C[15][31]=kappa*omega*mu; C[15][47]=omega*mu; C[15][63]=omega*mu; C[16][17]=kappa*mu; C[16][18]=mu; C[16][19]=mu; C[16][20]=kappa*omega*mu; C[16][24]=omega*mu; C[16][28]=omega*mu; C[16][32]=omega*mu; C[16][48]=omega*mu; C[17][18]=mu; C[17][19]=mu; C[17][21]=kappa*omega*mu; C[17][25]=omega*mu; C[17][29]=omega*mu; C[17][33]=omega*mu; C[17][49]=omega*mu; C[18][19]=kappa*mu; C[18][22]=kappa*omega*mu; C[18][26]=omega*mu; C[18][30]=omega*mu; C[18][34]=omega*mu; C[18][50]=omega*mu; C[19][23]=kappa*omega*mu; C[19][27]=omega*mu; C[19][31]=omega*mu; C[19][35]=omega*mu; C[19][51]=omega*mu; C[20][21]=kappa*mu; C[20][22]=mu; C[20][23]=mu; C[20][24]=omega*mu; C[20][28]=omega*mu; C[20][36]=omega*mu; C[20][52]=omega*mu; C[21][22]=mu; C[21][23]=mu; C[21][25]=omega*mu; C[21][29]=omega*mu; C[21][37]=omega*mu; C[21][53]=omega*mu; C[22][23]=kappa*mu; C[22][26]=omega*mu; C[22][30]=omega*mu; C[22][38]=omega*mu; C[22][54]=omega*mu; C[23][27]=omega*mu; C[23][31]=omega*mu; C[23][39]=omega*mu; C[23][55]=omega*mu; C[24][25]=kappa*mu; C[24][26]=omega*mu; C[24][27]=omega*mu; C[24][28]=kappa*omega*mu; C[24][40]=omega*mu; C[24][56]=omega*mu; C[25][26]=omega*mu; C[25][27]=omega*mu; C[25][29]=kappa*omega*mu; C[25][41]=omega*mu; C[25][57]=omega*mu; C[26][27]=kappa*mu; C[26][30]=kappa*omega*mu; C[26][42]=omega*mu; C[26][58]=omega*mu; C[27][31]=kappa*omega*mu; C[27][43]=omega*mu; C[27][59]=omega*mu; C[28][29]=kappa*mu; C[28][30]=mu; C[28][31]=mu; C[28][44]=omega*mu; C[28][60]=omega*mu; C[29][30]=mu; C[29][31]=mu; C[29][45]=omega*mu; C[29][61]=omega*mu; C[30][31]=kappa*mu; C[30][46]=mu; C[30][62]=omega*mu; C[31][47]=mu; C[31][63]=omega*mu; C[32][33]=kappa*mu; C[32][34]=mu; C[32][35]=omega*mu; C[32][36]=kappa*omega*mu; C[32][40]=omega*mu; C[32][44]=omega*mu; C[32][48]=kappa*omega*mu; C[33][34]=mu; C[33][35]=omega*mu; C[33][37]=kappa*omega*mu; C[33][41]=omega*mu; C[33][45]=omega*mu; C[33][49]=kappa*omega*mu; C[34][35]=kappa*omega*mu; C[34][38]=kappa*omega*mu; C[34][42]=omega*mu; C[34][46]=omega*mu; C[34][50]=kappa*omega*mu; C[35][39]=kappa*omega*mu; C[35][43]=omega*mu; C[35][47]=omega*mu; C[35][51]=kappa*omega*mu; C[36][37]=kappa*mu; C[36][38]=mu; C[36][39]=mu; C[36][40]=omega*mu; C[36][44]=omega*mu; C[36][52]=kappa*omega*mu; C[37][38]=mu; C[37][39]=mu; C[37][41]=omega*mu; C[37][45]=omega*mu; C[37][53]=kappa*omega*mu; C[38][39]=kappa*mu; C[38][42]=omega*mu; C[38][46]=omega*mu; C[38][54]=kappa*omega*mu; C[39][43]=omega*mu; C[39][47]=omega*mu; C[39][55]=kappa*omega*mu; C[40][41]=kappa*mu; C[40][42]=omega*mu; C[40][43]=omega*mu; C[40][44]=kappa*omega*mu; C[40][56]=kappa*omega*mu; C[41][42]=omega*mu; C[41][43]=omega*mu; C[41][45]=kappa*omega*mu; C[41][57]=kappa*omega*mu; C[42][43]=kappa*mu; C[42][46]=kappa*omega*mu; C[42][58]=kappa*omega*mu; C[43][47]=kappa*omega*mu; C[43][59]=kappa*omega*mu; C[44][45]=kappa*mu; C[44][46]=omega*mu; C[44][47]=omega*mu; C[44][60]=kappa*omega*mu; C[45][46]=omega*mu; C[45][47]=omega*mu; C[45][61]=kappa*omega*mu; C[46][47]=kappa*mu; C[46][62]=kappa*omega*mu; C[47][63]=kappa*omega*mu; C[48][49]=kappa*mu; C[48][50]=mu; C[48][51]=mu; C[48][52]=kappa*omega*mu; C[48][56]=omega*mu; C[48][60]=omega*mu; C[49][50]=mu; C[49][51]=mu; C[49][53]=kappa*omega*mu; C[49][57]=omega*mu; C[49][61]=omega*mu; C[50][51]=kappa*mu; C[50][54]=kappa*omega*mu; C[50][58]=omega*mu; C[50][62]=omega*mu; C[51][55]=kappa*omega*mu; C[51][59]=omega*mu; C[51][63]=omega*mu; C[52][53]=kappa*mu; C[52][54]=mu; C[52][55]=mu; C[52][56]=omega*mu; C[52][60]=omega*mu; C[53][54]=mu; C[53][55]=mu; C[53][57]=omega*mu; C[53][61]=omega*mu; C[54][55]=kappa*mu; C[54][58]=omega*mu; C[54][62]=omega*mu; C[55][59]=omega*mu; C[55][63]=omega*mu; C[56][57]=kappa*mu; C[56][58]=omega*mu; C[56][59]=omega*mu; C[56][60]=kappa*omega*mu; C[57][58]=omega*mu; C[57][59]=omega*mu; C[57][61]=kappa*omega*mu; C[58][59]=kappa*mu; C[58][62]=kappa*omega*mu; C[59][63]=kappa*omega*mu; C[60][61]=kappa*mu; C[60][62]=mu; C[60][63]=mu; C[61][62]=mu; C[61][63]=mu; C[62][63]=kappa*mu; /*Remove the STOP codons from the scheme*/ for(i=0;i<64;i++){ C[10][i]=0.0; C[i][10]=0.0; C[11][i]=0.0; C[i][11]=0.0; C[14][i]=0.0; C[i][14]=0.0; } /*Fill in the lower triangle*/ for(i=0;i<64;i++){ for(j=i+1;j<64;j++)C[j][i]=C[i][j];} /*Apply the equilibrium frequencies*/ for(i=0;i<64;i++) for(j=0;j<64;j++) C[i][j]*=pi[j]; /*Compute the diagonal*/ for(i=0;i<64;i++) { double rowsum=0.0; for(j=0;j<64;j++) rowsum+=C[i][j]; C[i][i]=-rowsum; } return *this; } }; class NY98_61 : public NY98 { public: void set_defaults() { n_states = 61; state_freq.resize(n_states,1./61.); string default_char = string(3,'-'); state_char.resize(n_states,default_char); int i,j,k,l,m; vector base(4,'-'); base[0] = 'U'; base[1] = 'C'; base[2] = 'A'; base[3] = 'G'; for(i=0,l=0,m=0;i<4;i++) for(j=0;j<4;j++) for(k=0;k<4;k++,l++,m++) { state_char[m][0] = base[i]; state_char[m][1] = base[j]; state_char[m][2] = base[k]; if(l==10 || l==11 || l==14) --m; } //for(i=0;i<61;i++) state_char[i] = i+1; } NY98_61(Random *ran_in) { set_defaults(); ran = ran_in; C.resize(n_states,n_states); D.resize(n_states,n_states); } NY98_61(const double mu, const double kappa, const double omega, Random *ran_in) { set_defaults(); ran = ran_in; C.resize(n_states,n_states); D.resize(n_states,n_states); update(mu,kappa,omega,state_freq); } NY98_61(const double mu, const double kappa, const double omega, const vector &pi, Random *ran_in) { set_defaults(); ran = ran_in; set_state_freq(pi); C.resize(n_states,n_states); D.resize(n_states,n_states); update(mu,kappa,omega,state_freq); } Codon_Mutation_Matrix& update(const double mu, const double kappa, const double omega) { build_C(mu,kappa,omega,state_freq); initialize(n_states,&C); return *this; } Codon_Mutation_Matrix& update(const double mu, const double kappa, const double omega, const vector &pi) { set_state_freq(pi); build_C(mu,kappa,omega,state_freq); initialize(n_states,&C); return *this; } Codon_Mutation_Matrix& build_C(const double mu, const double kappa, const double omega, const vector &pi) { int i,j; /*Initialize to zero*/ for(i=0;i<61;i++){for(j=0;j<61;j++)C[i][j]=0.0;} C[0][1]=kappa*mu; C[0][2]=omega*mu; C[0][3]=omega*mu; C[0][4]=kappa*omega*mu; C[0][8]=omega*mu; C[0][10]=omega*mu; C[0][13]=kappa*omega*mu; C[0][29]=omega*mu; C[0][45]=omega*mu; C[1][2]=omega*mu; C[1][3]=omega*mu; C[1][5]=kappa*omega*mu; C[1][9]=omega*mu; C[1][11]=omega*mu; C[1][14]=kappa*omega*mu; C[1][30]=omega*mu; C[1][46]=omega*mu; C[2][3]=kappa*mu; C[2][6]=kappa*omega*mu; C[2][15]=kappa*mu; C[2][31]=omega*mu; C[2][47]=omega*mu; C[3][7]=kappa*omega*mu; C[3][12]=omega*mu; C[3][16]=kappa*mu; C[3][32]=omega*mu; C[3][48]=omega*mu; C[4][5]=kappa*mu; C[4][6]=mu; C[4][7]=mu; C[4][8]=omega*mu; C[4][10]=omega*mu; C[4][17]=kappa*omega*mu; C[4][33]=omega*mu; C[4][49]=omega*mu; C[5][6]=mu; C[5][7]=mu; C[5][9]=omega*mu; C[5][11]=omega*mu; C[5][18]=kappa*omega*mu; C[5][34]=omega*mu; C[5][50]=omega*mu; C[6][7]=kappa*mu; C[6][19]=kappa*omega*mu; C[6][35]=omega*mu; C[6][51]=omega*mu; C[7][12]=omega*mu; C[7][20]=kappa*omega*mu; C[7][36]=omega*mu; C[7][52]=omega*mu; C[8][9]=kappa*mu; C[8][10]=kappa*omega*mu; C[8][21]=kappa*omega*mu; C[8][37]=omega*mu; C[8][53]=omega*mu; C[9][11]=kappa*omega*mu; C[9][22]=kappa*omega*mu; C[9][38]=omega*mu; C[9][54]=omega*mu; C[10][11]=kappa*mu; C[10][12]=omega*mu; C[10][25]=kappa*omega*mu; C[10][41]=omega*mu; C[10][57]=omega*mu; C[11][12]=omega*mu; C[11][26]=kappa*omega*mu; C[11][42]=omega*mu; C[11][58]=omega*mu; C[12][28]=kappa*omega*mu; C[12][44]=omega*mu; C[12][60]=omega*mu; C[13][14]=kappa*mu; C[13][15]=mu; C[13][16]=mu; C[13][17]=kappa*omega*mu; C[13][21]=omega*mu; C[13][25]=omega*mu; C[13][29]=omega*mu; C[13][45]=omega*mu; C[14][15]=mu; C[14][16]=mu; C[14][18]=kappa*omega*mu; C[14][22]=omega*mu; C[14][26]=omega*mu; C[14][30]=omega*mu; C[14][46]=omega*mu; C[15][16]=kappa*mu; C[15][19]=kappa*omega*mu; C[15][23]=omega*mu; C[15][27]=omega*mu; C[15][31]=omega*mu; C[15][47]=omega*mu; C[16][20]=kappa*omega*mu; C[16][24]=omega*mu; C[16][28]=omega*mu; C[16][32]=omega*mu; C[16][48]=omega*mu; C[17][18]=kappa*mu; C[17][19]=mu; C[17][20]=mu; C[17][21]=omega*mu; C[17][25]=omega*mu; C[17][33]=omega*mu; C[17][49]=omega*mu; C[18][19]=mu; C[18][20]=mu; C[18][22]=omega*mu; C[18][26]=omega*mu; C[18][34]=omega*mu; C[18][50]=omega*mu; C[19][20]=kappa*mu; C[19][23]=omega*mu; C[19][27]=omega*mu; C[19][35]=omega*mu; C[19][51]=omega*mu; C[20][24]=omega*mu; C[20][28]=omega*mu; C[20][36]=omega*mu; C[20][52]=omega*mu; C[21][22]=kappa*mu; C[21][23]=omega*mu; C[21][24]=omega*mu; C[21][25]=kappa*omega*mu; C[21][37]=omega*mu; C[21][53]=omega*mu; C[22][23]=omega*mu; C[22][24]=omega*mu; C[22][26]=kappa*omega*mu; C[22][38]=omega*mu; C[22][54]=omega*mu; C[23][24]=kappa*mu; C[23][27]=kappa*omega*mu; C[23][39]=omega*mu; C[23][55]=omega*mu; C[24][28]=kappa*omega*mu; C[24][40]=omega*mu; C[24][56]=omega*mu; C[25][26]=kappa*mu; C[25][27]=mu; C[25][28]=mu; C[25][41]=omega*mu; C[25][57]=omega*mu; C[26][27]=mu; C[26][28]=mu; C[26][42]=omega*mu; C[26][58]=omega*mu; C[27][28]=kappa*mu; C[27][43]=mu; C[27][59]=omega*mu; C[28][44]=mu; C[28][60]=omega*mu; C[29][30]=kappa*mu; C[29][31]=mu; C[29][32]=omega*mu; C[29][33]=kappa*omega*mu; C[29][37]=omega*mu; C[29][41]=omega*mu; C[29][45]=kappa*omega*mu; C[30][31]=mu; C[30][32]=omega*mu; C[30][34]=kappa*omega*mu; C[30][38]=omega*mu; C[30][42]=omega*mu; C[30][46]=kappa*omega*mu; C[31][32]=kappa*omega*mu; C[31][35]=kappa*omega*mu; C[31][39]=omega*mu; C[31][43]=omega*mu; C[31][47]=kappa*omega*mu; C[32][36]=kappa*omega*mu; C[32][40]=omega*mu; C[32][44]=omega*mu; C[32][48]=kappa*omega*mu; C[33][34]=kappa*mu; C[33][35]=mu; C[33][36]=mu; C[33][37]=omega*mu; C[33][41]=omega*mu; C[33][49]=kappa*omega*mu; C[34][35]=mu; C[34][36]=mu; C[34][38]=omega*mu; C[34][42]=omega*mu; C[34][50]=kappa*omega*mu; C[35][36]=kappa*mu; C[35][39]=omega*mu; C[35][43]=omega*mu; C[35][51]=kappa*omega*mu; C[36][40]=omega*mu; C[36][44]=omega*mu; C[36][52]=kappa*omega*mu; C[37][38]=kappa*mu; C[37][39]=omega*mu; C[37][40]=omega*mu; C[37][41]=kappa*omega*mu; C[37][53]=kappa*omega*mu; C[38][39]=omega*mu; C[38][40]=omega*mu; C[38][42]=kappa*omega*mu; C[38][54]=kappa*omega*mu; C[39][40]=kappa*mu; C[39][43]=kappa*omega*mu; C[39][55]=kappa*omega*mu; C[40][44]=kappa*omega*mu; C[40][56]=kappa*omega*mu; C[41][42]=kappa*mu; C[41][43]=omega*mu; C[41][44]=omega*mu; C[41][57]=kappa*omega*mu; C[42][43]=omega*mu; C[42][44]=omega*mu; C[42][58]=kappa*omega*mu; C[43][44]=kappa*mu; C[43][59]=kappa*omega*mu; C[44][60]=kappa*omega*mu; C[45][46]=kappa*mu; C[45][47]=mu; C[45][48]=mu; C[45][49]=kappa*omega*mu; C[45][53]=omega*mu; C[45][57]=omega*mu; C[46][47]=mu; C[46][48]=mu; C[46][50]=kappa*omega*mu; C[46][54]=omega*mu; C[46][58]=omega*mu; C[47][48]=kappa*mu; C[47][51]=kappa*omega*mu; C[47][55]=omega*mu; C[47][59]=omega*mu; C[48][52]=kappa*omega*mu; C[48][56]=omega*mu; C[48][60]=omega*mu; C[49][50]=kappa*mu; C[49][51]=mu; C[49][52]=mu; C[49][53]=omega*mu; C[49][57]=omega*mu; C[50][51]=mu; C[50][52]=mu; C[50][54]=omega*mu; C[50][58]=omega*mu; C[51][52]=kappa*mu; C[51][55]=omega*mu; C[51][59]=omega*mu; C[52][56]=omega*mu; C[52][60]=omega*mu; C[53][54]=kappa*mu; C[53][55]=omega*mu; C[53][56]=omega*mu; C[53][57]=kappa*omega*mu; C[54][55]=omega*mu; C[54][56]=omega*mu; C[54][58]=kappa*omega*mu; C[55][56]=kappa*mu; C[55][59]=kappa*omega*mu; C[56][60]=kappa*omega*mu; C[57][58]=kappa*mu; C[57][59]=mu; C[57][60]=mu; C[58][59]=mu; C[58][60]=mu; C[59][60]=kappa*mu; /*Fill in the lower triangle*/ for(i=0;i<61;i++){ for(j=i+1;j<61;j++)C[j][i]=C[i][j];} /*Apply the equilibrium frequencies*/ for(i=0;i<61;i++) for(j=0;j<61;j++) C[i][j]*=pi[j]; /*Compute the diagonal*/ for(i=0;i<61;i++) { double rowsum=0.0; for(j=0;j<61;j++) rowsum+=C[i][j]; C[i][i]=-rowsum; } return *this; } }; class FSM_Binary : public Mutation_Matrix { /****************************************************************/ /* Mutations occur at rate lambda/2 per unit time. */ /* */ /* Transition probability matrix, given time t is */ /* */ /* P[0,0] = P[1,1] = 1/2 + 1/2*exp(-lambda*t) */ /* P[0,1] = P[1,0] = 1/2 - 1/2*exp(-lambda*t) */ /* */ /* Reversible model, so Pr(observing unordered pair ab) = */ /* (2-delta[a,b])*pi[a] P[a,b]^(2t) */ /* where delta is the Kronecker delta and pi the equilibrium */ /* frequency which is 1/2. */ /* */ /* So Pr(observing unordered pair ab|mrca at t) */ /* = 1/4 + 1/4*exp(-lambda*t) if a=b */ /* or 1/2 - 1/2*exp(-lambda*t) otherwise */ /* */ /* Expected pairwise diversity in a coalescent model, where */ /* time is measured in units of PNe generations (P is ploidy */ /* Ne is effective population size), is lambda/(1+2*lambda) */ /* */ /****************************************************************/ public: void set_defaults() { n_states = 2; state_freq.resize(n_states,0.5); state_char.resize(n_states); state_char[0] = string(1,'0'); state_char[1] = string(1,'1'); } FSM_Binary(Random *ran_in) { set_defaults(); ran = ran_in; C.resize(n_states,n_states); initialize(2,&C); } FSM_Binary(const double lambda, Random *ran_in) { set_defaults(); ran = ran_in; C.resize(n_states,n_states); update(lambda); } FSM_Binary& update(const double lambda) { C[0][0] = -lambda/2.; C[0][1] = lambda/2.; C[1][0] = lambda/2.; C[1][1] = -lambda/2.; initialize(n_states,&C); return *this; } int fast_mutate(const int state, const double time) { return (ran->bernoulliTF(0.5+0.5*exp(-2.0*C[0][1]*time))) ? state : !state; } }; /*multinomial sampler*/ #endif // _MUTATION_H_ ClonalFrameML-1.11/src/bank/pause.h000066400000000000000000000026521307563374100170560ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * pause.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* pause.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _MYUTILS_PAUSE_H_ #define _MYUTILS_PAUSE_H_ #ifdef _WIN32 #include #include namespace myutils { inline void pause() { printf("\nPress any key\n"); int ch=-99; while (ch==-99) ch=_getch(); } inline void silent_pause() { int ch=-99; while (ch==-99) ch=_getch(); } }; #else namespace myutils { inline void pause() {} }; #endif #endifClonalFrameML-1.11/src/bank/readme.doc000066400000000000000000001230001307563374100175030ustar00rootroot00000000000000ࡱ> NPM[@ bjbj44 .8ViVi^^^^^^^rzzzz4,ri*)))))))$_+R-*^*^^#*XXX^^)X)XXnrp(T^^p) Ɠ zrf()9*0i*(E.jE.p)rr^^^^E.^p)xZ@@X4**rr Brr ControlWizard Instructions for formatting control files The control file consists of various elements: Labels: a string used to identify the variable. Values: the values to be input to the variables. Delimiters: specify the format of the control file by indicating where labels and values start and finish. Whitespace and comment: simply passed over when the control file is read. Whitespace is user-defined. Whitespace is ignored before and after labels, before and after the assignment operator, and before values. Defaults are spaces, newlines, carriage returns and EOFs. To comment a line use the rem symbol at the start of the line. The rem symbol is a user-defined single character, whose default is #. Only full lines can be commented. The rest of the line will be ignored until the next line delimiter is reached. The assignment operator is a user-defined single character. Label reading continues until the assignment operator. Value reading proceeds immediately after the assignment operator. Default is =. Line delimiters are user-defined. Value reading continues until the line delimiter. Label reading proceeds immediately after the line delimiter. Defaults are newlines and carriage returns. Element delimiters separate values that go to make up the elements of a vector. Default is the comma. Labels that are prematurely terminated by a rem symbol or an EOF are simply ignored. Note that: Labels cannot contain whitespace. Values that are not entered or specified correctly will not be read correctly. All whitespace before a string is ignored, but it is included thereafter, including after the end of the string. Where there are duplicate labels the latest one will be used. The double quote " character is reserved for delimiting text strings (TP_STRING). Implementing Include the header file into your program. ControlWizard uses C++ classes, so the compiler must be compatible. Then in your code: Create an instance of the class by using the code ControlWizard control_file; Specify the label, type and location of your variables. Therefore the variables must already have been declared. For example control_file.add_item("apples",TP_INT,&apples); control_file.add_ITEM("oranges",TP_INT,&oranges); control_file.add_item("bananas",TP_DOUBLE,&bananas); control_file.add_item("hotdogs",TP_STRING,&hotdogs); control_file.add_item("cars",TP_VEC_DOUBLE,&cars); control_file.add_item("trees",TP_EXT_VEC_DOUBLE,&trees); Execute the reading of your control file in the following way control_file.read_input("control.ini"); The item "oranges" uses the capitalised version add_ITEM() to indicate that it is a required variable. The member variable got_required will return true if all required variables were input. The member function check_required() will give a visual result. Set the member variable coutput to false if you do not want screen output during data input. The default is true. control_file.coutput=false; By default ControlWizard is not case sensitive. To force it to be case sensitive use the following command before any add_item() or add_ITEM() commands, and do not change it thereafter. control_file.case_sensitive=true; Types currently available for reading in TP_INT Single integer value TP_DOUBLE Single double value TP_STRING A single string enclosed in "double quotes" TP_VEC_INT A vector of integers. Outputs to the STL container vector TP_VEC_DOUBLE A vector of doubles. Outputs to the STL container vector TP_EXT_VEC_DOUBLE A vector of doubles contained in an external file, the location of which only is specified. Customising the format ControlWizard is very simple. Only the member variables and functions that you might be interested in accessing are public. The public member functions are described above. The public member variables are those that you might wish to modify for custom formatting (defaults on right): white_space Character(s) treated as whitespace space, EOF, newline, CR label_delimiters The assignment operator(s) = line_delimiters The end-of-line character(s) newline, CR elem_delimiters The value-separator(s) for vector value lists , rem_delimiters The rem symbols(s) # To augment these lists, which are of type vector, the easiest way to do this is, prior to the read_input() command, use (for example) control_file.line_delimiters.push_back(';'); control_file.elem_delimiters.push_back('\t'); which would make ; into an end-of-line delimiter, and tab into a separator for lists of values. To remove items from the default lists, the easiest thing to do is first to clear all delimiters and then start from scratch, as follows control_file.label_delimiters.clear(); control_file.label_delimiters.push_back(':'); which would replace = with : as the assignment operator. Adding new data types To include new types would require modifying the source files for ControlWizard. For example, to add the type TP_EXT_VEC_INT it would be relatively straightforward to model it on the code for TP_EXT_VEC_DOUBLE. You would need to Add a new data type to the enumeration DATA_TYPE in the header file. Add a new member function in the same caste as existing functions for dealing with the data. The function would need a declaration in the class declaration (header file) and a definition in the source file. Note that these functions are of type RTRV and take (std::ifstream &infile, std::string &label) as their arguments. It is necessary to use this format for the next step, which uses function pointers. Locate ControlWizard::data_format(), which contains a switch control sequence. Depending upon the case (which is of type DATA_TYPE), a generic function pointer is redirected to one of the data handling functions. Add a case for the new DATA_TYPE which redirects the function pointer read_data() to your new function. 78hn4 > J ~ A 9 U W " x ef˿˿˿˿˨˨˒˨|tldSd h`hpCJOJQJ^JaJhpCJaJhV%/CJaJhl2wCJaJh`hBCJaJh`hCfnCJaJh`hmCJaJh`h(CJaJh`hl2wCJaJh`hL]5CJaJh`hVR5CJaJh`hVRCJaJh`h*5>*CJaJh`hl2w5>*CJaJh`h*5>*CJaJ 89h4 ~  A B 9 : " # x y f & FgdVR & FgdVR$a$gd*T$Xl $ @a$gd5rw $ a$gd` $ @a$gd` $ a$gd` & FgdVR & FgdVR= ITUa%1YeɾꭜꂭvgXh`h;CJOJQJaJh`h fbCJOJQJaJh5rwCJOJQJaJh`hVRCJOJQJaJh`h fbCJaJ h`hmCJOJQJ^JaJ h`hVRCJOJQJ^JaJh`h5rwCJaJhVRCJaJh`hmCJaJh`hVR5>*CJaJh`hVRCJaJh`hpCJaJ" *.4etuz"CUfjklmyzʿʷʷʷʷʷwʨoghCrICJaJh fbCJaJ h`h5rwCJOJQJ^JaJhoDKCJOJQJaJhC0CJOJQJaJhoDKCJaJh`h5rwCJOJQJaJh5rwCJaJh5rwh5rwCJaJh5rwCJOJQJaJh fbCJOJQJaJh`h;CJOJQJaJh`h fbCJOJQJaJ%DghL  !">? $ $ @a$gd5rw $ @ a$gdm $ @a$gd fb$  @ (^ `(a$gd5rw $  @a$gd` $ @a$gdCrI $ @ a$gd fb DEQRfgh LY να~m~m~m~e~m~m~m~Z~Z~h`h;CJaJhHCJaJ h`h fbCJOJQJ^JaJh`h fbCJaJh`h fb5>*CJaJhCrIhCrICJaJhCrIhCrICJOJQJaJhCrICJOJQJaJ h`hCrICJOJQJ^JaJh`hCrICJOJQJaJ hCrIhCrICJOJQJ^JaJhCrICJaJhCrI6CJaJ  !fg =?J+,-;NPR|3467:$ux{ٻ٪٪٪٪ٻ٪ٻٟqh`h`5>*CJaJh`h"ZCJOJQJaJ h`h"ZCJOJQJ^JaJh`h"ZCJaJ h`hmCJOJQJ^JaJh`hh/CJaJh5rwCJaJh`h*CJaJh`hmCJaJh`h fb5>*CJaJh`hm5>*CJaJ+-QR :$L{$ & F @a$gd` $ @a$gd` $ @a$gd` $ @a$gdm $ $ @a$gd5rw:HT%/89zòòòäÙh`hV? CJaJhV? CJOJQJ^JaJ hV? hV? CJOJQJ^JaJhV? CJaJ hV? h`CJOJQJ^JaJ h`h`CJOJQJ^JaJh`h`CJaJh`CJaJc $ @a$gdV? $ & F @a$gd`,1h. A!"#$% @@@ NormalCJ_HaJmH sH tH DA@D Default Paragraph FontRi@R  Table Normal4 l4a (k@(No List*O* VRCode$a$889h4~AB9:"#xyfT$ X l D g h  L  !">?-QR :$L{c0000 0 0 0 000000000000000 0 0 0 0 000000 00 0000000 000080000 0000000000000000000000000000000000000 0 0 0 0 0089h4~AB9:"#xyfT$ X l h  L  !">?-QR :$L{c0000 0 0 0 000000000000000 0 0 0 0 00000 00 0000000 00h@0@0h0h0h0h0h0h0h0h000000000000000000000000000 0 0 0 0 0@ 0 UVVVWVlXVYVZV,8 8 ; ; 8*urn:schemas-microsoft-com:office:smarttagsCity9*urn:schemas-microsoft-com:office:smarttagsplace9*urn:schemas-microsoft-com:office:smarttagsState  4>$;?\_OR0=Ujl  ! % : < U Y n p * 2 u   m    E ` G J "/?Jcmz-;@C 2%HMt MWvfk  6 ; j o . 3   " L n ?J-;.3:?8Ipu{33333333333333333333333333333BytHUW WilsonG  Sfq8C>~>HWYh ^`hH.h ^`hH.h pLp^p`LhH.h @ @ ^@ `hH.h ^`hH.h L^`LhH.h ^`hH.h ^`hH.h PLP^P`LhH.h ^`hH.h ^`hH.h pLp^p`LhH.h @ @ ^@ `hH.h ^`hH.h L^`LhH.h ^`hH.h ^`hH.h PLP^P`LhH.h ^`hH.h ^`hH.h pLp^p`LhH.h @ @ ^@ `hH.h ^`hH.h L^`LhH.h ^`hH.h ^`hH.h PLP^P`LhH.h ^`hH.h ^`hH.h pLp^p`LhH.h @ @ ^@ `hH.h ^`hH.h L^`LhH.h ^`hH.h ^`hH.h PLP^P`LhH.~>HWG Sq8C                                    V? V%/h/4CrIoDK"Z` fbCfnl2w5rwBpH;4*L]VRmqjC0(@$@@UnknownGz Times New Roman5Symbol3& z Arial?5 z Courier New"q hwge )ge )!24d 3QH)?4)Instructions for formatting control filesWilsonWilson    Oh+'0 $0 L X dpx*Instructions for formatting control filesWonstWilsontilsils Normal.dotsWilsond14sMicrosoft Word 10.0@'@WM@ge՜.+,0$ hp  University of Oxfordr) { *Instructions for formatting control files Title  !"#$&'()*+,-./0123456789:;<>?@ABCDFGHIJKLORoot Entry F QData 1Table%a.WordDocument.8SummaryInformation(=DocumentSummaryInformation8ECompObjj  FMicrosoft Word Document MSWordDocWord.Document.89qClonalFrameML-1.11/src/bank/revolver.h000066400000000000000000000110121307563374100175730ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * revolver.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* revolver.h 28th August 2009 */ /* The revolver container has a fixed */ /* number of elements that it releases */ /* and takes back as required. Its purpose */ /* is to avoid unnecessary memory alloc- */ /* ation and freeing. */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _MYUTILS_REVOLVER_H_ #define _MYUTILS_REVOLVER_H_ #include "myutils/myerror.h" #include #include //#include namespace myutils { template class Revolver { public: /*Preserve public access for back-compatibility*/ T **element; protected: int protected_size; int protected_available; public: /*Default constructor*/ Revolver() { initialize(0); } /*Constructor*/ Revolver(int size) { initialize(size); } /*Constructor*/ Revolver(int size, T &value) { initialize(size,value); } /*Destructor*/ ~Revolver() { // Do not delete items in use!! if(!full()) error("Revolver::~Revolver(): not full"); for(i=0;i0) delete[] element; } bool full() const { return protected_available==protected_size; } bool full() { return protected_available==protected_size; } bool empty() const { return protected_available==0; } bool empty() { return protected_available==0; } int size(){return protected_size;} int size() const {return protected_size;} int navail(){return protected_available;} int navail() const {return protected_available;} Revolver& initialize(int size) { element=new T*[size]; if(!element) error("Revolver::initialize() allocation failure"); int i; for(i=0;i& initialize(int size, T &value) { element=new T*[size]; if(!element) error("Revolver::initialize() allocation failure"); int i; for(i=0;i=protected_size) error("Revolver::operator[](int pos): pos>=size()"); return element[pos]; }; #else /* NB:- order is not stable in Revolver */ /*Subscript operator*/inline T* operator[](int pos){return element[pos];}; #endif /* Release an element for use */ T* pop() { if(empty()) { if(size()==0) error("Revolver::pop(): zero-sized container"); error("Revolver::pop(): empty container"); } --protected_available; return element[protected_available]; } #ifdef _MYUTILS_DEBUG /* Return an element to the container, checking that it belongs to the container */ Revolver& push(T* val) { if(full()) error("Revolver::push(): full container"); int i; for(i=protected_available;i. */ #ifndef _MYUTILS_SORT_H_ #define _MYUTILS_SORT_H_ #include #include namespace myutils { /* WARNING: this class has very limited utility. Syntax: sort(sortme.begin(),sortme.end(),sort_by_vector(sortby)); where sortby is the vector of interest, if sortme is a vector that starts of as the indeces of sortby, i.e. 0,1,2,...,size()-1 then following the sort, it will be reordered according to sortby. */ template class sort_by_vector : public std::binary_function { const vector &sort_by; public: sort_by_vector(const vector &sort_by_in) : sort_by(sort_by_in) {} bool operator()(int a, int b) const { return (sort_by.at(a). */ #ifndef _TSV_H_ #define _TSV_H_ #pragma warning(disable: 4786) #include "myutils/myerror.h" using myutils::error; #include #include #include #include #include #include "myutils/matrix.h" using myutils::Matrix; using namespace std; class tsv { public: bool coutput; Matrix data; vector fieldname; map fieldnum; vector< vector > fieldvalue; const int DUMPMAX; tsv(const int dumpmax = 1000) : DUMPMAX(dumpmax) { coutput=false; } tsv& read(const char* infilename) { ifstream infile(infilename); if(!infile.is_open()) error("Could not open file"); int nfields = 0; int character = 0; string this_fieldname = ""; fieldname.resize(0); if(coutput) cout << "Fields found: "; while(!infile.eof()) { // if(!infile.good()) error("Problem reading file - is buffer too small?"); character = infile.get(); if(character=='\t') { if(coutput) cout << this_fieldname << " "; fieldname.push_back(this_fieldname); fieldnum[this_fieldname]=nfields; this_fieldname = ""; ++nfields; } else if(character=='\r'||character=='\n'||character==-1) { if(coutput) cout << this_fieldname << " "; fieldname.push_back(this_fieldname); fieldnum[this_fieldname]=nfields; this_fieldname = ""; ++nfields; character = infile.peek(); if(character=='\r'||character=='\n'||character==-1) infile.get(); break; } else this_fieldname += (char)character; } if(coutput) cout << "(" << nfields << " fields in total)" << endl << flush; int nrows = 0, ntries = 0; char* dump = new char[DUMPMAX]; while(!infile.eof()) { infile.getline(dump,DUMPMAX); if(dump[0]!='\0')/*check the line isn't blank using the end-of-string character*/ ++nrows; ++ntries; if(coutput && ntries%1000==0) cout << "\r" << ntries << " attempts, " << nrows << " rows so far" << flush; } cout << endl; if(coutput) cout << "Found " << nrows << " rows of data" << endl << flush; data.resize(nrows,nfields); infile.close(); ifstream infile2(infilename); infile2.getline(dump,DUMPMAX); delete[] dump; int row = 0; int col = 0; string value = ""; while(!infile2.eof()) { character = infile2.get(); if(character=='\t') { if(row n_values() { vector result(data.ncols(),0); int f; for(f=0;f=fieldvalue[f].size()) return false; return true; }; bool field_exist(string f) { bool exists = false; unsigned int i; for(i=0;i::iterator m = fieldnum.find(f); if(m==fieldnum.end()) return -1; return m->second; } }; #endif //_TSV_H_ ClonalFrameML-1.11/src/brent.h000077500000000000000000000326561307563374100161520ustar00rootroot00000000000000/* Copyright 2013 Daniel Wilson. * * brent.h * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . * * Parts of this code are based on code in Numerical Recipes in C++ * WH Press, SA Teukolsky, WT Vetterling, BP Flannery (2002). * */ #ifndef _BRENT_MINIMISATION_ #define _BRENT_MINIMISATION_ #include #include "myutils/myerror.h" using namespace std; /* Class Brent performs parabolic interpolation and Brent's method on a one- dimensional member function, BrentFunc.f(x). BrentFunc must be an instance of a class derived from the abstract class BrentFunction. Its member function f(x) takes only a single parameter, but using a derived class allows for it to be controlled by other member variables and/or call other member functions, enabling a neater alternative to using function pointers and global variables. See Numerical Recipes in C++ [Press et al 2002] for details of the algorithm. */ class BrentFunction { public: virtual double f(const double x) = 0; }; /* An example derived class might look like MyFunction below. By passing an instance of MyFunction to an instance of Brent in its constructor, the function MyFunction::f(x) can be minimized with respect to x, whilst having an auxilliary variable y, which is not minimized. class MyFunction : public BrentFunction { double y; public: MyFunction(const double y_in) : y(y_in) {} double f(const double x) { return (x+y)*(x+y); } }; */ class Brent { public: BrentFunction & BrentFunc; bool coutput; double evala_BrentFunc, evalb_BrentFunc, evalc_BrentFunc; double pointa,pointb,pointc; double GLIMIT, TINY, tolerance; int ITMAX; double ZEPS,EPS; double function_minimum; bool bracketed; bool fail; public: Brent(BrentFunction &BrentFunc_in) : BrentFunc(BrentFunc_in), GLIMIT(100.), TINY(1.e-20), ITMAX(100), coutput(false), EPS(3.0e-8) {} double minimize(const double pointa_in, const double pointb_in, const double tol) { fail = false; ZEPS=numeric_limits::epsilon()*1.0e-3; pointa = pointa_in; pointb = pointb_in; pointc = 0.0; tolerance = tol; mnbrak(pointa, pointb, pointc, evala_BrentFunc, evalb_BrentFunc, evalc_BrentFunc); if(coutput) { cout << "Function is bracketed by:" << endl; cout << "f(" << pointa << ") = " << evala_BrentFunc << endl; cout << "f(" << pointb << ") = " << evalb_BrentFunc << endl; cout << "f(" << pointc << ") = " << evalc_BrentFunc << endl; } double result = 0.0; function_minimum = brent(pointa, pointb, pointc, result); if(coutput) cout << "Function is minimized at f(" << result << ") = " << function_minimum << endl; return result; }; double rootfind(double x1, double x2, double tol) { //Using Brent�s method, find the root of a function func known to lie between x1 and x2. The //root, returned as zbrent, will be refined until its accuracy is tol. int iter; double a=x1,b=x2,c=x2,d,e,min1,min2; double fa=BrentFunc.f(a),fb=BrentFunc.f(b),fc,p,q,r,s,tol1,xm; bracketed = true; if ((fa > 0.0 && fb > 0.0) || (fa < 0.0 && fb < 0.0)) { if(coutput) cout << "f(" << x1 << ") = " << fa << "\tf(" << x2 << ") = " << fb << endl; //myutils::warning("Root must be bracketed in rootfind"); bracketed = false; return 0.0; } fc=fb; for (iter=1;iter<=ITMAX;iter++) { if ((fb > 0.0 && fc > 0.0) || (fb < 0.0 && fc < 0.0)) { c=a; //Rename a, b, c and adjust bounding interval d. fc=fa; e=d=b-a; } if (fabs(fc) < fabs(fb)) { a=b; b=c; c=a; fa=fb; fb=fc; fc=fa; } tol1=2.0*EPS*fabs(b)+0.5*tol; //Convergence check. xm=0.5*(c-b); if (fabs(xm) <= tol1 || fb == 0.0) return b; if (fabs(e) >= tol1 && fabs(fa) > fabs(fb)) { s=fb/fa; //Attempt inverse quadratic interpolation. if (a == c) { p=2.0*xm*s; q=1.0-s; } else { q=fa/fc; r=fb/fc; p=s*(2.0*xm*q*(q-r)-(b-a)*(r-1.0)); q=(q-1.0)*(r-1.0)*(s-1.0); } if (p > 0.0) q = -q; //Check whether in bounds. p=fabs(p); min1=3.0*xm*q-fabs(tol1*q); min2=fabs(e*q); if (2.0*p < (min1 < min2 ? min1 : min2)) { e=d; //Accept interpolation. d=p/q; } else { d=xm; //Interpolation failed, use bisection. e=d; } } else { //Bounds decreasing too slowly, use bisection. d=xm; e=d; } a=b; //Move last best guess to a. fa=fb; if (fabs(d) > tol1) //Evaluate new trial root. b += d; else b += SIGN(tol1,xm); fb=BrentFunc.f(b); } myutils::warning("Maximum number of iterations exceeded in zbrent"); return 0.0; //Never get here. } protected: /* The hard work is done by algorithms modified from Numerical Recipes in C++ [Press et al 2002] */ inline void shft3(double &a, double &b, double &c, const double d) { a=b; b=c; c=d; } inline void shft2(double &a, double &b, const double c) { a=b; b=c; } void mnbrak(double &ax, double &bx, double &cx, double &fa, double &fb, double &fc) { const double GOLD=1.618034; double ulim,u,r,q,fu; fa = BrentFunc.f(ax); fb = BrentFunc.f(bx); if (fb > fa) { SWAP(ax,bx); SWAP(fb,fa); } cx=bx+GOLD*(bx-ax); fc=BrentFunc.f(cx); while (fb > fc) { r=(bx-ax)*(fb-fc); q=(bx-cx)*(fb-fa); u=bx-((bx-cx)*q-(bx-ax)*r)/ (2.0*SIGN(MAX(FABS(q-r),TINY),q-r)); ulim=bx+GLIMIT*(cx-bx); if ((bx-u)*(u-cx) > 0.0) { fu=BrentFunc.f(u); if (fu < fc) { ax=bx; bx=u; fa=fb; fb=fu; return; } else if (fu > fb) { cx=u; fc=fu; return; } u=cx+GOLD*(cx-bx); fu=BrentFunc.f(u); } else if ((cx-u)*(u-ulim) > 0.0) { fu=BrentFunc.f(u); if (fu < fc) { shft3(bx,cx,u,cx+GOLD*(cx-bx)); shft3(fb,fc,fu,BrentFunc.f(u)); } } else if ((u-ulim)*(ulim-cx) >= 0.0) { u=ulim; fu=BrentFunc.f(u); } else { u=cx+GOLD*(cx-bx); fu=BrentFunc.f(u); } shft3(ax,bx,cx,u); shft3(fa,fb,fc,fu); } } inline void SWAP(double &a, double &b) { double dum=a;a=b;b=dum; } inline double SIGN(const double &a, const double &b) { return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a); } inline double MAX(const double &a, const double &b) { return b > a ? (b) : (a); } inline double FABS(const double &a) { return a < 0.0 ? -a : a; } double brent(const double ax, const double bx, const double cx, double &xmin) { const double CGOLD=0.3819660; int iter; double a,b,d=0.0,etemp,fu,fv,fw,fx; double p,q,r,tol1,tol2,u,v,w,x,xm; double e=0.0; a=(ax < cx ? ax : cx); b=(ax > cx ? ax : cx); x=w=v=bx; fw=fv=fx=BrentFunc.f(x); for (iter=0;iter tol1) { r=(x-w)*(fx-fv); q=(x-v)*(fx-fw); p=(x-v)*q-(x-w)*r; q=2.0*(q-r); if (q > 0.0) p = -p; q=FABS(q); etemp=e; e=d; if (FABS(p) >= FABS(0.5*q*etemp) || p <= q*(a-x) || p >= q*(b-x)) d=CGOLD*(e=(x >= xm ? a-x : b-x)); else { d=p/q; u=x+d; if (u-a < tol2 || b-u < tol2) d=SIGN(tol1,xm-x); } } else { d=CGOLD*(e=(x >= xm ? a-x : b-x)); } u=(FABS(d) >= tol1 ? x+d : x+SIGN(tol1,d)); fu=BrentFunc.f(u); if (fu <= fx) { if (u >= x) a=x; else b=x; shft3(v,w,x,u); shft3(fv,fw,fx,fu); } else { if (u < x) a=u; else b=u; if (fu <= fw || w == x) { v=w; w=u; fv=fw; fw=fu; } else if (fu <= fv || v == x || v == w) { v=u; fv=fu; } } } //myutils::error("Brent: Too many iterations"); fail = true; xmin=x; return fx; } }; class ConstrainedBrent { public: BrentFunction & BrentFunc; bool coutput; double evala_BrentFunc, evalb_BrentFunc, evalc_BrentFunc; double pointa,pointb,pointc; double GLIMIT, TINY, tolerance; int ITMAX; double ZEPS; double function_minimum; double min_x,max_x; public: ConstrainedBrent(BrentFunction &BrentFunc_in) : BrentFunc(BrentFunc_in), GLIMIT(100.), TINY(1.e-20), ITMAX(100), coutput(false) {} double minimize(const double pointa_in, const double pointb_in, const double tol, const double min_x_in, const double max_x_in) { min_x = min_x_in; max_x = max_x_in; ZEPS=numeric_limits::epsilon()*1.0e-3; pointa = pointa_in; pointb = pointb_in; pointc = min_x; if(pointamax_x) error("ConstrainedBrent::minimize(): point a falls outside range"); if(pointbmax_x) error("ConstrainedBrent::minimize(): point b falls outside range"); tolerance = tol; mnbrak(pointa, pointb, pointc, evala_BrentFunc, evalb_BrentFunc, evalc_BrentFunc); if(coutput) { cout << "Function is bracketed by:" << endl; cout << "f(" << pointa << ") = " << evala_BrentFunc << endl; cout << "f(" << pointb << ") = " << evalb_BrentFunc << endl; cout << "f(" << pointc << ") = " << evalc_BrentFunc << endl; } double result = 0.0; function_minimum = brent(pointa, pointb, pointc, result); if(coutput) cout << "Function is minimized at f(" << result << ") = " << function_minimum << endl; return result; }; protected: /* The hard work is done by algorithms modified from Numerical Recipes in C++ [Press et al 2002] */ inline void shft3(double &a, double &b, double &c, const double d) { a=b; b=c; c=d; } inline void shft2(double &a, double &b, const double c) { a=b; b=c; } void mnbrak(double &ax, double &bx, double &cx, double &fa, double &fb, double &fc) { const double GOLD=1.618034; double ulim,u,r,q,fu; fa = BrentFunc.f(ax); fb = BrentFunc.f(bx); if (fb > fa) { SWAP(ax,bx); SWAP(fb,fa); } cx=bx+GOLD*(bx-ax); if(cxmax_x) cx = max_x; fc=BrentFunc.f(cx); while (fb > fc) { r=(bx-ax)*(fb-fc); q=(bx-cx)*(fb-fa); u=bx-((bx-cx)*q-(bx-ax)*r)/ (2.0*SIGN(MAX(FABS(q-r),TINY),q-r)); if(umax_x) u = max_x; ulim=bx+GLIMIT*(cx-bx); if ((bx-u)*(u-cx) > 0.0) { fu=BrentFunc.f(u); if (fu < fc) { ax=bx; bx=u; fa=fb; fb=fu; return; } else if (fu > fb) { cx=u; fc=fu; return; } u=cx+GOLD*(cx-bx); if(umax_x) u = max_x; fu=BrentFunc.f(u); } else if ((cx-u)*(u-ulim) > 0.0) { fu=BrentFunc.f(u); if (fu < fc) { shft3(bx,cx,u,cx+GOLD*(cx-bx)); if(umax_x) u = max_x; shft3(fb,fc,fu,BrentFunc.f(u)); } } else if ((u-ulim)*(ulim-cx) >= 0.0) { u=ulim; if(umax_x) u = max_x; fu=BrentFunc.f(u); } else { u=cx+GOLD*(cx-bx); if(umax_x) u = max_x; fu=BrentFunc.f(u); } shft3(ax,bx,cx,u); shft3(fa,fb,fc,fu); } } inline void SWAP(double &a, double &b) { double dum=a;a=b;b=dum; } inline double SIGN(const double &a, const double &b) { return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a); } inline double MAX(const double &a, const double &b) { return b > a ? (b) : (a); } inline double FABS(const double &a) { return a < 0.0 ? -a : a; } double brent(const double ax, const double bx, const double cx, double &xmin) { const double CGOLD=0.3819660; int iter; double a,b,d=0.0,etemp,fu,fv,fw,fx; double p,q,r,tol1,tol2,u,v,w,x,xm; double e=0.0; a=(ax < cx ? ax : cx); b=(ax > cx ? ax : cx); x=w=v=bx; fw=fv=fx=BrentFunc.f(x); for (iter=0;iter tol1) { r=(x-w)*(fx-fv); q=(x-v)*(fx-fw); p=(x-v)*q-(x-w)*r; q=2.0*(q-r); if (q > 0.0) p = -p; q=FABS(q); etemp=e; e=d; if (FABS(p) >= FABS(0.5*q*etemp) || p <= q*(a-x) || p >= q*(b-x)) d=CGOLD*(e=(x >= xm ? a-x : b-x)); else { d=p/q; u=x+d; if (u-a < tol2 || b-u < tol2) d=SIGN(tol1,xm-x); } } else { d=CGOLD*(e=(x >= xm ? a-x : b-x)); } u=(FABS(d) >= tol1 ? x+d : x+SIGN(tol1,d)); fu=BrentFunc.f(u); if (fu <= fx) { if (u >= x) a=x; else b=x; shft3(v,w,x,u); shft3(fv,fw,fx,fu); } else { if (u < x) a=u; else b=u; if (fu <= fw || w == x) { v=w; w=u; fv=fw; fw=fu; } else if (fu <= fv || v == x || v == w) { v=u; fv=fu; } } } myutils::error("Brent: Too many iterations"); xmin=x; return fx; } }; #endif // _BRENT_MINIMISATION_ ClonalFrameML-1.11/src/cfml_results.R000066400000000000000000000325451307563374100175060ustar00rootroot00000000000000# ClonalFrameML results # Planned usage: Rscript cfml_results.R ... help = paste( "cfml_results.R summarizes the results of a ClonalFrameML analysis", "Daniel Wilson (2014)", "", "Usage: Rscript cfml_results.R prefix [coresites_list]", sep="\n") # Preliminaries library(ape) library(phangorn) ### Read a FASTA file read.fasta <- function(fname, as.char=FALSE) { a = scan(fname,what=character(0),sep="\n",quiet=TRUE,na.strings="") wh = as.vector(sapply(a,substr,1,1))==">" labs = substr(as.character(a[wh]),2,1000); lseqs = a[!wh] nlines = length(lseqs)%/%length(labs) n = length(lseqs)%/%nlines seqs = rep("",n); names(seqs) <- labs for(i in 1:n) { ibeg = (i-1)*nlines+1 iend = i*nlines seqs[i] = paste(lseqs[ibeg:iend],collapse="") } seqlen = as.numeric(sapply(seqs,nchar)) if(length(seqlen)>1 & var(seqlen)>0) { warning("Sequences have differing lengths"); mx = max(seqlen) for(i in 1:n) seqs[i] = paste(seqs[i],paste(rep("-",mx-seqlen[i]),collapse=""),sep="") } L = as.numeric(nchar(seqs[1])) SEQ = array("-",dim=c(n,L)) for(i in 1:n) SEQ[i,] = unlist(strsplit(seqs[i],"")) rownames(SEQ) <- labs; if(as.char==TRUE) { return(SEQ); } else { fSEQ = apply(toupper(SEQ),2,factor,levels=c("A","G","C","T")); return(fSEQ); } } ### Write a FASTA file write.fasta <- function(DNA,filename) { ofile <- file(filename,"w"); for(n in 1:nrow(DNA)) { writeLines(paste(">",rownames(DNA)[n],sep=""),ofile); writeLines(paste(DNA[n,],collapse=""),ofile); } close(ofile); } ### General totriplet = function(x) { L = floor(length(x)/3)*3 paste(x[seq(1,L,by=3)],x[seq(2,L,by=3)],x[seq(3,L,by=3)],sep="") } geneticCode = list( "TTT"="Phe","TTC"="Phe","TTA"="Leu","TTG"="Leu", "TCT"="Ser","TCC"="Ser","TCA"="Ser","TCG"="Ser", "TAT"="Tyr","TAC"="Tyr","TAA"="STO","TAG"="STO", "TGT"="Cys","TGC"="Cys","TGA"="STO","TGG"="Trp", "CTT"="Leu","CTC"="Leu","CTA"="Leu","CTG"="Leu", "CCT"="Pro","CCC"="Pro","CCA"="Pro","CCG"="Pro", "CAT"="His","CAC"="His","CAA"="Gln","CAG"="Gln", "CGT"="Arg","CGC"="Arg","CGA"="Arg","CGG"="Arg", "ATT"="Ile","ATC"="Ile","ATA"="Ile","ATG"="Met", "ACT"="Thr","ACC"="Thr","ACA"="Thr","ACG"="Thr", "AAT"="Asn","AAC"="Asn","AAA"="Lys","AAG"="Lys", "AGT"="Ser","AGC"="Ser","AGA"="Arg","AGG"="Arg", "GTT"="Val","GTC"="Val","GTA"="Val","GTG"="Val", "GCT"="Ala","GCC"="Ala","GCA"="Ala","GCG"="Ala", "GAT"="Asp","GAC"="Asp","GAA"="Glu","GAG"="Glu", "GGT"="Gly","GGC"="Gly","GGA"="Gly","GGG"="Gly") oneLetterCodes = unlist(list("Ala"="A","Arg"="R","Asn"="N","Asp"="D","Cys"="C","Glu"="E","Gln"="Q","Gly"="G","His"="H","Ile"="I","Leu"="L","Lys"="K","Met"="M","Phe"="F","Pro"="P","Ser"="S","Thr"="T","Trp"="W","Tyr"="Y","Val"="V","STO"="X","---"="-")) aminoAcids = names(table(unlist(geneticCode))) oneLetterAminoAcids = names(table(unlist(oneLetterCodes))) tripletNames = names(geneticCode) transcribe = function(x) { y = t(sapply(1:nrow(x),function(i) totriplet(x[i,]))) rownames(y) = rownames(x) return(y) } translate = function(x,oneLetter=FALSE) { x = toupper(x) tr = t(apply(x,1,function(y)sapply(y,function(i) {aa=geneticCode[[i]];ifelse(is.null(aa),"---",aa)} ))) if(oneLetter) tr = t(apply(tr,1,function(y) oneLetterCodes[y])) rownames(tr) = rownames(x) return(tr) } view.nucleotide = function(x) { image(0:ncol(x),0:nrow(x),t(matrix(as.numeric(factor(x,levels=c("-","A","G","C","T"))),nrow=nrow(x))),col=c("white","red","green","yellow","blue")) } view.codon = function(x) { image(0:ncol(x),0:nrow(x),t(matrix(as.numeric(factor(x),levels=tripletNames),nrow=nrow(x))),col=rainbow(20)) } view.protein = function(x,oneLetter=FALSE) { levs = aminoAcids if(oneLetter) levs = oneLetterAminoAcids cols = rainbow(20) if(oneLetter) cols = c("white",cols) image(0:ncol(x),0:nrow(x),t(matrix(as.numeric(factor(x,levels=levs)),nrow=nrow(x))),col=cols) } # Assumes a fasta file representing a single genome, possibly split across contigs read.fasta.ref = function(ref_file) { r = scan(ref_file,what=character(0),sep="\n") beg = substr(r,1,1) gd = beg!=">" rcat = paste(r[gd],collapse="") return(toupper(unlist(strsplit(rcat,"")))) } # Assumes a fasta file representing a single genome, possibly split across contigs read.fasta.ref.contig = function(ref_file) { r = scan(ref_file,what=character(0),sep="\n") beg = substr(r,1,1) gd = beg!=">" contig = rep(cumsum(!gd)[gd],times=nchar(r[gd])) return(contig) } # Alternative method of plotting using lines. Assume m>0 is interesting alt.image = function(m,col=heat.colors(1+max(m,na.rm=TRUE)),xpos=NULL,ypos=NULL,length=1,background.fun=NULL,...) { if(is.null(xpos)) xpos = 1:nrow(m) if(is.null(ypos)) ypos = 1:ncol(m) x = matrix(rep(xpos,ncol(m)),nrow=nrow(m)) y = matrix(rep(ypos,each=nrow(m)),ncol=ncol(m)) plot(range(xpos),range(ypos)+c(-length,length)/2,type="n",...) rect(min(xpos),min(ypos)-length/2,max(xpos),max(ypos)+length/2,col=col[1],border="NA") if(!is.null(background.fun)) background.fun() gd = m>0 COL = matrix(col[1+m],nrow=nrow(m)) arrows(x[gd],y[gd]-length/2,x[gd],y[gd]+length/2,col=COL[gd],len=0) } # Read options from command line args = commandArgs(trailingOnly = TRUE) if(length(args)!=1 & length(args)!=2) { cat(help,sep="\n") stop("\nIncorrect usage\n") } prefix = args[1] coresites_list = ifelse(length(args)==2,args[2],NA) if(!is.na(coresites_list)) { coresites = scan(coresites_list) } else { coresites = NA } # Automatically set treefile = paste(prefix,".labelled_tree.newick",sep="") xreffile = paste(prefix,".position_cross_reference.txt",sep="") ML_seqfile = paste(prefix,".ML_sequence.fasta",sep="") istatefile = paste(prefix,".importation_status.txt",sep="") if(!file.exists(istatefile)) istatefile = NA # Load the phyML tree estimated from all core variant and invariant sites #tree0 = read.tree(treefile); tree = midpoint(tree0); tree$node.label = c(tree$node.label,setdiff(tree0$node.label,tree$node.label)) tree = read.tree(treefile) # Load a list cross-referencing patterns in the original data to the output FASTA file xref = scan(xreffile,sep=",") genome_length = length(xref) if(is.na(coresites_list)) { coresites = 1:genome_length } else if(any(coresites>genome_length)) stop("Core site ",which(coresites>genome_length)[1]," exceeds genome length ",genome_length) if(any(coresites<1)) stop("Core sites must be positive") # Load the imputed and reconstructed ancestral sequences ML_seq=scan(ML_seqfile,what=character(0)) tp = substr(ML_seq[seq(1,length(ML_seq),by=2)],2,1000) ML_seq = ML_seq[seq(2,length(ML_seq),by=2)]; names(ML_seq) = tp # M is a matrix containing the FASTA file base calls M = matrix("",length(ML_seq),nchar(ML_seq[1])) for(i in 1:length(ML_seq)) { v = unlist(strsplit(ML_seq[i],"")) M[i,] = v gc() } rownames(M) = names(ML_seq) # Precompute various mappings # Combine the tip and node labels treelabels = c(tree$tip.label,tree$node.label) # For each row of M, identify the node index M_node_index = match(rownames(M),treelabels) # And the reverse operation rev_M_node_index = match(treelabels,rownames(M)) # For each row of M, identify the node index of its ancestor # To do this, identify the node index in tree$edge[,2] and read tree$edge[,1] M_anc_node_index = tree$edge[match(M_node_index,tree$edge[,2]),1] # Find, by name, the ancestor M_anc_node = treelabels[M_anc_node_index] # Find its position in M M_anc_node_M_index = match(M_anc_node,rownames(M)) # Not-root nonroot = !is.na(M_anc_node_index) # Map edge order on to M order, and vice versa edge2M = match(tree$edge[,2],M_node_index) M2edge = match(M_node_index,tree$edge[,2]) # Precompute the positions of mutations on branches of the tree # For each pattern, record the mutated nodes # wh.mut is a matrix, in the same order as M, recording whether the base represents a mutation wh.mut = apply(M,2,function(m) 1*(m!=m[M_anc_node_M_index])); wh.mut[nrow(wh.mut),] = 0 # Weight of each pattern wpat = as.vector(table(factor(xref,levels=1:ncol(M)))) # For each node, what proportion of mutations are shared with each other node? #tp = sapply(1:nrow(wh.mut),function(i) apply(t(t(wh.mut[,wh.mut[i,]==1,drop=FALSE])*wpat[wh.mut[i,]==1]),1,sum)/sum(wpat[wh.mut[i,]==1])) # A homoplasy is a mutation that occurs on multiple branches. Count the number of homoplasic mutations per branch # Exclude reference sequences from the count gd = !is.na(as.numeric(rownames(wh.mut))) | substr(rownames(wh.mut),1,4)=="NODE" n.mut = apply(wh.mut[gd,],2,sum) is.homoplasy = n.mut>1 is.core = !is.na(match(1:genome_length,coresites)) # A homoplasy is a mutation that occurs on multiple branches. Count the number of homoplasic mutations per branch # Exclude reference sequences from the count #gd = !is.na(as.numeric(rownames(wh.mut))) | substr(rownames(wh.mut),1,4)=="NODE" #plot.mut = t(wh.mut[,xref[xref>0]])*(1+is.homoplasy[xref[xref>0]]) spectrum.mut = t(wh.mut[,xref[xref>0]])*(n.mut[xref[xref>0]]) # Identify contiguous non-core regions noncore.beg = 1+which(is.core[2:length(is.core)]==0 & (is.core[2:length(is.core)]!=is.core[1:(length(is.core)-1)])); if(!is.core[1]) noncore.beg = c(1,noncore.beg) noncore.end = which(is.core[2:length(is.core)]==1 & (is.core[2:length(is.core)]!=is.core[1:(length(is.core)-1)])); if(!is.core[length(is.core)]) noncore.end = c(noncore.end,length(is.core)) noncore.len = noncore.end-noncore.beg+1 noncore.plot = noncore.len>=1000 # Plot "raw" mutations/homoplasies #f = function() rect(noncore.beg[noncore.plot],0,noncore.end[noncore.plot],ncol(wh.mut),col="white",border=NA) #noncore.plot = noncore.len>=1000 #alt.image(plot.mut,col=c("skyblue","yellow","yellow"),xlab="Position",ylab="Branch",axes=FALSE,xaxs="i",yaxs="i",xpos=which(xref>0),background.fun=f) #axis(1); axis(2,1:nrow(wh.mut),rownames(wh.mut),las=2,cex.axis=.4); box() # Plot the recombination intervals #ypos = match(itv2$Node,rownames(wh.mut)) #arrows(itv2$Beg,ypos,itv2$End,ypos,len=0,lwd=2,col="blue",lend=2) if(!is.na(istatefile)) itv2 = read.table(istatefile,h=T,as.is=T,sep="\t") if(FALSE){ # Histogram of recombination tract lengths tlen = itv2$End-itv2$Beg # Identify ones that straddle the original wh = which(itv2$End==genome_length) for(i in wh) { if(any(itv2$Beg[itv2$Node==itv2$Node[i]]==1)) { wh2 = which(itv2$Beg[itv2$Node==itv2$Node[i]]==1) tlen[i] = tlen[i]+tlen[itv2$Node==itv2$Node[i]][wh2] tlen[itv2$Node==itv2$Node[i]][wh2] = NA } } hist(tlen,100,col="orange3",prob=T) hist(log10(tlen),100,col="orange3",prob=T) plot.ecdf(log10(tlen),col="orange3") } # Make all branch lengths equal #tree.bkp = tree #tree$edge.length = rep(1,length(tree$edge.length)) tree$comid = ifelse(is.na(as.numeric(tree$tip.label)),tree$tip.label,paste("C0000",as.numeric(tree$tip.label),sep="")) wh.mlst = ifelse(is.na(as.numeric(rownames(wh.mut))),rownames(wh.mut),paste("C0000",as.numeric(rownames(wh.mut)),sep="")) #wh.mlst_or_ref = ifelse(is.na(as.numeric(rownames(wh.mut))),rownames(wh.mut),mlst[paste(">",ifelse(is.na(as.numeric(rownames(wh.mut))),rownames(wh.mut),paste("C0000",as.numeric(rownames(wh.mut)),sep="")),"_n1",sep="")]); wh.mlst_or_ref[(1+ceiling(nrow(wh.mut)/2)):nrow(wh.mut)] = "" pdf(file="/dev/null",width=14,height=7) par(mfrow=c(1,2)) plot(tree,type="phylogram") dev.off() # Based on the phylogram tree plot, find the vertical positions and horizontal end-points of every branch vpos = get("last_plot.phylo", envir = .PlotPhyloEnv)$yy[M_node_index] lpos = get("last_plot.phylo", envir = .PlotPhyloEnv)$xx[M_anc_node_index] rpos = get("last_plot.phylo", envir = .PlotPhyloEnv)$xx[M_node_index] # Manipulate the vertical positions new_plot.phylo = get("last_plot.phylo", envir = .PlotPhyloEnv); new_plot.phylo$yy = rank(vpos)[rev_M_node_index] assign("last_plot.phylo",new_plot.phylo,envir=.PlotPhyloEnv) vpos = get("last_plot.phylo", envir = .PlotPhyloEnv)$yy[M_node_index] pdf(file=paste0(prefix,".cfml.pdf"),width=14,height=7) par(mfrow=c(1,2)) xrg = range(get("last_plot.phylo", envir = .PlotPhyloEnv)$xx) plot(xrg+c(0,diff(xrg)/20),range(get("last_plot.phylo", envir = .PlotPhyloEnv)$yy)+c(-0.5,0.5),type="n",axes=FALSE,xlab="",ylab="",xaxs="i",yaxs="i") # Plot the horizontal branches arrows(lpos,vpos,rpos,vpos,col=1,len=0) # Plot the vertical branches sapply(sort(union(M_anc_node_index,c())),function(i) { vpos = get("last_plot.phylo", envir = .PlotPhyloEnv)$yy[M_node_index[!is.na(M_anc_node_index) & M_anc_node_index==i]] hpos = get("last_plot.phylo", envir = .PlotPhyloEnv)$xx[M_node_index[!is.na(M_node_index) & M_node_index==i]] if(length(vpos)==2) arrows(hpos,vpos[1],hpos,vpos[2],len=0,col=1) }) #text(max(get("last_plot.phylo", envir = .PlotPhyloEnv)$xx)+diff(xrg)/20/2,get("last_plot.phylo", envir = .PlotPhyloEnv)$yy,wh.mlst_or_ref[rev_M_node_index],cex=.4) # Draw lines from the nodes arrows(rpos,vpos,rep(xrg[2],length(vpos)),vpos,lty=2,len=0,col="grey") # Plot "raw" mutations/homoplasies od = order(vpos) if(length(noncore.beg)>0) background.noncore = function() rect(noncore.beg[noncore.plot],0,noncore.end[noncore.plot],ncol(wh.mut),col="grey",border=NA) else background.noncore = function() {} noncore.plot = noncore.len>=10000 alt.image(spectrum.mut[,od],col=c("skyblue","white","yellow",colorRampPalette(c("orange","red"))(pmax(0,max(spectrum.mut)-2))),xlab="Position",ylab="Branch",axes=FALSE,xaxs="i",yaxs="i",xpos=which(xref>0),background.fun=background.noncore) axis(1); axis(2,1:nrow(wh.mut),ifelse((1:nrow(wh.mut))<=ceiling(nrow(wh.mut)/2),rownames(wh.mut),"")[od],las=2,cex.axis=.4); box() # Plot the recombination intervals if(!is.na(istatefile)) { ypos = match(itv2$Node,rownames(wh.mut)[od]) arrows(itv2$Beg,ypos,itv2$End,ypos,len=0,lwd=2,col="blue",lend=2) } dev.off() ClonalFrameML-1.11/src/coalesce/000077500000000000000000000000001307563374100164265ustar00rootroot00000000000000ClonalFrameML-1.11/src/coalesce/coalescent_record.h000066400000000000000000000121351307563374100222570ustar00rootroot00000000000000/* Copyright 2013 Daniel Wilson. * * coalescent_record.h * Part of the coalesce library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ #ifndef _RECORD_H_ #define _RECORD_H_ class mt_node { static int number; public: /*Fixed once*/ int id; /*Recyclable*/ bool in_use; double time; double edge_time; double last_update; // in a structured coalescent, the last time edge_time was updated class mt_node *ancestor; //ptr to ancestor class mt_node *descendant[2];//vec of ptrs to descendant public: mt_node() {}; mt_node& initialize(const int id_in) { id=id_in; recycle(); return *this; } mt_node& recycle() { in_use=false; time=0.0; edge_time=0.0; last_update=0.0; ancestor=NULL; descendant[0]=NULL; descendant[1]=NULL; return *this; } }; class marginal_tree { int id; int k; public: /*Fixed once*/ int size; //class Control *con; //ptr to con int n; class mt_node *node; //vec of mt_node's /*Recyclable*/ int genotype; int next_free_node; int nco; public: marginal_tree() {}; /*marginal_tree& initialize(const int id_in, class Control *con_in) { id=id_in; con=con_in; size=con->nsamp+(con->nsamp-1); node=(class mt_node*) malloc((size_t) size*sizeof(class mt_node)); int i; for(i=0;i. * */ #include "main.h" int main (const int argc, const char* argv[]) { clock_t start_time = clock(); cout << "ClonalFrameML " << ClonalFrameML_GITRevision << endl; // Process the command line arguments if(argc<4) { stringstream errTxt; errTxt << "Syntax: ClonalFrameML newick_file fasta_file output_file [OPTIONS]" << endl; errTxt << endl; errTxt << "Options specifying the analysis type:" << endl; errTxt << "-em true (default) or false Estimate parameters by a Baum-Welch expectation maximization algorithm." << endl; errTxt << "-embranch true or false (default) Estimate parameters for each branch using the EM algorithm." << endl; errTxt << "-rescale_no_recombination true or false (default) Rescale branch lengths for given sites with no recombination model." << endl; errTxt << "-imputation_only true or false (default) Perform only ancestral state reconstruction and imputation." << endl; errTxt << "Options affecting all analyses:" << endl; errTxt << "-kappa value > 0 (default 2.0) Relative rate of transitions vs transversions in substitution model" << endl; errTxt << "-fasta_file_list true or false (default) Take fasta_file to be a white-space separated file list." << endl; errTxt << "-xmfa_file true or false (default) Take fasta_file to be an XMFA file."< 0 (default 1e-7) Minimum branch length." << endl; errTxt << "-reconstruct_invariant_sites true or false (default) Reconstruct the ancestral states at invariant sites." << endl; // errTxt << "-compress_reconstructed_sites true (default) or false Reduce the number of columns in the output FASTA file." << endl; // Alternative not currently implemented, so not optional errTxt << "-label_uncorrected_tree true or false (default) Regurgitate the uncorrected Newick tree with internal nodes labelled." << endl; errTxt << "Options affecting -em and -embranch:" << endl; errTxt << "-prior_mean df \"0.1 0.001 0.1 0.0001\" Prior mean for R/theta, 1/delta, nu and M." << endl; errTxt << "-prior_sd df \"0.1 0.001 0.1 0.0001\" Prior standard deviation for R/theta, 1/delta, nu and M." << endl; errTxt << "-initial_values default \"0.1 0.001 0.05\" Initial values for R/theta, 1/delta and nu." << endl; errTxt << "-guess_initial_m true (default) or false Initialize M and nu jointly in the EM algorithms." << endl; errTxt << "-emsim value >= 0 (default 0) Number of simulations to estimate uncertainty in the EM results." << endl; errTxt << "-embranch_dispersion value > 0 (default .01) Dispersion in parameters among branches in the -embranch model." << endl; errTxt << "Options affecting -rescale_no_recombination:" << endl; errTxt << "-brent_tolerance tolerance (default .001) Set the tolerance of the Brent routine for -rescale_no_recombination." << endl; errTxt << "-powell_tolerance tolerance (default .001) Set the tolerance of the Powell routine for -rescale_no_recombination." << endl; error(errTxt.str().c_str()); } // Process required arguments const char* newick_file = argv[1]; const char* fasta_file = argv[2]; const char* out_file = argv[3]; string tree_out_file = string(out_file) + ".labelled_tree.newick"; string oritree_out_file = string(out_file) + ".labelled_uncorrected_tree.newick"; string fasta_out_file = string(out_file) + ".ML_sequence.fasta"; string xref_out_file = string(out_file) + ".position_cross_reference.txt"; string import_out_file = string(out_file) + ".importation_status.txt"; string em_out_file = string(out_file) + ".em.txt"; string emsim_out_file = string(out_file) + ".emsim.txt"; // Set default options ArgumentWizard arg; arg.case_sensitive = false; string fasta_file_list="false", xmfa_file="false", imputation_only="false", ignore_incomplete_sites="false", ignore_user_sites="", reconstruct_invariant_sites="false"; string use_incompatible_sites="true", rescale_no_recombination="false"; string show_progress="false", compress_reconstructed_sites="true"; string string_prior_mean="0.1 0.001 0.1 0.0001", string_prior_sd="0.1 0.001 0.1 0.0001", string_initial_values = "0.1 0.001 0.05"; string guess_initial_m="true", em="true", embranch="false", label_original_tree="false", chr_name=""; double brent_tolerance = 1.0e-3, powell_tolerance = 1.0e-3, global_min_branch_length = 1.0e-7; double embranch_dispersion = 0.01, kappa = 2.0; int emsim = 0; // Process options arg.add_item("fasta_file_list", TP_STRING, &fasta_file_list); arg.add_item("xmfa_file", TP_STRING, &xmfa_file); arg.add_item("imputation_only", TP_STRING, &imputation_only); arg.add_item("ignore_incomplete_sites", TP_STRING, &ignore_incomplete_sites); arg.add_item("ignore_user_sites", TP_STRING, &ignore_user_sites); arg.add_item("reconstruct_invariant_sites", TP_STRING, &reconstruct_invariant_sites); arg.add_item("use_incompatible_sites", TP_STRING, &use_incompatible_sites); arg.add_item("brent_tolerance", TP_DOUBLE, &brent_tolerance); arg.add_item("chromosome_name", TP_STRING, &chr_name); arg.add_item("powell_tolerance", TP_DOUBLE, &powell_tolerance); arg.add_item("rescale_no_recombination", TP_STRING, &rescale_no_recombination); arg.add_item("show_progress", TP_STRING, &show_progress); arg.add_item("compress_reconstructed_sites",TP_STRING, &compress_reconstructed_sites); arg.add_item("min_branch_length", TP_DOUBLE, &global_min_branch_length); arg.add_item("prior_mean", TP_STRING, &string_prior_mean); arg.add_item("prior_sd", TP_STRING, &string_prior_sd); arg.add_item("initial_values", TP_STRING, &string_initial_values); arg.add_item("guess_initial_m", TP_STRING, &guess_initial_m); arg.add_item("em", TP_STRING, &em); arg.add_item("emsim", TP_INT, &emsim); arg.add_item("embranch", TP_STRING, &embranch); arg.add_item("embranch_dispersion", TP_DOUBLE, &embranch_dispersion); arg.add_item("kappa", TP_DOUBLE, &kappa); arg.add_item("label_uncorrected_tree", TP_STRING, &label_original_tree); arg.read_input(argc-3,argv+3); bool FASTA_FILE_LIST = string_to_bool(fasta_file_list, "fasta_file_list"); bool XMFA_FILE = string_to_bool(xmfa_file, "xmfa_file"); bool CORRECT_BRANCH_LENGTHS = !string_to_bool(imputation_only, "imputation_only"); bool IGNORE_INCOMPLETE_SITES = string_to_bool(ignore_incomplete_sites, "ignore_incomplete_sites"); bool RECONSTRUCT_INVARIANT_SITES = string_to_bool(reconstruct_invariant_sites, "reconstruct_invariant_sites"); bool USE_INCOMPATIBLE_SITES = string_to_bool(use_incompatible_sites, "use_incompatible_sites"); bool RESCALE_NO_RECOMBINATION = string_to_bool(rescale_no_recombination, "rescale_no_recombination"); bool SHOW_PROGRESS = string_to_bool(show_progress, "show_progress"); bool COMPRESS_RECONSTRUCTED_SITES = string_to_bool(compress_reconstructed_sites, "compress_reconstructed_sites"); bool GUESS_INITIAL_M = string_to_bool(guess_initial_m, "guess_initial_m"); bool EM = string_to_bool(em, "em"); bool EMBRANCH = string_to_bool(embranch, "embranch"); bool LABEL_ORIGINAL_TREE = string_to_bool(label_original_tree, "label_uncorrected_tree"); bool MULTITHREAD = false; if(brent_tolerance<=0.0 || brent_tolerance>=0.1) { stringstream errTxt; errTxt << "brent_tolerance value out of range (0,0.1], default 0.001"; error(errTxt.str().c_str()); } if(powell_tolerance<=0.0 || powell_tolerance>=0.1) { stringstream errTxt; errTxt << "powell_tolerance value out of range (0,0.1], default 0.001"; error(errTxt.str().c_str()); } if(!CORRECT_BRANCH_LENGTHS || EMBRANCH || RESCALE_NO_RECOMBINATION) EM = false; if(((int)RESCALE_NO_RECOMBINATION + (int)EM +(int)EMBRANCH)>1) { stringstream errTxt; errTxt << "rescale_no_recombination, em and embranch are mutually incompatible"; error(errTxt.str().c_str()); } if((RESCALE_NO_RECOMBINATION || EM || EMBRANCH) && !CORRECT_BRANCH_LENGTHS) { stringstream wrnTxt; wrnTxt << "advanced options will be ignored because imputation_only=true"; warning(wrnTxt.str().c_str()); } if(CORRECT_BRANCH_LENGTHS && !(RESCALE_NO_RECOMBINATION || EM || EMBRANCH)) { error("One of -em, -embranch or -rescale_no_recombination must be specified when imputation_only=false"); } if(MULTITHREAD) { cout << "WARNING: multithreaded version not implemented, ignoring." << endl; } if(global_min_branch_length<=0.0) { error("Minimum branch length must be positive"); } // Process the prior mean and standard deviation vector prior_mean(0), prior_sd(0); stringstream sstream_prior_mean; sstream_prior_mean << string_prior_mean; int i; for(i=0;i<1000;i++) { if(sstream_prior_mean.eof()) break; double prior_mean_elem; sstream_prior_mean >> prior_mean_elem; if(sstream_prior_mean.fail()) error("Could not interpret value specified by prior_mean"); prior_mean.push_back(prior_mean_elem); } if(i==1000) error("Maximum length of vector exceeded by prior_mean"); stringstream sstream_prior_sd; sstream_prior_sd << string_prior_sd; for(i=0;i<1000;i++) { if(sstream_prior_sd.eof()) break; double prior_sd_elem; sstream_prior_sd >> prior_sd_elem; if(sstream_prior_sd.fail()) error("Could not interpret value specified by prior_sd"); prior_sd.push_back(prior_sd_elem); } if(prior_mean.size()!=4) error("prior_mean must have 4 values separated by spaces"); if(prior_sd.size()!=4) error("prior_sd must have 4 values separated by spaces"); // Process the initial values vector initial_values(0); if(string_initial_values!="") { stringstream sstream_initial_values; sstream_initial_values << string_initial_values; for(i=0;i<1000;i++) { if(sstream_initial_values.eof()) break; double initial_values_elem; sstream_initial_values >> initial_values_elem; if(sstream_initial_values.fail()) error("Could not interpret value specified by initial_values"); initial_values.push_back(initial_values_elem); } if(i==1000) error("Maximum length of vector exceeded by initial_values"); if(!(initial_values.size()==3)) error("initial values must have 3 values separated by spaces"); } if(emsim<0) error("-emsim cannot be negative"); if(emsim>0 && !(EM || EMBRANCH)) error("-emsim only applicable with -em or -embranch"); if(embranch_dispersion<=0.0) error("-embranch_dispersion must be positive"); if(kappa<=0.0) error("-kappa must be positive"); // Open the FASTA file(s) vector sites_to_ignore; DNA fa; if(FASTA_FILE_LIST) { ifstream file_list(fasta_file); if(!file_list.is_open()) { stringstream errTxt; errTxt << "could not find file " << fasta_file; error(errTxt.str().c_str()); } int n = 0; int L = -1; while(!file_list.eof()) { string filename; file_list >> filename; // Pre-check: does it exist? ifstream file_list1(filename.c_str()); if(!file_list1.is_open()) { stringstream errTxt; errTxt << "could not find listed file " << fasta_file; error(errTxt.str().c_str()); } // Read the file DNA fa1(filename.c_str()); n += fa1.nseq; if(L==-1) L = fa1.lseq; if(fa1.lseq!=L) { stringstream errTxt; errTxt << "listed file " << fasta_file << " had sequence length " << fa1.lseq << " expecting " << L; error(errTxt.str().c_str()); } // Add to list int ni; for(ni=0;ni ctree_node_labels; const bool is_rooted = (newick.root.dec.size()==2); marginal_tree ctree = (is_rooted) ? convert_rooted_NewickTree_to_marginal_tree(newick,fa.label,ctree_node_labels) : convert_unrooted_NewickTree_to_marginal_tree(newick,fa.label,ctree_node_labels); const int root_node = (is_rooted) ? ctree.size-1 : ctree.size-2; // If requested, regurgitate the input tree with the internal nodes labelled, before anything is done to the branch lengths if(LABEL_ORIGINAL_TREE) { write_newick(ctree,ctree_node_labels,oritree_out_file.c_str()); } // Open the list of sites to ignore vector ignore_site(fa.lseq,false); for (int i=0;i> elem; elem--; if(!(elem>=0 && elem anyN; vector compat = compute_compatibility(fa,ctree,anyN,false); if(IGNORE_INCOMPLETE_SITES) { for(i=0;i isIRAS(fa.lseq,false); for(i=0;i isBLC(fa.lseq,false); for(i=0;i empirical_nucleotide_frequencies(4,0.25); Matrix nuc = FASTA_to_nucleotide(fa,empirical_nucleotide_frequencies,isIRAS); // Identify and count unique patterns vector pat; // Pattern as string of AGCTNs vector pat1, cpat, ipat; // First example of each pattern, number of sites with that pattern, the pattern at each (compatible) site (-1 otherwise) vector nuc_ispoly(nuc.ncols(),true); find_alignment_patterns(nuc,nuc_ispoly,pat,pat1,cpat,ipat); // Storage for the MLE of the nucleotide sequence at every node Matrix node_nuc; // Sanity check: are all branch lengths non-negative for(i=0;i empirical_nucleotide_frequencies(4,0.25); Matrix nuc = FASTA_to_nucleotide(fa,empirical_nucleotide_frequencies,isBLC); // Identify and count unique patterns vector pat; // Pattern as string of AGCTNs vector pat1, cpat, ipat; // First example of each pattern, number of sites with that pattern, the pattern at each (compatible) site (-1 otherwise) vector nuc_ispoly(nuc.ncols(),true); find_alignment_patterns(nuc,nuc_ispoly,pat,pat1,cpat,ipat); // Storage for the MLE of the nucleotide sequence at every node Matrix node_nuc; // Begin by computing the joint maximum likelihood ancestral sequences mydouble ML = maximum_likelihood_ancestral_sequences(nuc,ctree,kappa,empirical_nucleotide_frequencies,pat1,cpat,node_nuc); cout << "BRANCH LENGTH CORRECTION/RECOMBINATION ANALYSIS:" << endl; cout << "Analysing " << nBLC << " sites" << endl; // Report the estimated equilibrium frequencies cout << "Empirical nucleotide frequencies: A " << round(1000*empirical_nucleotide_frequencies[Adenine])/10 << "% C " << round(1000*empirical_nucleotide_frequencies[Cytosine])/10; cout << "% G " << round(1000*empirical_nucleotide_frequencies[Guanine])/10 << "% T " << round(1000*empirical_nucleotide_frequencies[Thymine])/10 << "%" << endl; if(RESCALE_NO_RECOMBINATION) { // Rescale the branch lengths using given sites without a model of recombination cout << "Beginning branch optimization. Key to parameters (and constraints):" << endl; cout << "B uncorrected branch length" << endl; cout << "L maximum log-likelihood per branch" << endl; cout << "M corrected branch length/expected number of mutations (> 0)" << endl; double ML = 0.0; for(i=0;iid; int j,k; for(j=0,k=0;j param(1,log10(initial_branch_length)); param = Pow.minimize(param,powell_tolerance); double final_branch_length = pow(10.,param[0]); if(final_branch_length 0)" << endl; cout << "I mean DNA import length per branch (> 0)" << endl; cout << "D divergence of DNA imported by recombination (> 0)" << endl; cout << "M expected number of mutations per branch (> 0)" << endl; double ML = 0.0; vector< vector > is_imported(root_node); // Calculate the a and b parameters of the priors vector prior_a(4), prior_b(4); for(i=0;i<4;i++) { // Mean = a/b and variance = a/b/b so sd = sqrt(a)/b // So b = mean/sd/sd and a = b*mean if(prior_mean[i]<=0.0) error("EM: prior_mean must be positive"); if(prior_sd[i]<=0.0) error("EM: prior_sd must be positive"); prior_b[i] = prior_mean[i]/prior_sd[i]/prior_sd[i]; prior_a[i] = prior_b[i]*prior_mean[i]; } // Initial values for R_over_theta, mean_import_length and import_divergence from prior vector param(3); param[0] = initial_values[0]; param[1] = 1.0/initial_values[1]; param[2] = initial_values[2]; // Do inference clock_t pow_start_time = clock(); ClonalFrameBaumWelch cff(ctree,node_nuc,isBLC,ipat,kappa,empirical_nucleotide_frequencies,is_imported,prior_a,prior_b,root_node,GUESS_INITIAL_M,SHOW_PROGRESS); param = cff.maximize_likelihood(param); ML = cff.ML; cout << " L = " << ML << " P = " << cff.priorL << " R = " << param[0] << " I = " << param[1] << " D = " << param[2] << " in " << (double)(clock()-pow_start_time)/CLOCKS_PER_SEC << " s and " << cff.neval << " evaluations" << endl; cout << " Posterior alphas: R = " << cff.posterior_a[0] << " I = " << cff.posterior_a[1] << " D = " << cff.posterior_a[2] << endl; const double cfmlLLR = ML-cff.priorL-cff.ML0; if(cfmlLLR>6.0) { cout << " ClonalFrameML log-likelihood ratio of " << cfmlLLR << " indicates evidence for recombination" << endl; } else { cout << " WARNING: ClonalFrameML log-likelihood ratio of " << cfmlLLR << " indicates lack of evidence for recombination" << endl; } for(i=0;i0) { Matrix sim = cff.simulate_posterior(param,emsim); if(sim.nrows()!=3 || sim.ncols()!=emsim) error("ClonalFrameBaumWelch::simulate_posterior() produced unexpected results"); ofstream eout(emsim_out_file.c_str()); eout << "R/theta" << tab << "delta" << tab << "nu" << endl; for(i=0;i 0)" << endl; cout << "I mean DNA import length per branch (> 0)" << endl; cout << "D divergence of DNA imported by recombination (> 0)" << endl; cout << "M expected number of mutations per branch (> 0)" << endl; double ML = 0.0; vector< vector > is_imported(root_node); // Calculate the a and b parameters of the priors vector prior_a(5), prior_b(5); for(i=0;i<4;i++) { // Mean = a/b and variance = a/b/b so sd = sqrt(a)/b // So b = mean/sd/sd and a = b*mean if(prior_mean[i]<=0.0) error("EMBRANCH: prior_mean must be positive"); if(prior_sd[i]<=0.0) error("EMBRANCH: prior_sd must be positive"); prior_b[i] = prior_mean[i]/prior_sd[i]/prior_sd[i]; prior_a[i] = prior_b[i]*prior_mean[i]; } // Set the prior on the fifth parameter prior_a[4] = prior_b[4] = 1.0/embranch_dispersion; // Initial values for rho_over_theta, mean_import_length and import_divergence from prior // Note that the fourth value (mean branch length) is ignored and computed from the tree vector param(4); param[0] = initial_values[0]; param[1] = 1.0/initial_values[1]; param[2] = initial_values[2]; param[3] = 1.0e-5; // Do inference clock_t pow_start_time = clock(); ClonalFrameBaumWelchRhoPerBranch cff(ctree,node_nuc,isBLC,ipat,kappa,empirical_nucleotide_frequencies,is_imported,prior_a,prior_b,root_node,GUESS_INITIAL_M,SHOW_PROGRESS); cff.maximize_likelihood(param); ML = cff.ML; cout << "Mean parameters:" << endl; cout << " L = " << ML << " R = " << cff.mean_param[0] << " I = " << 1.0/cff.mean_param[1] << " D = " << cff.mean_param[2] << " M = " << cff.mean_param[3] << " in " << (double)(clock()-pow_start_time)/CLOCKS_PER_SEC << " s and " << cff.neval << " evaluations" << endl; cout << "Parameters per branch:" << endl; for(i=0;i0) { warning("-emsim not yet implemented for -embranch"); // Matrix sim = cff.simulate_posterior(param,emsim); // if(sim.nrows()!=3 || sim.ncols()!=emsim) error("ClonalFrameBaumWelch::simulate_posterior() produced unexpected results"); // ofstream eout(emsim_out_file.c_str()); // eout << "R/theta" << tab << "delta" << tab << "nu" << endl; // for(i=0;i &tip_labels, vector &all_node_labels) { size_t i; vector order = tip_labels; const int n = tip_labels.size(); for(i=0;i &allnodes = newick.allnodes; size_t nnode = allnodes.size(); vector tips(0); vector coals(0); NewickNode* root = 0; for(i=0;idec.size()==0) { tips.push_back(allnodes[i]); } else { coals.push_back(allnodes[i]); } // Test for multifurcations if(allnodes[i]->dec.size()>2) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Number of descendant nodes (" << allnodes[i]->dec.size(); errTxt << ") incompatible with a strictly bifurcating rooted tree"; error(errTxt.str().c_str()); } // Test for the root if(allnodes[i]->anc==0) { if(root==0) { root = allnodes[i]; } else { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Found multiple roots in Newick tree"; error(errTxt.str().c_str()); } } } size_t ntips = tips.size(); // Make sure the number of tips equals that specified by the tip labels if(ntips!=tip_labels.size()) { stringstream errTxt; errTxt << "Number of nodes in Newick tree inconsistent with that expected"; error(errTxt.str().c_str()); } // Check that the Newick tree is strictly bifurcating, and assume it is correctly rooted if(nnode!=2*ntips-1) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Number of nodes (" << nnode << ") and number of tips (" << ntips; errTxt << ") incompatible with a strictly bifurcating rooted tree"; error(errTxt.str().c_str()); } // Calculate node times. Ensure all branches have non-zero length const double minbranchlength = 1e-12; vector root2tip(1,root); // temporary ordering of nodes from root to tips vector ageroot2tip(1,0.0); // corresponding age of each node in root2tip double youngest_node = 0.0; size_t iroot2tip; for(iroot2tip=0;iroot2tip=root2tip.size()) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "iroot2tip exceeded size of root2tip"; error(errTxt.str().c_str()); } // Add descendants of current node to list and calculate node times // counting with age increasing backwards in time, but the root node at time 0 int idec; for(idec=0;idecdec.size();idec++) { root2tip.push_back(root2tip[iroot2tip]->dec[idec]); // Ensure the descendant is always younger than its ancestor double branchlength = root2tip[root2tip.size()-1]->len; if(branchlength(0) // and then impose this ordering thereafter. Note that the time-ordering of the tips is // unimportant. vector ixroot2tip(0); for(iroot2tip=0;iroot2tip labelorder; for(iroot2tip=0;iroot2tip::iterator _find = std::find(order.begin(),order.end(),root2tip[iroot2tip]->str); if(_find==order.end()) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Newick tree tip label " << root2tip[iroot2tip]->str << " was not expected"; error(errTxt.str().c_str()); } labelorder.push_back(_find-order.begin()); } // Re-order root2tip and ageroot2tip by (1) label (tips only) (2) age (coalescences only) std::stable_sort(ixroot2tip.begin(),ixroot2tip.end(),orderNewickNodesByStatusLabelAndAge(root2tip,ageroot2tip,labelorder)); } // Assign each node in root2tip an index by calculating the rank of each element in root2tip in ixroot2tip map nodeIndex; for(iroot2tip=0;iroot2tip(0); for(iroot2tip=0;iroot2tipdec.size()==0) { // If tip if(internal_nodes_begun) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "internal nodes added to marginal tree before all tips"; error(errTxt.str().c_str()); } double age = ageroot2tip[ix]-youngest_node; if(fabs(age)<1e-6) age = 0.0; tree.add_base_node(&age,nodeIndex[node]); } else if(node->dec.size()==2) { // If internal node internal_nodes_begun = true; double age = ageroot2tip[ix]-youngest_node; if(fabs(age)<1e-6) age = 0.0; tree.coalesce(age,nodeIndex[node->dec[0]],nodeIndex[node->dec[1]]); } else { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "only tips or bifurcating nodes expected. " << node->dec.size() << " descendants not allowed."; error(errTxt.str().c_str()); } if(node->str!="") { all_node_labels.push_back(node->str); } else { stringstream autolab; autolab << "NODE_" << iroot2tip+1; all_node_labels.push_back(autolab.str()); } } return tree; } marginal_tree convert_unrooted_NewickTree_to_marginal_tree(NewickTree &newick, vector &tip_labels, vector &all_node_labels) { size_t i; vector order = tip_labels; const int n = tip_labels.size(); for(i=0;i &allnodes = newick.allnodes; size_t nnode = allnodes.size(); vector tips(0); vector coals(0); NewickNode* root = 0; for(i=0;idec.size()==0) { tips.push_back(allnodes[i]); } else { coals.push_back(allnodes[i]); } // Test for multifurcations if(allnodes[i]->anc!=0 && allnodes[i]->dec.size()==3) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Only the root is allowed 3 descendant nodes"; error(errTxt.str().c_str()); } if(allnodes[i]->dec.size()>3) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Number of descendant nodes (" << allnodes[i]->dec.size(); errTxt << ") incompatible with a bifurcating unrooted tree"; error(errTxt.str().c_str()); } // Test for the root if(allnodes[i]->anc==0) { if(root==0) { root = allnodes[i]; if(root->dec.size()!=3) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Deepest node in unrooted Newick tree expected to have 3 descendants"; error(errTxt.str().c_str()); } } else { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Found multiple roots in Newick tree"; error(errTxt.str().c_str()); } } } size_t ntips = tips.size(); // Make sure the number of tips equals that specified by the tip labels if(ntips!=tip_labels.size()) { stringstream errTxt; errTxt << "Number of nodes in Newick tree inconsistent with that expected"; error(errTxt.str().c_str()); } // Check that the Newick tree is consistent with an unrooted strictly bifurcating tree if(nnode!=2*ntips-2) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Number of nodes (" << nnode << ") and number of tips (" << ntips; errTxt << ") incompatible with an unrooted bifurcating tree"; error(errTxt.str().c_str()); } // Calculate node times. Ensure all branches have non-zero length const double minbranchlength = 1e-12; vector root2tip(1,root); // temporary ordering of nodes from root to tips vector ageroot2tip(1,0.0); // corresponding age of each node in root2tip double youngest_node = 0.0; size_t iroot2tip; for(iroot2tip=0;iroot2tip=root2tip.size()) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "iroot2tip exceeded size of root2tip"; error(errTxt.str().c_str()); } // Add descendants of current node to list and calculate node times // counting with age increasing backwards in time, but the root node at time 0 int idec; for(idec=0;idecdec.size();idec++) { root2tip.push_back(root2tip[iroot2tip]->dec[idec]); // Ensure the descendant is always younger than its ancestor double branchlength = root2tip[root2tip.size()-1]->len; if(branchlength(0) // and then impose this ordering thereafter. Note that the time-ordering of the tips is // unimportant. vector ixroot2tip(0); for(iroot2tip=0;iroot2tip labelorder; for(iroot2tip=0;iroot2tip::iterator _find = std::find(order.begin(),order.end(),root2tip[iroot2tip]->str); if(_find==order.end()) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Newick tree tip label " << root2tip[iroot2tip]->str << " was not expected"; error(errTxt.str().c_str()); } labelorder.push_back(_find-order.begin()); } // Re-order root2tip and ageroot2tip by (1) label (tips only) (2) age (coalescences only) std::stable_sort(ixroot2tip.begin(),ixroot2tip.end(),orderNewickNodesByStatusLabelAndAge(root2tip,ageroot2tip,labelorder)); } // Assign each node in root2tip an index by calculating the rank of each element in root2tip in ixroot2tip map nodeIndex; for(iroot2tip=0;iroot2tip(0); for(iroot2tip=0;iroot2tipdec.size()==0) { // If tip if(internal_nodes_begun) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "internal nodes added to marginal tree before all tips"; error(errTxt.str().c_str()); } double age = ageroot2tip[ix]-youngest_node; if(fabs(age)<1e-6) age = 0.0; tree.add_base_node(&age,nodeIndex[node]); } else if(node->dec.size()==2) { // If internal node internal_nodes_begun = true; double age = ageroot2tip[ix]-youngest_node; if(fabs(age)<1e-6) age = 0.0; tree.coalesce(age,nodeIndex[node->dec[0]],nodeIndex[node->dec[1]]); } else { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "only tips or bifurcating nodes expected. " << node->dec.size() << " descendants not allowed."; error(errTxt.str().c_str()); } if(node->str!="") { all_node_labels.push_back(node->str); } else { stringstream autolab; autolab << "NODE_" << iroot2tip+1; all_node_labels.push_back(autolab.str()); } } // Deal with the root separately iroot2tip = nnode-1; size_t ix = ixroot2tip[iroot2tip]; const NewickNode *node = root2tip[ix]; // Sanity check if(nodeIndex[node]!=iroot2tip) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "Inconsistency in internal node numbering"; error(errTxt.str().c_str()); } if(node->dec.size()!=3) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "expected trifurcating root node"; error(errTxt.str().c_str()); } if(node->anc!=NULL) { stringstream errTxt; errTxt << "convert_NewickTree_to_marginal_tree(): "; errTxt << "expected orphan root node"; error(errTxt.str().c_str()); } double age = ageroot2tip[ix]-youngest_node; if(fabs(age)<1e-6) age = 0.0; // Coalesce the first two descendants tree.coalesce(age,nodeIndex[node->dec[0]],nodeIndex[node->dec[1]]); if(node->str!="") { all_node_labels.push_back(node->str); } else { stringstream autolab; autolab << "NODE_" << iroot2tip+1; all_node_labels.push_back(autolab.str()); } // Coalesce the resulting node with the third descendant to make the absolute root (this branch has exactly zero length) int penultimate_nodeid = nnode-1; tree.coalesce(age,penultimate_nodeid,nodeIndex[node->dec[2]]); stringstream autolab; autolab << "NODE_" << iroot2tip+2; all_node_labels.push_back(autolab.str()); return tree; } vector compute_compatibility(DNA &fa, marginal_tree &ctree, vector &anyN, bool purge_singletons) { // Sample size const int n = fa.nseq; // Sequence length const int L = fa.lseq; // Results of initial incompatibility test: -1 (invariant or singleton, compatible), 0 (2 alleles, not tested), 2 (>2 alleles, incompatible) vector iscompat(L,0); anyN = vector(L,false); // Convert FASTA file to binary: if more than two alleles mark as incompatible: -1 (uninitialized), 0 (reference allele), 1 (first non-reference allele), 2 (second non-reference allele) // Let -2 be a no-call (N) Matrix bip(n,L,-1); int i,pos; for(pos=0;pos treebip(n,n-2,-1); // Add "mutations" encoding the branches of the clonal frame // The first index is for the sequence (including internal sequences) and the second is for the branch encoded (equivalent to the site) Matrix cstate(2*n-1,2*n-1,-1); int j,k; // Assign 0 to the root node for every site for(k=0;k<2*n-1;k++) cstate[2*n-2][k] = 0; // Work from root to tips inheriting the state or, if the focal branch, introducing the mutated state for(j=2*n-3;j>=0;j--) { for(k=0;k<2*n-1;k++) { if(j==k) { cstate[j][k] = 1; } else { const mt_node *node = &(ctree.node[j]); const mt_node *parent = node->ancestor; const int parentState = cstate[parent->id][k]; cstate[j][k] = parentState; } } } // Determine compatibility with the clonal frame // Test whether the observed partitions in the FASTA file are incompatible with any branches in the Newick tree // by tracking whether each of the four possible "haplotypes" has been observed. // pos is the position in the FASTA file, j is the individual in the FASTA file and k is the branch in the Newick tree for(pos=0;pos > hap(2*n-1, Matrix(2,2,false)); if(iscompat[pos]==0) { for(j=0;j0) { int nd=snewick.find_first_of(":",pos); if (nd==string::npos) nd=snewick.size(); snewick.erase(snewick.begin()+pos+1,snewick.begin()+nd); pos=snewick.find_first_of(")",pos+1); } snewick.append(";"); return NewickTree(snewick); } Matrix FASTA_to_nucleotide(DNA &fa, vector &empirical_nucleotide_frequencies, vector usesite) { int i,j,k; int nsites = 0; for(j=0;j nuc(fa.nseq,nsites,N_ambiguous); empirical_nucleotide_frequencies = vector(4,0.0); double total_empirical_count = 0.0; for(j=0,k=0;j &nuc, vector &iscompat, vector &pat, vector &pat1, vector &cpat, vector &ipat) { pat = vector(0); pat1 = vector(0); cpat = vector(0); ipat = vector(nuc.ncols()); static const char AGCTN[5] = {'A','G','C','T','N'}; int i,j,pos; for(pos=0;pos > compute_HKY85_ptrans(const marginal_tree &ctree, const double kappa, const vector &pi) { const double k = 1.0/kappa; const int nnodes = ctree.size; Matrix ptrans_element(4,4,0.0); vector< Matrix > ptrans(nnodes,ptrans_element); int i; for(i=0;i1.0) { ptrans[i][j][l] = 1.0; } else if(ptrans[i][j][l]<1.0e-100) { ptrans[i][j][l] = 1.0e-100; } } } } return ptrans; } Matrix compute_HKY85_ptrans(const double x, const double kappa, const vector &pi) { const double k = 1.0/kappa; Matrix ptrans(4,4,0.0); double t1 = pi[2] + pi[3]; double t2 = t1 * pi[0]; double t3 = t1 * pi[1]; double t4 = pi[0] * pi[1] + pi[2] * pi[3] + (t2 + t3) * k; t4 = 0.1e1 / t4; double t5 = -0.1e1 / 0.2e1; double t6 = exp(t5 * (t1 * k + pi[0] + pi[1]) * x * t4); double t7 = pi[2] + pi[3] + pi[0] + pi[1]; double t8 = exp(t5 * k * t7 * x * t4); double t9 = pow(pi[1], 0.2e1); double t10 = pow(pi[0], 0.2e1); double t11 = pi[0] + pi[1]; double t12 = t7 * t6 - t1 * t8 - pi[0] - pi[1]; double t13 = t8 - 0.1e1; double t14 = 0.1e1 / t11; double t15 = 0.1e1 / t7; double t16 = t13 * pi[2] * t15; double t17 = t13 * pi[3] * t15; t4 = exp(t5 * (t11 * k + pi[2] + pi[3]) * x * t4); t5 = pow(pi[3], 0.2e1); double t18 = pow(pi[2], 0.2e1); t11 = t11 * t8; t7 = -t11 + t7 * t4 - pi[3] - pi[2]; t1 = 0.1e1 / t1; double t19 = t13 * pi[0] * t15; t13 = t13 * pi[1] * t15; double temp; temp = (t6 * t9 + ((pi[0] + pi[3] + pi[2]) * t6 + pi[0]) * pi[1] + t2 * t8 + t10) * t14 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[0][0] = temp; temp = -pi[1] * t12 * t14 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[0][1] = temp; temp = -t16; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[0][2] = temp; temp = -t17; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[0][3] = temp; temp = -pi[0] * t12 * t14 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[1][0] = temp; temp = (t6 * t10 + ((pi[2] + pi[1] + pi[3]) * t6 + pi[1]) * pi[0] + t3 * t8 + t9) * t14 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[1][1] = temp; temp = -t16; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[1][2] = temp; temp = -t17; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[1][3] = temp; temp = -t19; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[2][0] = temp; temp = -t13; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[2][1] = temp; temp = (t4 * t5 + ((pi[0] + pi[2] + pi[1]) * t4 + pi[2]) * pi[3] + t11 * pi[2] + t18) * t1 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[2][2] = temp; temp = -t7 * pi[3] * t1 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[2][3] = temp; temp = -t19; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[3][0] = temp; temp = -t13; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[3][1] = temp; temp = -t7 * pi[2] * t1 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[3][2] = temp; temp = (t4 * t18 + ((pi[0] + pi[1] + pi[3]) * t4 + pi[3]) * pi[2] + t11 * pi[3] + t5) * t1 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[3][3] = temp; return ptrans; } Matrix dcompute_HKY85_ptrans(const double x, const double kappa, const vector &pi) { const double k = 1.0/kappa; Matrix ptrans(4,4,0.0); double t1 = pi[2] + pi[3]; double t2 = t1 * pi[0]; double t3 = t1 * pi[1]; double t4 = pi[0] * pi[1] + pi[2] * pi[3] + (t2 + t3) * k; t4 = 0.1e1 / t4; double t5 = -0.1e1 / 0.2e1; double t6 = exp(t5 * (t1 * k + pi[0] + pi[1]) * x * t4); double t7 = pi[2] + pi[3] + pi[0] + pi[1]; double t8 = exp(t5 * k * t7 * x * t4); double t9 = pow(pi[1], 0.2e1); double t10 = pow(pi[0], 0.2e1); double t11 = pi[0] + pi[1]; double t12 = t7 * t6 - t1 * t8 - pi[0] - pi[1]; double t13 = t8 - 0.1e1; double t14 = 0.1e1 / t11; double t15 = 0.1e1 / t7; double t16 = t13 * pi[2] * t15; double t17 = t13 * pi[3] * t15; t4 = exp(t5 * (t11 * k + pi[2] + pi[3]) * x * t4); t5 = pow(pi[3], 0.2e1); double t18 = pow(pi[2], 0.2e1); t11 = t11 * t8; t7 = -t11 + t7 * t4 - pi[3] - pi[2]; t1 = 0.1e1 / t1; double t19 = t13 * pi[0] * t15; t13 = t13 * pi[1] * t15; double temp; temp = (t6 * t9 + ((pi[0] + pi[3] + pi[2]) * t6 + pi[0]) * pi[1] + t2 * t8 + t10) * t14 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[0][0] = temp; temp = -pi[1] * t12 * t14 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[0][1] = temp; temp = -t16; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[0][2] = temp; temp = -t17; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[0][3] = temp; temp = -pi[0] * t12 * t14 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[1][0] = temp; temp = (t6 * t10 + ((pi[2] + pi[1] + pi[3]) * t6 + pi[1]) * pi[0] + t3 * t8 + t9) * t14 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[1][1] = temp; temp = -t16; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[1][2] = temp; temp = -t17; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[1][3] = temp; temp = -t19; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[2][0] = temp; temp = -t13; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[2][1] = temp; temp = (t4 * t5 + ((pi[0] + pi[2] + pi[1]) * t4 + pi[2]) * pi[3] + t11 * pi[2] + t18) * t1 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[2][2] = temp; temp = -t7 * pi[3] * t1 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[2][3] = temp; temp = -t19; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[3][0] = temp; temp = -t13; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[3][1] = temp; temp = -t7 * pi[2] * t1 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[3][2] = temp; temp = (t4 * t18 + ((pi[0] + pi[1] + pi[3]) * t4 + pi[3]) * pi[2] + t11 * pi[3] + t5) * t1 * t15; if(temp>1.0) temp = 1.0; if(temp<1e-100) temp=1e-100; ptrans[3][3] = temp; return ptrans; } /* Use the following in Maple to generate this code: (k is 1/transition:transversion ratio, i.e. k=1/kappa) M := Matrix([ [-g-k*(c+t),g,k*c,k*t], [a,-a-k*(c+t),k*c,k*t], [k*a,k*g,-k*(a+g)-t,t], [k*a,k*g,c,-k*(a+g)-c]]); R:= simplify(-a*M[1,1]-g*M[2,2]-c*M[3,3]-t*M[4,4]); M:=simplify(M/R); CodeGeneration:-C(subs([a=pi[1],g=pi[2],c=pi[3],t=pi[4]],-n[1]*M[1,1]-n[2]*M[2,2]-n[3]*M[3,3]-n[4]*M[4,4]),optimize,resultname="ptrans"); */ double HKY85_expected_rate(const vector &n, const double kappa, const vector &pi) { const double k = 1.0/kappa; double t2 = pi[1]; double t3 = pi[2]; double t4 = k * t3; double t5 = pi[3]; double t6 = k * t5; double t9 = pi[0]; double t11 = t9 * k; double t14 = t2 * k; double t19 = 0.1e1 / (t9 * t2 + t11 * t3 + t11 * t5 + t14 * t3 + t14 * t5 + t3 * t5); return n[0] * (t2 + t4 + t6) * t19 / 0.2e1 + n[1] * (t9 + t4 + t6) * t19 / 0.2e1 + n[2] * (t11 + t14 + t5) * t19 / 0.2e1 + n[3] * (t11 + t14 + t3) * t19 / 0.2e1; } /* For a full description of this algorithm: A Fast Algorithm for Joint Reconstruction of Ancestral Amino Acid Sequences Tal Pupko, Itsik Peer, Ron Shamir, and Dan Graur. Mol. Biol. Evol. 17(6):890–896. 2000 */ mydouble maximum_likelihood_ancestral_sequences(Matrix &nuc, marginal_tree &ctree, const double kappa, const vector &pi, vector &pat1, vector &cpat, Matrix &node_sequence) { mydouble ML(1.0); // Every node in the tree has a likelihood attached of the best subtree likelihood, and the sequence eventually identified as the global maximum likelihood estimate const int nseq = nuc.nrows(); const int nnodes = 2*nseq-1; const int npat = pat1.size(); node_sequence = Matrix(nnodes,npat,N_ambiguous); // subtree_ML[i][j][k] is, for node i, pattern j, the subtree maximum likelihood given the parent node has state k = {A,G,C,T} Matrix subtree_ML_element(npat,4,0.0); vector< Matrix > subtree_ML(nnodes,subtree_ML_element); // path_ML[i][j][k] is, for node i, pattern j, the state of node i that maximizes the subtree likelihood given the parent node has state k = {A,G,C,T} Matrix path_ML_element(npat,4,N_ambiguous); vector< Matrix > path_ML(nnodes,path_ML_element); // For each node (except the root node), define an HKY85 transition probability matrix vector< Matrix > ptrans = compute_HKY85_ptrans(ctree,kappa,pi); // Nodes are ordered in the tree first in tip order (0..n-1) then in ascending time order towards the root node (2*n-2) // First, do the tips int i,j,k,l; for(i=0;isubtree_ML[i][j][k]) { subtree_ML[i][j][k] = subtree_ML_l; path_ML[i][j][k] = (Nucleotide)l; } } } else { stringstream errTxt; errTxt << "maximum_likelihood_ancestral_sequences(): unexpected base " << obs << " (out of range 0-5) in sequence " << i << " pattern " << j; error(errTxt.str().c_str()); } } } } // Now the internal nodes, all of which are bifurcating for(;iid; const int i1 = d1->id; if(i0<0 || i0>=nnodes || i1<0 || i1>=nnodes) { stringstream errTxt; errTxt << "maximum_likelihood_ancestral_sequences(): node index during Viterbi-like algorithm"; error(errTxt.str().c_str()); } // Check subtree ML has been computed for(l=0;l<4;l++) { if(subtree_ML[i0][j][l].iszero() || subtree_ML[i1][j][l].iszero()) { stringstream errTxt; errTxt << "maximum_likelihood_ancestral_sequences(): uninitialized subtree ML during Viterbi-like algorithm"; error(errTxt.str().c_str()); } } for(k=0;k<4;k++) { // If the parent node's state is k, what is the maximum likelihood of the subtree? // And what is the state of the node that achieves that maximum value? // If multiple equally good paths are possible, the path is chosen in the following order of decreasing preference: A, G, C, T subtree_ML[i][j][k] = ptrans[i][k][0]*subtree_ML[i0][j][0]*subtree_ML[i1][j][0]; path_ML[i][j][k] = (Nucleotide)0; for(l=1;l<4;l++) { const mydouble subtree_ML_l = ptrans[i][k][l]*subtree_ML[i0][j][l]*subtree_ML[i1][j][l]; if(subtree_ML_l > subtree_ML[i][j][k]) { subtree_ML[i][j][k] = subtree_ML_l; path_ML[i][j][k] = (Nucleotide)l; } } } } } // Now work back from root to tips choosing the ML path // Start at the root (this is redundant as the root's ancestor has no bearing so subtree_ML[nnodes-1][j][l] and path_ML[nnodes-1][j][l] are the same for different l's) for(j=0;jML) { best_state = l; ML_temp = subtree_ML[nnodes-1][j][l]; } } node_sequence[nnodes-1][j] = path_ML[nnodes-1][j][best_state]; ML *= pow(ML_temp,cpat[j]); } for(i=nnodes-2;i>=0;i--) { const mt_node* anc = ctree.node[i].ancestor; // Check the descendant nodes exist if(anc==NULL) { stringstream errTxt; errTxt << "maximum_likelihood_ancestral_sequences(): null pointer during Viterbi-like algorithm second pass"; error(errTxt.str().c_str()); } const int ianc = anc->id; if(ianc<0 || ianc>=nnodes) { stringstream errTxt; errTxt << "maximum_likelihood_ancestral_sequences(): node index during Viterbi-like algorithm second pass"; error(errTxt.str().c_str()); } for(j=0;j &all_node_names, const char* file_name) { ofstream fout(file_name); if(!fout) { stringstream errTxt; errTxt << "write_newick(): could not open file " << file_name << " for writing"; error(errTxt.str().c_str()); } write_newick(ctree,all_node_names,fout); fout.close(); } void write_newick(const marginal_tree &ctree, const vector &all_node_names, ofstream &fout) { if(!fout) { stringstream errTxt; errTxt << "write_newick(): could not open file stream for writing"; error(errTxt.str().c_str()); } const int nnodes = ctree.size; if(all_node_names.size()!=nnodes) { stringstream errTxt; errTxt << "write_newick(): length of node names vector does not equal number of nodes"; error(errTxt.str().c_str()); } const mt_node* root = (const mt_node*)(&ctree.node[nnodes-1]); if(root==NULL) { stringstream errTxt; errTxt << "write_newick(): null pointer to root"; error(errTxt.str().c_str()); } const int id = root->id; const mt_node* d0 = root->descendant[0]; const mt_node* d1 = root->descendant[1]; // Check the descendant nodes exist if(d0==NULL || d1==NULL) { stringstream errTxt; errTxt << "write_newick(): null pointer to root descendant"; error(errTxt.str().c_str()); } // Write to Newick fout << "("; write_newick_node(d0,all_node_names,fout); fout << ","; write_newick_node(d1,all_node_names,fout); fout << ")" << all_node_names[id] << ";" << endl; } void write_newick_node(const mt_node *node, const vector &all_node_names, ofstream &fout) { const int id = node->id; const mt_node* d0 = node->descendant[0]; const mt_node* d1 = node->descendant[1]; // Check the descendant nodes exist if(d0==NULL && d1==NULL) { // Node is a tip fout << all_node_names[id] << ":" << node->edge_time; } else if(d0!=NULL && d1!=NULL) { // Node is internal fout << "("; write_newick_node(d0,all_node_names,fout); fout << ","; write_newick_node(d1,all_node_names,fout); fout << ")" << all_node_names[id] << ":" << node->edge_time; } else { stringstream errTxt; errTxt << "write_newick_node(): node has unexpectedly just one descendant"; error(errTxt.str().c_str()); } } void write_ancestral_fasta(Matrix &nuc, vector &all_node_names, const char* file_name) { ofstream fout(file_name); if(!fout) { stringstream errTxt; errTxt << "write_ancestral_fasta(): could not open file " << file_name << " for writing"; error(errTxt.str().c_str()); } write_ancestral_fasta(nuc,all_node_names,fout); fout.close(); } void write_ancestral_fasta(Matrix &nuc, vector &all_node_names, ofstream &fout) { static const char AGCTN[5] = {'A','G','C','T','N'}; if(!fout) { stringstream errTxt; errTxt << "write_ancestral_fasta(): could not open file stream for writing"; error(errTxt.str().c_str()); } if(nuc.nrows()!=all_node_names.size()) { stringstream errTxt; errTxt << "write_ancestral_fasta(): number of sequences (" << nuc.nrows() << ") does not equal number of node labels (" << all_node_names.size() << ")"; error(errTxt.str().c_str()); } int i,pos; for(i=0;i" << all_node_names[i] << endl; for(pos=0;pos &iscompat, vector &ipat, const char* file_name) { ofstream fout(file_name); if(!fout) { stringstream errTxt; errTxt << "write_position_cross_reference(): could not open file " << file_name << " for writing"; error(errTxt.str().c_str()); } write_position_cross_reference(iscompat,ipat,fout); fout.close(); } void write_position_cross_reference(vector &iscompat, vector &ipat, ofstream &fout) { if(!fout) { stringstream errTxt; errTxt << "write_position_cross_reference(): could not open file stream for writing"; error(errTxt.str().c_str()); } int i,j,pat; for(i=0,j=0;i=ipat.size()) { stringstream errTxt; errTxt << "write_position_cross_reference(): internal inconsistency in number of compatible sizes (" << j+1 << " or more) and number of patterns (" << ipat.size() << ")"; error(errTxt.str().c_str()); } pat = ipat[j]; ++j; } if(i>0) fout << ','; fout << pat+1; } fout << endl; } mydouble likelihood_branch(const int dec_id, const int anc_id, const Matrix &node_nuc, const vector &pat1, const vector &cpat, const double kappa, const vector &pinuc, const double branch_length) { mydouble ML(1.0); const int npat = pat1.size(); // Define an HKY85 emission probability matrix for Unimported sites Matrix pemis; pemis = compute_HKY85_ptrans(branch_length,kappa,pinuc); // Cycle through the patterns calculating the likelihood int i; for(i=0;i > &imported, vector &all_node_names, vector &isBLC, vector &compat, const char* file_name, const int root_node) { ofstream fout(file_name); if(!fout) { stringstream errTxt; errTxt << "write_importation_status(): could not open file " << file_name << " for writing"; error(errTxt.str().c_str()); } write_importation_status(imported,all_node_names,isBLC,compat,fout,root_node); fout.close(); } void write_importation_status(vector< vector > &imported, vector &all_node_names, vector &isBLC, vector &compat, ofstream &fout, const int root_node) { if(!fout) { stringstream errTxt; errTxt << "write_importation_status(): could not open file stream for writing"; error(errTxt.str().c_str()); } if(imported.size()!=root_node) { stringstream errTxt; errTxt << "write_importation_status(): number of lineages (" << imported.size() << ") does not equal the number of non-root node labels (" << root_node << ")"; error(errTxt.str().c_str()); } if(all_node_names.size()" << all_node_names[i] << endl; int k = 0; for(pos=0;pos0) + (int)imported[i][k]; fout << out; ++k; } else if(compat[pos]<=0) { // If compatible but not used in branch length correction, 4 fout << 4; } else { // If homoplasy/multiallelic and not used in branch length correction, 5 fout << 5; } } fout << endl; } } void write_importation_status_intervals(vector< vector > &imported, vector &all_node_names, vector &isBLC, vector &compat, const char* file_name, const int root_node, const char* chr_name) { ofstream fout(file_name); if(!fout) { stringstream errTxt; errTxt << "write_importation_status_intervals(): could not open file " << file_name << " for writing"; error(errTxt.str().c_str()); } write_importation_status_intervals(imported,all_node_names,isBLC,compat,fout,root_node, chr_name); fout.close(); } void write_importation_status_intervals(vector< vector > &imported, vector &all_node_names, vector &isBLC, vector &compat, ofstream &fout, const int root_node, const char* chr_name) { if(!fout) { stringstream errTxt; errTxt << "write_importation_status_intervals(): could not open file stream for writing"; error(errTxt.str().c_str()); } if(imported.size()!=root_node) { stringstream errTxt; errTxt << "write_importation_status_intervals(): number of lineages (" << imported.size() << ") does not equal the number of non-root node labels (" << root_node << ")"; error(errTxt.str().c_str()); } if(all_node_names.size() &node_nuc, const vector &iscompat, const vector &ipat, const double kappa, const vector &pinuc, const double branch_length, const double rho_over_theta, const double mean_import_length, const double import_divergence, vector &is_imported) { mydouble ML(0.0); // Store the positions of **all** sites is_imported = vector(iscompat.size(),Unimported); // subseq_ML[i][j] is, for position i, the subsequence maximum likelihood given the next position has state j = {Unimported,Imported} Matrix subseq_ML(iscompat.size(),2); // path_ML[i][j] is, for position i, the state of position i that maximizes the subsequence likelihood given the next position has state j = {Unimported,Imported} Matrix path_ML(iscompat.size(),2); // Define an HKY85 emission probability matrix for Unimported sites Matrix pemisUnimported; pemisUnimported = compute_HKY85_ptrans(branch_length,kappa,pinuc); // Define an HKY85 emission probability matrix for Imported sites Matrix pemisImported; pemisImported = compute_HKY85_ptrans(import_divergence,kappa,pinuc); // Recombination parameters const double recrate = rho_over_theta*branch_length; const double endrecrate = 1.0/mean_import_length; const double totrecrate = recrate+endrecrate; // Equilibrium frequency of unimported and imported sites respectively const double pi[2] = {endrecrate/totrecrate,recrate/totrecrate}; // Define a transition probability matrix Matrix ptrans(2,2,0.0); // These probabilities do not change until (i==0) ptrans[0][0] = (mydouble)(exp(-totrecrate)+pi[0]*(1-exp(-totrecrate))); ptrans[0][1] = (mydouble)(pi[1]*(1-exp(-totrecrate))); ptrans[1][1] = (mydouble)(exp(-totrecrate)+pi[1]*(1-exp(-totrecrate))); ptrans[1][0] = (mydouble)(pi[0]*(1-exp(-totrecrate))); // Beginning at the last variable site, calculate the subsequence maximum likelihood int i,j; for(i=iscompat.size()-1,j=ipat.size();i>=0;i--) { if(i==0) { ptrans[0][0] = pi[0]; ptrans[1][0] = pi[0]; ptrans[0][1] = pi[1]; ptrans[1][1] = pi[1]; } // If the previous position's state (leftwards) is j, what is the maximum likelihood of the subsequence from the current position to the last (rightwards)? // And what is the state k of the position that achieves that maximum value? mydouble UU,UI,IU,II; if(iscompat[i]) { j--; if(j<0) { stringstream errTxt; errTxt << "maximum_likelihood_ClonalFrame_branch_allsites(): internal inconsistency in tracking informative sites"; error(errTxt.str().c_str()); } Nucleotide dec = node_nuc[dec_id][ipat[j]]; Nucleotide anc = node_nuc[anc_id][ipat[j]]; if(i=UI) ? UU : UI; path_ML[i][0] = (UU>=UI) ? Unimported : Imported; subseq_ML[i][1] = (IU>=II) ? IU : II; path_ML[i][1] = (IU>=II) ? Unimported : Imported; } // Beginning at the first variable site, identify the most likely path // Sanity check if(path_ML[0][0]!=path_ML[0][1]) { stringstream errTxt; errTxt << "maximum_likelihood_ClonalFrame_branch_allsites(): internal inconsistency when choosing the first importation state in the best path"; error(errTxt.str().c_str()); } is_imported[0] = path_ML[0][0]; ML = subseq_ML[0][0]; for(i=1;i &node_nuc, const vector &position, const vector &ipat, const double kappa, const vector &pinuc, const double branch_length, const double rho_over_theta, const double mean_import_length, const double import_divergence, Matrix &numEmis, vector &denEmis, Matrix &numTrans, vector &denTrans) { const int npos = position.size(); // Define an HKY85 emission probability matrix for Unimported sites Matrix pemisUnimported; pemisUnimported = compute_HKY85_ptrans(branch_length,kappa,pinuc); // Define an HKY85 emission probability matrix for Imported sites Matrix pemisImported; pemisImported = compute_HKY85_ptrans(import_divergence,kappa,pinuc); // Define storage space for the intermediate forward calculations Matrix A; A = Matrix(npos,2); // Resize if necessary and zero the output objects numEmis = Matrix(2,2,0.0); denEmis = vector(2,0.0); numTrans = Matrix(2,2,0.0); denTrans = vector(2,0.0); // cout << "numTrans = " << numTrans[0][0].todouble() << " " << numTrans[0][1].todouble() << " " << numTrans[1][0].todouble() << " " << numTrans[0][0].todouble() << endl; // Recombination parameters const double recrate = rho_over_theta*branch_length; const double endrecrate = 1.0/mean_import_length; const double totrecrate = recrate+endrecrate; // Transient storage mydouble aprev[2]; mydouble a[2]; // Equilibrium frequency of unimported and imported sites respectively const mydouble pi[2] = {endrecrate/totrecrate,recrate/totrecrate}; // Beginning at the first variable site, calculate the subsequence marginal likelihood int i; for(i=0;i=0;i--) { if(i==(npos-1)) { b[0] = mydouble(1.0); b[1] = mydouble(1.0); // Update the expected number of emissions mydouble pU = A[i][0]*b[0]; mydouble pI = A[i][1]*b[1]; // NB:- pU+pI should always equal ML but just in case it introduces small errors const mydouble MLi = pU + pI; pU /= MLi; pI /= MLi; const double ppost[2] = {pU.todouble(),1.0-pU.todouble()}; // Increment the numerator and denominator of the expected number of emissions from state j to observation k int j; // NB:- *** obs refers to the PRESENT site !!! *** const int obs = (int)(node_nuc[dec_id][ipat[i]]!=node_nuc[anc_id][ipat[i]]); // 0 = same, 1 = different for(j=0;j<2;j++) { // Total number of emissions from j to k equals indicator of actual observation k (0 or 1) weighted by probability the site was in state j numEmis[j][obs] += ppost[j]; // Total number of possible emissions from j to k equals the number of sites, each weighted by probability the site was in state j denEmis[j] += ppost[j]; // NB:- the denominator is the same for both observation states } } else { bnext[0] = b[0]; bnext[1] = b[1]; // Note that these retrieve the ancestral and descendant nucleotides at the 3prime adjacent site Nucleotide dec = node_nuc[dec_id][ipat[i+1]]; Nucleotide anc = node_nuc[anc_id][ipat[i+1]]; const mydouble pemisU = pemisUnimported[anc][dec]; const mydouble pemisI = pemisImported[anc][dec]; mydouble prnotrans; prnotrans.setlog(-totrecrate*(position[i+1]-position[i])); const mydouble prtrans = mydouble(1.0)-prnotrans; const mydouble sumbnext = prtrans*(pi[0]*pemisU*bnext[0] + pi[1]*pemisI*bnext[1]); b[0] = prnotrans*pemisU*bnext[0]+sumbnext; b[1] = prnotrans*pemisI*bnext[1]+sumbnext; // Update the expected number of transitions and emissions // Calculate the marginal probabilities that the hidden state is Unimported or Imported // if(fabs((A[i][0]*b[0]+A[i][1]*b[1]).LOG()-ML.LOG())>1e-6) { // cout << ML.LOG() << "\t" << (A[i][0]*b[0]+A[i][1]*b[1]).LOG() << endl; // } mydouble pU = A[i][0]*b[0]; mydouble pI = A[i][1]*b[1]; // NB:- pU+pI should always equal ML but just in case it introduces small errors const mydouble MLi = pU + pI; pU /= MLi; pI /= MLi; const double ppost[2] = {pU.todouble(),1.0-pU.todouble()}; // Increment the numerator and denominator of the expected number of emissions from state j to observation k int j; // NB:- *** obs refers to the PRESENT site !!! *** const int obs = (int)(node_nuc[dec_id][ipat[i]]!=node_nuc[anc_id][ipat[i]]); // 0 = same, 1 = different for(j=0;j<2;j++) { // Total number of emissions from j to k equals indicator of actual observation k (0 or 1) weighted by probability the site was in state j numEmis[j][obs] += ppost[j]; // Total number of possible emissions from j to k equals the number of sites, each weighted by probability the site was in state j denEmis[j] += ppost[j]; // NB:- the denominator is the same for both observation states } // Increment the numerator and denominator of the expected number of transitions from state j to state k // Impose maximum adjacent site distance of 1kb (needed for small-p Poisson approximation to heterogeneous bernoulli) const mydouble pemis[2] = {pemisU,pemisI}; const double dist = position[i+1]-position[i]; if(dist<=1000.) { int k; for(j=0;j<2;j++) { for(k=0;k<2;k++) { const int istrans = (int)(j!=k); // Probability of transition from j to k given the data equals the joint likelihood of the data and transition from j to k, divided by marginal likelihood of the data if(istrans) { numTrans[j][k] += (A[i][j]*prtrans*pi[k]*pemis[k]*bnext[k]/MLi).todouble(); // Note the use of bnext, not b // if(j==0 && k==1) cout << "pos = " << i << " numTrans[0][1] = " << numTrans[j][k].todouble() << endl; //(A[i][j]*ptrans[istrans]*pemis[k]*bnext[k]/ML).LOG() << endl; } else { numTrans[j][k] += (A[i][j]*(prnotrans+prtrans*pi[k])*pemis[k]*bnext[k]/MLi).todouble(); // Note the use of bnext, not b } } // Expected distance between sites equals actual distance weighted by the probability the 5prime site was in state j denTrans[j] += dist*ppost[j]; // NB:- the denominator is the same for both destination states } } } } // Return the marginal likelihood // cout << "numTrans = " << numTrans[0][0].todouble() << " " << numTrans[0][1].todouble() << " " << numTrans[1][0].todouble() << " " << numTrans[0][0].todouble() << endl; return ML; } double Baum_Welch(const marginal_tree &tree, const Matrix &node_nuc, const vector &position, const vector &ipat, const double kappa, const vector &pinuc, const vector &informative, const vector &prior_a, const vector &prior_b, vector &full_param, vector &posterior_a, int &neval, const bool coutput, double &priorL) { int i; if(coutput) cout << setprecision(9); // Initial parameters double rho_over_theta = full_param[0]; double mean_import_length = full_param[1]; double import_divergence = full_param[2]; posterior_a = vector(3+informative.size()); // Storage for the expected number of transitions and emissions in the HMM Matrix numEmiss(2,2), numTrans(2,2); vector denEmiss(2), denTrans(2); // Counters double mutI=0.0; // Running total divergence at imported sites double numU=0.0, numI=0.0; // Running total number of transitions *to* unimported, imported regions double nsiI=0.0; // Running total number of imported sites double lenU=0.0, lenI=0.0; // Running total length of unimported, imported regions // Calculate the marginal likelihood and expected number of transitions and emissions by the forward-backward algorithm // Include the effect of the prior double ML = 0.0; priorL = gamma_loglikelihood(full_param[0], prior_a[0], prior_b[0]) + gamma_loglikelihood(1.0/full_param[1], prior_a[1], prior_b[1]) + gamma_loglikelihood(full_param[2], prior_a[2], prior_b[2]); for(i=0;iid; const double branch_length = full_param[3+i]; ML += mydouble_forward_backward_expectations_ClonalFrame_branch(dec_id,anc_id,node_nuc,position,ipat,kappa,pinuc,branch_length,rho_over_theta,mean_import_length,import_divergence,numEmiss,denEmiss,numTrans,denTrans).LOG(); // Update estimate of the branch length const double mutU_br = numEmiss[0][1]; const double nsiU_br = denEmiss[0]; full_param[3+i] = (prior_a[3]+mutU_br)/(prior_b[3]+nsiU_br); posterior_a[3+i] = (prior_a[3]+mutU_br); // Increment counters for the other expectations mutI += numEmiss[1][1]; nsiI += denEmiss[1]; const double numI_br = numTrans[0][1]; const double lenU_br = denTrans[0]; numI += numI_br; lenU += full_param[3+i]*lenU_br; numU += numTrans[1][0]; lenI += denTrans[1]; if(coutput) { cout << "nmut = " << mutU_br << " nU = " << nsiU_br << " nsub = " << numEmiss[1][1] << " nI = " << denEmiss[1] << endl; cout << "nU>I = " << numI_br << " dU = " << lenU_br << " nI>U = " << numTrans[1][0] << " dI = " << denTrans[1] << endl; cout << "numTrans = " << numTrans[0][0] << " " << numTrans[0][1] << " " << numTrans[1][0] << " " << numTrans[0][0] << endl; } } } ML += priorL; ++neval; // Update estimates of the recombination parameters full_param[0] = (prior_a[0]+numI)/(prior_b[0]+lenU); full_param[1] = (prior_b[1]+lenI)/(prior_a[1]+numU); full_param[2] = (prior_a[2]+mutI)/(prior_b[2]+nsiI); posterior_a[0] = (prior_a[0]+numI); posterior_a[1] = (prior_a[1]+numU); posterior_a[2] = (prior_a[2]+mutI); if(coutput) { cout << "params ="; for(int j=0;jI = " << numI_br << " dU = " << lenU_br << " nI>U = " << numTrans[1][0] << " dI = " << denTrans[1] << endl; cout << "numTrans = " << numTrans[0][0] << " " << numTrans[0][1] << " " << numTrans[1][0] << " " << numTrans[0][0] << endl; } } } new_ML += priorL; ++neval; // Update estimates of the recombination parameters full_param[0] = (prior_a[0]+numI)/(prior_b[0]+lenU); full_param[1] = (prior_b[1]+lenI)/(prior_a[1]+numU); full_param[2] = (prior_a[2]+mutI)/(prior_b[2]+nsiI); posterior_a[0] = (prior_a[0]+numI); posterior_a[1] = (prior_a[1]+numU); posterior_a[2] = (prior_a[2]+mutI); if(coutput) { cout << "params ="; for(int j=0;jI = " << numI_br << " dU = " << lenU_br << " nI>U = " << numTrans[1][0] << " dI = " << denTrans[1] << endl; cout << "numTrans = " << numTrans[0][0] << " " << numTrans[0][1] << " " << numTrans[1][0] << " " << numTrans[0][0] << endl; } } } if(coutput) { cout << "ML0 = " << ML << endl; } return ML; } double gamma_loglikelihood(const double x, const double a, const double b) { return a*log(b)-lgamma(a)+(a-1)*log(x)-b*x; } void forward_backward_simulate_expectations_ClonalFrame_branch(const int dec_id, const int anc_id, const Matrix &node_nuc, const vector &position, const vector &ipat, const double kappa, const vector &pinuc, const double branch_length, const double rho_over_theta, const double mean_import_length, const double import_divergence, const int nsim, vector &mutU, vector &nsiU, vector &mutI, vector &nsiI, vector &numUI, vector &lenU, vector &numIU, vector &lenI) { const int npos = position.size(); // Define an HKY85 emission probability matrix for Unimported sites Matrix pemisUnimported; pemisUnimported = compute_HKY85_ptrans(branch_length,kappa,pinuc); // Define an HKY85 emission probability matrix for Imported sites Matrix pemisImported; pemisImported = compute_HKY85_ptrans(import_divergence,kappa,pinuc); // Define storage space for the intermediate forward calculations and counters Matrix A; A = Matrix(npos,2); Matrix numEmis, numTrans; vector denEmis, denTrans; // Define storage space for the observation at every site vector emittedState; emittedState = vector(npos); // Recombination parameters const double recrate = rho_over_theta*branch_length; const double endrecrate = 1.0/mean_import_length; const double totrecrate = recrate+endrecrate; // Transient storage mydouble aprev[2]; mydouble a[2]; // Equilibrium frequency of unimported and imported sites respectively const mydouble pi[2] = {endrecrate/totrecrate,recrate/totrecrate}; // Beginning at the first variable site, do the forward algorithm int i; for(i=0;i P; P = Matrix(npos,2); // P[i][j] is the probability of going from position (i+1) state j to position i state 1 mydouble bnext[2]; mydouble b[2]; // Beginning at the last variable site, do the backward algorithm and calculate backward simulation probabilities for(i=npos-1;i>=0;i--) { if(i==(npos-1)) { // Backward algorithm b[0] = mydouble(1.0); b[1] = mydouble(1.0); // Calculate the backwards simulation probability // A[npos-1][j]*b[j] is the joint probability of the data and state j at the final position const mydouble num = A[npos-1][1]*b[1]; const mydouble den = A[npos-1][0]*b[0] + num; P[npos-1][0] = P[npos-1][1] = (num/den).todouble(); } else { // Backward algorithm bnext[0] = b[0]; bnext[1] = b[1]; // Note that these retrieve the ancestral and descendant nucleotides at the 3prime adjacent site Nucleotide dec = node_nuc[dec_id][ipat[i+1]]; Nucleotide anc = node_nuc[anc_id][ipat[i+1]]; const mydouble pemisU = pemisUnimported[anc][dec]; const mydouble pemisI = pemisImported[anc][dec]; mydouble prnotrans; prnotrans.setlog(-totrecrate*(position[i+1]-position[i])); const mydouble prtrans = mydouble(1.0)-prnotrans; const mydouble sumbnext = prtrans*(pi[0]*pemisU*bnext[0] + pi[1]*pemisI*bnext[1]); b[0] = prnotrans*pemisU*bnext[0]+sumbnext; b[1] = prnotrans*pemisI*bnext[1]+sumbnext; // Calculate the backwards simulation probability const mydouble pemis[2] = {pemisU,pemisI}; // numjk is proportional to the probability of going from state j at position (i+1) to state k at position i mydouble num00 = A[i][0]*(prnotrans+prtrans*pi[0])*pemis[0]*bnext[0]; mydouble num01 = A[i][1]*prtrans*pi[0]*pemis[0]*bnext[0]; mydouble num10 = A[i][0]*prtrans*pi[1]*pemis[1]*bnext[1]; mydouble num11 = A[i][1]*(prnotrans+prtrans*pi[1])*pemis[1]*bnext[1]; P[i][0] = (num01/(num00+num01)).todouble(); P[i][1] = (num11/(num10+num11)).todouble(); } } // Simulate the number of transitions and emissions int sim; for(sim=0;sim(2,2,0.0); denEmis = vector(2,0.0); numTrans = Matrix(2,2,0.0); denTrans = vector(2,0.0); // Cycle from 3prime to 5prime int last; // Last hidden state for(i=npos-1;i>=0;i--) { if(i==(npos-1)) { // Start by simulating the 3prime-most position last = ran.bernoulli(P[i][0]); // Update relevant counters ++numEmis[last][emittedState[i]]; ++denEmis[last]; } else { // Simulate the 5prime-next position const int next = ran.bernoulli(P[i][last]); // Update all the counters ++numEmis[next][emittedState[i]]; ++denEmis[next]; const double dist = position[i+1]-position[i]; if(dist<=1000.0) { ++numTrans[next][last]; denTrans[next] += dist; } last = next; } } mutU[sim] = numEmis[0][1]; nsiU[sim] = denEmis[0]; mutI[sim] = numEmis[1][1]; nsiI[sim] = denEmis[1]; numUI[sim] = numTrans[0][1]; lenU[sim] = denTrans[0]; numIU[sim] = numTrans[1][0]; lenI[sim] = denTrans[1]; } } Matrix Baum_Welch_simulate_posterior(const marginal_tree &tree, const Matrix &node_nuc, const vector &position, const vector &ipat, const double kappa, const vector &pinuc, const vector &informative, const vector &prior_a, const vector &prior_b, const vector &full_param, int &neval, const bool coutput, const int nsim) { // Storage for output: for each parameter, simulated values Matrix post(3,nsim,0.0); // Storage for the simulated counts of transitions and emissions vector /*mutU(nsim,0.0), nsiU(nsim,0.0),*/ mutI(nsim,0.0), nsiI(nsim,0.0); vector numUI(nsim,0.0), lenU(nsim,0.0), numIU(nsim,0.0), lenI(nsim,0.0); vector mutU_br(nsim,0.0), nsiU_br(nsim,0.0), mutI_br(nsim,0.0), nsiI_br(nsim,0.0); vector numUI_br(nsim,0.0), lenU_br(nsim,0.0), numIU_br(nsim,0.0), lenI_br(nsim,0.0); // Estimated parameters double rho_over_theta = full_param[0]; double mean_import_length = full_param[1]; double import_divergence = full_param[2]; // Do all the simulations for each branch individually, and combine int i; for(i=0;iid; const double branch_length = full_param[3+i]; forward_backward_simulate_expectations_ClonalFrame_branch(dec_id,anc_id,node_nuc,position,ipat,kappa,pinuc,branch_length,rho_over_theta,mean_import_length,import_divergence,nsim,mutU_br,nsiU_br,mutI_br,nsiI_br,numUI_br,lenU_br,numIU_br,lenI_br); // Update the running totals for each simulation int sim; for(sim=0;sim &node_nuc, const vector &position, const vector &ipat, const double kappa, const vector &pinuc, const vector &informative, const vector &prior_a, const vector &prior_b, vector &mean_param, Matrix &full_param, Matrix &posterior_a, int &neval, const bool coutput) { int i; if(coutput) cout << setprecision(9); // Resize as necessary posterior_a = Matrix(informative.size(),4); // Storage for the expected number of transitions and emissions in the HMM per branch Matrix numEmiss(2,2), numTrans(2,2); vector denEmiss(2), denTrans(2); // Counters per branch vector mutU_br(informative.size(),0.0), mutI_br(informative.size(),0.0); vector nsiU_br(informative.size(),0.0), nsiI_br(informative.size(),0.0); vector numI_br(informative.size(),0.0), numU_br(informative.size(),0.0); vector lenU_br(informative.size(),0.0), lenI_br(informative.size(),0.0); // Calculate the marginal likelihood and expected number of transitions and emissions by the forward-backward algorithm // Include the effect of the prior (this is dubious - should instead compute loglikelihood of the pseudocounts) double ML = gamma_loglikelihood(mean_param[0], prior_a[0], prior_b[0]) + gamma_loglikelihood(mean_param[1], prior_a[1], prior_b[1]) + gamma_loglikelihood(mean_param[2], prior_a[2], prior_b[2]) + gamma_loglikelihood(mean_param[3], prior_a[3], prior_b[3]); for(i=0;iid; // Initial parameters const double rho_over_theta = mean_param[0]*full_param[i][0]; const double mean_import_length = 1.0/(mean_param[1]*full_param[i][1]); // NB internal definition const double import_divergence = mean_param[2]*full_param[i][2]; const double branch_length = mean_param[3]*full_param[i][3]; ML += mydouble_forward_backward_expectations_ClonalFrame_branch(dec_id,anc_id,node_nuc,position,ipat,kappa,pinuc,branch_length,rho_over_theta,mean_import_length,import_divergence,numEmiss,denEmiss,numTrans,denTrans).LOG(); // Store counters per branch mutU_br[i] = numEmiss[0][1]; nsiU_br[i] = denEmiss[0]; mutI_br[i] = numEmiss[1][1]; nsiI_br[i] = denEmiss[1]; numI_br[i] = numTrans[0][1]; lenU_br[i] = denTrans[0]; numU_br[i] = numTrans[1][0]; lenI_br[i] = denTrans[1]; // if(coutput) { // cout << "nmut = " << mutU_br << " nU = " << nsiU_br << " nsub = " << numEmiss[1][1] << " nI = " << denEmiss[1] << endl; // cout << "nU>I = " << numI_br << " dU = " << lenU_br << " nI>U = " << numTrans[1][0] << " dI = " << denTrans[1] << endl; // cout << "numTrans = " << numTrans[0][0] << " " << numTrans[0][1] << " " << numTrans[1][0] << " " << numTrans[0][0] << endl; // } } } ++neval; // Update estimates of all the parameters: start with the branch lengths double mean_param_num, mean_param_den; // First, iterate to update the mean branch length parameter (max 3 times) int j; for(j=0;j<3;j++) { mean_param_num = prior_a[3]; mean_param_den = prior_b[3]; for(i=0;i. * */ #ifndef _MAIN_H_ #define _MAIN_H_ #include #include #include "myutils/newick.h" //#include "coalesce/coalesce.h" #include "coalesce/coalescent_record.h" #include //#include "myutils/myutils.h" #include "xmfa.h" #include #include #include "myutils/DNA.h" #include "myutils/mydouble.h" //#include "coalesce/mutation.h" #include "powell.h" #include "myutils/argumentwizard.h" #include #include "myutils/random.h" #include #include #include "version.h" using std::cout; using myutils::NewickTree; using std::stringstream; using myutils::error; using myutils::ArgumentWizard; using myutils::DATA_TYPE; // Global definition of random number generator Random ran; enum Nucleotide {Adenine=0, Guanine, Cytosine, Thymine, N_ambiguous}; enum ImportationState {Unimported=0, Imported}; marginal_tree convert_rooted_NewickTree_to_marginal_tree(NewickTree &newick, vector &tip_labels, vector &all_node_labels); marginal_tree convert_unrooted_NewickTree_to_marginal_tree(NewickTree &newick, vector &tip_labels, vector &all_node_labels); vector compute_compatibility(DNA &fa, marginal_tree &tree, vector &anyN, bool purge_singletons=true); NewickTree read_Newick(const char* newick_file); Matrix FASTA_to_nucleotide(DNA &fa, vector &empirical_nucleotide_frequencies, vector usesite); void find_alignment_patterns(Matrix &nuc, vector &iscompat, vector &pat, vector &pat1, vector &cpat, vector &ipat); vector< Matrix > compute_HKY85_ptrans(const marginal_tree &ctree, const double kappa, const vector &pi); Matrix compute_HKY85_ptrans(const double x, const double k, const vector &pi); Matrix dcompute_HKY85_ptrans(const double x, const double kappa, const vector &pi); double HKY85_expected_rate(const vector &n, const double kappa, const vector &pi); mydouble maximum_likelihood_ancestral_sequences(Matrix &nuc, marginal_tree &ctree, const double kappa, const vector &pi, vector &pat1, vector &cpat, Matrix &node_sequence); void write_newick(const marginal_tree &ctree, const vector &all_node_names, const char* file_name); void write_newick(const marginal_tree &ctree, const vector &all_node_names, ofstream &fout); void write_newick_node(const mt_node *node, const vector &all_node_names, ofstream &fout); void write_ancestral_fasta(Matrix &nuc, vector &all_node_names, const char* file_name); void write_ancestral_fasta(Matrix &nuc, vector &all_node_names, ofstream &fout); void write_position_cross_reference(vector &iscompat, vector &ipat, const char* file_name); void write_position_cross_reference(vector &iscompat, vector &ipat, ofstream &fout); mydouble likelihood_branch(const int dec_id, const int anc_id, const Matrix &node_nuc, const vector &pat1, const vector &cpat, const double kappa, const vector &pinuc, const double branch_length); bool string_to_bool(const string s, const string label=""); void write_importation_status(vector< vector > &imported, vector &all_node_names, vector &isBLC, vector &compat, const char* file_name, const int root_node); void write_importation_status(vector< vector > &imported, vector &all_node_names, vector &isBLC, vector &compat, ofstream &fout, const int root_node); void write_importation_status_intervals(vector< vector > &imported, vector &all_node_names, vector &isBLC, vector &compat, const char* file_name, const int root_node,const char* chr_name); void write_importation_status_intervals(vector< vector > &imported, vector &all_node_names, vector &isBLC, vector &compat, ofstream &fout, const int root_node, const char* chr_name); double Baum_Welch(const marginal_tree &tree, const Matrix &node_nuc, const vector &position, const vector &ipat, const double kappa, const vector &pinuc, const vector &informative, const vector &prior_a, const vector &prior_b, vector &full_param, vector &posterior_a, int &neval, const bool coutput, double &priorL); double Baum_Welch0(const marginal_tree &tree, const Matrix &node_nuc, const vector &position, const vector &ipat, const double kappa, const vector &pinuc, const vector &informative, const vector &prior_a, const vector &prior_b, const vector &full_param, const vector &posterior_a, const bool coutput); double gamma_loglikelihood(const double x, const double a, const double b); Matrix Baum_Welch_simulate_posterior(const marginal_tree &tree, const Matrix &node_nuc, const vector &position, const vector &ipat, const double kappa, const vector &pinuc, const vector &informative, const vector &prior_a, const vector &prior_b, const vector &full_param, int &neval, const bool coutput, const int nsim); double Baum_Welch_Rho_Per_Branch(const marginal_tree &tree, const Matrix &node_nuc, const vector &position, const vector &ipat, const double kappa, const vector &pinuc, const vector &informative, const vector &prior_a, const vector &prior_b, vector &mean_param, Matrix &full_param, Matrix &posterior_a, int &neval, const bool coutput); mydouble maximum_likelihood_ClonalFrame_branch_allsites(const int dec_id, const int anc_id, const Matrix &node_nuc, const vector &iscompat, const vector &ipat, const double kappa, const vector &pi, const double branch_length, const double rho_over_theta, const double mean_import_length, const double import_divergence, vector &is_imported); class orderNewickNodesByStatusLabelAndAge : public std::binary_function { public: const vector &root2tip; // temporary ordering of Newick nodes from root to tips const vector &ageroot2tip; // corresponding age of each node in root2tip const vector &labelorder; // The position where each node comes in the label order (for tips; the label is ignored for internal nodes) orderNewickNodesByStatusLabelAndAge(const vector &root2tip_in, const vector &ageroot2tip_in, const vector &labelorder_in) : root2tip(root2tip_in), ageroot2tip(ageroot2tip_in), labelorder(labelorder_in) { } // Test if i is less than j bool operator()(size_t i, size_t j) const { if(root2tip[i]->dec.size()==0 && root2tip[j]->dec.size()!=0) { // If i is a tip and j is not return true; } else if(root2tip[i]->dec.size()==0 && root2tip[j]->dec.size()==0) { // If i and j are both tips // Then order by label if(labelorder[i]==labelorder[j]) { stringstream errTxt; errTxt << "orderNewickNodesByStatusLabelAndAge::operator(): "; errTxt << "tips cannot have the same label order"; error(errTxt.str().c_str()); } return labelorder[i] < labelorder[j]; } else if(root2tip[i]->dec.size()!=0 && root2tip[j]->dec.size()==0) { // If i is not a tip but j is return false; } else { // If neither are tips // Then order by age return ageroot2tip[i] < ageroot2tip[j]; } } }; class ClonalFrameRescaleBranchFunction : public PowellFunction { public: // References to non-member variables const mt_node &node; const Matrix &node_nuc; const vector &pat1; const vector &cpat; const double kappa; const vector π // True member variable mydouble ML; int neval; const bool multithread; double crude_branch_length; double min_branch_length; public: ClonalFrameRescaleBranchFunction(const mt_node &_node, const Matrix &_node_nuc, const vector &_pat1, const vector &_cpat, const double _kappa, const vector &_pi, const bool _multithread, const double _crude_branch_length, const double _min_branch_length) : node(_node), node_nuc(_node_nuc), pat1(_pat1), cpat(_cpat), kappa(_kappa), pi(_pi), neval(0), multithread(_multithread), crude_branch_length(_crude_branch_length), min_branch_length(_min_branch_length) {}; double f(const vector& x) { ++neval; // Process parameters if(!(x.size()==1)) error("ClonalFrameRescaleBranchFunction::f(): 1 argument required"); double branch_length = pow(10.,x[0]); if(branch_lengthid; // Calculate likelihood ML = likelihood_branch(dec_id,anc_id,node_nuc,pat1,cpat,kappa,pi,branch_length); return -ML.LOG(); } }; /* Maximum likelihood routine based on the Baum-Welch EM algorithm for estimating a single set of recombination parameters (R/M, import length, import divergence) and an independent branch length per branch. Note that the approach is classical and the priors act through pseudocounts - i.e. a form of data augmentation prior */ class ClonalFrameBaumWelch { public: // References to non-member variables const marginal_tree &tree; const Matrix &node_nuc; const vector &iscompat; const vector &ipat; const double kappa; const vector π vector< vector > &is_imported; // True member variable double ML,ML0,priorL; double PR; int neval; const vector prior_a; const vector prior_b; vector which_compat; const int root_node; vector informative; vector initial_branch_length; vector full_param; vector posterior_a; bool guess_initial_m; bool coutput; public: ClonalFrameBaumWelch(const marginal_tree &_tree, const Matrix &_node_nuc, const vector &_iscompat, const vector &_ipat, const double _kappa, const vector &_pi, vector< vector > &_is_imported, const vector &_prior_a, const vector &_prior_b, const int _root_node, const bool _guess_initial_m, const bool _coutput=false) : tree(_tree), node_nuc(_node_nuc), iscompat(_iscompat), ipat(_ipat), kappa(_kappa), pi(_pi), neval(0), is_imported(_is_imported), prior_a(_prior_a), prior_b(_prior_b), root_node(_root_node), initial_branch_length(_root_node), informative(_root_node), guess_initial_m(_guess_initial_m), coutput(_coutput) { if(prior_a.size()!=4) error("ClonalFrameBaumWelch: prior a must have length 4"); if(prior_b.size()!=4) error("ClonalFrameBaumWelch: prior b must have length 4"); int i; // Precompute which sites are compatible which_compat = vector(0); for(i=0;iid; for(j=0,k=0;j=2.0) ? true : false; } } vector maximize_likelihood(const vector ¶m) { if(!(param.size()==3)) error("ClonalFrameBaumWelch::maximize_likelihood(): 3 arguments required"); // Starting points for the shared parameters full_param = vector(0); posterior_a = vector(0); full_param.push_back(param[0]); // rho_over_theta full_param.push_back(param[1]); // mean_import_length: may need to invert full_param.push_back(param[2]); // import_divergence int i; for(i=0;iid; const double rho_over_theta = full_param[0]; const double mean_import_length = full_param[1]; const double import_divergence = full_param[2]; const double branch_length = (informative[i]) ? full_param[3+i] : initial_branch_length[i]; maximum_likelihood_ClonalFrame_branch_allsites(dec_id,anc_id,node_nuc,iscompat,ipat,kappa,pi,branch_length,rho_over_theta,mean_import_length,import_divergence,is_imported[i]); } ML0 = Baum_Welch0(tree,node_nuc,which_compat,ipat,kappa,pi,informative,prior_a,prior_b,full_param,posterior_a,coutput); return full_param; } Matrix simulate_posterior(const vector ¶m, const int nsim) { if(!(param.size()==3+informative.size())) error("ClonalFrameBaumWelch::simulate_posterior(): 3 arguments required"); return Baum_Welch_simulate_posterior(tree,node_nuc,which_compat,ipat,kappa,pi,informative,prior_a,prior_b,param,neval,coutput,nsim); } }; /* In this version, the Baum-Welch algorithm is used to maximize the likelihood of all four parameters (R/M, import length, import divergence, branch length) for each branch. As for ClonalFrameBaumWelch, the prior acts through pseudocounts i.e. a data augmentation prior, and there is an extra parameter whose prior determines the variance in estimates of the recombination parameters per branch. This parameter needs to be set fairly stringently to prevent wild estimates in the absence of strong information per branch. */ class ClonalFrameBaumWelchRhoPerBranch { public: // References to non-member variables const marginal_tree &tree; const Matrix &node_nuc; const vector &iscompat; const vector &ipat; const double kappa; const vector π vector< vector > &is_imported; // True member variable double ML; double PR; int neval; const vector prior_a; const vector prior_b; vector which_compat; const int root_node; vector informative; vector initial_branch_length; vector mean_param; // Mean recombination parameters Matrix full_param; // Branch-specific recombination parameters and branch length Matrix posterior_a; bool guess_initial_m; bool coutput; public: ClonalFrameBaumWelchRhoPerBranch(const marginal_tree &_tree, const Matrix &_node_nuc, const vector &_iscompat, const vector &_ipat, const double _kappa, const vector &_pi, vector< vector > &_is_imported, const vector &_prior_a, const vector &_prior_b, const int _root_node, const bool _guess_initial_m, const bool _coutput=false) : tree(_tree), node_nuc(_node_nuc), iscompat(_iscompat), ipat(_ipat), kappa(_kappa), pi(_pi), neval(0), is_imported(_is_imported), prior_a(_prior_a), prior_b(_prior_b), root_node(_root_node), initial_branch_length(_root_node), informative(_root_node), guess_initial_m(_guess_initial_m), coutput(_coutput) { if(prior_a.size()!=5) error("ClonalFrameBaumWelchRhoPerBranch: prior a must have length 5"); if(prior_b.size()!=5) error("ClonalFrameBaumWelchRhoPerBranch: prior b must have length 5"); int i; // Precompute which sites are compatible which_compat = vector(0); for(i=0;iid; for(j=0,k=0;j=2.0) ? true : false; } } void maximize_likelihood(const vector ¶m) { if(!(param.size()==4)) error("ClonalFrameBaumWelchRhoPerBranch::maximize_likelihood(): 4 arguments required"); // Starting points for the shared parameters mean_param = vector(0); mean_param.push_back(param[0]); // rho_over_theta // NB:- **internally** define second parameter to be INVERSE mean import length mean_param.push_back(1.0/param[1]); // 1/mean_import_length mean_param.push_back(param[2]); // import_divergence // Specially for the mean branch length, set it to the crudely estimated value assuming no recombnation mean_param.push_back(0.0); // mean branch length int i; for(i=0;i(initial_branch_length.size(),4); posterior_a = Matrix(initial_branch_length.size(),4,0.0); for(i=0;iid; const double rho_over_theta = mean_param[0]*full_param[i][0]; const double mean_import_length = 1.0/(mean_param[1]*full_param[i][1]); const double import_divergence = mean_param[2]*full_param[i][2]; const double branch_length = (informative[i]) ? mean_param[3]*full_param[i][3] : initial_branch_length[i]; maximum_likelihood_ClonalFrame_branch_allsites(dec_id,anc_id,node_nuc,iscompat,ipat,kappa,pi,branch_length,rho_over_theta,mean_import_length,import_divergence,is_imported[i]); } return; } Matrix simulate_posterior(const vector ¶m, const int nsim) { error("Not implemented yet"); // if(!(param.size()==3+informative.size())) error("ClonalFrameBaumWelchRhoPerBranch::simulate_posterior(): 3 arguments required"); // return Baum_Welch_simulate_posterior(tree,node_nuc,which_compat,ipat,kappa,pi,informative,prior_a,prior_b,param,neval,coutput,nsim); return Matrix(0,0,0); } }; #endif // _MAIN_H_ ClonalFrameML-1.11/src/make.sh000077500000000000000000000002231307563374100161210ustar00rootroot00000000000000echo "#define ClonalFrameML_GITRevision \"`git describe --tags`\"" > version.h g++ main.cpp -o ClonalFrameML -I ./ -I ./myutils -I ./coalesce -O3 ClonalFrameML-1.11/src/make_win.bat000066400000000000000000000006541307563374100171370ustar00rootroot00000000000000@echo off rem This creates the version.h file. rem You need git installed (obviously) rem And to be in the folder where the ".git" directory exists. FOR /F "delims=" %%i IN ('git describe --tags') DO set GITRESULT=%%i echo #define ClonalFrameML_GITRevision %GITRESULT% > version.h rem The linux make.sh file now compiles the code. rem If you're in VS, remember you need _CRT_SECURE_NO_WARNINGS in the rem Pre-Processor code. ClonalFrameML-1.11/src/makefile000066400000000000000000000007201307563374100163470ustar00rootroot00000000000000# Make file for ClonalFrameML CC = g++ CFLAGS = -O3 -I./ -I./myutils -I./coalesce LDFLAGS = OBJECTS = main.o HEADERS = main.h brent.h powell.h version.h .PHONY: clean version all: version ClonalFrameML ClonalFrameML: $(OBJECTS) $(CC) $(LDFLAGS) -o ClonalFrameML $(OBJECTS) main.o: main.cpp $(HEADERS) $(CC) $(CFLAGS) -c -o main.o main.cpp version: /bin/echo "#define ClonalFrameML_GITRevision \"`git describe --tags`\"" > version.h clean: rm $(OBJECTS) ClonalFrameML-1.11/src/myutils/000077500000000000000000000000001307563374100163565ustar00rootroot00000000000000ClonalFrameML-1.11/src/myutils/DNA.h000066400000000000000000000611641307563374100171410ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * DNA.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* DNA.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _DNA_H_ #define _DNA_H_ #pragma warning(disable: 4786) #include #include #include #include #include "myutils/myutils.h" #include #include #include using namespace std; using namespace myutils; class DNA { public: vector label; vector sequence; int nseq; int lseq; vector ntimes; bool coutput; map baseToInt; // converts TUCAG- to 112345 map intToBase; // converts 012345 to NTCAG- protected: vector _uniqueHaps; vector _sites; LowerTriangularMatrix __B; vector _M; vector _F; vector _four; LowerTriangularMatrix< vector > _G; LowerTriangularMatrix _A; LowerTriangularMatrix _B; LowerTriangularMatrix _CC; Matrix _D; public: DNA() { coutput = false; init(); } DNA(const char* filename) { coutput = false; readFASTA_1pass(filename); init(); } DNA& init() { baseToInt['T'] = 1; baseToInt['U'] = baseToInt['T']; baseToInt['C'] = 2; baseToInt['A'] = 3; baseToInt['G'] = 4; baseToInt['-'] = 5; intToBase[0] = 'N'; intToBase[1] = 'T'; intToBase[2] = 'C'; intToBase[3] = 'A'; intToBase[4] = 'G'; intToBase[5] = '-'; return *this; } /*DNA& readFASTA(const char* filename) { ifstream in1(filename); if(!in1.is_open()) { string errmsg = "DNA::readFASTA(): File "; errmsg += string(filename); errmsg += " not found"; error(errmsg.c_str()); } int str; nseq = 0; while(!in1.eof()) { str = in1.get(); if((char)str=='>') { ++nseq; } } in1.close(); if(coutput) cout << "Read in " << nseq << " sequence" << endl; if(nseq==0) { lseq = 0; return *this; } ifstream in2(filename); if(!in2.is_open())error("File not found second time"); lseq = 0; string junk; while(!in2.eof()) { str = in2.get(); if((char)str=='>') { getline(in2,junk); if (!junk.empty()&&*junk.rbegin()=='\r') junk.erase(junk.length()-1,1); while(!in2.eof()) { str = in2.get(); if((char)str=='>') break; if(str!=-1 && (char)str!='\n' && (char)str!='\r') ++lseq; } if(coutput) cout << "Sequences are " << lseq << " long" << endl; break; } } in2.close(); string blank(lseq,' '); sequence.resize(nseq,blank); label.resize(nseq); ntimes.resize(nseq,0.0); ifstream in3(filename); if(!in3.is_open())error("File not found third time"); int NSEQ = 0; int LSEQ = 0; while(true) { str = in3.get(); if(in3.eof()) error("Cannot find sequences!"); if((char)str=='>') { getline(in3,label[NSEQ]); if (!label[NSEQ].empty()&&*label[NSEQ].rbegin()=='\r') label[NSEQ].erase(label[NSEQ].length()-1,1); break; } } while(true) { str = in3.get(); if(in3.eof()) break; if(LSEQ') { ++NSEQ; getline(in3,label[NSEQ]); if (!label[NSEQ].empty()&&*label[NSEQ].rbegin()=='\r') label[NSEQ].erase(label[NSEQ].length()-1,1); LSEQ=0; } } in3.close(); if(coutput) for(NSEQ=0;NSEQ0 && s[0]!='>') { string errmsg = "DNA::readFASTA_1pass(): File "; errmsg += string(filename); errmsg += " did not begin with '>'"; error(errmsg.c_str()); } label.push_back(s.substr(1)); string newseq = ""; while(!in1.eof()) { getline(in1,s); if (!s.empty()&&*s.rbegin()=='\r') s.erase(s.length()-1,1); s.erase(remove(s.begin(),s.end(),' '),s.end()); if(s.length()>0 && s[0]=='>') { if(lseq==-1) lseq = newseq.length(); if(newseq.length()!=lseq) { string errmsg = "DNA::readFASTA_1pass(): File "; errmsg += string(filename); errmsg += " sequences had different lengths"; error(errmsg.c_str()); } sequence.push_back(newseq); newseq = ""; ++nseq; label.push_back(s.substr(1)); } else { newseq += s; } } if(lseq==-1) lseq = newseq.length(); if(newseq.length()!=lseq) { string errmsg = "DNA::readFASTA_1pass(): File "; errmsg += string(filename); errmsg += " sequences had different lengths"; error(errmsg.c_str()); } sequence.push_back(newseq); newseq = ""; ++nseq; ntimes = vector(nseq,0.0); in1.close(); if(sequence.size()!=label.size()) { string errmsg = "DNA::readFASTA_1pass(): File "; errmsg += string(filename); errmsg += " different number of sequences and labels"; error(errmsg.c_str()); } if(coutput) for(int NSEQ=0;NSEQ" << label[n] << endl; for(pos=0;pos &code, const char* filename) { ofstream fout(filename); int n,pos; for(n=0;n" << label[n] << endl; for(pos=0;pos &code, const char* filename) { ofstream fout(filename); int n,pos; for(n=0;n" << label[n] << endl; for(pos=0;pos uniqueHaps(nseq,-1); uniqueHaps[0] = 0; int i,ii,j; bool unique; for(i=1;i &diff, map &chmap) { double result = 0.0; int i,j,k; for(i=0;i(lseq,0); int i,j,k; int S = 0; char hap0,hap1; bool segregating; for(j=0;j(S,0); // so j>=k always // __B[j][k] = 0 for compatible, 1 for incompatible bool comb[3]; for(j=0;j(S,0); // int maxM = 0; _M[S-1] = 0; _M[S-2] = __B[S-1][S-2]; for(i=S-3;i>=0;i--) { _M[i] = __B[i+1][i] + _M[i+1]; for(k=i+2;k_M[i]) _M[i] = __B[k][i]+_M[k]; } return (double)_M[0]; } void RecCorrelations(double* result) { RecCorrelations(result,true); } void RecCovariances(double* result) { RecCorrelations(result,false); } void RecCorrelations(double* result, bool normalize) { result[0] = result[1] = result[2] = 0.0; if(nseq==0) return; if(lseq==0) return; /* Determine which sites are biallelic segregating */ _sites = vector(lseq,0); int i,j,k; int S = 0; char hap0,hap1; bool segregating; for(j=0;j(S,1.0); /* _F is the marginal frequency of hap0 at site j */ for(j=0;j(4,0.0); /* _G[j][k] is the frequency of AB (_G[j][k][0]), */ _G = LowerTriangularMatrix< vector >(S,_four); /* Ab (1), aB (2), ab (3) for sites j and k */ for(j=0;j(S,0.0); // rsq _B = LowerTriangularMatrix(S,0.0); // Dprime _CC = LowerTriangularMatrix(S,0.0); // G4 _D = Matrix(S,S,0.0); double temp; for(i=0;i0.0 && _G[i][j][1]>0.0 && _G[i][j][2]>0.0 && _G[i][j][3]>0.0) ? 1.0 : 0.0; _D[i][j] = _D[j][i] = _sites[i] - _sites[j]; } } double E[4] = {0.0,0.0,0.0,0.0}; double EE[4] = {0.0,0.0,0.0,0.0}; double ED[3] = {0.0,0.0,0.0}; int ctr; for(i=0,ctr=0;i &polypeptide) { if(offset<0) error("DNA::transcribe(): cannot have negative offset"); if((lseq-offset)%3!=0) error("DNA::transcribe(): DNA length minus offset isn't a multiple of 3"); const int tlen = (lseq-offset)/3; string blank(" ",tlen); polypeptide = vector(nseq,blank); int i,j,ctr; for(i=offset,ctr=0;i &codonsequence) { if(offset<0) error("DNA::tocodon(): cannot have negative offset"); if((lseq-offset)%3!=0) error("DNA::tocodon(): DNA length minus offset isn't a multiple of 3"); const int tlen = (lseq-offset)/3; string blank(" ",tlen); codonsequence = vector(nseq,blank); int i,j,ctr; for(i=offset,ctr=0;i &ntsequence) { if(offset<0) error("DNA::tonucleotide(): cannot have negative offset"); if(offset>=lseq) error("DNA::tonucleotide(): cannot offset the whole sequence"); const int tlen = lseq-offset; ntsequence = Matrix(nseq,tlen); int i,j,ctr; for(i=offset,ctr=0;i &codonsequence) { if(offset<0) error("DNA::tocodon(): cannot have negative offset"); if((lseq-offset)%3!=0) error("DNA::tocodon(): DNA length minus offset isn't a multiple of 3"); const int tlen = (lseq-offset)/3; codonsequence = Matrix(nseq,tlen); int i,j,ctr; for(i=offset,ctr=0;i &codonsequence) { if(offset<0) error("DNA::tocodon(): cannot have negative offset"); if((lseq-offset)%3!=0) error("DNA::tocodon(): DNA length minus offset isn't a multiple of 3"); const int tlen = (lseq-offset)/3; codonsequence = Matrix(nseq,tlen); int i,j,ctr; for(i=offset,ctr=0;i &codonsequence) { if(offset<0) error("DNA::tocodon(): cannot have negative offset"); if((lseq-offset)%3!=0) error("DNA::tocodon(): DNA length minus offset isn't a multiple of 3"); const int tlen = (lseq-offset)/3; codonsequence = Matrix(nseq,tlen); int i,j,ctr; for(i=offset,ctr=0;i &codonsequence) { if(offset<0) error("DNA::tocodon(): cannot have negative offset"); if((lseq-offset)%3!=0) error("DNA::tocodon(): DNA length minus offset isn't a multiple of 3"); const int tlen = (lseq-offset)/3; codonsequence = Matrix(nseq,tlen); int i,j,ctr; for(i=offset,ctr=0;i=14) --ret; /* (shouldn't ever be equal to because of previous line) */ if(ret>=11) --ret; if(ret>=10) --ret; return ret; } /* Returns 0-60 for non-STOP codons, 61 for indels and -1 for unknown */ int tripletToCodon61_noerror(string &tri) { const int a = baseToInt[tri[0]]; const int b = baseToInt[tri[1]]; const int c = baseToInt[tri[2]]; bool indel = false; if(a==5) indel = true; if(b==5) indel = true; if(c==5) indel = true; if(indel==true) { if(a==5 && b==5 && c==5) return 64; else return -1; } /* return a value from 0 to 63 */ int ret = (a-1)*16 + (b-1)*4 + c - 1; /* remove STOP codons so value ranges from 0 to 60 */ if(ret==10 || ret==11 || ret==14) return -2; // WARNING value instead of ERROR if(ret>=14) --ret; /* (shouldn't ever be equal to because of previous line) */ if(ret>=11) --ret; if(ret>=10) --ret; return ret; } char codonToPeptide(const int codon) { switch(codon) { case 0: return 'F'; case 1: return 'F'; case 2: return 'L'; case 3: return 'L'; case 4: return 'S'; case 5: return 'S'; case 6: return 'S'; case 7: return 'S'; case 8: return 'Y'; case 9: return 'Y'; case 10: return 'X'; case 11: return 'X'; case 12: return 'C'; case 13: return 'C'; case 14: return 'X'; case 15: return 'W'; case 16: return 'L'; case 17: return 'L'; case 18: return 'L'; case 19: return 'L'; case 20: return 'P'; case 21: return 'P'; case 22: return 'P'; case 23: return 'P'; case 24: return 'H'; case 25: return 'H'; case 26: return 'Q'; case 27: return 'Q'; case 28: return 'R'; case 29: return 'R'; case 30: return 'R'; case 31: return 'R'; case 32: return 'I'; case 33: return 'I'; case 34: return 'I'; case 35: return 'M'; case 36: return 'T'; case 37: return 'T'; case 38: return 'T'; case 39: return 'T'; case 40: return 'N'; case 41: return 'N'; case 42: return 'K'; case 43: return 'K'; case 44: return 'S'; case 45: return 'S'; case 46: return 'R'; case 47: return 'R'; case 48: return 'V'; case 49: return 'V'; case 50: return 'V'; case 51: return 'V'; case 52: return 'A'; case 53: return 'A'; case 54: return 'A'; case 55: return 'A'; case 56: return 'D'; case 57: return 'D'; case 58: return 'E'; case 59: return 'E'; case 60: return 'G'; case 61: return 'G'; case 62: return 'G'; case 63: return 'G'; } return '?'; } }; #endif // _DNA_H_ ClonalFrameML-1.11/src/myutils/argumentwizard.h000066400000000000000000000163721307563374100216030ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * argumentwizard.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* argumentwizard.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _ARGUMENT_WIZARD_H_ #define _ARGUMENT_WIZARD_H_ #pragma warning(disable: 4786) #include #include #include #include #include #include #include "myutils/myerror.h" #include namespace myutils { #ifndef _CONTROL_AND_ARGUMENT_WIZARD_TYPES_ #define _CONTROL_AND_ARGUMENT_WIZARD_TYPES_ typedef void RTRV;//functions that retrieve the data typedef void GENERIC;//for the generic pointers enum DATA_TYPE {TP_UNRECOGNISED,TP_INT,TP_DOUBLE,TP_STRING,TP_VEC_INT,TP_VEC_DOUBLE,TP_EXT_VEC_DOUBLE}; #endif // _CONTROL_AND_ARGUMENT_WIZARD_TYPES_ class ArgumentWizard { /*MEMBER VARIABLES*/ public: std::list required; bool coutput; bool unrecognised; bool got_required; bool case_sensitive; bool fail_noprefix; /* used to avoid function pointers in selecting data-read function */ DATA_TYPE switcher; protected: std::map label_map; std::map data_map; int argc,argn; std::vector argv; /*MEMBER FUNCTIONS*/ public: ArgumentWizard(){set_defaults();} void read_input(const int argc_in, const char* argv_in[]) { argc = argc_in; argv = std::vector(argc); int i; for(i=0;i::iterator i; for(i=required.begin();i!=required.end();i++) std::cout << *i << " "; std::cout << std::endl; } else std::cout << "All required items were found" << std::endl; return got_required; } protected: void set_defaults() { coutput = true; unrecognised = true; case_sensitive = false; fail_noprefix = true; } void remove_case(std::string &s) { int i; for(i=0;i<(int)s.length();i++) s[i] = tolower(s[i]); } /* Returns true if a label is found */ bool read_label(std::string &word) { if(argn>=argc) error("Syntax error in ArgumentWizard::read_label: exceeded number of arguments"); word = argv[argn]; if(word[0]!='-') { if(fail_noprefix) error("Syntax error in ArgumentWizard::read_label: option must be prefixed with a \'-\'"); ++argn; return false; } std::string word2 = std::string(word.length()-1,' '); int i; for(i=1;i<(int)word.length();i++) word2[i-1] = word[i]; word = word2; if(!case_sensitive) remove_case(word); ++argn; return true; } void data_format(std::string &label) { label_map[label]; switcher = label_map[label]; } // Returns false if some required items are not found bool auto_check_required() { return (required.size()==0); } protected: RTRV function_get_unrecognised(std::string &label) { if((label.size()>0)&&(coutput || unrecognised)) printf("Label \"%s\" not recognised.\n",label.c_str()); } template RTRV function_get_single(T dummy, std::string &label) { if(argn>=argc) error("Syntax error in ArgumentWizard::function_get_single(): exceeded number of arguments"); std::string word = argv[argn]; //if(word[0]=='-') error("Syntax error in ArgumentWizard::function_get_single(): expecting a value but got an option"); std::stringstream s; s << word; T value; s >> value; GENERIC* ptr = data_map[label]; (*(static_cast(ptr))) = value; if(coutput) std::cout << label << " = " << value << std::endl; ++argn; } template RTRV function_get_vector(T dummy, std::string &label) { if(argn>=argc) error("Syntax error in ArgumentWizard::function_get_vector(): exceeded number of arguments"); std::string word; GENERIC* g_ptr = data_map[label]; std::vector* ptr = static_cast*>(g_ptr); ptr->clear(); if(coutput) std::cout << label << " = "; while(true) { if(argn==argc) break; word = argv[argn]; if(word[0]=='-') break; stringstream s; s << word; T value; s >> value; ptr->push_back(value); if(coutput) std::cout << word << " "; ++argn; } if(coutput) std::cout << std::endl; } RTRV function_get_string(std::string &label) { if(argn>=argc) error("Syntax error in ArgumentWizard::function_get_single(): exceeded number of arguments"); std::string word = argv[argn]; //if(word[0]=='-') error("Syntax error in ArgumentWizard::function_get_single(): expecting a value but got an option"); GENERIC* ptr = data_map[label]; (*(static_cast(ptr))) = word; if(coutput) std::cout << label << " = " << word << std::endl; ++argn; } RTRV function_get_external_vector_double(std::string &label) { error("ArgumentWizard:: TP_EXT_VEC_DOUBLE not available"); } void read_data(std::string &label) { switch(switcher) { case TP_UNRECOGNISED: function_get_unrecognised(label); break; case TP_INT: function_get_single((int)0,label); break; case TP_DOUBLE: function_get_single((double)0,label); break; case TP_STRING: function_get_string(label); break; case TP_VEC_INT: function_get_vector((int)0,label); break; case TP_VEC_DOUBLE: function_get_vector((double)0,label); break; case TP_EXT_VEC_DOUBLE: function_get_external_vector_double(label); break; default: function_get_unrecognised(label); break; } } }; // class ArgumentWizard }; // namespace myutils #endif // _ARGUMENT_WIZARD_H_ ClonalFrameML-1.11/src/myutils/lotri_matrix.h000066400000000000000000000113341307563374100212460ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * lotri_matrix.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* lotri_matrix.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _LOWER_TRIANGULAR_MATRIX_H_ #define _LOWER_TRIANGULAR_MATRIX_H_ #include #include /****************************************************************/ /* myutils::Matrix */ /* */ /* Matrix is a C++ style container whose memory storage is */ /* designed so that elements can easily be viewed at debug */ /* time in MSVC++ and to be compatible with some C code in */ /* which matrices are stored as one-dimensional arrays, where */ /* element (i,j) would be accessed as M[i*n+j]. */ /* */ /* Element (i,j) can be accessed in one of three ways: */ /* M[i][j] clearest syntax */ /* M.element[i][j] useful for viewing during debug */ /* M.array[i*n+j] compatible with C arrays */ /* */ /****************************************************************/ namespace myutils { template class LowerTriangularMatrix { public: /*Preserve public access for back-compatibility*/ T *array; T **element; protected: int _n; /* dimension of the lower triangular square matrix */ int _size; /* number of elements of the matrix */ // int protected_ncols; int initialized; public: /*Default constructor*/ LowerTriangularMatrix() { initialized=0; initialize(0); } /*Constructor*/ LowerTriangularMatrix(int n) { initialize(n); } /*Constructor*/ LowerTriangularMatrix(int n, T value) { initialize(n); int i,j; for(i=0;i& initialize(int n) { int i; int size = n*(n+1)/2; array = new T[size]; if (!array) error("array allocation failure in LowerTriangularMatrix::initialize()"); element = new T*[n]; if (!element) error("element allocation failure in LowerTriangularMatrix::initialize()"); for(i=0;i& resize(int n) { int i; int size = n*(n+1)/2; if (!initialized) return initialize(n); if(n==_n)return *this; delete[] array; delete[] element; array = new T[size]; if (!array) error("array allocation failure in LowerTriangularMatrix::resize()"); element = new T*[n]; if (!element) error("element allocation failure in LowerTriangularMatrix::resize()"); for(i=0;i &mat) /* Copy constructor for the following cases: LowerTriangularMatrix mat2(mat); LowerTriangularMatrix mat2=mat; and when LowerTriangularMatrix is returned from a function */ { initialize(mat._n); int i; for(i=0;i<_size;i++) array[i] = mat.array[i]; } /*Assignment operator*/ LowerTriangularMatrix& operator=(const LowerTriangularMatrix& mat) { //if(this==mat)return *this; resize(mat._n); int i; for(i=0;i<_size;i++) array[i] = mat.array[i]; return *this; } /*Subscript operator*/inline T* operator[](int pos){return element[pos];}; inline T& safe(int i, int j) { return (j<=i) ? element[i][j] : element[j][i]; } }; }; #endif // _LOWER_TRIANGULAR_MATRIX_H_ClonalFrameML-1.11/src/myutils/matrix.h000066400000000000000000000223071307563374100200370ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * matrix.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* matrix.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _MATRIX_H_ #define _MATRIX_H_ #include #include #include "myutils/vector.h" #include "myutils/utils.h" /****************************************************************/ /* myutils::Matrix */ /* */ /* Matrix is a C++ style container whose memory storage is */ /* designed so that elements can easily be viewed at debug */ /* time in MSVC++ and to be compatible with some C code in */ /* which matrices are stored as one-dimensional arrays, where */ /* element (i,j) would be accessed as M[i*n+j]. */ /* */ /* Element (i,j) can be accessed in one of three ways: */ /* M[i][j] clearest syntax */ /* M.element[i][j] useful for viewing during debug */ /* M.array[i*n+j] compatible with C arrays */ /* */ /****************************************************************/ namespace myutils { template class safeArray { public: T *element; int lo,hi; public: safeArray(T *set_element, const int set_lo, const int set_hi) : element(set_element), lo(set_lo), hi(set_hi) {}; inline T& operator[](int pos){ if(pos=hi) error("safeArray::operator[](int pos): pos>=max"); return element[pos]; } inline const T& operator[](int pos) const { if(pos=hi) error("safeArray::operator[](int pos): pos>=max"); return element[pos]; } }; template class Matrix { public: /*Preserve public access for back-compatibility*/ T *array; T **element; protected: unsigned long int protected_nrows; unsigned long int protected_ncols; int initialized; public: /*Default constructor*/ Matrix() { initialized=0; initialize(0,0); } /*Constructor*/ Matrix(int nrows, int ncols) { initialize(nrows,ncols); } /*Constructor*/ Matrix(int nrows, int ncols, T value) { initialize(nrows,ncols); unsigned long int i,j; for(i=0;i& initialize(int nrows, int ncols) { unsigned long int i; const unsigned long int newsize = (unsigned long int)(nrows)*(unsigned long int)(ncols); array = new T[newsize]; if (!array) error("array allocation failure in Matrix::initialize()"); element = new T*[(unsigned long int)nrows]; if (!element) error("element allocation failure in Matrix::initialize()"); for(i=0;i& resize(int nrows, int ncols) { unsigned long int i; if (!initialized) return initialize(nrows,ncols); if((nrows==protected_nrows)&&(ncols==protected_ncols))return *this; delete[] array; delete[] element; const unsigned long int newsize = (unsigned long int)(nrows)*(unsigned long int)(ncols); array = new T[newsize]; if (!array) error("array allocation failure in Matrix::resize()"); element = new T*[(unsigned long int)nrows]; if (!element) error("element allocation failure in Matrix::resize()"); for(i=0;i &mat) /* Copy constructor for the following cases: Matrix mat2(mat); Matrix mat2=mat; and when Matrix is returned from a function */ { initialize((int)mat.protected_nrows,(int)mat.protected_ncols); int i; for(i=0;i& operator=(const Matrix& mat) { //if(this==mat)return *this; resize(mat.nrows(),mat.ncols()); int i; for(i=0;i operator[](unsigned long int pos){ if(pos<0) error("Matrix::operator[](int row): row<0"); if(pos>=protected_nrows) error("Matrix::operator[](int row): row>=nrows()"); //return element[pos]; return safeArray< T >(element[pos],0,protected_ncols); }; /*DEBUG Subscript operator*/inline const safeArray< T > operator[](unsigned long int pos) const { if(pos<0) error("Matrix::operator[](int row): row<0"); if(pos>=protected_nrows) error("Matrix::operator[](int row): row>=nrows()"); //return element[pos]; return const safeArray< T >(element[pos],0,protected_ncols); }; #else /*Subscript operator*/inline T* operator[](unsigned long int pos){return element[pos];}; /*Subscript operator*/inline const T* operator[](unsigned long int pos) const {return element[pos];}; #endif /*Matrix multiplication*/ Matrix operator*(const Matrix& mat) { if(ncols()!=mat.nrows()) error("Matrix multiplication: matrices are not conformable"); Matrix result(nrows(),mat.ncols(),0.0); int i,j,k; for(i=0;i& multiply(const Matrix& op1, const Matrix& op2) { if(op1.ncols()!=op2.nrows()) error("Matrix multiplication: matrices are not conformable"); resize(op1.nrows(),op2.ncols()); int i,j,k; for(i=0;i map(T (* f)(T)) { Matrix result((int)protected_nrows,(int)protected_ncols); int i,j; for(i=0;i<(int)protected_nrows;i++) for(j=0;j<(int)protected_ncols;j++) result[i][j] = f(element[i][j]); return result; } /* Numerical Recipes in C++ routine for inverting a square real matrix */ Matrix invert() { if(protected_nrows!=protected_ncols) error("Matrix inversion: must be a symmetric matrix"); Matrix a = *this; Matrix b(protected_nrows,protected_ncols,0); int i; for(i=0;i indxc(n); myutils::Vector indxr(n); myutils::Vector ipiv(n); for(j=0;j=big) { big=fabs(a[j][k]); irow=j; icol=k; } } } ++(ipiv[icol]); if(irow!=icol) { for(l=0;l=0;l--) { if(indxr[l]!=indxc[l]) for(k=0;k inline Matrix IdentityMatrix(const int n) { Matrix m(n,n,(T)0); int i; for(i=0;i. */ #ifndef _MY_DOUBLE_H_ #define _MY_DOUBLE_H_ #include #include #include "myutils/myerror.h" using myutils::error; /* This class behaves to the user like a non-negative double, but is stored internally as the natural logarithm. Standard mathematical operations are performed on the logarithm of the number so that it should not underflow or overflow like a double. */ class mydouble { protected: double _log; bool _zero; public: /*Default constructor*/ mydouble() { _zero = false; }; /*Copy constructor*/ mydouble(const double &_doub) { _zero = false; if(_doub<0.0) myutils::error("mydouble::mydouble(const double&): cannot initialize with negative number"); if(_doub==0.0) setzero(); else _log = log(_doub); }; /*Copy constructor*/ mydouble(const mydouble &_mydoub) { _zero = _mydoub._zero; _log = _mydoub._log; } // Construct a zero static mydouble zero() { mydouble z(0); return z; } /*Conversion operator THIS CONVERSION OPERATOR HAS BEEN DISABLED BECAUSE IT ALLOWED THE COMPILER TO IMPLICITLY MAKE MYDOUBLE->DOUBLE CONVERSIONS WHICH RESULTED IN LOSS OF PRECISION WHEN DOUBLE->MYDOUBLE CONVERSIONS WERE REQUIRED TO MAINTAIN PRECISION. IT HAS BEEN REPLACED BY THE SUBSEQUENT FUNCTION WHICH IS AN EXPLICIT CONVERSION TO TYPE DOUBLE WHICH THE COMPILER CANNOT CALL IMPLICITLY. operator double const() { return (_zero) ? 0.0 : exp(_log); };*/ double todouble() const { return (_zero) ? 0.0 : exp(_log); } /*Assignment operator*/ mydouble& operator=(const double &_doub) { _zero = false; if(_doub<0.0) myutils::error("mydouble::operator=(const double&): cannot assign a negative number"); if(_doub==0.0) setzero(); else _log = log(_doub); return *this; } /*Assignment operator*/ mydouble& operator=(const mydouble &_mydoub) { _zero = _mydoub._zero; _log = _mydoub._log; return *this; } mydouble& setlog(const double &log) { _zero = false; _log = log; return *this; } mydouble& setzero() { _zero = true; _log = -std::numeric_limits::max(); return *this; } bool iszero() const { return _zero; } bool isinfinity() const { return !_zero && _log==std::numeric_limits::infinity(); } bool isbad() const { return !_zero && _log!=_log; } /*** MULTIPLICATION ***/ mydouble operator*(const double &dbl) const { return operator*(mydouble(dbl)); } mydouble operator*(const mydouble &mydbl) const { mydouble a; if(_zero || mydbl._zero) a.setzero(); else a.setlog(_log + mydbl._log); return a; } mydouble& operator*=(const double &dbl) { if(_zero || dbl==0.0) setzero(); else _log += mydouble(dbl)._log; return *this; } mydouble& operator*=(const mydouble &mydbl) { if(_zero || mydbl._zero) setzero(); else _log += mydbl._log; return *this; } /*** DIVISION ***/ mydouble operator/(const double &dbl) const { return operator/(mydouble(dbl)); } mydouble operator/(const mydouble &mydbl) const { mydouble a; if(mydbl._zero) error("mydouble::operator/(const mydouble&): division by zero"); else if(_zero) a.setzero(); else a.setlog(_log - mydbl._log); return a; } mydouble& operator/=(const double &dbl) { if(dbl==0.0) error("mydouble::operator/=(const double&): division by zero"); else if(!_zero) _log -= mydouble(dbl)._log; return *this; } mydouble& operator/=(const mydouble &mydbl) { if(mydbl._zero) error("mydouble::operator/=(const mydouble&): division by zero"); else if(!_zero) _log -= mydbl._log; return *this; } /*** ADDITION ***/ mydouble operator+(const double &dbl) const { if(dbl==0.0) return mydouble(*this); if(dbl<0.0) return operator-(mydouble(-dbl)); return operator+(mydouble(dbl)); } mydouble operator+(const mydouble &mydbl) const { mydouble a; if(_zero) a = mydouble(mydbl); else if(mydbl._zero) a = mydouble(*this); else { double diff = _log - mydbl._log; if(diff==0.0) a.setlog(log(2.0) + _log); else if(diff<0.0) a.setlog(mydbl._log + log(1.0 + exp(diff))); else a.setlog(_log + log(1.0 + exp(-diff))); } return a; } mydouble& operator+=(const double &dbl) { if(dbl==0.0) return *this; return operator+=(mydouble(dbl)); } mydouble& operator+=(const mydouble &mydbl) { if(_zero) *this = mydbl; else if(!mydbl._zero) { double diff = _log - mydbl._log; if(diff==0.0) _log += log(2.0); else if(diff<0.0) _log = mydbl._log + log(1.0 + exp(diff)); else _log += log(1.0 + exp(-diff)); } return *this; } /*** SUBTRACTION - warning cannot have negative numbers ***/ mydouble operator-(const double &dbl) const { if(dbl==0.0) return mydouble(*this); return operator-(mydouble(dbl)); } mydouble operator-(const mydouble &mydbl) const { mydouble a; if(mydbl._zero) a = mydouble(*this); else if(_zero) error("mydouble::operator-(const mydouble&): subtracting a positive number from zero"); else { /* diff must always be positive */ double diff = _log - mydbl._log; if(diff==0.0) a.setzero(); else if(diff<0.0) myutils::error("mydouble::operator-(const mydouble&) cannot handle negative numbers"); else a.setlog(_log + log(1.0 - exp(-diff))); } return a; } mydouble& operator-=(const double &dbl) { if(dbl==0.0) return *this; return operator-=(mydouble(dbl)); } mydouble& operator-=(const mydouble &mydbl) { if(!mydbl._zero) { if(_zero) error("mydouble::operator-=(const mydouble&): subtracting a positive number from zero"); /* diff must always be positive */ double diff = _log - mydbl._log; if(diff==0.0) setzero(); else if(diff<0.0) myutils::error("mydouble::operator-=(const mydouble&) cannot handle negative numbers"); else _log += log(1.0 - exp(-diff)); } return *this; } /*** SPECIAL OPERATIONS ***/ double LOG() const { return _log; } /* Caution: ^ has lower precedence than /+-* */ mydouble operator^(const double &dbl) const { mydouble a; if(_zero) a.setzero(); else a.setlog(_log * dbl); return a; } /* Caution: ^ has lower precedence than /+-* */ mydouble operator^(const mydouble &mydbl) const { mydouble a; if(_zero) a.setzero(); else a.setlog(_log * exp(mydbl._log)); return a; } mydouble& operator^=(const double &dbl) { if(!_zero) _log *= dbl; return *this; } mydouble& operator^=(const mydouble &mydbl) { if(!_zero) _log *= exp(mydbl._log); return *this; } /*** COMPARISON OPERATORS ***/ bool operator<(const double &dbl) const { return operator<(mydouble(dbl)); } bool operator<(const mydouble &mydbl) const { return (_log < mydbl._log); } bool operator<=(const double &dbl) const { return operator<=(mydouble(dbl)); } bool operator<=(const mydouble &mydbl) const { return (_log <= mydbl._log); } bool operator>(const double &dbl) const { return operator>(mydouble(dbl)); } bool operator>(const mydouble &mydbl) const { return (_log > mydbl._log); } bool operator>=(const double &dbl) const { return operator>=(mydouble(dbl)); } bool operator>=(const mydouble &mydbl) const { return (_log >= mydbl._log); } bool operator==(const double &dbl) const { return operator==(mydouble(dbl)); } bool operator==(const mydouble &mydbl) const { return (_log == mydbl._log); } bool operator!=(const double &dbl) const { return operator!=(mydouble(dbl)); } bool operator!=(const mydouble &mydbl) const { return (_log != mydbl._log); } }; /*** MULTIPLICATION ***/ inline mydouble operator*(const double &dbl, const mydouble &mydbl) { mydouble a(dbl); return a *= mydbl; } /*** DIVISION ***/ inline mydouble operator/(const double &dbl, const mydouble &mydbl) { mydouble a(dbl); return a /= mydbl; } /*** ADDITION ***/ inline mydouble operator+(const double &dbl, const mydouble &mydbl) { mydouble a(dbl); return a += mydbl; } /*** SUBTRACTION - warning cannot have negative numbers ***/ inline mydouble operator-(const double &dbl, const mydouble &mydbl) { mydouble a(dbl); return a -= mydbl; } /*** SPECIAL OPERATIONS ***/ inline double log(const mydouble &mydbl) { return mydbl.LOG(); } inline mydouble pow(const mydouble &_X, const mydouble &_Y) { return _X^_Y; } inline mydouble pow(const mydouble &_X, const double &_Y) { return _X^_Y; } /* Caution: ^ has lower precedence than /+-* */ inline mydouble operator^(const double dbl, const mydouble &mydbl) { mydouble a(dbl); return a ^= mydbl; } /*** COMPARISON OPERATORS ***/ inline bool operator<(const double &dbl, const mydouble &mydbl) { return (mydouble(dbl)(const double &dbl, const mydouble &mydbl) { return (mydouble(dbl)>mydbl); } inline bool operator>=(const double &dbl, const mydouble &mydbl) { return (mydouble(dbl)>=mydbl); } inline bool operator==(const double &dbl, const mydouble &mydbl) { return (mydouble(dbl)==mydbl); } inline bool operator!=(const double &dbl, const mydouble &mydbl) { return (mydouble(dbl)!=mydbl); } #endif//_MY_DOUBLE_H_ ClonalFrameML-1.11/src/myutils/myerror.h000066400000000000000000000027321307563374100202320ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * myerror.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* myerror.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _MYUTILS_ERROR_H #define _MYUTILS_ERROR_H #include #include // For use with MPI programs #ifdef _MYUTILS_MPI_ABORT_ON_EXIT #include #endif namespace myutils { inline void error(const char* error_text) { printf("ERROR: "); printf("%s\n", error_text); #ifdef _MYUTILS_MPI_ABORT_ON_EXIT MPI_Abort(MPI_COMM_WORLD,13); #endif exit(13); } inline void warning(const char* warning_text) { printf("WARNING: "); printf("%s\n", warning_text); return; } }; #endifClonalFrameML-1.11/src/myutils/myutils.h000066400000000000000000000032711307563374100202400ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * myutils.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* myutils.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _MYUTILS_H_ #define _MYUTILS_H_ #pragma warning(disable: 4786) /*Includes all header files in the myutils directory*/ /*#include "cmatrix.h" #include "matrix.h" #include "random.h" #include "error.h" #include "DNA.h" #include "vector.h"*/ #include "myutils/myerror.h" #include "myutils/utils.h" //#include "myutils/cmatrix.h" #include "myutils/vector.h" #include "myutils/matrix.h" #include "myutils/lotri_matrix.h" #include "myutils/random.h" #include "myutils/DNA.h" //#include "myutils/pause.h" //#include "myutils/sort.h" //#include "controlwizard.h" /* has problems in Linux with pointers */ //#include "pause.h" /* removed because conio.h is not standard */ #endif ClonalFrameML-1.11/src/myutils/newick.h000066400000000000000000000142441307563374100200140ustar00rootroot00000000000000/* * newick.h * newick * * Created by Daniel Wilson on 05/03/2013. * Copyright 2013 __MyCompanyName__. All rights reserved. * */ #ifndef _NEWICK_H_ #define _NEWICK_H_ #include #include #include "myutils/myerror.h" #include #include using std::vector; using std::string; using myutils::error; using std::stringstream; using std::endl; using std::cout; using myutils::warning; namespace myutils { class NewickNode { public: // Member variables NewickNode *anc; // Ancestral node vector dec; // Descendant nodes (any number) double len; // Length string str; // Name vector *allnodes; // Pointer to all nodes in the tree // Member functions NewickNode() { initialize(); } NewickNode(string token, NewickNode *anc_in) { initialize(); anc = anc_in; if(anc!=0) { // Get pointer to allnodes allnodes = anc->allnodes; // Add self to list of descendants allnodes->push_back(this); } // Remember when a node is created to add it to one's descendants process_token(token); } void initialize(){ anc = 0; dec = vector(0); len = 0.0; str = ""; allnodes = 0; } void process_token(string token){ // If this is part of a nexus file, assume all comments enclosed by square brackets have been removed // If this is the outermost node, assume the trailing semi-colon has already been removed // Locate left-most open bracket size_t lbrkt = token.find('('); // Locate right-most close bracket size_t rbrkt = token.rfind(')'); // Locate right-most colon size_t rcoln = token.rfind(':'); // Some checks if(lbrkt!=token.npos && rbrkt!=token.npos && lbrkt>rbrkt) { stringstream errTxt; errTxt << "Token: " << token << endl; errTxt << "Left bracket to right of right bracket: " << lbrkt << ", " << rbrkt; error(errTxt.str().c_str()); } if(lbrkt==token.npos && rbrkt!=token.npos) { stringstream errTxt; errTxt << "Token: " << token << endl; errTxt << "Found right bracket but no left bracket"; error(errTxt.str().c_str()); } if(lbrkt!=token.npos && rbrkt==token.npos) { stringstream errTxt; errTxt << "Token: " << token << endl; errTxt << "Found left bracket but no right bracket"; error(errTxt.str().c_str()); } if(rbrkt==lbrkt+1) { stringstream errTxt; errTxt << "Token: " << token << endl; errTxt << "Empty brackets"; error(errTxt.str().c_str()); } // Some indicator variables // Has descendants within brackets bool has_brkt = (lbrkt!=token.npos); // Has a colon bool has_coln = (rcoln!=token.npos && (!has_brkt || rcoln>rbrkt)); if(has_coln && has_brkt) { // Name the node if(rcoln>rbrkt+1) { str = token.substr(rbrkt+1,rcoln-rbrkt-1); } else { str = ""; } // Get the length if(rcoln0) { str = token.substr(0,rcoln); } else { str = ""; } // Get the length if(rcoln poscomma(0); size_t pos; // Keep track of the opening and closing of brackets within the string int nlbrkt = 0; int nrbrkt = 0; for(pos=0;posnlbrkt) { stringstream errTxt; errTxt << "Token: " << desc << endl; errTxt << "Found right bracket before left bracket"; error(errTxt.str().c_str()); } } } if(nlbrkt!=nrbrkt) { stringstream errTxt; errTxt << "Token: " << desc << endl; errTxt << "Too few right brackets"; error(errTxt.str().c_str()); } // For each descendant separated by commas, start a new node if(poscomma.size()==0) { stringstream errTxt; errTxt << "Token: " << desc << endl; errTxt << "Single descendant found"; warning(errTxt.str().c_str()); dec.push_back(new NewickNode(desc.substr(0,desc.length()),this)); } else{ dec.push_back(new NewickNode(desc.substr(0,poscomma[0]),this)); int i; for(i=1;i allnodes; // Pointer to all nodes in the tree // Member functions NewickTree() { } NewickTree(string token) { process_token(token); } void process_token(string token) { // Check for a trailing semi-colon if(token[token.length()-1]!=';') { stringstream errTxt; errTxt << "Token: " << token << endl; errTxt << "Expected trailing semi-colon but none found"; error(errTxt.str().c_str()); } // Set member variables allnodes = vector< NewickNode* >(1,&root); root.allnodes = &allnodes; // Start from the root node, having removed the trailing semi-colon root.process_token(token.substr(0,token.length()-1)); } }; }; // namespace myutils #endif // _NEWICK_H_ ClonalFrameML-1.11/src/myutils/random.h000066400000000000000000000516361307563374100200220ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * random.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* random.h 23rd February 2005 */ /* (c) Danny Wilson and Numerical Recipes */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _RANDOM_H_ #define _RANDOM_H_ #include #include #include #include "myutils/vector.h" #include "myutils/matrix.h" #include "myutils/lotri_matrix.h" #include "myutils/myerror.h" namespace myutils { class Random { protected: /* protected member variables */ int seed; /* protected member variables used by ran2() */ int idum; int idum2,iy; int *iv; const int NTAB; int protected_ncalls; /* protected member variables used by binomial() */ int nold; double pold,pc,plog,pclog,en,oldg; /* protected member variables used by poisson() */ double sq,alxm,g,oldm; /* protected member variables used by Z() */ int iset; double gset; protected: int autosetseed(void) { time_t lt; lt=time(NULL); return (int)lt; } /* uniform random number generation */ inline double ran2(void) { ++protected_ncalls; const int IM1=2147483563,IM2=2147483399; const int IA1=40014,IA2=40692,IQ1=53668,IQ2=52774; const int IR1=12211,IR2=3791,IMM1=IM1-1; const int NDIV=1+IMM1/NTAB; const double EPS=3.0e-16,RNMX=1.0-EPS,AM=1.0/double(IM1); int j,k; double temp; if (idum <= 0) { idum=(idum==0 ? 1 : -idum); idum2=idum; for (j=NTAB+7;j>=0;j--) { k=idum/IQ1; idum=IA1*(idum-k*IQ1)-k*IR1; if (idum < 0) idum += IM1; if (j < NTAB) iv[j] = idum; } iy=iv[0]; } k=idum/IQ1; idum=IA1*(idum-k*IQ1)-k*IR1; if (idum < 0) idum += IM1; k=idum2/IQ2; idum2=IA2*(idum2-k*IQ2)-k*IR2; if (idum2 < 0) idum2 += IM2; j=iy/NDIV; iy=iv[j]-idum2; iv[j] = idum; if (iy < 1) iy += IMM1; if ((temp=AM*iy) > RNMX) { return RNMX; } else { return temp; } } void rerror(const char* error_text) // Standard error handler { printf("Random Package run-time error...\n"); printf("%s\n", error_text); printf("...now exiting to system...\n"); exit(13); } /* 0 < a <= 1. From Devroye (1986) p. 425 */ double ahrens_dieter74_gamma(const double a) { double b = (exp(1.)+a)/exp(1.); double c = 1./a; double U,V,W,X; while(true) { U = ran2(); W = ran2(); V = b * U; if(V<=1) { X = pow(V,c); if(W<=exp(-X)) break; } else { X = -log(c*(b-V)); if(W<=pow(X,a-1.)) break; } } return X; } /* a > 1. From Devroye (1986) p. 410 */ double best78_gamma(const double a) { double b = a - 1.; double c = 3.*a - 0.75; double U,V,W,X,Y,Z; while(true) { U = ran2(); V = ran2(); W = U*(1.-U); Y = sqrt(c/W)*(U-0.5); X = b + Y; if(X>=0) { Z = 64. * pow(W,3.) * pow(V,2.); if(Z <= 1.0 - 2.0*pow(Y,2.)/X) break; if(log(Z) <= 2.*(b * log(X/b) - Y)) break; } } return X; } /* positive integers for ia only. From Numerical Recipes */ double gamdev(const int ia) { int j; double am,e,s,v1,v2,x,y; if (ia < 1) error("Error in routine gamma"); if (ia < 6) { x=1.0; for (j=1;j<=ia;j++) x *= ran2(); x = -log(x); } else { do { do { do { v1=ran2(); v2=2.0*ran2()-1.0; } while (v1*v1+v2*v2 > 1.0); y=v2/v1; am=ia-1; s=sqrt(2.0*am+1.0); x=s*y+am; } while (x <= 0.0); e=(1.0+y*y)*exp(am*log(x/am)-s*y); } while (ran2() > e); } return x; } double gammln(const double xx) { int j; double x,y,tmp,ser; static const double cof[6]={76.18009172947146,-86.50532032941677, 24.01409824083091,-1.231739572450155,0.1208650973866179e-2, -0.5395239384953e-5}; y=x=xx; tmp=x+5.5; tmp -= (x+0.5)*log(tmp); ser=1.000000000190015; for (j=0;j<6;j++) ser += cof[j]/++y; return -tmp+log(2.5066282746310005*ser/x); } public: /* Default constructor */ Random() : NTAB(32) { iv = new int[NTAB]; setseed(-autosetseed()); nold = -1; pold = -1.0; oldm = -1.0; iset = 0; } /* Copy constructor */ Random(const Random &ran) : NTAB(32) { seed = ran.seed; iv = new int[NTAB]; int i; for(i=0;i0) error("Random must be seeded with a negative integer"); seed=seed_in; idum=seed; idum2=123456789; iy=0; protected_ncalls=0; return *this; } /* seed_in must be a negative integer. set_ncalls is # calls to ran2() */ Random& setseed(const int seed_in, const int set_ncalls) { if(seed_in>0) error("Random must be seeded with a negative integer"); if(set_ncalls<0) error("ncalls must be non-negative"); if(seed!=seed_in || protected_ncalls>set_ncalls) setseed(seed_in); while(protected_ncalls &iv_in) { if(iv_in.size()!=NTAB) error("Random::setidum(): iv must have size NTAB"); seed=1; /* positive seed indicates it was not properly set */ idum=idum_in; idum2=idum2_in; iy=iy_in; int i; for(i=0;i &iv_out) { idum_out = idum; idum2_out = idum2; iy_out = iy; iv_out = std::vector(NTAB); int i; for(i=0;i0"); double gam1,gam2; if(a == 1.0) gam1 = exponential(1.0); else if(a == (double)((int) a)) gam1 = gamdev((int)a); else if(a < 1.0) gam1 = ahrens_dieter74_gamma(a); else gam1 = best78_gamma(a); if(b == 1.0) gam2 = exponential(1.0); else if(b == (double)((int) b)) gam2 = gamdev((int)b); else if(b < 1.0) gam2 = ahrens_dieter74_gamma(b); else gam2 = best78_gamma(b); return gam1/(gam1+gam2); } double binomial(const int n, const double pp) { const double PI=3.141592653589793238; int j; // Static members made class members 13/04/09 //static int nold=(-1); double am,em,g,angle,p,bnl,sq,t,y; //static double pold=(-1.0),pc,plog,pclog,en,oldg; p=(pp <= 0.5 ? pp : 1.0-pp); am=n*p; if (n < 25) { bnl=0.0; for (j=0;j= (en+1.0)); em=floor(em); t=1.2*sq*(1.0+y*y)*exp(oldg-gammln(em+1.0) -gammln(en-em+1.0)+em*plog+(en-em)*pclog); } while (ran2() > t); bnl=em; } if (p != pp) bnl=n-bnl; return bnl; } //double *dirichlet(const int *p, const int k) //{ // double *result; // result=(double *)malloc((unsigned) k*sizeof(double)); // if (!result) error("Allocation failure in dirichlet"); // double total=0.0; // int i; // for (i=0;i dirichlet(const std::vector &p, const int k) //{ // std::vector result(k); // // double total=0.0; // int i; // for (i=0;i r */ void dirichlet(const std::vector &a, std::vector &r) { double total=0.0; int i; int k = (int) a.size(); if(r.size()!=k) r.resize(k); for(i=0;i(rnumber); // uniform discrete [0,b-a] return result + a; // uniform discrete [a,b] } double exponential(const double mean) { double dum; do dum=ran2(); while (dum == 0.0); return -log(dum)*mean; } double exponential_ratio() { double dum1,dum2; do dum1 = ran2(); while(dum1 == 0.0); do dum2 = ran2(); while(dum2 == 0.0); return log(dum1)/log(dum2); } /* b is the scale parameter, c the shape parameter. mean = bc, variance = bbc */ double gamma(const double b, const double c) { if (b<=0) error("Error in gamma: 1st parameter should be >0"); if (c<=0) error("Error in gamma: 2nd parameter should be >0"); if (c == 1.0) return exponential(b); int cint = (int) c; if (c == (double) cint) return b*gamdev(cint); if (c<1.0) return b*ahrens_dieter74_gamma(c); return b*best78_gamma(c); } /* If X ~ geometric(p) then E(X) = (1-p)/p and E(X+1) = 1/p */ int geometric(const double p) { return (int)ceil(log(U())/log(1.-p)-1.); } double inverse(const double a, const double b) { if(a<=0.0) error("Lower bound for inverse distribution must be positive"); if(b<=a) error("Upper bound must be greater than lower bound for inverse distribution"); return a*pow(b/a,U()); } /* Returns X where Y=log(X) ~ Normal(mu,sigma) */ double log_normal(const double mu, const double sigma) { return exp(normal(mu,sigma)); } /* Returns the minimum of n uniform(0,1) random deviates */ double minU(const int n) { return 1.-pow(1.-ran2(),1.0/(double)n); } int *multinomial(const double* p, const int n, const int k) { int *result; result=(int *)malloc((unsigned) k*sizeof(int)); if (!result) error("Allocation failure in multinomial"); int i; for (i=0;ipmax) pmax=pnow; } int j=n, rnum2; double rnum1,ratio; do { rnum1 = ran2(); rnum2 = discrete(0,k-1); ratio = p[rnum2]/pmax; if (rnum1 <= ratio) { ++result[rnum2]; --j; } } while (j>0); return result; } int *multinomial(const double* p, const double pmax, const int n, const int k) { int *result; result=(int *)malloc((unsigned) k*sizeof(int)); if (!result) error("Allocation failure in multinomial"); for (int i=0;i0); return result; } std::vector multinomial(const std::vector &p, const int n, const int k) { std::vector result(k); int i; for (i=0;ipmax) pmax=pnow; } int j=n, rnum2; double rnum1,ratio; do { rnum1 = ran2(); rnum2 = discrete(0,k-1); ratio = p[rnum2]/pmax; if (rnum1 <= ratio) { ++result[rnum2]; --j; } } while (j>0); return result; } std::vector multinomial(const std::vector &p, const double pmax, const int n, const int k) { std::vector result(k); int i; for (i=0;i0); return result; } /* p and result have length k. Sum of result equals n */ void multinomial(const double* p, const int k, int* result, const int n) { int i; for (i=0;ipmax) pmax=pnow; } int j=n, rnum2; double rnum1,ratio; do { rnum1 = ran2(); rnum2 = discrete(0,k-1); ratio = p[rnum2]/pmax; if (rnum1 <= ratio) { ++result[rnum2]; --j; } } while (j>0); } /* Returns the random variates in the Vector MN */ void multivariate_normal(Vector &mu, Matrix &Sigma, Vector &MN) { Matrix temp; Vector z; return multivariate_normal(mu,Sigma,MN,temp,z); } /* Returns the random variates in the Vector MN */ void multivariate_normal(Vector &mu, Matrix &Sigma, Vector &MN, Matrix &temp, Vector &z, bool *cholesky_fail=0) { /* Cholesky decomposition from Numerical Recipies in C++ */ /* Note that eigen decomposition is stabler, and might better pick up non-positive definite Sigma. If not picked up, the empirical variance-covariance matrix for the simulations will not equal Sigma. */ int i,j,k; double sum; int n = Sigma.nrows(); if(n!=Sigma.ncols()) error("multivariate_normal(): Sigma is not a square matrix"); if(n!=mu.size()) error("multivariate_normal(): mu and Sigma have incompatible sizes"); if(cholesky_fail!=0) *cholesky_fail = false; temp.resize(n,n); z.resize(n); MN.resize(n); for(i=0;i=0;k--) sum -= temp[i][k]*temp[j][k]; if(i==j) { if(sum <= 0.0) {/* Sigma, with rounding errors, is not positive definite */ if(cholesky_fail!=0) { *cholesky_fail = true; return; } printf("\nSigma = \n"); int ii,jj; for(ii=0;ii &mu, LowerTriangularMatrix &Cholesky, Vector &MN, Vector &z) { /* Cholesky decomposition from Numerical Recipies in C++ */ /* Note that eigen decomposition is stabler, and might better pick up non-positive definite Sigma. If not picked up, the empirical variance-covariance matrix for the simulations will not equal Sigma. */ int i,k; int n = Cholesky.n(); if(n!=mu.size()) error("multivariate_normal(): mu and Sigma have incompatible sizes"); z.resize(n); MN.resize(n); /* Simulate MultiNormal(mu, Sigma), where Sigma is the variance-covariance matrix. Compute the Cholesky decomposition Sigma = L . L', where ' denotes the transpose. Generate a vector of i.i.d. standard normal variates Z. Then M = L' . Z + mu has the desired distribution.*/ for(i=0;i g); } else { if (xm != oldm) { oldm=xm; sq=sqrt(2.0*xm); alxm=log(xm); g=xm*alxm-gammln(xm+1.0); } do { do { y=tan(PI*ran2()); em=sq*y+xm; } while (em < 0.0); em=floor(em); t=0.9*(1.0+y*y)*exp(em*alxm-gammln(em+1.0)-g); } while (ran2() > t); } return em; } /*b=mean of full distribution t=cutoff point*/ double trunc_exponential(const double b, const double t) { return -b*log(1.0-(1.0-exp(-t/b))*ran2()); } /* truncated geometric with range 1..t. Mean of non-truncated distn would be 1/p. */ int trunc_geometric(const double p, const int t) { const double a = pow(1.-p,(double)t); return (int)ceil(log(a-(a-1.)*ran2())/log(1.-p)); } inline double U(void){return ran2();} double uniform(const double a, const double b) { double rnumber = ran2(); // continuous uniform [0,1] rnumber *= (b-a); // continuous uniform [0,b-a] rnumber += a; // continuous uniform [a,b] return rnumber; } double Z(void) { // Static members made class members 13/04/09 //static int iset=0; //static double gset; double fac,rsq,v1,v2; if (idum < 0) iset=0; if (iset == 0) { do { v1=2.0*ran2()-1.0; v2=2.0*ran2()-1.0; rsq=v1*v1+v2*v2; } while (rsq >= 1.0 || rsq == 0.0); fac=sqrt(-2.0*log(rsq)/rsq); gset=v1*fac; iset=1; return v2*fac; } else { iset=0; return gset; } } }; }; #endif ClonalFrameML-1.11/src/myutils/utils.h000066400000000000000000000024671307563374100177000ustar00rootroot00000000000000/* Copyright 2012 Daniel Wilson. * * utils.h * Part of the myutils library. * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . */ /********************************************/ /* utils.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _MYUTILS_UTILS_H_ #define _MYUTILS_UTILS_H_ namespace myutils { template void SWAP(T &a, T &b) { T c = a; a = b; b = c; } template T MIN(T a, T b) { return (a T MAX(T a, T b) { return (a. */ /********************************************/ /* vector.h 23rd February 2005 */ /* (c) Danny Wilson. */ /* www.danielwilson.me.uk */ /********************************************/ #ifndef _MYUTILS_VECTOR_H_ #define _MYUTILS_VECTOR_H_ #include "myutils/myerror.h" #include #include //#include namespace myutils { template class Vector { public: /*Preserve public access for back-compatibility*/ T *element; protected: int protected_size; int initialized; public: /*Default constructor*/ Vector() { initialized=0; initialize(0); } /*Constructor*/ Vector(int size) { initialize(size); } /*Constructor*/ Vector(int size, T value) { initialize(size); int i; for(i=0;i0) delete[] element; } Vector& initialize(int size) { element=new T[size]; if (!element) error("allocation failure in Vector::initialize()"); protected_size=size; initialized=1; return *this; } /*All current data is lost when the Matrix is resized*/ Vector& resize(int size) { if (!initialized) return initialize(size); if(size==protected_size)return *this; delete[] element; element=new T[size]; if (!element) error("allocation failure in Vector::resize()"); protected_size=size; return *this; } int size(){return protected_size;} int size() const {return protected_size;} /* void error(char* error_text) { printf("Run-time error in Vector::"); printf("%s%\n", error_text); printf("Exiting to system...\n"); exit(13); }*/ /*Copy constructor*/ Vector(const Vector &vec) /* Copy constructor for the following cases: Vector vec2(vec); Vector vec2=vec; and when Vector is returned from a function */ { initialize(vec.protected_size); int i; for(i=0;i& operator=(const Vector& vec) { resize(vec.size()); int i; for(i=0;i &vec) /* Copy constructor for the following cases: Vector vec2(vec); Vector vec2=vec; and when Vector is returned from a function */ { initialize(vec.size()); int i; for(i=0;i& operator=(const std::vector& vec) { resize(vec.size()); int i; for(i=0;i=protected_size) error("Vector::operator[](int pos): pos>=size()"); return element[pos]; }; /*Subscript operator*/inline const T& operator[](int pos) const { if(pos<0) error("Vector::operator[](int pos): pos<0"); if(pos>=protected_size) error("Vector::operator[](int pos): pos>=size()"); return element[pos]; }; #else /*Subscript operator*/inline T& operator[](int pos){return element[pos];}; /*Subscript operator*/inline const T& operator[](int pos) const {return element[pos];}; #endif }; }; #endif // _MYUTILS_VECTOR_H_ ClonalFrameML-1.11/src/powell.h000077500000000000000000000106451307563374100163340ustar00rootroot00000000000000/* Copyright 2013 Daniel Wilson. * * powell.h * * The myutils library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The myutils library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the myutils library. If not, see . * * Parts of this code are based on code in Numerical Recipes in C++ * WH Press, SA Teukolsky, WT Vetterling, BP Flannery (2002). * */ #ifndef _POWELL_MINIMISATION_H_ #define _POWELL_MINIMISATION_H_ #include #include #include #include "myutils/myutils.h" #include "brent.h" #pragma warning( disable : 4355 ) using namespace std; using namespace myutils; class PowellFunction { public: virtual double f(const vector& x) = 0; }; class Powell : public BrentFunction { public: PowellFunction &PowFunc; Brent brent; bool coutput; int ITMAX; // maximum number of iterations double TINY; // a small number double TOL; // tolerance int N; // number of dimensions [= p.size()] vector p; // parameter vector for minimum of PowFunc.f() Matrix xi; // Matrix of vector directions double function_minimum; // value of PowFunc.f() at its minimum int n_iterations; // number of iterations taken to find function_minimum // int BrentFunc_i; // the column in xi that is being minimized one-dimensionally vector BrentFunc_xt;// parameters to be fed into one-dimensional minimization vector BrentFunc_xi; bool fail; public: Powell(PowellFunction &PowFunc_in) : PowFunc(PowFunc_in), ITMAX(200), TINY(1.0e-25), TOL(1.0e-8), coutput(false), brent(*this) {} const vector& minimize(const vector& parameters, const double tol) { fail = false; p = parameters; n_iterations = 0; N = (int)parameters.size(); xi = Matrix(N,N,0.0); int i; for(i=0;i. * */ #include "myutils/DNA.h" void readXMFA(const char *filename,DNA * dna,vector * sites_to_ignore) { string unlink=string(1000,'N'); ifstream in(filename); if(!in.is_open()) { string errmsg = "readXMFA(): File "+string(filename)+" not found"; error(errmsg.c_str()); } dna->nseq = 0; int block=0; string s; getline(in,s);while (s.empty()||*s.begin()=='#') getline(in,s); if (!s.empty()&&*s.rbegin()=='\r') s.erase(s.length()-1,1); s.erase(remove(s.begin(),s.end(),' '),s.end()); s=s.substr(0,s.find(":")); if(s.length()>0 && s[0]!='>') { string errmsg = "readXMFA(): File "+string(filename)+" did not begin with '>'"; error(errmsg.c_str()); } dna->label.push_back(s.substr(1)); string newseq = ""; while(!in.eof()) { getline(in,s);if (s.empty()||*s.begin()=='#') continue; if (!s.empty()&&*s.rbegin()=='\r') s.erase(s.length()-1,1); s.erase(remove(s.begin(),s.end(),' '),s.end()); s=s.substr(0,s.find(":")); if(s.length()>0 && (s[0]=='>'||s[0]=='=')) { if (block==0) dna->sequence.push_back(""); if (dna->nseq>=0) { if (block==0) dna->sequence[dna->nseq]+=newseq; else { if (dna->nseq==0) for (int i=0;ipush_back(dna->sequence[0].length()+i); dna->sequence[dna->nseq]+=unlink+newseq;} } newseq = ""; if(s[0]=='>') {dna->nseq++;if (block==0) dna->label.push_back(s.substr(1));} else {block++;dna->nseq=-1;} } else newseq += s; } dna->nseq=dna->sequence.size(); dna->lseq=dna->sequence[0].length(); in.close(); }