plink-1.07-src/0000755000265600020320000000000011270312402012457 5ustar tilleaadminplink-1.07-src/linear.cpp0000644000265600020320000003536211264127625014464 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "linear.h" #include "helper.h" #include "options.h" #include "stats.h" LinearModel::LinearModel(Plink * p_) { P = p_; nc = 0; cluster = false; RSS = -1; } void LinearModel::setDependent() { // Set dependent variable and intercept Y.clear(); for (int i=0; in; i++) if (!miss[i]) Y.push_back( P->sample[i]->pperson->phenotype ) ; } void LinearModel::pruneY() { ////////////////////////////////// // Prune out rows that are missing vector Y2; for (int i=0; i &ia, const int mfit) { int i,j,k; int ma=ia.size(); for (i=mfit;i=0;j--) { if (ia[j]) { for (i=0;i= big) { big=fabs(a[j][k]); irow=j; icol=k; } } } ++(ipiv[icol]); if (irow != icol) { for (l=0;l=0;l--) { if (indxr[l] != indxc[l]) for (k=0;k &ia, matrix_t &covar, double &chisq, matrix_t & X) { int i,j,k,l,m,mfit=0; double ym,wt,sum,sig2i; int ndat=x.size(); int ma=a.size(); vector_t afunc(ma); matrix_t beta; sizeMatrix(beta,ma,1); for (j=0;j > temp; sizeMatrix(temp,mfit,mfit); for (j=0;j nind..."); } void LinearModel::standardise() { // Get mean and variance for all variable double sdY = sqrt(varY); for (int i=0; i wmax) wmax=w[j]; thresh=TOL*wmax; for (j=0;j sc(nc); for (int i=0; ichr << " " << setw(par::pp_maxsnp) << loc->name << " " << setw(10) << loc->bp << " " << setw(4) << loc->allele1 << " " << setw(10) << label[p] << " " << setw(8) << Y.size() << " "; if (okay) { OUT << setw(10) << coef[p] << " "; if (par::display_ci) OUT << setw(8) << se << " " << setw(8) << coef[p] - par::ci_zt * se << " " << setw(8) << coef[p] + par::ci_zt * se << " "; OUT << setw(12) << Z << " " << setw(12) << pvalue; } else { OUT << setw(10) << "NA" << " "; if (par::display_ci) OUT << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " "; OUT << setw(12) << "NA" << " " << setw(12) << "NA"; } OUT << "\n"; } } } double LinearModel::calculateRSS() { // Might already be calculated? if ( RSS >= 0 ) return RSS; // Calculate residual sum of squares (RSS) RSS = 0; for (int i=0; i 0 ? ( r > 1 ? 1 : r ) : 0; } double LinearModel::calculateAdjustedRSquared() { double ra = 1 - ( (double)(nind-1)/(double)(nind-np-1) ) * ( 1 - calculateRSquared() ); return ra > 0 ? ( ra > 1 ? 1 : ra ) : 0; } double LinearModel::calculateMallowC(LinearModel * submodel) { // Mallow's C = RSSm / S^2 + 2(m+1)-n // where S^2 = RSSk / (n-k-1); double Sk = calculateRSS() / ( nind - np - 1); return ( submodel->calculateRSS() / Sk ) + 2 * ( submodel->np+1)-nind; } double LinearModel::calculateFTest(LinearModel * submodel) { double RSSk = calculateRSS(); double RSSm = submodel->calculateRSS(); return ( ( RSSm - RSSk ) / (double)( np - submodel->np ) ) / ( RSSk / (double)(nind - np - 1 ) ); } double LinearModel::getPValue() { vector_t var = getVar(); bool okay = var[testParameter] < 1e-20 || !realnum(var[testParameter]) ? false : all_valid; if (all_valid) { double se = sqrt(var[testParameter]); double Z = coef[testParameter] / se; return pT(Z,Y.size()-np); } else return 1; } vector_t LinearModel::getPVals() { int tmp = testParameter; vector_t res; for (testParameter = 1; testParameter < np; testParameter++) res.push_back( getPValue() ); testParameter = tmp; return res; } void LinearModel::HuberWhite() { // } plink-1.07-src/legacy.cpp0000644000265600020320000006362311264127625014457 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "plink.h" #include "helper.h" #include "options.h" using namespace std; void Plink::findIBSRuns(Individual * person1, Individual * person2, ofstream & IBS) { int l=0; int lastibs=0; int lastchr=-1; int last; int nmiss = 0; int nibs0 = 0; bool run = false; bool justfinished = false; int start = 0; int end = 0; int hetcnt = 0; vector::iterator a1 = person1->one.begin(); vector::iterator a2 = person1->two.begin(); vector::iterator b1 = person2->one.begin(); vector::iterator b2 = person2->two.begin(); while ( a1 != person1->one.end() ) { // Skip haploid chromosomes, for now if ( ( par::chr_sex[locus[l]->chr] && ( person1->sex || person2->sex ) ) || par::chr_haploid[locus[l]->chr] ) { a1++; a2++; b1++; b2++; l++; continue; } bool miss = false; bool ibs0 = false; if ( *a1 == *a2 && *b1 == *b2 && *a1 != *b1 ) ibs0 = true; else if ( *a1 && !(*a2) ) miss = true; else if ( *b1 && !(*b2) ) miss = true; else if ( par::ibs_2only ) { // If we are only looking for IBS2, then make IBS1->0 if ( *a1 != *b1 || *a2 != *b2 ) ibs0 = true; } // Outside of a run? if (!run) { // A new IBS run? if ( ( ! miss) && ( ! ibs0 ) ) { start = lastibs = l; nmiss = nibs0 = 0; run=true; } } else // if already in a run, either end or increase length? { if ( ibs0 ) // ...found IBS0/error? { if (nibs0 == par::ibs_run_0) { end = lastibs; run = false; justfinished = true; } else { nibs0++; } } else if ( miss ) // ...missing genotypes? { if (nmiss == par::ibs_run_missing) { end = lastibs; run = false; justfinished = true; } else { nmiss++; } } if ( locus[l]->chr != locus[start]->chr ) // different chromosome? { end = l-1; run = false; justfinished = true; } else if ( l == (nl_all -1 ) ) // or end of all SNPs? { if ( ! ( miss || ibs0 ) ) end = l; else end = l-1; run = false; justfinished = true; } else if ( l>0 && locus[l]->chr == locus[l-1]->chr && ( locus[l]->bp - locus[l-1]->bp ) > par::ibs_inter_snp_distance ) // or too great a gap between SNPs? { end = lastibs; run = false; justfinished = true; } else // ...continue run, recording whether either locus is a het { if ( ! ( miss || ibs0 ) ) { lastibs=l; if ( *a1 != *a2 || *b1 != *b2 ) hetcnt++; } } } // Check run length? if (justfinished) { if ( locus[end]->bp - locus[start]->bp >= par::ibs_run_length_kb * 1000 && end - start + 1 >= par::ibs_run_length_snps ) { // Record Segment s; s.p1 = person1; s.p2 = person2; s.start = start; s.finish = end; segment.push_back(s); // Display IBS << setw(par::pp_maxfid) << person1->fid << " " << setw(par::pp_maxiid) << person1->iid << " " << setw(par::pp_maxfid) << person2->fid << " " << setw(par::pp_maxiid) << person2->iid << " " << setw(4) << locus[start]->chr << " " << setw(par::pp_maxsnp) << locus[start]->name << " " << setw(par::pp_maxsnp) << locus[end]->name << " " << setw(12) << locus[start]->bp << " " << setw(12) << locus[end]->bp << " " << setw(10) << (double)(locus[end]->bp - locus[start]->bp) /(double)1000 << " " << setw(5) << (double)(locus[end]->pos - locus[start]->pos) << " " << setw(5) << end - start +1 << " " << setw(6) << nibs0 << " " << setw(6) << (double)hetcnt/(double)(end-start+1) << " " << setw(6) << nmiss << " "; if ( lastchr == locus[start]->chr ) { IBS << setw(6) << start - last - 1 << " "; IBS << setw(6) << (double)( locus[start]->bp - locus[last]->bp ) / 1000.0 << "\n"; } else { IBS << setw(6) << "NA" << " "; IBS << setw(6) << "NA" << "\n"; lastchr = locus[start]->chr; } last = end; } ////////////////// // Clear counters start = end = nmiss = hetcnt = 0; justfinished=false; } /////////////// // Next locus a1++; a2++; b1++; b2++; l++; } } void Plink::preCalcPhenotypes() { // For binary traits: // SD = X + Y - 2XY // CP = XY - KX - KY + K^2 // = X + Y - (1/K)XY // WT = X + Y - aXY // a = (1-2K + K // Phenotype mean, and number of non-missing phenotypes int npheno=0; for (int i=0; imissing) { m_phenotype += person->phenotype; npheno++; } } // Test for no non-missing phenotypes: a problem if a subsequent // test has been specified if (npheno==0 && (par::assoc_test || par::plink || par::TDT_test || par::ibs_sharing_test || par::epistasis) ) error("No nonmissing phenotypes / available individuals"); // Option to fix the mean/variance or prevalence? Swap in here if (par::fix_prev) { m_phenotype = 1 + par::fixed_prev; v_phenotype = npheno * par::fixed_prev * (1-par::fixed_prev); } // Calculate mean of phenotype m_phenotype /= npheno; if (par::qt) { stringstream s2; s2 << "Phenotype mean = " << m_phenotype << "\n"; printLOG(s2.str()); } printLOG("Final analysis contains " + int2str(npheno) + " non-missing individuals\n\n"); } short Plink::calcPhenotypes(vector & l, Individual *p1, Individual *p2) { ///////////////////////////////////// // Calculate pairwise phenotype score short skip = 0; if (p1->missing || p2->missing) skip = 1; else if (par::remove_unaffected_pairs && p1->phenotype == 1 && p2->phenotype == 1) l.push_back(-999); else { if (par::SD) l.push_back( (p1->phenotype - p2->phenotype) * (p1->phenotype - p2->phenotype ) ); else if (par::CP) l.push_back(-(p1->phenotype - m_phenotype) * (p2->phenotype - m_phenotype)); } return skip; } void Plink::calcRegression(int chr) { /////////////////////////////////////////////// // For a specific chromosome, regress the SD/CP // on all pi-hat values (multi- or single-point) // Permutes individuals & recalculates pairwise phenotypes // Also, keep track of empirical p-values for the chromosome, // for subsequent minP correction // pihat[pair][position] // Number of positions int npos = pihat[0].size(); // Number of replicates int R = par::replicates; vector res; // Partial correlations, for a chromosome vector maxres; // Largest correlation per replicate vector pv(npos,0); // Empirical p-values vector pvalid(npos,0); // Number of valid partial correlations // Create phenotype list phenotype.resize(0); for (int i=0; isol // Determine number of groups int ns=0; for (int i=0; isol > ns) ns=sample[in_anal[i]]->sol; ns++; // Make 's' which records group membership // s[group][person] vector< vector > s; s.resize(ns); for (int i=0; isol].push_back(i); ////////////////////// // Start permutations for (int p=1; p<=R; p++) { cout << "Regression permutation: " << p << " of " << R << " \r"; // Vector 'in_anal' contains a list of individuals who // actually feature at least once in the main analysis // Store remapped IDs vector > indx; // Permute phenotypes, within cluster for (int k=0; k p(s[k].size()); permute(p); indx.push_back(p); } // Extract the new permuteds vector pin_anal(in_anal.size()); for (int j=0; j pall(sample.size(),-1); for (int i=0; i perm; perm.resize(0); for (int i=0; ifid // << "_" // << sample[pair1[i]]->iid // << " - " // << sample[pair2[i]]->fid // << "_" // << sample[pair2[i]]->iid // << " becomes " // << sample[pall[pair1[i]]]->fid // << "_" // << sample[pall[pair1[i]]]->iid << " - " // << sample[pall[pair2[i]]]->fid // << "_" // << sample[pall[pair2[i]]]->iid // << "\n"; calcPhenotypes(perm,sample[pall[pair1[i]]],sample[pall[pair2[i]]]); } // Re-standardise pairs phenotype in the // newly permuted list (as this will potentially // have a different mean and variance compared to the // original list, as each individual may now feature a // different number of times preCalcRegression_PHENO(perm); /////////////////// // Perform analyses // Track the maximum observed test statistic double mx=0; // Get vector of test statistics vector pres = doRegression(npos,perm); // Compare permuted test statistics against the originals double zero=0; for (int l=0; l= res[l]) pv[l]++; // Is this the maximum? if (pres[l]>mx) mx=pres[l]; } } // Save maximum statistic maxres.push_back(mx); // Next permutation } ////////////////////////////////////////////// // Finished permutations for this chromosome // Save max-values for genome-wide comparison for (int p=0; p= maxr2[p]) maxr2[p]=maxres[p]; cout << "\n"; ///////////////// // Output results ofstream PLO; string f = par::output_file_name + "-" + int2str(chr+1) + ".plink"; PLO.open(f.c_str(),ios::out); printLOG("Writing main PLINK results to [ " + f + " ] \n"); for (int l=0; l= res[l]) maxpv++; double p1, p2; string n1, n2; double fr = 0, fr2 = 0; //////////////////////////////// // Single and multipoint output if (m1[l]==-1) { p1 = locus[par::run_start]->pos - par::fringe; n1 = "fringe"; } else { p1 = locus[m1[l]]->pos; n1 = locus[m1[l]]->name; fr = locus[m1[l]]->freq; if (fr>0.5) fr2=1-fr; else fr2=fr; } if (m2[l]==-1) { p2 = locus[par::run_end]->pos + par::fringe; n2 = "fringe"; } else { p2 = locus[m2[l]]->pos; n2 = locus[m2[l]]->name; } if (m1[l]==-1 && m2[l]==-1) { p1 = p2 = 0; n1 = "Genomewide"; n2 = "IBD"; } double d1 = p1 + pos[l] * (p2-p1); if (res[l] != res[l]) PLO << "R " << par::run_chr << " " << n1 << " " << n2 << " " << d1 << " " << fr << " " << fr2 << " " << "NaN NaN NaN NaN\n"; else PLO << "R " << par::run_chr << " " << n1 << " " << n2 << " " << d1 << " " << fr << " " << fr2 << " " << res[l] << " " << (double)(pv[l]+1)/(double)(pvalid[l]+1) << " " << pvalid[l] << " " << (double)maxpv/(double)(pvalid[l]+1) << "\n"; } PLO.close(); } void Plink::preCalcRegression_PHENO(vector & pheno) { ///////////////////////////////////// // Precalculate means and variances // Number of pairs int N_pairs = pheno.size(); // Mean double m_pair_phenotype=0; int ntmp=0; for (int i=0; i Plink::doRegression(int npos, vector & ph) { vector rv; // Dependent variable = squared phenotype difference // = crossproduct // pihat[pair][position] // m_phenotype // v_phenotype // m_pihat[] // v_pihat[] // Partial corretion ( Phenotype ~ IBD local | IBD global ) // r_{PL|G} = ( r_{PL} - r_{PG}r_{LG} ) / sqrt( (1-r_PG^2)(1-r_LG^2) ) // r_PL : calculate as before // r_PG : calculate only once: make this the first item always // r_LG : need to also calculate this each time... // Number of pairs int N_pairs = pihat.size(); for (int l=0; l-998){ r_XY += pihat[i][l] * ph[i]; r_XZ += pihat[i][l] * pihat_G[i]; r_YZ += ph[i] * pihat_G[i]; N_actual++; } } r_XY /= N_actual-1; r_XZ /= N_actual-1; r_YZ /= N_actual-1; if (par::FIXED) r_XZ = r_YZ = 0 ; double partial = ( r_XY - r_XZ*r_YZ ) / sqrt( (1 - r_XZ*r_XZ ) * ( 1 - r_YZ*r_YZ ) ) ; // cerr << r_XZ << "\n"; // cerr << "r_XY... " // << l << "\t" // << r_XY << "\t" // << r_XZ << "\t" // << r_YZ << "\t" // << partial << "\n"; // Check that we actually had locus-specific information? // i.e. if the correlation between global and locus-specific // IBD is too high, then we don't want to look at this // position (set to inf) double zero=0; if ( r_XZ > par::MAX_CORR_PIHAT_PIHAT_G ) partial = 1/zero; else { // Check for +/- inf or nan status -- for now, return 0 if so if (partial != partial || partial == 1/zero || partial == -1/zero) { cerr << "WARNING: " << l << "\t" << partial << "\t" << r_XY << "\t" << r_XZ << "\t" << r_YZ << "\n"; partial = 1/zero; } } //save (minus, as default is SD) // reverse sign at permutation counting stage // if a different test rv.push_back(-partial); } // cout << "END"; return rv; } void Plink::calcAssociationWithBootstrap() { // TODO: // 1) how to handle missing phenotype and/or // genotype data in the bootstrap: i.e. ignore? // adjust by factor of n/n* ? impute? // count number of non-missing individuals in sample // int nonmiss = 0; // for (int i=0; imissing) nonmiss++; // else if (sample[i]->phenotype!=1 || // sample[i]->phenotype!=2 ) // error("Must be 1/2 coding for association test"); // } // vector pv(nl_all); // vector maxpv(nl_all); // vector original; // int aff; // int unf; // vector a1(nl_all); // vector a2(nl_all); // vector a0(nl_all); // vector odds(nl_all); // vector exp_afffreq1(nl_all); // vector exp_afffreq2(nl_all); // vector exp_unffreq1(nl_all); // vector exp_unffreq2(nl_all); // // Original association results // vector chisq = testAssoc(aff,unf, // a1,a2,a0, // odds, // exp_afffreq1, exp_afffreq2, // exp_unffreq1, exp_unffreq2, // perm); // for (int l=0; l< chisq.size(); l++) // { // // Z-score // double z = sqrt(chisq[l]); // // See direction of allele 1 as +ve // if ( odds[l] < 1) // CHECK THIS... // z *= -1; // // Make "lower" labelled allele base // if (locus[l]->allele1 > locus[l]->allele2) // z *= -1; // cout << z << " "; // } // cout << "\n"; // // Bootstrap samples // // C 00 AA GC // // U AA 00 GG // // copy original phenotypes (1/2/0) // vector orig_pheno(sample.size()); // for (int i=0; iphenotype; // // copy genotype data // vector< vector > orig_geno1(sample.size()); // vector< vector > orig_geno2(sample.size()); // for (int i=0; ione[l]; // orig_geno2[i][l] = sample[i]->two[l]; // } // } // for (int bs=0; bsphenotype = orig_pheno[pick]; // for (int l=0; l < nl_all; l++) // { // sample[i]->one[l] = orig_geno1[pick][l]; // sample[i]->two[l] = orig_geno2[pick][l]; // } // } // // Perform association tests // chisq = testAssoc(aff,unf, // a1,a2,a0, // odds, // exp_afffreq1, exp_afffreq2, // exp_unffreq1, exp_unffreq2, // perm); // // Output raw BS statistic (chisq) // for (int l=0; l< chisq.size(); l++) // { // // Z-score // double z = sqrt(chisq[l]); // // See direction of allele 1 as +ve // if ( odds[l] < 1) // CHECK THIS // z *= -1; // // Make "lower" labelled allele base // if (locus[l]->allele1 > locus[l]->allele2) // z *= -1; // cout << z << " "; // } // cout << "\n"; // } // ////////////// // // Runs of IBS // if (par::ibs_run) // { // if (par::SNP_major) // P.SNP2Ind(); // P.segment.resize(0); // ofstream IBS; // string f = par::output_file_name + ".ibs"; // if (par::ibs_2only) f += "2"; // IBS.open(f.c_str(),ios::out); // IBS.precision(4); // if (par::ibs_2only) // P.printLOG("Writing IBS(2)-run information to [ "+f+" ] \n"); // else // P.printLOG("Writing IBS(1+2)-run information to [ "+f+" ] \n"); // P.printLOG("Run defined as "+int2str(par::ibs_run_length_kb) + " kb "); // P.printLOG("and "+int2str(par::ibs_run_length_snps) + " SNPs "); // P.printLOG("(maximum gap of " // +int2str( (int)((double)par::ibs_inter_snp_distance/1000))+" kb)\n"); // if (par::ibs_2only) // P.printLOG("Allowing "+int2str(par::ibs_run_0)+" IBS 0/1 SNPs per run\n"); // else // P.printLOG("Allowing "+int2str(par::ibs_run_0)+" IBS 0 SNPs per run\n"); // P.printLOG("Allowing " // +int2str(par::ibs_run_missing)+" missing SNPs per run\n"); // IBS << setw(par::pp_maxfid) << "FID1" << " " // << setw(par::pp_maxiid) << "IID1" << " " // << setw(par::pp_maxfid) << "FID2" << " " // << setw(par::pp_maxiid) << "IID2" << " " // << setw(4) << "CHR" << " " // << setw(par::pp_maxsnp) << "SNP1" << " " // << setw(par::pp_maxsnp) << "SNP2" << " " // << setw(12) << "POS1" << " " // << setw(12) << "POS2" << " " // << setw(10) << "KB" << " " // << setw(5) << "CM" << " " // << setw(5) << "NSNP" << " " // << setw(6) << "NIBS0" << " " // << setw(6) << "HET" << " " // << setw(6) << "NMISS" << " " // << setw(6) << "SNPGAP" << " " // << setw(6) << "KBGAP" << "\n"; // int c=0; // for (int i1=0; i1 Plink::calcSinglePoint(vector & IBD, Z IBDg) { vector pihat; // Singlepoint individual loci for (int l=0; lname << "\t" << IBD[l].z0 << " " << IBD[l].z1 << " " << IBD[l].z2 << " => " << IBD[l].z1*0.5+IBD[l].z2 << " " << (IBDg.z1*0.5 + IBDg.z2) << "\n"; } } // Final analysis is genome-wide IBD if (!par::done_global_pihat) pihat_G.push_back( (IBDg.z1*0.5 + IBDg.z2) ); return pihat; } plink-1.07-src/haploCC.cpp0000644000265600020320000002305511264127625014517 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "phase.h" #include "helper.h" #include "stats.h" void HaploPhase::haplotypicCC(map & tests, int nt, bool display) { vector caseN(nt); vector controlN(nt); // Consider each individual for (int i=0; imissing ) { if ( person->aff ) { for (int z = 0 ; z < hap1[i].size(); z++) { map::iterator i1 = tests.find(hap1[i][z]); map::iterator i2 = tests.find(hap2[i][z]); if ( i1 != tests.end() ) { if (!ambig[i]) caseN[i1->second]++; else caseN[i1->second] += pp[i][z]; } if ( ! ( haploid || X && person->sex ) ) { if ( i2 != tests.end() ) { if (!ambig[i]) caseN[i2->second]++; else caseN[i2->second] += pp[i][z]; } } } } // Or control? else { for (int z = 0 ; z < hap1[i].size(); z++) { map::iterator i1 = tests.find(hap1[i][z]); map::iterator i2 = tests.find(hap2[i][z]); if ( i1 != tests.end() ) { if (!ambig[i]) controlN[i1->second]++; else controlN[i1->second] += pp[i][z]; } if ( ! ( haploid || X && person->sex ) ) { if ( i2 != tests.end() ) { if (!ambig[i]) controlN[i2->second]++; else controlN[i2->second] += pp[i][z]; } } } } } } // next individual //////////////////////////////////////// // Display header haplotype information? if ( display ) { HTEST << setw(10) << hname << " "; if (nt==2) { // Find test haplotype int hh=0; map::iterator i1 = tests.begin(); while ( i1 != tests.end() ) { if ( i1->second == 0 ) hh = i1->first; i1++; } HTEST << setw(12) << haplotypeName(hh) << " "; HTEST << setw(10) << caseN[0] / ( caseN[0] + caseN[1] ) << " " << setw(10) << controlN[0] / ( controlN[0] + controlN[1] ) << " "; } else { HTEST << setw(12) << "OMNIBUS" << " " << setw(10) << "NA" << " " << setw(10) << "NA" << " "; } } /////////////////////////////////////////////////// // Standard chi-sq statistic: use for omnibus test vector rowT(nt); double caseT = 0; double controlT = 0; for (int h=0; hname << "|"; HTEST << P.locus[S[ns-1]]->name << "\n"; } ///////////////////////////////////////////////////////// // // Adjust result based on empirical variance // ///////////////////////////////////////////////////////// if ( useEmpiricalVariance ) { set hs; map::iterator i1 = tests.begin(); while ( i1 != tests.end() ) { if ( i1->second == 0 ) hs.insert( i1->first); ++i1; } // This updates empiricalVariance in HaploPhase calculateEmpiricalVariance(hs); } // Save chi-square statistic in temporary storage result = chi2; ////////////////////////////// // Test based on proportions // Z = ( p1 - p2 ) / ( sqrt( p*(1-p)*(1/n1+1/n2) ) ) // we get p from HaploPhase (i.e. population frequency // Be careful of missing phenotypes here? // Haplotype-specific tests only if ( nt == 2 ) { /////////////////////////// // Too much of a stretch? if ( ratio < 0.01 ) { result = -1; pvalue = -9; odds = 1; case_freq = 0; control_freq = 0; return; } double p2 = caseN[0] / ( caseN[0] + caseN[1] ); double p1 = controlN[0] / ( controlN[0] + controlN[1] ); // Based on asymptotic variance double n2 = caseN[0] + caseN[1]; double n1 = controlN[0] + controlN[1]; double p = ( n1 * p1 + n2 * p2 ) / ( n1 + n2 ); // Instead of ( p2 - p1 ) / ( sqrt( p*(1-p)*(1/n1+1/n2) ) ) // use the empirical variance of dosage double Ze = ( p2 - p1 ) / ( sqrt( empiricalVariance * (1/n1+1/n2) ) ); // Chi-squared statistic based on test of proportions, using // empirical estimate of variance result = chi2 = Ze * Ze; case_freq = p2; control_freq = p1; } pvalue = chiprobP(chi2,nt-1); ////////////////////////////////// // Return odds ratio if ( nt == 2 ) { odds = ( caseN[0] * controlN[1] ) / ( controlN[0] * caseN[1] ); } } /////////////////////////////////// // Multimarker test with weighting void HaploPhase::haplotypicWeightedCC() { vector_t weights; for (int i=0; i::iterator whap = new_pred_weighted_allele[current].find( haplotypeName(i) ); if ( whap != new_pred_weighted_allele[current].end() ) { weights.push_back( whap->second ); } else { weights.push_back( 0 ); } } vector caseN(2); vector controlN(2); // Consider each individual for (int i=0; imissing ) { if (person->aff ) { for (int z = 0 ; z < hap1[i].size(); z++) { int h1 = hap1[i][z]; int h2 = hap2[i][z]; if (!ambig[i]) { caseN[0] += weights[h1]; caseN[1] += 1-weights[h1]; } else { caseN[0] += weights[h1] * pp[i][z]; caseN[1] += (1-weights[h1]) * pp[i][z]; } if ( ! ( haploid || X && person->sex ) ) { if (!ambig[i]) { caseN[0] += weights[h2]; caseN[1] += 1-weights[h2]; } else { caseN[0] += weights[h2] * pp[i][z]; caseN[1] += (1-weights[h2]) * pp[i][z]; } } } } // Or control? else { for (int z = 0 ; z < hap1[i].size(); z++) { int h1 = hap1[i][z]; int h2 = hap2[i][z]; if (!ambig[i]) { controlN[0] += weights[h1]; controlN[1] += 1-weights[h1]; } else { controlN[0] += weights[h1] * pp[i][z]; controlN[1] += (1-weights[h1]) * pp[i][z]; } if ( ! ( haploid || X && person->sex ) ) { if (!ambig[i]) { controlN[0] += weights[h2]; controlN[1] += 1-weights[h2]; } else { controlN[0] += weights[h2] * pp[i][z]; controlN[1] += (1-weights[h2]) * pp[i][z]; } } } } } } // next individual HTEST << setw(10) << hname << " "; // set hh to integer HTEST << setw(12) << new_map[current]->allele1 << " "; HTEST << setw(10) << caseN[0] / ( caseN[0] + caseN[1] ) << " " << setw(10) << controlN[0] / ( controlN[0] + controlN[1] ) << " "; vector rowT(2); double caseT = 0; double controlT = 0; for (int h=0; h<2; h++) { rowT[h] = caseN[h] + controlN[h]; caseT += caseN[h]; controlT += controlN[h]; } double chi2 = 0; for (int h=0; h<2; h++) { double exp = ( rowT[h] * caseT ) / (caseT + controlT); chi2 += ( ( caseN[h] - exp ) * ( caseN[h] - exp ) ) / exp ; exp = ( rowT[h] * controlT ) / (caseT + controlT); chi2 += ( ( controlN[h] - exp ) * ( controlN[h] - exp ) ) / exp ; } if ( realnum(chi2) ) { HTEST << setw(10) << chi2 << " " << setw(4) << 1 << " " << setw(10) << chiprobP(chi2,1) << " "; } else { HTEST << setw(10) << "NA" << " " << setw(4) << "NA" << " " << setw(10) << "NA" << " "; } for (int snps=0; snpsname << "|"; HTEST << P.locus[S[ns-1]]->name << "\n"; } plink-1.07-src/zfstream.cpp0000644000265600020320000003220711264127626015041 0ustar tilleaadmin/* * A C++ I/O streams interface to the zlib gz* functions * * by Ludwig Schwardt * original version by Kevin Ruland * * This version is standard-compliant and compatible with gcc 3.x. */ #include "zfstream.h" #include // for strcpy, strcat, strlen (mode strings) #include // for BUFSIZ // Internal buffer sizes (default and "unbuffered" versions) #define BIGBUFSIZE BUFSIZ #define SMALLBUFSIZE 1 /*****************************************************************************/ // Default constructor gzfilebuf::gzfilebuf() : file(NULL), io_mode(std::ios_base::openmode(0)), own_fd(false), buffer(NULL), buffer_size(BIGBUFSIZE), own_buffer(true) { // No buffers to start with this->disable_buffer(); } // Destructor gzfilebuf::~gzfilebuf() { // Sync output buffer and close only if responsible for file // (i.e. attached streams should be left open at this stage) this->sync(); if (own_fd) this->close(); // Make sure internal buffer is deallocated this->disable_buffer(); } // Set compression level and strategy int gzfilebuf::setcompression(int comp_level, int comp_strategy) { return gzsetparams(file, comp_level, comp_strategy); } // Open gzipped file gzfilebuf* gzfilebuf::open(const char *name, std::ios_base::openmode mode) { // Fail if file already open if (this->is_open()) return NULL; // Don't support simultaneous read/write access (yet) if ((mode & std::ios_base::in) && (mode & std::ios_base::out)) return NULL; // Build mode string for gzopen and check it [27.8.1.3.2] char char_mode[6] = "\0\0\0\0\0"; if (!this->open_mode(mode, char_mode)) return NULL; // Attempt to open file if ((file = gzopen(name, char_mode)) == NULL) return NULL; // On success, allocate internal buffer and set flags this->enable_buffer(); io_mode = mode; own_fd = true; return this; } // Attach to gzipped file gzfilebuf* gzfilebuf::attach(int fd, std::ios_base::openmode mode) { // Fail if file already open if (this->is_open()) return NULL; // Don't support simultaneous read/write access (yet) if ((mode & std::ios_base::in) && (mode & std::ios_base::out)) return NULL; // Build mode string for gzdopen and check it [27.8.1.3.2] char char_mode[6] = "\0\0\0\0\0"; if (!this->open_mode(mode, char_mode)) return NULL; // Attempt to attach to file if ((file = gzdopen(fd, char_mode)) == NULL) return NULL; // On success, allocate internal buffer and set flags this->enable_buffer(); io_mode = mode; own_fd = false; return this; } // Close gzipped file gzfilebuf* gzfilebuf::close() { // Fail immediately if no file is open if (!this->is_open()) return NULL; // Assume success gzfilebuf* retval = this; // Attempt to sync and close gzipped file if (this->sync() == -1) retval = NULL; if (gzclose(file) < 0) retval = NULL; // File is now gone anyway (postcondition [27.8.1.3.8]) file = NULL; own_fd = false; // Destroy internal buffer if it exists this->disable_buffer(); return retval; } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // Convert int open mode to mode string bool gzfilebuf::open_mode(std::ios_base::openmode mode, char* c_mode) const { bool testb = mode & std::ios_base::binary; bool testi = mode & std::ios_base::in; bool testo = mode & std::ios_base::out; bool testt = mode & std::ios_base::trunc; bool testa = mode & std::ios_base::app; // Check for valid flag combinations - see [27.8.1.3.2] (Table 92) // Original zfstream hardcoded the compression level to maximum here... // Double the time for less than 1% size improvement seems // excessive though - keeping it at the default level // To change back, just append "9" to the next three mode strings if (!testi && testo && !testt && !testa) strcpy(c_mode, "w"); if (!testi && testo && !testt && testa) strcpy(c_mode, "a"); if (!testi && testo && testt && !testa) strcpy(c_mode, "w"); if (testi && !testo && !testt && !testa) strcpy(c_mode, "r"); // No read/write mode yet // if (testi && testo && !testt && !testa) // strcpy(c_mode, "r+"); // if (testi && testo && testt && !testa) // strcpy(c_mode, "w+"); // Mode string should be empty for invalid combination of flags if (strlen(c_mode) == 0) return false; if (testb) strcat(c_mode, "b"); return true; } // Determine number of characters in internal get buffer std::streamsize gzfilebuf::showmanyc() { // Calls to underflow will fail if file not opened for reading if (!this->is_open() || !(io_mode & std::ios_base::in)) return -1; // Make sure get area is in use if (this->gptr() && (this->gptr() < this->egptr())) return std::streamsize(this->egptr() - this->gptr()); else return 0; } // Fill get area from gzipped file gzfilebuf::int_type gzfilebuf::underflow() { // If something is left in the get area by chance, return it // (this shouldn't normally happen, as underflow is only supposed // to be called when gptr >= egptr, but it serves as error check) if (this->gptr() && (this->gptr() < this->egptr())) return traits_type::to_int_type(*(this->gptr())); // If the file hasn't been opened for reading, produce error if (!this->is_open() || !(io_mode & std::ios_base::in)) return traits_type::eof(); // Attempt to fill internal buffer from gzipped file // (buffer must be guaranteed to exist...) int bytes_read = gzread(file, buffer, buffer_size); // Indicates error or EOF if (bytes_read <= 0) { // Reset get area this->setg(buffer, buffer, buffer); return traits_type::eof(); } // Make all bytes read from file available as get area this->setg(buffer, buffer, buffer + bytes_read); // Return next character in get area return traits_type::to_int_type(*(this->gptr())); } // Write put area to gzipped file gzfilebuf::int_type gzfilebuf::overflow(int_type c) { // Determine whether put area is in use if (this->pbase()) { // Double-check pointer range if (this->pptr() > this->epptr() || this->pptr() < this->pbase()) return traits_type::eof(); // Add extra character to buffer if not EOF if (!traits_type::eq_int_type(c, traits_type::eof())) { *(this->pptr()) = traits_type::to_char_type(c); this->pbump(1); } // Number of characters to write to file int bytes_to_write = this->pptr() - this->pbase(); // Overflow doesn't fail if nothing is to be written if (bytes_to_write > 0) { // If the file hasn't been opened for writing, produce error if (!this->is_open() || !(io_mode & std::ios_base::out)) return traits_type::eof(); // If gzipped file won't accept all bytes written to it, fail if (gzwrite(file, this->pbase(), bytes_to_write) != bytes_to_write) return traits_type::eof(); // Reset next pointer to point to pbase on success this->pbump(-bytes_to_write); } } // Write extra character to file if not EOF else if (!traits_type::eq_int_type(c, traits_type::eof())) { // If the file hasn't been opened for writing, produce error if (!this->is_open() || !(io_mode & std::ios_base::out)) return traits_type::eof(); // Impromptu char buffer (allows "unbuffered" output) char_type last_char = traits_type::to_char_type(c); // If gzipped file won't accept this character, fail if (gzwrite(file, &last_char, 1) != 1) return traits_type::eof(); } // If you got here, you have succeeded (even if c was EOF) // The return value should therefore be non-EOF if (traits_type::eq_int_type(c, traits_type::eof())) return traits_type::not_eof(c); else return c; } // Assign new buffer std::streambuf* gzfilebuf::setbuf(char_type* p, std::streamsize n) { // First make sure stuff is sync'ed, for safety if (this->sync() == -1) return NULL; // If buffering is turned off on purpose via setbuf(0,0), still allocate one... // "Unbuffered" only really refers to put [27.8.1.4.10], while get needs at // least a buffer of size 1 (very inefficient though, therefore make it bigger?) // This follows from [27.5.2.4.3]/12 (gptr needs to point at something, it seems) if (!p || !n) { // Replace existing buffer (if any) with small internal buffer this->disable_buffer(); buffer = NULL; buffer_size = 0; own_buffer = true; this->enable_buffer(); } else { // Replace existing buffer (if any) with external buffer this->disable_buffer(); buffer = p; buffer_size = n; own_buffer = false; this->enable_buffer(); } return this; } // Write put area to gzipped file (i.e. ensures that put area is empty) int gzfilebuf::sync() { return traits_type::eq_int_type(this->overflow(), traits_type::eof()) ? -1 : 0; } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // Allocate internal buffer void gzfilebuf::enable_buffer() { // If internal buffer required, allocate one if (own_buffer && !buffer) { // Check for buffered vs. "unbuffered" if (buffer_size > 0) { // Allocate internal buffer buffer = new char_type[buffer_size]; // Get area starts empty and will be expanded by underflow as need arises this->setg(buffer, buffer, buffer); // Setup entire internal buffer as put area. // The one-past-end pointer actually points to the last element of the buffer, // so that overflow(c) can safely add the extra character c to the sequence. // These pointers remain in place for the duration of the buffer this->setp(buffer, buffer + buffer_size - 1); } else { // Even in "unbuffered" case, (small?) get buffer is still required buffer_size = SMALLBUFSIZE; buffer = new char_type[buffer_size]; this->setg(buffer, buffer, buffer); // "Unbuffered" means no put buffer this->setp(0, 0); } } else { // If buffer already allocated, reset buffer pointers just to make sure no // stale chars are lying around this->setg(buffer, buffer, buffer); this->setp(buffer, buffer + buffer_size - 1); } } // Destroy internal buffer void gzfilebuf::disable_buffer() { // If internal buffer exists, deallocate it if (own_buffer && buffer) { // Preserve unbuffered status by zeroing size if (!this->pbase()) buffer_size = 0; delete[] buffer; buffer = NULL; this->setg(0, 0, 0); this->setp(0, 0); } else { // Reset buffer pointers to initial state if external buffer exists this->setg(buffer, buffer, buffer); if (buffer) this->setp(buffer, buffer + buffer_size - 1); else this->setp(0, 0); } } /*****************************************************************************/ // Default constructor initializes stream buffer gzifstream::gzifstream() : std::istream(NULL), sb() { this->init(&sb); } // Initialize stream buffer and open file gzifstream::gzifstream(const char* name, std::ios_base::openmode mode) : std::istream(NULL), sb() { this->init(&sb); this->open(name, mode); } // Initialize stream buffer and attach to file gzifstream::gzifstream(int fd, std::ios_base::openmode mode) : std::istream(NULL), sb() { this->init(&sb); this->attach(fd, mode); } // Open file and go into fail() state if unsuccessful void gzifstream::open(const char* name, std::ios_base::openmode mode) { if (!sb.open(name, mode | std::ios_base::in)) this->setstate(std::ios_base::failbit); else this->clear(); } // Attach to file and go into fail() state if unsuccessful void gzifstream::attach(int fd, std::ios_base::openmode mode) { if (!sb.attach(fd, mode | std::ios_base::in)) this->setstate(std::ios_base::failbit); else this->clear(); } // Close file void gzifstream::close() { if (!sb.close()) this->setstate(std::ios_base::failbit); } /*****************************************************************************/ // Default constructor initializes stream buffer gzofstream::gzofstream() : std::ostream(NULL), sb() { this->init(&sb); } // Initialize stream buffer and open file gzofstream::gzofstream(const char* name, std::ios_base::openmode mode) : std::ostream(NULL), sb() { this->init(&sb); this->open(name, mode); } // Initialize stream buffer and attach to file gzofstream::gzofstream(int fd, std::ios_base::openmode mode) : std::ostream(NULL), sb() { this->init(&sb); this->attach(fd, mode); } // Open file and go into fail() state if unsuccessful void gzofstream::open(const char* name, std::ios_base::openmode mode) { if (!sb.open(name, mode | std::ios_base::out)) this->setstate(std::ios_base::failbit); else this->clear(); } // Attach to file and go into fail() state if unsuccessful void gzofstream::attach(int fd, std::ios_base::openmode mode) { if (!sb.attach(fd, mode | std::ios_base::out)) this->setstate(std::ios_base::failbit); else this->clear(); } // Close file void gzofstream::close() { if (!sb.close()) this->setstate(std::ios_base::failbit); } plink-1.07-src/nonfounderphasing.cpp0000644000265600020320000004744111264127625016742 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "genogroup.h" #include "phase.h" #include "haplowindow.h" extern Plink * PP; void displayFamTran(map & pmap, int fi, HaploPhase * HP) { cout << "FAMILY " << fi << " : " << PP->sample[fi]->fid << "\n"; map::iterator i = pmap.begin(); cout << setw(12) << "PATERNAL" << " " << setw(12) << "MATERNAL" << " " << " -> " << setw(12) << "OFFSPRING" << " " << setw(8) << "PROB" << "\n"; while ( i != pmap.end() ) { const FamilyTransmissions * f = &(i->first); cout << setw(12) << (HP->haplotypeName( f->pt ) + "/" + HP->haplotypeName( f->pu ) ) << " " << setw(12) << (HP->haplotypeName( f->mt ) + "/" + HP->haplotypeName( f->mu ) ) << " " << " -> " << setw(12) << (HP->haplotypeName( f->pt ) + "/" + HP->haplotypeName( f->mt ) ) << " " << setw(8) << i->second << "\n"; ++i; } cout << "\n"; } void HaploPhase::validateNonfounder(int i, vector & s1, vector & s2) { // Flipping allele-coding for homozygotes for (int s=0; sone[i]; s2[s] = P.SNP[S[s]]->two[i]; } else { s1[s] = P.sample[i]->one[S[s]]; s2[s] = P.sample[i]->two[S[s]]; } if (s1[s] == s2[s]) { s1[s] = !s1[s]; s2[s] = !s2[s]; } } ////////////////////////////////////////////////////////// // Count amount of missing genotype data at this position int nm = 0; for (int s=0; s0) ambig[i] = true; // But if too much missing genotype data, then // we should not even try to phase this individual // for this region; note -- females should always be // missing all genotypes for Y, so we don't need to // worry about allowing for a special case here. if ( (double)nm/(double)ns >= par::hap_missing_geno ) { include[i] = false; } /////////////////////////////////////////////// // 2 or more hets at any loci -> ambiguous // Haploid genotypes should never be heterozygous, // so we are okay here w.r.t X chromosome int het=0; for (int s=0; s1) ambig[i] = true; return; } bool HaploPhase::consistentNonfounderPhaseGivenGenotypes(vector & s1, vector & s2, int h1, int h2) { // This function works for autosomal, haploid and sex chromosomes // Template haplotypes vector & t1 = hap[h1]; vector & t2 = hap[h2]; for (int s=0; s & s1, vector & s2, int h2) { // This function works for haploid individuals (male X offspring); // only the mother transmitted the X // Template haplotypes vector & t1 = hap[h2]; for (int s=0; ssex ) { // We should only have specified possible homozygous phases -- // therefore, if male X chr, we only need to check that it is // consistent with at least one maternal X if ( h1 == m1 || h1 == m2 ) return true; } return (h1 == p1 && h2 == m1 ) || (h1 == p1 && h2 == m2 ) || (h1 == p2 && h2 == m1 ) || (h1 == p2 && h2 == m2 ) || (h1 == m1 && h2 == p1 ) || (h1 == m1 && h2 == p2 ) || (h1 == m2 && h2 == p1 ) || (h1 == m2 && h2 == p2 ); } void HaploPhase::resolveWithKids(int i) { // Consider the founders in each family, who // have at least 1 child, and a genotyped spouse // We require a full family, with two parents // if ( ! f->parents ) return; // int pati = pat-> // Individual * pat = f->pat; // Individual * mat = f->mat; // A/a B/b A/a B/b -> A/A B/B // AB / AB // for (int i=0; i< P.family[f].size(); i++) // cout << P.family[f]->fid << "\t" // << P.family[f]->iid << "\t" // << P.family[f]->pat->iid << "\t" // << P.family[f]->mat->iid << "\n"; } void HaploPhase::phaseAndScoreNonfounder(int i) { ////////////////////////////////////////////// // Always try to phase this offspring include[i] = true; ////////////////////////////////////////////// // Link this individual up with their parents int father = P.sample[i]->ip; int mother = P.sample[i]->im; bool nofather = false; bool nomother = false; if (father==-1) nofather = true; else if (!include[father]) nofather = true; if (mother==-1) nomother = true; else if (!include[mother]) nomother = true; // For TDT purposes, we require both parents to be 'observed' // i.e. so we never we to consider the "AllPhases" list (so // we now do not bother generating it, i.e. enumerateAllPhases() // function call is commented out in the main loop above if ( nofather || nomother ) { include[i] = false; return; } int pat_phases = hap1[father].size(); int mat_phases = hap1[mother].size(); // Too much ambiguity? if (pat_phases * mat_phases >= par::hap_max_nf_phases ) { include[i] = false; return; } // Keep track of transmitted and non-transmitted // haplotypes if performing a TDT-type analysis vector > trans1(0); vector > untrans1(0); ///////////////////////////////////////// // Perform fill-in phasing for offspring // Step 1. Enumerate possible offspring phases // or set to not include if too much missing, // and populate s1/s2 with genotype data for // region vector s1(ns); vector s2(ns); validateNonfounder(i,s1,s2); ////////////////////////////////////////////// // Do we want to attempt to reconstruct phase? if ( ! include[i] ) { return; } //////////////////////////////////////////////// // Step 2. Joint distribution of parental phases double psum = 0; int pcnt=1; // Set offspring posterior probability list to nil pp[i].clear(); // Consider all possible pairs of parental phases and implied // possible haplotypic transmissions // If no mother or father exists, we are using the standard // ph[] enumeration of all possible haplotypes: NOT SUPPORTED // CURRENTLY, BUT WE COULD IMPLEMENT AGAIN FOR SIBLINGS vector & pathap1 = nofather ? ph_hap1 : hap1[father]; vector & pathap2 = nofather ? ph_hap2 : hap2[father]; vector & mathap1 = nomother ? ph_hap1 : hap1[mother]; vector & mathap2 = nomother ? ph_hap2 : hap2[mother]; map & pmap = phasemap[i]; for (int z1=0; z1 < pat_phases ; z1++) for (int z2=0; z2 < mat_phases ; z2++) { int p1 = pathap1[z1]; int p2 = pathap2[z1]; int m1 = mathap1[z2]; int m2 = mathap2[z2]; // Legacy code: we no longer take this approach, but // the code is left here to show how to call the function // // Obtain possible offspring phases, given offspring // // genotypes and parental haplotypes // enumerateNonfounderPhase(i, // offspring individual // s1, s2, // offspring genotypes // p1, p2, // paternal haplotypes // m1, m2, // maternal haplotypes // phap1, phap2); // return possible offspring haplotypes // Given parental phases, there are four possible autosomal // offspring tranmissions (for autosomes). We should enumerate // these and see which are consistent with the observed // offspring genotypes // Autosome(PM) Haploid* X(->female) X(->male) // 00 0 00 *0 // 01 1 01 *1 // 10 // 11 // * not implemented; i.e. for now these are skipped // (i.e. this function is never called) -- this will be added // in future versions; haploid genotypes are coded as // homozygous; but for the haploid case, we need special code // in place to indicate MT transmission, etc. // For the autosomal X case, when transmitting to males, we // need a special function, however, as we do not want to // consider at all the paternal (homozygous/haploid) X // genotype (i.e. Y was transmitted...) // For female offspring on the X, we do want to look at the // paternal X for concordance, but we should only look at one // copy (as the father should always be homozygous/haploid). vector offspring(4); vector possible(4,false); int npossible = 0; for (int tr_pat = 0; tr_pat < 2; tr_pat++) for (int tr_mat = 0; tr_mat < 2; tr_mat++) { // Handle special cases of non-autosomal chromosomes if ( X ) { // If boy, haploid and X must have come from mother // (0 paternal possible transmissions) // If girl, diploid, but father can only send one // possible X (only 1 possible paternal // transmission) if ( tr_pat == 1 ) continue; } // Offspring haplotypes // Store: c is 0..3 coding // first two are for maternal transmissions // so we can take a short cut and just consider // first two positions for X (where there is // no variation in paternal transmission conditional // on offspring sex) int c = tr_mat + tr_pat*2; // Paternal transmission if ( tr_pat == 0 ) { offspring[c].pt = p1; offspring[c].pu = p2; } else { offspring[c].pt = p2; offspring[c].pu = p1; } // Maternal transmission if ( tr_mat == 0 ) { offspring[c].mt = m1; offspring[c].mu = m2; } else { offspring[c].mt = m2; offspring[c].mu = m1; } // Is this offspring phase compatible with the offspring // genotypes? if ( X && P.sample[i]->sex ) { // Only consider maternal X transmission to male if ( consistentNonfounderMalePhaseGivenXGenotypes(s1,s2, offspring[c].mt) ) { possible[c] = true; npossible++; } } else if ( consistentNonfounderPhaseGivenGenotypes(s1,s2, offspring[c].pt, offspring[c].mt) ) { // Add this to list of possible offspring phases, // keeping track of frequency // If we were to revert to using absent parents, // then ph_freq[] should have been populated by // enumerateAllPhases() possible[c] = true; npossible++; } } // Next of 2/4 (max) possible parental transmissions // Need to scale these 0->4 possibilities to sum to correct // value // At least one possible phase? if ( npossible > 0 ) { double p = 1; if (ambig[father]) p *= pp[father][z1]; if (ambig[mother]) p *= pp[mother][z2]; // We explicitly consider both phases, so we remove this line // if (h1!=h2) // p *= 2; p /= (double)npossible; int numposs = 4; // Autosomal if ( X ) numposs = 2; // X transmission for (int j=0; j::iterator ip = pmap.find( offspring[j] ); if ( ip == pmap.end() ) pmap.insert( make_pair(offspring[j] , p) ); else ip->second += p; // Keep track of total probability psum += p; } } } // Consider next possible parental phase } ///////////////////////////////////// // Extract possible offspring phases and populate // standard metrics pp[i].clear(); hap1[i].clear(); hap2[i].clear(); /////////////////////////////////////////////////////// // Store the possible offspring phases (transmissions // only), and keep track of the probabilities map::iterator ip = pmap.begin(); include[i] = ambig[i] = true; if ( pmap.size() == 0 ) { include[i] = false; } else if ( pmap.size() == 1 ) { ambig[i] = false; int h1 = ip->first.pt; int h2 = ip->first.mt; if ( h1 < h2 ) { hap1[i].push_back( h1 ); hap2[i].push_back( h2 ); } else { hap1[i].push_back( h2 ); hap2[i].push_back( h1 ); } } else { // More than one possible phase for this offspring map mapBack; while ( ip != pmap.end() ) { int2 h; h.p1 = ip->first.pt; h.p2 = ip->first.mt; if ( h.p2 < h.p1 ) { int t = h.p1; h.p1 = h.p2; h.p2 = t; } // Have we already seen this pair of transmitted haplotypes? map::iterator im = mapBack.find(h); if ( im != mapBack.end() ) { int k = im->second; pp[i][k] += ip->second; } else { int t = pp[i].size(); mapBack.insert(make_pair(h,t)); pp[i].push_back( ip->second ); hap1[i].push_back( h.p1 ); hap2[i].push_back( h.p2 ); } // Next family transmission ip++; } } //////////////////////////// // Normalise probabilities if (ambig[i]) for (int z=0; z < pp[i].size(); z++) pp[i][z] /= psum; map::iterator itp = pmap.begin(); while ( itp != pmap.end() ) { itp->second /= psum; ++itp; } /////////////////////////////////////////////////////////// // Score haplotype transmissions for this trio, and add to // tabulation of sample T and U counts if (par::test_hap_TDT || par::proxy_TDT) transmissionCount(i,pmap); return; } void HaploPhase::transmissionCount(int i, map & pmap ) { // For debugging only: // displayFamTran(pmap,i,this); map::iterator ip = pmap.begin(); int t = subhaplotypes ? nt : nh; //////////////////////////////////// // Consider each possible phase set while ( ip != pmap.end() ) { vector t1(t,0); vector u1(t,0); FamilyTransmissions f = ip->first; double posterior = ip->second; // This function works fine for X chromosome as is. // i.e. fathers haploid/homozygous/uninformative; // son's/daughters genotype will always reflect X maternal // transmission int h1, h2, p1, p2, m1, m2; if ( subhaplotypes ) { // Collapse from a 0..nh space to a 0..nt space, via // downcoding<> We can assume the haplotype codes given here // will always be valid (i.e. map between 0 and nh) and that the // downcoding map will always have an appropriate key // AACCA 0 0 -AC-- // ACCAC 1 1 -XX-- // AACCC 2 0 -AC-- // CCCCC 3 1 -XX-- h1 = downcoding.find( f.pt )->second; h2 = downcoding.find( f.mt )->second; p1 = downcoding.find( f.pt )->second; p2 = downcoding.find( f.pu )->second; m1 = downcoding.find( f.mt )->second; m2 = downcoding.find( f.mu )->second; } else { h1 = f.pt; h2 = f.mt; p1 = f.pt; p2 = f.pu; m1 = f.mt; m2 = f.mu; } scoreTransmissions(h1,h2,p1,p2,m1,m2,t1,u1); /////////////////////////////////////////// // Update sample totals for each haplotype // and also accumulate the empirical variance // of the transmissions for (int h=0; hsecond; untrans[h] += u1[h] * ip->second; } // Consider next family transmission set ++ip; } } void HaploPhase::scoreTransmissions(int h1, int h2, int p1, int p2, int m1, int m2, vector & t1, vector & u1) { // Return of vector of T and U (0,1,2) for each haplotype // for this particular trio // Father heterozygous? if ( p1 != p2 ) { // Mother homozygous? if ( m1 == m2 ) { // then select a kid allele that matches, // and score the other one for transmission if ( h1 == m1 ) { t1[h2]++; if (p1==h2) u1[p2]++; else u1[p1]++; } else { t1[h1]++; if (p1==h1) u1[p2]++; else u1[p1]++; } } else { // Both parents are heterozygous, // Transmitted alleles are unambiguous t1[h1]++; t1[h2]++; // Untransmitted alleles // i.e. which two are left over // after accounting for the two // transmitted alleles bool pat_accounted = false; bool mat_accounted = false; if (p1 != h1 && p1 != h2 ) { u1[p1]++; pat_accounted = true; } else if (p2 != h1 && p2 != h2 ) { u1[p2]++; pat_accounted = true; } if (m1 != h1 && m1 != h2 ) { u1[m1]++; mat_accounted = true; } else if (m2 != h1 && m2 != h2 ) { u1[m2]++; mat_accounted = true; } // This only happens with AB x AB -> AB if ( ! (pat_accounted || mat_accounted )) { u1[h1]++; u1[h2]++; } else if ( ( ( !pat_accounted ) && mat_accounted ) || ( pat_accounted && (!mat_accounted) ) ) { // If only 1 untransmitted allele accounted for, it must // be the doubled allele that is untransmitted if (p1 == m1 || p1 == m2 ) u1[p1]++; else u1[p2]++; } // AB AB BB -- 2 accounted for // AA // AB AB AB -- 0 accounted for // AB // AB AC BA -- 1 accounted for // AC // AC AB BA -- 1 accounted for // AC // AB CD DB -- 2 accounted for // AC } } else if ( m1 != m2 ) { // Mother heterozygous, father homozygous if (h1 == p1 ) { t1[h2]++; if (m1==h2) u1[m2]++; else u1[m1]++; } else { t1[h1]++; if (m1==h1) u1[m2]++; else u1[m1]++; } } return; } plink-1.07-src/cnv.cpp0000644000265600020320000015641111264127625013777 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "perm.h" #include "cnv.h" #include "crandom.h" #include "model.h" extern Plink * PP; const double EPS_OVERLAP = 1e-6; class sortedSegments { public: Segment s; bool operator< (const sortedSegments & b) const { if ( s.p1 < b.s.p1 ) return true; if ( s.p1 > b.s.p1 ) return false; if ( s.start < b.s.start ) return true; if ( s.start > b.s.start ) return false; if ( s.finish < b.s.finish ) return true; return false; } }; double probOverlap(Segment * s, set& isection) { // For this segement, calculate sum of all weights across all genes double dlen = PP->locus[s->finish]->bp - PP->locus[s->start]->bp + 1; set::iterator i = isection.begin(); double wCount = 0; while ( i != isection.end() ) { double glen = i->stop - i->start + 1; wCount += 1.0 / ( dlen + glen ); ++i; } return wCount / 1000.00 ; } void CNVIndivReport::calculateResults() { // Note -- T1, T2, T5 and T6 are based on *all* individuals // Tests T3, T4, T7 and T8 are based on all individuals with 1+ event if ( count == 0 ) count = 1; t1 /= n; t2 /= n; t3 /= count; t4 /= count; t5 /= n; t6 /= n; t7 /= count; t8 /= n; // t8 = t8 > 0 ? t5/t8 : 0 ; // Expected counts // t9 /= n; // subset // t10 /= n; // baseline geneset // t11 = t10>0 ? t9 / t10 : 0 ; // Make main metric the ratio // t8 = t11>0 ? t8 / t11 : 0 ; } void Plink::setUpForCNVList() { if ( ! par::cnv_makemap ) { /////////////////////////////////////////////// // .map file checkFileExists(par::mapfile); printLOG("Reading marker information from [ " + par::mapfile + " ]\n"); vector include; vector include_pos(0); int nl_actual=0; readMapFile(par::mapfile,include,include_pos,nl_actual); /////////////////////////////////////////////// // .fam file printLOG("Reading individual information from [ " + par::famfile + " ]\n"); checkFileExists(par::famfile); readFamFile(par::famfile); // Set some basics that we would have skipped by // doing the above manually nl_all = locus.size(); n = sample.size(); prettyPrintLengths(); } } void Plink::readCNVList() { // These should be done already, but check nl_all = locus.size(); n = sample.size(); prettyPrintLengths(); /////////////////////////////////////////////// // Intersect with a range file? set isection; set isection_baseline; map idescription; set iintersected; if ( par::cnv_intersect ) { set isubset; set ifound; if ( par::cnv_intersect_subset ) { checkFileExists( par::cnv_intersect_subset_file ); printLOG("Reading intersection subset list from [ " + par::cnv_intersect_subset_file + " ]\n"); ifstream IN(par::cnv_intersect_subset_file.c_str(), ios::in); while ( ! IN.eof() ) { string gname; IN >> gname; if ( gname=="" ) continue; isubset.insert(gname); } printLOG("Looking for subset of " + int2str( isubset.size() ) + " ranges\n"); IN.close(); } checkFileExists( par::cnv_intersect_file ); printLOG("Reading CNV intersection list from [ " + par::cnv_intersect_file + " ]\n"); ifstream IN(par::cnv_intersect_file.c_str(), ios::in); while ( ! IN.eof() ) { // First three fields are CHR M1, M2 // Ignore rest of the line char cline[par::MAX_LINE_LENGTH]; IN.getline(cline,par::MAX_LINE_LENGTH,'\n'); string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); if ( tokens.size() < 3 ) error("Problem with line:\n" + sline ); // If we are only extracting a subset of this list, then // we require a properly defined 4th column if ( par::cnv_intersect_subset ) { if ( tokens.size() < 4 ) error("Problem with line, no region name:\n" + sline ); // Skip if this gene isn't on the list if ( isubset.find( tokens[3] ) == isubset.end() ) continue; // Add to list of found subset genes ifound.insert( tokens[3] ); } // Otherwise load this in string chr = tokens[0]; string m1 = tokens[1]; string m2 = tokens[2]; int p1,p2; Range r; if (chr=="") continue; if ( ! from_string( r.start,m1, std::dec ) ) error("Problem with position : " + m1 ); if ( ! from_string( r.stop,m2, std::dec ) ) error("Problem with position : " + m2 ); // Add any border r.start -= par::cnv_region_border; r.stop += par::cnv_region_border; // Check for consistency if ( r.start > r.stop ) error("Badly defined region:\n"+sline); r.chr = getChromosomeCode(chr); isection.insert(r); idescription.insert(make_pair(r,sline)); // Add region name to this one r.name = tokens.size() >= 4 ? tokens[3] : "REGION-"+int2str(geneList.size()+1); geneList.insert(r); // A second, global copy } IN.close(); if ( par::cnv_exclude ) printLOG("Read " + int2str( isection.size() ) + " ranges to exclude from CNV list\n"); else printLOG("Read " + int2str( isection.size() ) + " ranges to intersect with CNV list\n"); if ( ifound.size() < isubset.size() ) { printLOG("Could not find " + int2str( isubset.size() - ifound.size() ) + " ranges\n"); printLOG("Writing this list to [ " + par::output_file_name + ".notfound ] \n"); ofstream O2; O2.open( (par::output_file_name+".notfound").c_str() , ios::out); set::iterator i1 = isubset.begin(); while ( i1 != isubset.end() ) { if ( ifound.find( *i1 ) == ifound.end() ) O2 << *i1 << "\n"; ++i1; } O2.close(); } //////////////////////////////////////////////// // Read in a second list of baseline ranges? if ( par::cnv_count_baseline ) { checkFileExists( par::cnv_count_baseline_file ); printLOG("Reading CNV baseline count list from [ " + par::cnv_count_baseline_file + " ]\n"); ifstream IN1(par::cnv_count_baseline_file.c_str(), ios::in); while ( ! IN1.eof() ) { // First three fields are CHR M1, M2 // Ignore rest of the line vector tokens = tokenizeLine( IN1 ); if ( tokens.size() == 0 ) continue; if ( tokens.size() < 3 ) error("Problem with line:\n" + displayLine(tokens) ); string chr = tokens[0]; string m1 = tokens[1]; string m2 = tokens[2]; int p1,p2; Range r; if (chr=="") continue; if ( ! from_string( r.start,m1, std::dec ) ) error("Problem with position : " + m1 ); if ( ! from_string( r.stop,m2, std::dec ) ) error("Problem with position : " + m2 ); // Add any border r.start -= par::cnv_region_border; r.stop += par::cnv_region_border; // Check for consistency if ( r.start > r.stop ) error("Badly defined region:\n"+ displayLine( tokens )); r.chr = getChromosomeCode(chr); isection_baseline.insert(r); } IN1.close(); printLOG("Read " + int2str( isection_baseline.size() ) + " baseline count ranges from [ " + par::cnv_count_baseline_file + " ]\n"); } } /////////////////////////////////////////////// // .cnv file printLOG("\nReading segment list (CNVs) from [ " + par::cnv_listname + " ]\n"); checkFileExists( par::cnv_listname ); ifstream IN; IN.open( par::cnv_listname.c_str() , ios::in ); ofstream MOUT; if ( par::cnv_writelist ) { if ( par::output_file_name + ".cnv" == par::cnv_listname ) error("CNV input and output file names cannot be the same, " + par::cnv_listname ); } map uid; map mlocus; if ( ! par::cnv_makemap ) { makePersonMap( *this, uid ); for (int l=0; lchr; p.p2 = locus[l]->bp; mlocus.insert(make_pair(p,l)); } } map::iterator ii; map::iterator il1; map::iterator il2; int nseg=0; int n_mapped_to_person=0; int n_passed_filters=0; int n_intersects=0; int nall=0; set positions; while ( ! IN.eof() ) { string fid, iid, chr, bp1, bp2; string type, scorestr, sitesstr; // FID IID CHR BP1 BP2 TYPE SCORE SITES IN >> fid >> iid >> chr >> bp1 >> bp2 >> type >> scorestr >> sitesstr; if ( fid == "FID" || fid == "" ) continue; nall++; // Lookup person if ( ! par::cnv_makemap ) { ii = uid.find( fid + "_" + iid ); if ( ii == uid.end() ) continue; } ++n_mapped_to_person; // Correct type? int t; if ( ! from_string( t,type, std::dec ) ) error("Problem with type specifier: " + type ); if ( par::cnv_del_only && t > 1 ) continue; if ( par::cnv_dup_only && t < 3 ) continue; int p1,p2; if ( ! from_string( p1, bp1, std::dec ) ) error("Problem with first position: " + bp1 ); if ( ! from_string( p2, bp2, std::dec ) ) error("Problem with second position: " + bp2 ); if ( p1 > p2 ) error("Badly defined segment, " + bp1 + " > " + bp2); double score; int sites; if ( ! from_string( score, scorestr, std::dec ) ) error("Problem with score : " + scorestr ); if ( ! from_string( sites, sitesstr, std::dec ) ) error("Problem with sites : " + sitesstr ); // Filters: double kb = (double)(p2 - p1) / 1000.0; if ( par::cnv_min_sites > 0 && par::cnv_min_sites > sites ) continue; if ( par::cnv_max_sites > 0 && par::cnv_max_sites < sites ) continue; if ( par::cnv_min_score > 0 && par::cnv_min_score > score ) continue; if ( par::cnv_max_score > 0 && par::cnv_max_score < score ) continue; if ( par::cnv_min_kb > 0 && par::cnv_min_kb > kb ) continue; if ( par::cnv_max_kb > 0 && par::cnv_max_kb < kb ) continue; ++n_passed_filters; /////////////////////////////////////////////// // Intersect with range as specified from file // (optionally counting overaps, instead) int segment_contains = 0; double weighted_segment_contains = 0; if ( par::cnv_intersect ) { if ( par::cnv_count ) { segment_contains += count_intersects(isection,getChromosomeCode(chr),p1,p2); if ( par::cnv_weighted_gene_test ) weighted_segment_contains += weighted_count_intersects(isection,getChromosomeCode(chr),p1,p2); } else { if ( par::cnv_exclude && intersects(isection,iintersected,getChromosomeCode(chr),p1,p2) ) continue; if ( (!par::cnv_exclude) && !intersects(isection,iintersected,getChromosomeCode(chr),p1,p2) ) continue; } ++n_intersects; } /////////////////////////////////////////////// // Intersect with range as specified from a // second file, to act as baseline count int segment_baseline = 0; double weighted_segment_baseline = 0; if ( par::cnv_count_baseline ) { segment_baseline += count_intersects(isection_baseline,getChromosomeCode(chr),p1,p2); if ( par::cnv_weighted_gene_test ) weighted_segment_baseline += weighted_count_intersects(isection_baseline,getChromosomeCode(chr),p1,p2); } int2 p; p.p1 = getChromosomeCode( chr ); if ( par::cnv_makemap ) { int2 p; p.p1 = getChromosomeCode( chr ); // Start p.p2 = p1; positions.insert(p); // End p.p2 = p2; positions.insert(p); // One position past end p.p2++; positions.insert(p); } else { // Can we map this to an exact marker? p.p2 = p1; il1 = mlocus.find( p ); p.p2 = p2; il2 = mlocus.find( p ); if ( il1 == mlocus.end() || il2 == mlocus.end() ) continue; // Seems okay, add segment to list Segment s; s.start = il1->second; s.finish = il2->second; s.p1 = s.p2 = ii->second; s.count = segment_contains; s.baseline = par::cnv_count_baseline ? segment_baseline : 0 ; s.weightedCount = weighted_segment_contains; s.weightedBaseline = weighted_segment_baseline; s.type = t; s.score = score; s.sites = sites; segment.push_back(s); if ( par::verbose ) cout << "Adding " << s.count << " " << s.weightedCount << "\t" << " and " << s.baseline << " " << s.weightedBaseline << "\tphe = " << s.p1->phenotype << "\n"; } } // Determine some measure of expected weighted count, given // totality of CNVs and genes specified per individual expectedOverlap.resize(n,0); expectedOverlapBaseline.resize(n,0); if ( par::cnv_weighted_gene_test ) { map mmap; for (int i=0; i::iterator s0 = segment.begin(); while ( s0 != segment.end() ) { int j = mmap.find(s0->p1)->second; expectedOverlap[ j ] += probOverlap(&(*s0), isection ); if ( par::cnv_count_baseline ) expectedOverlapBaseline[ j ] += probOverlap(&(*s0), isection_baseline ); ++s0; } } // cout << "expected:\n"; // display(expectedOverlap); // cout << "v2\n"; // display(expectedOverlap); ///////////////////////////////////////////////////////////// // Write-back any intersected segments (or, non-intersected // segments, if in exclude mode) if ( par::cnv_intersect_writeback ) { if ( par::cnv_exclude ) printLOG("Writing back list to non-intersected regions to [ " + par::output_file_name + ".reg ]\n"); else printLOG("Writing back list to intersected regions to [ " + par::output_file_name + ".reg ]\n"); ofstream ROUT; ROUT.open( ( par::output_file_name+".reg").c_str(), ios::out ); ROUT.precision(4); // Either a simple list of the intersected regions // or a verbose report that has both the region // and then a list of the CNVs in that region set::iterator ir = isection.begin(); while ( ir != isection.end() ) { set::iterator i = iintersected.find(*ir); bool writeback = par::cnv_exclude ? i == iintersected.end() : i != iintersected.end() ; if ( writeback ) { map::iterator is = idescription.find(*ir); if ( par::cnv_intersect_writeback_verbose ) { ROUT << "RANGE (+/- " << par::cnv_region_border/1000 << "kb ) [ " << idescription[*ir] << " ]\n"; ROUT << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " "; ROUT << setw(8) << "PHE" << " "; ROUT << setw(4) << "CHR" << " " << setw(12) << "BP1" << " " << setw(12) << "BP2" << " " << setw(6) << "TYPE" << " " << setw(8) << "KB" << " " << setw(8) << "OLAP" << " " << setw(8) << "OLAP_U" << " " << setw(8) << "OLAP_R" << "\n"; // << setw(12) << "SCORE" << " " // << setw(8) << "SITES" << "\n"; } else ROUT << idescription[*ir] << "\n"; // Get a list of CNVs that span this range // and display in verbose mode if ( par::cnv_intersect_writeback_verbose ) { Range trange = *ir; set< Segment > segset = allSegmentsIntersecting( trange ); set::iterator s = segset.begin(); while ( s != segset.end() ) { Individual * person = s->p1; Locus * loc1 = locus[s->start]; Locus * loc2 = locus[s->finish]; ROUT << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " "; ROUT << setw(8) << person->phenotype << " "; ROUT << setw(4) << loc1->chr << " " << setw(12) << loc1->bp << " " << setw(12) << loc2->bp << " "; if ( s->type < 2 ) ROUT << setw(6) << "DEL" << " "; else ROUT << setw(6) << "DUP" << " "; // Instead of SCORE and SITES, output kb length, and overlap stats // ROUT << setw(12) << s->score << " " // << setw(8) << s->sites << "\n"; // Start and stop of CNV int p1 = loc1->bp; int p2 = loc2->bp; // Start and stop of gene double consensusStart = p1 > ir->start ? p1 : ir->start; double consensusStop = p2 < ir->stop ? p2 : ir->stop; double numerator = consensusStop - consensusStart + 1.0; // 1,2,3 = default, union, region overlap double denom1 = p2-p1+1.0; double unionStart = p1 < ir->start ? p1 : ir->start; double unionStop = p2 > ir->stop ? p2 : ir->stop; double denom2 = unionStop - unionStart + 1.0; double denom3 = ir->stop-ir->start+1.0; double overlap1 = numerator / denom1 ; double overlap2 = numerator / denom2 ; double overlap3 = numerator / denom3 ; ROUT << setw(8) << (loc2->bp - loc1->bp)/1000.0 << " " << setw(8) << overlap1 << " " << setw(8) << overlap2 << " " << setw(8) << overlap3 << "\n"; ++s; } ROUT << "\n"; } } ++ir; } ROUT.close(); } ///////////////////////////////////////////////////////////// // Drop segments that are above or below a certain threshold int removed_freq_filter = 0; // Old approach was to define genomic regions // and then filter based on these. // New approach (below) is to focus on each event more directly, // and ask how many others intersect with it. Populate "freq" // for each event; // Overlap definition wants to be // 1) NOT "disrupt" // 2) Inforce union overlap so that frequency groups are // symmetric // Clean up this code later (decide if we want to keep these old // definitions if ( par::cnv_freq_method2 ) { // Turn off potential write-back function for // ranges; and manually set the intersect function // to work as needed here par::cnv_intersect_writeback = false; par::cnv_disrupt = false; par::cnv_union_overlap = true; par::cnv_region_overlap = false; for (int s1=0; s1start ]->chr; int bp1 = locus[ seg1->start ]->bp; int bp2 = locus[ seg1->finish ]->bp; // Count overlap with self, once ++seg1->freq; for (int s2=s1+1; s2start ]->chr != chr ) continue; // Do these overlap? int t1 = locus[seg2->start ]->bp; int t2 = locus[seg2->finish ]->bp; // This seg ends before the other starts? if ( t2 < bp1 ) continue; // Or does this seg start after the other ends? if ( t1 > bp2 ) continue; // Calculate overlap (union) double consensusStart = t1 > bp1 ? t1 : bp1; double consensusStop = t2 < bp2 ? t2 : bp2; double numerator = consensusStop - consensusStart + 1.0; double unionStart = t1 < bp1 ? t1 : bp1; double unionStop = t2 > bp2 ? t2 : bp2; double denom = unionStop - unionStart + 1.0; double overlap = numerator / denom ; overlap += EPS_OVERLAP; if ( overlap > par::cnv_freq_method2_threshold ) { seg1->freq++; seg2->freq++; } } } if ( par::cnv_freq_include ) { printLOG("Filtering segments based on frequencies\n"); vector::iterator s = segment.begin(); while ( s != segment.end() ) { //cout << "seg count = " << s->freq << "\n"; if ( par::cnv_freq_include_exact && s->freq != par::cnv_freq_include_cnt ) { s = segment.erase(s); ++removed_freq_filter; } else if ( par::cnv_freq_include_below && s->freq > par::cnv_freq_include_cnt ) { s = segment.erase(s); ++removed_freq_filter; } else if ( s->freq < par::cnv_freq_include_cnt ) { s = segment.erase(s); ++removed_freq_filter; } else s++; } printLOG("Will remove " + int2str( removed_freq_filter ) + " CNVs based on frequency (after other filters)\n"); } } //////////////////////////// // Old frequency filter code if ( ! par::cnv_freq_method2 ) { if ( par::cnv_freq_include || par::cnv_unique ) { if ( par::cnv_unique ) printLOG("Filtering segments unique to cases/controls\n"); if ( par::cnv_freq_include ) printLOG("Filtering segments based on frequencies\n"); // Clear any existing segments (we are done with gene lists, etc) // by now isection.clear(); // 1) Find common regions // 2) intersect or exclude as is fit // 3) remove from list vector caseCount = segmentCountCaseControls(this,true); vector controlCount = segmentCountCaseControls(this,false); // Determine regions to exclude bool inRegion = false; Range r; for (int l=0; lchr != locus[l]->chr ) { endOfChromosome = true; } int count = caseCount[l] + controlCount[l]; bool uniq = ! ( caseCount[l] == 0 || controlCount[l] == 0 ); // cout << "dets " << locus[l]->chr << "\t" << locus[l]->bp << "\t" << uniq << "\t" << caseCount[l] << " : " // << controlCount[l] << "\t"; bool iregion; if ( par::cnv_freq_include_exact ) { // Define the regions we want to exclude iregion = count != par::cnv_freq_include_cnt; } else { // Define the regions we want to exclude // Use inclusive thresholds for inclusion // --cnv-freq-exclude-above X if ( par::cnv_freq_include_below ) // inclusive thresholds iregion = count > par::cnv_freq_include_cnt; else // --cnv-freq-exclude-below X iregion = count < par::cnv_freq_include_cnt; } // cout << iregion << "\t" << count << "\n"; ///////////////////////////////////////////////////////// // Filter based on uniquenes to either cases or controls if ( inRegion ) { // End of an iregion or unique region? if ( ( ( par::cnv_unique && ! uniq ) || ( ! par::cnv_unique ) ) && ( ( par::cnv_freq_include && ! iregion ) || ( ! par::cnv_freq_include ) ) ) { // Range goes up to the position just before this region r.stop = locus[l]->bp-1; if ( r.stop < 0 ) r.stop = 0; isection.insert(r); inRegion = false; } else if ( endOfChromosome ) // of we just have to stop anyway? { inRegion = false; r.stop = locus[l]->bp; isection.insert(r); continue; } } // ...or, the start of a new region? else { if ( ( par::cnv_unique && uniq ) || ( par::cnv_freq_include && iregion ) ) { r.start = locus[l]->bp; r.chr = locus[l]->chr; inRegion = true; } // But could also be the end of the chromosome if ( inRegion && endOfChromosome ) { inRegion = false; r.stop = locus[l]->bp; isection.insert(r); continue; } } } // Next SNP ///////////////////////////////////////////////////////////// // We now have a list of "iregion sections" -- intersect // based on these set::iterator i2 = isection.begin(); // cout << "List of bad regions, N = " << isection.size() << "\n"; // while (i2 != isection.end() ) // { // cout << "regs = " << i2->chr << "\t" << i2->start << " -> " << i2->stop << "\n"; // ++i2; // } // cout << "\n"; // Turn off potential write-back function for // ranges par::cnv_intersect_writeback = false; vector::iterator s = segment.begin(); while ( s != segment.end() ) { bool doesIntersect = intersects(isection, iintersected, locus[s->start]->chr, locus[s->start]->bp, locus[s->finish]->bp); if ( par::cnv_freq_include_exact_exclude ) { doesIntersect = ! doesIntersect; } bool willErase = doesIntersect; // bool willErase = par::cnv_freq_include_exact ? // ! doesIntersect : doesIntersect ; // Remove this segment or keep? if ( willErase ) { s = segment.erase(s); ++removed_freq_filter; } else ++s; } printLOG("Will remove " + int2str( removed_freq_filter ) + " CNVs based on frequency (after other filters)\n"); } } // Drop individuals for whom we do not see any segments of this // variety? if ( par::cnv_drop_no_segment ) { vector indel(n,false); indivSegmentSummaryCalc(segmentCount, segmentLength, true, true); for ( int i = 0; i < n; i++) { indivPair t; t.p1 = t.p2 = sample[i]; map::iterator ic = segmentCount.find(t); map::iterator il = segmentLength.find(t); if ( ic != segmentCount.end() ) { if ( ic->second == 0 ) indel[i] = true; } else indel[i] =true; } int n_removed = deleteIndividuals(indel); printLOG("Removed " + int2str(n_removed) + " individuals with <1 CNV\n"); } /////////////////////////////////////////////////////////// // Check that CNVs do not overlap at all (within a person) if ( par::cnv_check_overlap ) { set sorted; vector::iterator s1 = segment.begin(); while ( s1 != segment.end() ) { sortedSegments ss; ss.s = *s1; sorted.insert( ss ); ++s1; } // Look for overlap set olap; set::iterator s = sorted.begin(); while ( s != sorted.end() ) { set::iterator snext = s; ++snext; if ( snext == sorted.end() ) break; if ( s->s.p1 != snext->s.p1 ) { ++s; continue; } if ( snext->s.start <= s->s.finish ) { olap.insert( (Segment*)&(s->s) ); olap.insert( (Segment*)&(snext->s) ); } ++s; } if ( olap.size() > 0 ) { printLOG("Within-individual CNV overlap detected, involving " + int2str( olap.size() ) + " CNVs\n"); printLOG("Writing list to [ " + par::output_file_name + ".cnv.overlap ]\n"); ofstream O( ( par::output_file_name + ".cnv.overlap").c_str() , ios::out ); O << setw( par::pp_maxfid ) << "FID" << " " << setw( par::pp_maxfid ) << "IID" << " " << setw( 4 ) << "CHR" << " " << setw( 12 ) << "BP1" << " " << setw( 12 ) << "BP2" << "\n"; set::iterator i = olap.begin(); while ( i != olap.end() ) { O << setw( par::pp_maxfid ) << (*i)->p1->fid << " " << setw( par::pp_maxfid ) << (*i)->p1->iid << " " << setw( 4 ) << locus[ (*i)->start ]->chr << " " << setw( 12 ) << locus[ (*i)->start ]->bp << " " << setw( 12 ) << locus[ (*i)->finish ]->bp << "\n"; ++i; } O.close(); } else { printLOG("No overlapping samples found\n"); } } //////////////////////////////////////////////////////// // Get full typeCount; for case/control data, split out // by cases and controls map typeCount; map typeCaseCount; vector::iterator s = segment.begin(); while ( s != segment.end() ) { ////////////////////////////////// // Record in full 0/1, 3/4 space map & myCount = par::bt && s->p1->aff ? typeCaseCount : typeCount; map::iterator it = myCount.find( s->type ); if ( it == myCount.end() ) { myCount.insert(make_pair(s->type,1)); } else { ++(it->second); } // Next segment ++s; } ////////////////////////////// // Make a new map file if ( ! par::cnv_makemap ) { printLOG(int2str( n_mapped_to_person ) + " mapped to a person, of which " + int2str( n_passed_filters) + " passed filters\n"); if ( par::cnv_intersect ) { if ( par::cnv_exclude ) printLOG( int2str( n_intersects ) + " kept after excluding specific regions\n"); else printLOG( int2str( n_intersects ) + " intersected with one or more specified region\n"); } int t = par::cnv_intersect ? n_intersects : n_passed_filters; t -= removed_freq_filter; if ( t - segment.size() > 0 ) printLOG( int2str( t - segment.size() ) + " did not map to any marker\n"); printLOG(int2str( segment.size() ) + " of " + int2str(nall) + " mapped as valid segments\n"); map::iterator it1 = typeCaseCount.begin(); map::iterator it2 = typeCount.begin(); set obsCounts; while ( it1 != typeCaseCount.end() ) { obsCounts.insert( it1->first ); ++it1; } while ( it2 != typeCount.end() ) { obsCounts.insert( it2->first ); ++it2; } stringstream s2; if ( par::bt ) s2 << setw(6) << "CopyN" << " " << setw(12) << "Case/Control" << "\n"; else s2 << setw(6) << "CopyN" << " " << setw(8) << "Count" << "\n"; printLOG( s2.str() ); s2.clear(); set::iterator it = obsCounts.begin(); while ( it != obsCounts.end() ) { map::iterator i1 = typeCaseCount.find( *it ); map::iterator i2 = typeCount.find( *it ); int n1 = 0, n2 = 0; if ( i1 != typeCaseCount.end() ) n1 = i1->second; if ( i2 != typeCount.end() ) n2 = i2->second; stringstream s; if ( par::bt ) s << setw(6) << *it << " " << setw(12) << (int2str(n1)+" / "+int2str(n2)) << "\n"; else s << setw(6) << *it << " " << setw(8) << n2 << "\n"; printLOG( s.str() ); ++it; } printLOG("\n"); } IN.close(); //////////////////////////////// // Write a CNV list file? if ( par::cnv_writelist ) { printLOG("Writing new CNV list to [ " + par::output_file_name + ".cnv ]\n"); MOUT.open( ( par::output_file_name + ".cnv").c_str() , ios::out ); MOUT << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " "; if (par::dump_covar_with_phenotype ) MOUT << setw(8) << "PHE" << " "; MOUT << setw(4) << "CHR" << " " << setw(12) << "BP1" << " " << setw(12) << "BP2" << " " << setw(6) << "TYPE" << " " << setw(12) << "SCORE" << " " << setw(8) << "SITES" << " "; if ( par::cnv_write_freq ) MOUT << setw(8) << "FREQ" << " "; MOUT << "\n"; vector::iterator s = segment.begin(); while ( s != segment.end() ) { Individual * person = s->p1; Locus * loc1 = locus[s->start]; Locus * loc2 = locus[s->finish]; MOUT << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " "; if (par::dump_covar_with_phenotype ) MOUT << setw(8) << person->phenotype << " "; MOUT << setw(4) << loc1->chr << " " << setw(12) << loc1->bp << " " << setw(12) << loc2->bp << " " << setw(6) << s->type << " " << setw(12) << s->score << " " << setw(8) << s->sites << " "; if ( par::cnv_write_freq ) MOUT << setw(8) << s->freq << " "; MOUT << "\n"; ++s; } MOUT.close(); printLOG("Writing new FAM file to [ " + par::output_file_name + ".fam ]\n"); MOUT.open( ( par::output_file_name + ".fam").c_str() , ios::out ); for (int i=0;ifid << " " << person->iid << " " << person->pat << " " << person->mat << " " << person->sexcode << " "; if (par::bt) MOUT << (int)person->phenotype << "\n"; else MOUT << person->phenotype << "\n"; } MOUT.close(); } ///////////////////////////////////// // Collapse type (0/1 -> 2, 3/4 -> 2) s = segment.begin(); while ( s != segment.end() ) { s->type = s->type < 2 ? 1 : 2; ++s; } //////////////////////////////// // Write a new MAP file out? if ( par::cnv_makemap ) { ofstream MOUT; printLOG("Writing new MAP file to [ " + par::output_file_name + ".cnv.map ]\n"); MOUT.open( ( par::output_file_name + ".cnv.map").c_str() , ios::out ); set::iterator ip = positions.begin(); int nseg = 1; while ( ip != positions.end() ) { MOUT << ip->p1 << "\t" << "p"+int2str(ip->p1)+"-"+int2str(ip->p2) << "\t" << "0\t" << ip->p2 << "\n"; ++ip; } MOUT.close(); printLOG("Wrote " + int2str( positions.size() ) + " unique positions to file\n"); } } void Plink::processCNVList() { if ( par::cnv_writelist || par::cnv_makemap ) return; // Per-individual summaries printLOG("Writing per-individual summary to [ " + par::output_file_name + ".cnv.indiv ]\n"); indivSegmentSummary(); if ( par::cnv_enrichment_test ) { glmCNVBurdenModel(*pperm,true); } // Display convenient segment view if (par::display_segment_long) displaySegmentsLong(); // Display convenient BED/UCSC track if (par::display_cnv_track) displaySegmentsBED(); // Find overlap in segments? if (par::segment_overlap) summariseHomoRuns(); ///////////////////////////////// // Association/burden mapping /////////////////////////////////////////////////////// // Set up scaffold based on gene positions, if needed set::iterator g = geneList.begin(); map minpos; map maxpos; while ( g != geneList.end() ) { if ( minpos.find( g->chr ) == minpos.end() ) { minpos.insert(make_pair( g->chr, g->start ) ); maxpos.insert(make_pair( g->chr, g->stop ) ); } else { if ( minpos[g->chr] > g->start ) minpos[g->chr] = g->start; if ( maxpos[g->chr] < g->stop ) maxpos[g->chr] = g->stop; } ++g; } map::iterator i = minpos.begin(); while ( i != minpos.end() ) { CInfo cdet; cdet.bpstart = i->second; cdet.bpstop = maxpos[i->first]; scaffold.insert(make_pair(i->first,cdet)); ++i; } // if ( par::cnv_glm ) // runTestCNVwithGLM(*pperm); if ( par::qt ) runTestCNVwithQT(*pperm); else { // Disease trait tests // Utilise existing tests for homozygous segments if ( par::seg_test_region ) { printLOG("Writing positional summary to [ " + par::output_file_name + ".cnv.regional.summary ]\n"); initialiseGeneCountAssociation(*pperm); } else { printLOG("Writing positional summary to [ " + par::output_file_name + ".cnv.summary ]\n"); summaryIBSsegments(*pperm); } } } bool intersects(set& isection , set & iintersects, int chr, int p1, int p2) { // Does this particular CNV intersect with atleast one range? // Potentially allowing for fractional overlaps) // Either consider all ranges and report back true when // first intersection is seen // Or; consider all ranges, no matter what, keeping track // of which ranges have been intersected (i.e. if we have // overlapping ranges, for example) // Intersect: default -- if any of segment overlaps // param 0.2 -- if at least20% of segment overlaps // Alternatively, "disrupt" mode simply asks whether the start // or end of a CNV is within the region bool doesIntersect = false; set::iterator ir = isection.begin(); while ( ir != isection.end() ) { if ( ir->chr != chr ) { ++ir; continue; } // Either use disrupt mode, or standard // intersect mode (which might include // and overlap) if ( par::cnv_disrupt ) { if ( ( p1 >= ir->start && p1 <= ir->stop ) || ( p2 >= ir->start && p2 <= ir->stop ) ) { if ( par::cnv_intersect_writeback ) { doesIntersect = true; iintersects.insert(*ir); } else return true; } } else // ... intersect or exclude mode { if ( p1 <= ir->stop && p2 >= ir->start ) { if ( par::cnv_overlap < 0 ) { if ( par::cnv_intersect_writeback ) { doesIntersect = true; iintersects.insert(*ir); } else return true; } ///////////////////////////// // Overlap-based comparison // The CNV spans p1 to p2 // Region spans ir->start/stop // Denominator either of CNV itself, // or is union double consensusStart = p1 > ir->start ? p1 : ir->start; double consensusStop = p2 < ir->stop ? p2 : ir->stop; double numerator = consensusStop - consensusStart + 1.0; double denom; if ( par::cnv_union_overlap ) { double unionStart = p1 < ir->start ? p1 : ir->start; double unionStop = p2 > ir->stop ? p2 : ir->stop; denom = unionStop - unionStart + 1.0; } else if ( par::cnv_region_overlap ) denom = ir->stop-ir->start+1.0; else denom = p2-p1+1.0; double overlap = numerator / denom ; overlap += EPS_OVERLAP; if ( overlap >= par::cnv_overlap ) { if ( par::cnv_intersect_writeback ) { doesIntersect = true; iintersects.insert(*ir); } else return true; } } } // Consider next range ++ir; } return doesIntersect; } int count_intersects(set& isection ,int chr,int p1,int p2) { // How many ranges does this CNV intersect with? // Potentially allowing for fractional overlaps) // Intersect: default -- if any of segment overlaps // param 0.2 -- if at least20% of segment overlaps int iCount = 0; set::iterator ir = isection.begin(); while ( ir != isection.end() ) { if ( ir->chr != chr ) { ++ir; continue; } // Either use disrupt mode, or standard // intersect mode (which might include // and overlap) if ( par::cnv_disrupt ) { if ( ( p1 >= ir->start && p1 <= ir->stop ) || ( p2 >= ir->start && p2 <= ir->stop ) ) { ++iCount; } } else // ... intersect or exclude mode { if ( p1 < ir->stop && p2 > ir->start ) { if ( par::cnv_overlap < 0 ) ++iCount; else { // The CNV spans p1 to p2 double consensusStart = p1 > ir->start ? p1 : ir->start; double consensusStop = p2 < ir->stop ? p2 : ir->stop; double numerator = consensusStop - consensusStart + 1.0; double denom; if ( par::cnv_union_overlap ) { double unionStart = p1 < ir->start ? p1 : ir->start; double unionStop = p2 > ir->stop ? p2 : ir->stop; denom = unionStop - unionStart + 1.0; } else if ( par::cnv_region_overlap ) denom = ir->stop-ir->start+1.0; else denom = p2-p1+1.0; double overlap = numerator / denom ; overlap += EPS_OVERLAP; if ( overlap >= par::cnv_overlap ) ++iCount; } } } ++ir; } return iCount; } double weighted_count_intersects(set& isection ,int chr,int p1,int p2) { // For simple overlap or disruption statistics, calculate // weighted score // Currently, no support for fractional overlaps double wCount = 0; int cnt = 0; set::iterator ir = isection.begin(); while ( ir != isection.end() ) { if ( ir->chr != chr ) { ++ir; continue; } // Either use disrupt mode, or standard // intersect mode (which might include // and overlap) if ( par::cnv_disrupt ) { if ( ( p1 >= ir->start && p1 <= ir->stop ) || ( p2 >= ir->start && p2 <= ir->stop ) ) { // if ( D < G ) then W = D + G // if ( D >= G ) then W = D + G - ( D - G + 1 ) double dlen = p2 - p1 + 1; double glen = ir->stop - ir->start + 1; double score = dlen < glen ? dlen + glen : dlen + glen - ( dlen-glen+1); wCount += 1.0 / score; } } else // ... intersect or exclude mode { if ( p1 < ir->stop && p2 > ir->start ) { double dlen = p2 - p1 + 1; double glen = ir->stop - ir->start + 1; wCount += 1.0 / ( dlen + glen ); //Make 'weight' the intersection // double consensusStart = p1 > ir->start ? p1 : ir->start; // double consensusStop = p2 < ir->stop ? p2 : ir->stop; // wCount += consensusStop - consensusStart + 1.0; // cout.precision(8); // cout << "wCount = " << wCount << " from " << dlen << " " << glen << "\n"; // if ( ++cnt == 2 ) cout << "TWO!\n"; } } ++ir; } // In 1 / KB units wCount *= 1000; return wCount; } vector segmentCountCaseControls(Plink * P, bool countCases) { // Helper function to count segments (for CNVs, homozygous segments // only) vector count(P->nl_all,0); vector::iterator s = P->segment.begin(); while ( s != P->segment.end() ) { if ( s->p1->pperson->aff == countCases ) for (int l = s->start ; l <= s->finish; l++) count[l]++; s++; } return count; } set allSegmentsIntersecting(Range & r) { set sset; set dummySet; set rangeSet; rangeSet.insert(r); par::cnv_intersect_writeback = false; vector::iterator s = PP->segment.begin(); while ( s != PP->segment.end() ) { bool testIntersection = intersects(rangeSet, dummySet, PP->locus[ s->start ]->chr, PP->locus[ s->start ]->bp, PP->locus[ s->finish ]->bp ); // Record this CNV if ( ( par::cnv_exclude && ! testIntersection ) || ( testIntersection && ! par::cnv_exclude ) ) { sset.insert( *s ); } ++s; } return sset; } void Plink::countCNVPerRegion(vector & caseCount, vector & controlCount ) { // Return the number of case and control CNVs in each gene/region // These values are calculated first time, then stored in // gene2segment for subsequent lookup int nGenes = geneList.size(); caseCount.clear(); controlCount.clear(); caseCount.resize(nGenes,0); controlCount.resize(nGenes,0); int gCount = 0; set::iterator g = geneList.begin(); while ( g != geneList.end() ) { Range tr = *g; // Either calculate, of if already done, just lookup the CNVs that // fall in this range (i.e. this function will be called many times // when using permutation) map >::iterator i = gene2segment.find( tr ); if ( i == gene2segment.end() ) { set s = allSegmentsIntersecting( tr ); gene2segment.insert(make_pair( tr, s ) ); i = gene2segment.find( tr ); } set::iterator si = i->second.begin(); while ( si != i->second.end() ) { if ( si->p1->pperson->missing ) continue; if ( si->p1->pperson->aff ) ++caseCount[ gCount ]; else ++controlCount[ gCount ]; ++si; } ++gCount; ++g; } return; } ////////////////////////////////////////////// // General segmental permutation test routine void Plink::initialiseGeneCountAssociation( Perm & perm ) { printLOG("Performing region-based association mapping\n"); int nt = geneList.size(); vector caseCount(nt,0); vector controlCount(nt,0); // Count up number of CNVs in each gene/region countCNVPerRegion(caseCount, controlCount); ////////////////////////////// // And display string f = par::output_file_name + ".cnv.regional.summary"; ofstream SIBS; SIBS.open( f.c_str() , ios::out ); SIBS << setw(4) << "CHR" << " " << setw(16) << "REGION" << " " << setw(12) << "BP1" << " " << setw(12) << "BP2" << " " << setw(8) << "AFF" << " " << setw(8) << "UNAFF" << "\n"; set::iterator ri = geneList.begin(); int rCount = 0; while ( ri != geneList.end() ) { SIBS << setw(4) << ri->chr << " " << setw(16) << ri->name << " " << setw(12) << ri->start << " " << setw(12) << ri->stop << " " << setw(8) << caseCount[rCount] << " " << setw(8) << controlCount[rCount] << "\n"; ++ri; ++rCount; } SIBS.close(); ///////////////////// // Permutation test? if (!par::permute) return; ///////////////////////////////////// // Treat CNVs as homozygous segments par::homo_run = true; homozygousSegmentPermutationTest(perm,f,caseCount,controlCount); return; } void Plink::positionPermuteSegments() { // Based on the global geneList set of genes, // create a permuted version and rescore each segment count set permGeneList = geneList; // Permute on a within-chromosome basis // cout << scaffold.size() << "\n"; vector offset( scaffold.size() , 0 ); for (int c = 0 ; c < scaffold.size(); c++) { offset[c] = (int) ( CRandom::rand() * ( scaffold[c].bpstop - scaffold[c].bpstart ) ); } set::iterator g = permGeneList.begin(); while ( g != permGeneList.end() ) { Range * pg = (Range*)&(*g); // cout << "original = " << pg->name << " : " << pg->start << " to " << pg->stop << " "; // cout << "offset = " << offset[pg->chr] << " ; "; pg->start += offset[pg->chr]; pg->stop += offset[pg->chr]; // Did we fall off the edge? if ( pg->stop >= scaffold[pg->chr].bpstop ) { // cout << " ** "; int gsize = pg->stop - pg->start + 1; pg->start = scaffold[pg->chr].bpstart + ( pg->stop - scaffold[pg->chr].bpstop ); pg->stop = pg->start + gsize; } // cout << "now " << pg->start << " to " << pg->stop << "\n"; // Shuffle next gene ++g; } // i.e. update the "count" variable (simple # of genes intersected) if ( ! par::cnv_count ) error("This function requires --cnv-count"); // Update each segment vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( ! par::cnv_weighted_gene_test ) { s->count = count_intersects(permGeneList , locus[s->start]->chr, locus[s->start]->bp, locus[s->finish]->bp ); } else s->weightedCount = weighted_count_intersects(permGeneList , locus[s->start]->chr, locus[s->start]->bp, locus[s->finish]->bp ); ++s; } } vector_t Plink::glmCNVBurdenModel(Perm & perm, bool print ) { vector_t results; string f = par::output_file_name + ".cnv.burden"; if ( print ) printLOG("Performing GLM-based CNV burden test, results in [ " + f + " ]\n"); /////////////////////////////////////////////////// // Set up association model bool OLD_assoc_glm_without_main_snp = par::assoc_glm_without_main_snp; bool OLD_clist = par::clist; par::assoc_glm_without_main_snp = true; par::clist = true; // Model 0 no covariates // 1 CNV count // 2 CNV avg size // 3 CNV total kb // 4 CNV count + CNV avg size bool noCovar = false; if ( par::cnv_en_model == 0 ) noCovar == true; ///////////////////////////////////////////////////// // First calculate: // What is mean CNV size for all observed CNVs? // Do we see variation in CNV count in the dataset? double cnt = 0; double avg = 0; set num; for ( int i = 0; i < n; i++) { indivPair t; t.p1 = t.p2 = sample[i]; map::iterator ic = segmentCount.find(t); map::iterator il = segmentLength.find(t); // Insert correct # of terms here: if ( ic != segmentCount.end() ) { num.insert( ic->second ); cnt += ic->second; avg += il->second; } else num.insert( 0 ); } avg /= cnt; if ( cnt == 0 ) return results; // If equal burden, then do not include COUNT covariate bool equalBurden = num.size() == 1 ? true : false; // The number of additional terms we want to add go here (at the end of the clist) int totalTerms = 2; // gene count + covariate if ( par::cnv_en_model == 4 && equalBurden ) par::cnv_en_model = 2; if ( par::cnv_en_model == 0 ) noCovar = true; if ( par::cnv_en_model == 1 && equalBurden ) noCovar = true; if ( par::cnv_en_model == 4 ) totalTerms = 3; if ( noCovar ) totalTerms = 1; par::clist_number += totalTerms; for (int i=0; iclist.resize( par::clist_number ); // If we want to use permutation, record this guy // And make this the first term int testTerm = par::clist_number - totalTerms; // Fill in label forms clistname.resize( par::clist_number ); clistname[ testTerm ] = "GCNT"; if ( ! noCovar ) { if ( par::cnv_en_model == 1 ) clistname[ testTerm + 1 ] = "NSEG"; else if ( par::cnv_en_model == 2 ) clistname[ testTerm + 1 ] = "AVGKB"; else if ( par::cnv_en_model == 3 ) clistname[ testTerm + 1 ] = "TOTKB"; else if ( par::cnv_en_model == 4 ) { clistname[ testTerm + 1 ] = "NSEG"; clistname[ testTerm + 2 ] = "AVGKB"; } } // Now these are part of Plink class // map segmentCount; // map segmentLength; // Generate basic summary file for all people indivSegmentSummaryCalc(segmentCount, segmentLength, true, true); if ( ! par::cnv_count ) error("This function requires a --cnv-count set to be specified"); //////////////////////////////////////// // Put relevant variables in clist slots for ( int i = 0; i < n; i++) { indivPair t; t.p1 = t.p2 = sample[i]; map::iterator ic = segmentCount.find(t); map::iterator il = segmentLength.find(t); map::iterator ic2 = segmentCount2.find(t); indivPair p = ic->first; // Insert correct # of terms here: if ( ic != segmentCount.end() ) { double countCNV = ic->second; double totalKB = il->second; double avgKB = il->second / (double)ic->second; double geneCount = ic2->second; // Model = Y ~ gene-count + cnv-count + avg-cnv-size sample[i]->clist[ testTerm ] = geneCount; if ( ! noCovar ) { if ( par::cnv_en_model == 1 ) sample[i]->clist[ testTerm + 1 ] = countCNV; else if ( par::cnv_en_model == 2 ) sample[i]->clist[ testTerm + 1 ] = avgKB; else if ( par::cnv_en_model == 3 ) sample[i]->clist[ testTerm + 1 ] = totalKB; else if ( par::cnv_en_model == 4 ) { sample[i]->clist[ testTerm + 1 ] = countCNV; sample[i]->clist[ testTerm + 2 ] = avgKB; } } } else { // No CNVs at all sample[i]->clist[ testTerm ] = 0; if ( ! noCovar ) { if ( par::cnv_en_model == 1 ) sample[i]->clist[ testTerm + 1 ] = 0; else if ( par::cnv_en_model == 2 ) sample[i]->clist[ testTerm + 1 ] = avg; else if ( par::cnv_en_model == 3 ) sample[i]->clist[ testTerm + 1 ] = 0; else if ( par::cnv_en_model == 4 ) { sample[i]->clist[ testTerm + 1 ] = 0; sample[i]->clist[ testTerm + 2 ] = avg; } } } } // Next individual ////////////////////////////////////////// // Set up potential permutation test perm.setTests( totalTerms ); perm.setPermClusters(*this); perm.originalOrder(); vector_t original( totalTerms ); /////////////////////////////////////////// // Perform association glmAssoc(false,*pperm); /////////////////////////////////////////// // Report results ofstream OUTF; bool valid = model->isValid(); vector_t b = model->getCoefs(); vector_t chisq(1,model->getStatistic()); vector_t pval = model->getPVals(); // NOTE: b includes intercept; pval doesn't double statistic = valid ? model->getStatistic() : 0; double pvalue = pval[ pval.size()-1 ]; double beta = b[ b.size()-1 ]; delete model; if ( print ) { OUTF.open( f.c_str() , ios::out ); OUTF << setw(12) << "TEST" << " " << setw(12) << "BETA" << " " << setw(12) << "P" << "\n"; for (int c = 0; c < par::clist_number; c++) { OUTF << setw(12) << clistname[c] << " " << setw(12) << b[c+1] << " " << setw(12) << pval[c] << "\n"; } OUTF.close(); } // Make test 1-sided, based on signed coef. for (int c = 0; c < par::clist_number; c++) original[c] = b[c+1]; //////////////////////////////////////////////////// // Run permutations if ( par::permute ) { bool finished = false; int nv = 0; double z = 0; double unreal = 1/z; while(!finished) { perm.permuteInCluster(); glmAssoc(false,*pperm); bool valid = model->isValid(); if ( ! valid ) { ++nv; } vector_t b = model->getCoefs(); vector_t pr( par::clist_number ); for (int c = 0; c < par::clist_number; c++) pr[c] = valid ? b[c+1] : unreal ; finished = perm.update(pr,original); delete model; } if (nv>0) cout << nv << " invalid models during permutation\n"; if (!par::silent) cout << "\n\n"; ///////////////////////////////////////////////// // Display permuted results f += ".mperm"; printLOG("Writing CNV burden permutation results to [ "+f+" ]\n"); ofstream FOUT; FOUT.open( f.c_str() , ios::out ); FOUT.precision(4); FOUT << setw(8) << "TEST" << " " << setw(12) << "EMP1" << "\n"; for (int c = 0; c < par::clist_number; c++) { FOUT << setw(8) << clistname[c] << " " << setw(12) << perm.pvalue( c ) << "\n"; } FOUT.close(); } /////////////////////////////////////// // Some final tidying up par::assoc_glm_without_main_snp = OLD_assoc_glm_without_main_snp; par::clist = OLD_clist; par::clist_number -= totalTerms;; clistname.resize( par::clist_number ); for (int i=0; iclist.resize( par::clist_number ); shutdown(); return chisq; } plink-1.07-src/cfamily.h0000644000265600020320000000135111264127626014273 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __CFAMILY_H__ #define __CFAMILY_H__ #include "plink.h" class Individual; double genrel(Individual * a, Individual * b); #endif plink-1.07-src/genoerr.cpp0000644000265600020320000002270411264127625014647 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2007 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "genogroup.h" #include "phase.h" #include "haplowindow.h" extern ofstream LOG; using namespace std; void HaploPhase::queryGenotype(int s) { // Map within-region SNP coding back to within-genome SNP coding int l = S[s]; // Right now, let's elect not to handle non-autosomal SNPs if ( par::chr_haploid[ P.locus[l]->chr ] || par::chr_sex[ P.locus[l]->chr ] ) error("--proxy-error not yet set up for non-autosomal SNPs"); ////////////////////////////////////////////////////// // Calculate sample genotype frequencies for test SNP int g0 = 0, g1 = 0, g2 = 0; for (int i = 0; ione[i] : P.sample[i]->one[l]; bool s2 = par::SNP_major ? P.SNP[l]->two[i] : P.sample[i]->two[l]; if( s1 ) { if ( s2 ) ++g0; } else { if ( s2 ) ++g1; else ++g2; } } vector_t geno_freq(3); int total = g0 + g1 + g2; geno_freq[0] = (double)g0 / (double)total; geno_freq[1] = (double)g1 / (double)total; geno_freq[2] = (double)g2 / (double)total; ////////////////////////////////////////////////////// // Check each genotype of each individual over region for (int i=0; ione[i] : P.sample[i]->one[l]; bool s2 = par::SNP_major ? P.SNP[l]->two[i] : P.sample[i]->two[l]; int g; if ( s1 ) { if ( s2 ) g = 0; else g = -9; } else { if ( s2 ) g = 1; else g = 2; } /////////////////////////////////////////////////////////// // Determine most likely genotype given flanking haplotypes queryThisGenotype(i, s, g, geno_freq); } } } void HaploPhase::queryThisGenotype(int i, int s, int g, vector_t & geno_freq ) { // Do not do anything if the actual genotype is missing -- i.e. the // focus here is on correcting genotyping error rather than // imputation if ( g<0 ) return; // For a given individual, for a given phased region and set of // sample haplotype frequencies, determine a) the possible set of // phased haplotypes consistent with region if the test SNP were in // fact missing, b) the relative probability that the observed // genotype is the true genotype given the new phases. //////////////////////////////////////////////////////////////// // These are the new possible phases, if the test genotype were // missing vector new_hap1; vector new_hap2; vector_t newpp; double psum = 0; int z2 = 0; // Consider // AAA-C-GGT / ACA-C-GGT // Becomes up to three possible states: // AAA-C-GGT / ACA-C-GGT // AAA-C-GGT / ACA-T-GGT // AAA-T-GGT / ACA-T-GGT for (int z=0; z posshap1; vector posshap2; vector h1 = hap[hap1[i][z]]; vector h2 = hap[hap2[i][z]]; vector h1_flip = hap[hap1[i][z]]; vector h2_flip = hap[hap2[i][z]]; // flip bit at SNP position s h1_flip[s] = !h1[s]; h2_flip[s] = !h2[s]; // Do we observed these possible haplotypes in the sample as a // whole? If not, no need to consider, as the posterior // probability will be 0. if( hapmapb.find(h1) != hapmapb.end() ) posshap1.push_back(hapmapb[h1]); if( hapmapb.find(h2) != hapmapb.end() ) posshap2.push_back(hapmapb[h2]); if( hapmapb.find(h1_flip) != hapmapb.end() ) posshap1.push_back(hapmapb[h1_flip]); if( hapmapb.find(h2_flip) != hapmapb.end() ) posshap2.push_back(hapmapb[h2_flip]); // get new probabilities for each possible new phasing for( int a = 0; a < posshap1.size(); a++ ) for( int b = 0; b < posshap2.size(); b++ ) { newpp.push_back(f[posshap1[a]] * f[posshap2[b]]); new_hap1.push_back(posshap1[a]); new_hap2.push_back(posshap2[b]); // We are already considering both explicitly // if (posshap1[a] != posshap2[b]) // newpp[z2] *= 2; psum += newpp[z2]; z2++; } } // adjust to sum to 1 for (int z=0; zname << " " << setw(par::pp_maxfid) << P.sample[i]->fid << " " << setw(par::pp_maxiid) << P.sample[i]->iid << " " << setw(6) << genotype(P, i, l) << " " << setw(8) << geno_freq[g] << " " << setw(8) << gh[0] << " " << setw(8) << gh[1] << " " << setw(8) << gh[2] << " "; int ng = -9; for ( int j=0; j<=2; j++) if ( gh[j] > impute_threshold ) ng = j; if ( ng==0 ) HTEST << setw(6) << P.locus[S[s]]->allele1+"/" +P.locus[S[s]]->allele1 << "\n"; else if ( ng==1 ) HTEST << setw(6) << P.locus[S[s]]->allele1+"/" +P.locus[S[s]]->allele2 << "\n"; else if ( ng==2 ) HTEST << setw(6) << P.locus[S[s]]->allele2+"/" +P.locus[S[s]]->allele2 << "\n"; else HTEST << setw(6) << par::missing_genotype+"/" +par::missing_genotype << "\n"; } //////////////////////////////////////////////////////////// // Only report dodgy looking genotypes even in verbose mode if ( par::proxy_full_report && gh[g] < threshold ) { HTEST << "Individual " << P.sample[i]->fid << " " << P.sample[i]->iid << " ; locus " << P.locus[S[s]]->name << "\n"; HTEST << "Observed genotype is " << genotype(P, i, S[s]) << " g = " << g << "\n"; HTEST << "Prior genotypes probabilities = " << geno_freq[0] << " " << geno_freq[1] << " " << geno_freq[2] << endl; HTEST << "Posterior genotypes probabilities = " << gh[0] << " " << gh[1] << " " << gh[2] << endl; HTEST << "\nOriginal phases\n"; for (int z = 0; z < hap1[i].size(); z++) { HTEST << setw(par::pp_maxfid) << P.sample[i]->fid<< " " << setw(par::pp_maxiid) << P.sample[i]->iid<< " " << setw(4) << z << " " << setw(10) << haplotypeName(hap1[i][z]) << " " << setw(10) << haplotypeName(hap2[i][z]) << "\t"; if (ambig[i]) { HTEST << setw(12) << pp[i][z]<< " "; int max_z = 0; for (int z2=0; z2 pp[i][max_z] ? z2 : max_z ; if (max_z == z) HTEST << setw(6) << 1<< " "<< " "; else HTEST << setw(6) << 0<< " "<< " "; } else HTEST << setw(12) << 1<< " "<< setw(6) << 1<< " "<< " "; // Genotypes for (int j=0; jfid<< " " << setw(par::pp_maxiid) << P.sample[i]->iid<< " " << setw(4) << z << " " << setw(10) << haplotypeName(new_hap1[z]) << " " << setw(10) << haplotypeName(new_hap2[z]) << "\t"; HTEST << setw(12) << newpp[z]<< " "; int max_z = 0; for (int z2=0; z2 newpp[max_z] ? z2 : max_z ; if (max_z == z) HTEST << setw(6) << 1<< " "<< " "; else HTEST << setw(6) << 0<< " "<< " "; // Genotypes for (int j=0; j #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "sockets.h" using namespace std; extern string PVERSION; extern string PREL; #define PORT_NUM 80 #define IP_ADDR "132.183.161.22" #define GET_STRING "GET /~purcell/plink/version2.txt HTTP/1.1\nHost: pngu.mgh.harvard.edu\nConnection: close\n\n" void Plink::webcheck(CArgs & a) { #ifdef SKIP printLOG("Web-check not implemented on this system...\n"); return; #else ////////////////////////////////////////// // First look for a local .pversion file in // the local directory // Get today's date time_t curr=time(0); string tdstamp = (string)ctime(&curr); string buf; stringstream ss(tdstamp); vector date_tokens; while (ss >> buf) date_tokens.push_back(buf); string thisDate = date_tokens[0] + date_tokens[1] + date_tokens[2]; bool hasRecord = doesFileExist(".pversion"); //////////////////////////////////////////////////////// // Web-based message (but may be cached in local file) vector tokens; bool connect2web = true; printLOG("Web-based version check ( --noweb to skip )\n"); //////////////////////////////////////////// // If we have a record, are we up-to-date? if ( hasRecord ) { ifstream VER; VER.open(".pversion",ios::in); string oldDay, oldMonth, oldDate, webVersion; VER >> oldDay >> oldMonth >> oldDate; if ( thisDate == oldDay+oldMonth+oldDate ) { printLOG("Recent cached web-check found..."); connect2web = false; // Read rest of cached web message while ( ! VER.eof() ) { string t; VER >> t; if (t=="") break; tokens.push_back(t); } } VER.close(); } if ( connect2web ) { //printLOG("Connecting to web to get version...\n"); tokens = socketConnection( this , IP_ADDR, PORT_NUM, GET_STRING); } bool print = false; bool print2 = false; bool version_okay = true; for (int i=0; i #include "plink.h" #include "model.h" using namespace std; class LogisticModel : public Model { public: LogisticModel(Plink *); ~LogisticModel() { }; void setDependent(); void fitLM(); void fitUnivariateLM() { }; void reset(); void pruneY(); vector_t getCoefs(); vector_t getVar(); vector_t getSE(); void displayResults(ofstream &, Locus *); double getLnLk(); vector_t getPVals(); double getPValue(); void HuberWhite(); private: vector_t p; vector Y; vector_t V; // diagonal p(1-p) double chisq; }; #endif plink-1.07-src/informative.cpp0000644000265600020320000001650311264127624015530 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "plink.h" #include "options.h" #include "helper.h" class Pair { public: Individual * p1; Individual * p2; bool operator< (const Pair & b) const { return ( ( p1 < b.p1 ) || ( p1 == b.p1 && p2 < b.p2 ) ); } }; int Plink::readInformative() { skip_pair.clear(); //////////////////////////////////// // Request to read from .genome file int c=0; checkFileExists(par::ibd_file); printLOG("Reading genome-wide IBD estimates from [ "+par::ibd_file+" ] \n"); ifstream INC; INC.open(par::ibd_file.c_str(), ios::in); map mperson; vector::iterator person = sample.begin(); while ( person != sample.end() ) { mperson.insert(make_pair( (*person)->fid+"_"+(*person)->iid , *person )); person++; } map mpair; // Read in .genome file -- from header, get field values for: // FID1 -- IID2 and // z0, z1, z2 int z0_code = -1; int z1_code = -1; int z2_code = -1; int col_length = 0; vector tokens = tokenizeLine(INC); col_length = tokens.size(); if ( tokens.size() < 4 || tokens[0] != "FID1" || tokens[1] != "IID1" || tokens[2] != "FID2" || tokens[3] != "IID2" ) error("Problem with header row of .genome file"); for ( int i = 4; i tokens = tokenizeLine(INC); if ( tokens.size() == 0 ) continue; if ( col_length != tokens.size() ) { string strmsg = ""; for (int i=0;i( z.z0 , z0 , std::dec) && from_string( z.z1 , z1 , std::dec) && from_string( z.z2 , z2 , std::dec) ) ) { z.z0 = 1; z.z1 = 0; z.z2 = 0; } if ( par::debug ) cerr << "Read from file: " << fid1 << " " << iid1 << ", " << fid2 << " " << iid2 << ", " << z.z0 << " " << z.z1 <<" " << z.z2 << "\n"; /////////////////////////////////// // Range of Genome-Wide IBD okay? bool val = true; double pihat = z.z1/2 + z.z2; if ( pihat < par::MIN_PIHAT ) { if (par::include_all_pairs) { z.z0 = 1-par::include_all_z1; z.z1 = par::include_all_z1; z.z2 = 0; } else val = false; } // Above IBD threshold? if ( z.z1/2 + z.z2 > par::MAX_PIHAT ) val = false; // Not a parent-offspring pair? if (z.z1 > 0.9 ) val = false; // Need to nudge? if (par::nudge && ( pihat * pihat ) < z.z2 ) { z.z0 = ( 1 - pihat) * ( 1 - pihat); z.z1 = 2 * pihat * (1-pihat); z.z2 = pihat * pihat; } if (val) { map::iterator person1 = mperson.find(fid1+"_"+iid1); map::iterator person2 = mperson.find(fid2+"_"+iid2); if ( person1 == mperson.end() || person2 == mperson.end() ) continue; Pair p; p.p1 = person1->second; p.p2 = person2->second; mpair.insert(make_pair(p,z)); } } INC.close(); // Now we've finished reading in all known genome-wide IBD values, // and we've saved those that are in the desired range. // Consider all pairs for (int i1=0; i1::iterator i = mpair.find(p); // Pair either not found... if ( i == mpair.end() ) { skip_pair.push_back(true); } else // ... or was in .genome with valid IBD { skip_pair.push_back(false); c++; saved_IBDg.push_back(i->second); // And related enough for the --genome-test? if ( par::genome_test ) { Z ibd = i->second; if ( ibd.z1/2 + ibd.z2 >= par::genome_test_threshold ) { int2 pair; pair.p1 = i1; pair.p2 = i2; related.insert(pair); } } } } stringstream s2; s2 << "\n" << c << " pairs are informative ( " << par::MIN_PIHAT << " <= pihat <= " << par::MAX_PIHAT << " )\n"; printLOG(s2.str()); return c; } int Plink::calcInformative() { ////////////////////////////// // Precalculate to get list of // individuals to skip in subsequent // sections int c=0; int c0=0; printLOG("Preprocessing to assess number of informative pairs ... \n"); for (int i1=0; i1missing && !p2->missing) { Z IBSg; Z IBDg; // Calculate or fix if (!par::FIXED) { IBSg = calcGenomeIBS(p1,p2); IBDg = calcGenomeIBD(p1,p2,IBSg); } else { IBDg = par::FIX_IBD; } bool val = true; ////////////////////////////// // Do we meet criteria? // Range of Genome-Wide IBD okay? if (IBDg.z1/2 + IBDg.z2 < par::MIN_PIHAT ) { if (par::include_all_pairs) { IBDg.z0 = 1-par::include_all_z1; IBDg.z1 = par::include_all_z1; IBDg.z2 = 0; } else val = false; } // Above IBD threshold? if ( IBDg.z1/2 + IBDg.z2 > par::MAX_PIHAT ) val = false; // Not a parent-offspring pair? if (IBDg.z1 > 0.9 ) val = false; // Affected-only pair analysis? if (val) { saved_IBDg.push_back(IBDg); skip_pair.push_back(false); ++c; cout << c << " pairs extracted; " << c0 << " processed of " << np << " \r"; cout.flush(); } else skip_pair.push_back(true); } else skip_pair.push_back(true); } stringstream s2; s2 << "\n" << c << " pairs are informative ( " << par::MIN_PIHAT << " <= pihat <= " << par::MAX_PIHAT << " )\n"; printLOG(s2.str()); return c; } void Plink::writeInformative() { // Replaced by standard .genome output } plink-1.07-src/logistic.cpp0000644000265600020320000001743011264127625015023 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "logistic.h" #include "plink.h" #include "helper.h" #include "options.h" #include "stats.h" LogisticModel::LogisticModel(Plink * p_) { P = p_; nc = 0; cluster = false; } void LogisticModel::setDependent() { // Set phenotype to 'aff' variable Y.clear(); for (int i=0; in; i++) { if ( !miss[i] ) { if ( P->sample[i]->pperson->aff ) Y.push_back( 1 ) ; else Y.push_back( 0 ) ; } } nind = Y.size(); p.resize(nind); V.resize(nind); } void LogisticModel::pruneY() { ////////////////////////////////// // Prune out rows that are missing if ( miss.size() != Y.size() ) error("Internal error: bad call to Model::pruneY()"); vector Y2; for (int i=0; ichr << " " << setw(par::pp_maxsnp) << loc->name << " " << setw(10) << loc->bp << " " << setw(4) << loc->allele1 << " " << setw(10) << label[p] << " " << setw(8) << Y.size() << " "; if (okay) { if ( par::return_beta ) OUT << setw(10) << coef[p] << " "; else OUT << setw(10) << exp(coef[p]) << " "; if (par::display_ci) { OUT << setw(8) << se << " "; if ( par::return_beta ) OUT << setw(8) << coef[p] - par::ci_zt * se << " " << setw(8) << coef[p] + par::ci_zt * se << " "; else OUT << setw(8) << exp(coef[p] - par::ci_zt * se) << " " << setw(8) << exp(coef[p] + par::ci_zt * se) << " "; } OUT << setw(12) << Z << " " << setw(12) << pvalue; } else { OUT << setw(10) << "NA" << " "; if (par::display_ci) OUT << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " "; OUT << setw(12) << "NA" << " " << setw(12) << "NA"; } OUT << "\n"; } } } double LogisticModel::getPValue() { vector_t var = getVar(); bool okay = var[testParameter] < 1e-20 || !realnum(var[testParameter]) ? false : all_valid; if (all_valid) { double se = sqrt(var[testParameter]); double Z = coef[testParameter] / se; return chiprobP(Z*Z,1); } else return 1; } vector_t LogisticModel::getPVals() { int tmp = testParameter; vector_t res; for ( testParameter = 1; testParameter < np; testParameter++) res.push_back( getPValue() ); testParameter = tmp; return res; } double LogisticModel::getLnLk() { // Return -2 * sample log-likelihood // We assume the model is fit, and all Y's are either 0 or 1 double lnlk = 0; for (int i=0; i sc(nc); for (int i=0; i #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "stats.h" #include "cfamily.h" #include "zed.h" void Plink::calcStratifiedAlleleFreqs() { // w/ Modification to output counts by John Novembre // Assume SNP-major data if (!par::SNP_major) Ind2SNP(); // This is called *after* any filters are applied // (i.e. unlike the original --freq command) // This means that various things will have been fixed already, // such as het haploid calls, and which allele is the minor one if (par::summ_nonfounders) printLOG("Writing stratified allele frequencies (all individuals) to [ " + par::output_file_name + ".frq.strat ] \n"); else printLOG("Writing stratified allele frequencies (founders-only) to [ " + par::output_file_name + ".frq.strat ] \n"); ofstream FRQ; string f = par::output_file_name + ".frq.strat"; FRQ.open(f.c_str(), ifstream::out); FRQ.precision(4); FRQ << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(8) << "CLST" << " " << setw(4) << "A1" << " " << setw(4) << "A2" << " " << setw(8) << "MAF" << " " << setw(6) << "MAC" << " " << setw(8) << "NCHROBS" // Modified by JN 5/24/07 << "\n"; ////////////////////////////////////////// // Calculate allele frequencies, and FST vector::iterator loc = locus.begin(); vector::iterator s = SNP.begin(); while ( loc != locus.end() ) { // Track Fst vector_t het(nk); double tothet = 0; // Consider each cluster for (int k=0; kchr << " " << setw(par::pp_maxsnp) << (*loc)->name << " "; (*loc)->freq = 0; // count 1 per allele, for frequency double nmfreq = 0; double totfreq = 0; double count = 0; // count 1 per genotype, for missingness int geno_nm = 0; bool X = false; bool haploid = false; // Determine type of SNP if (par::chr_sex[(*loc)->chr]) X=true; else if (par::chr_haploid[(*loc)->chr]) haploid=true; /////////////////////////////// // Iterate over each individual vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator person = sample.begin(); while ( person != sample.end() ) { bool s1 = *i1; bool s2 = *i2; // For allele frequencies // only consider founders? if ( par::summ_nonfounders || (*person)->founder ) { if ( (*person)->sol == k ) { if ( haploid || ( X && (*person)->sex ) ) { ////////////////// // Haploid counts // "1" allele count // Possible count of 1 allele totfreq++; if ( (!s1) && (!s2) ) // FF = hom(11) { (*loc)->freq++; nmfreq++; } else if ( s1 && s2 ) // TT = hom(22) { nmfreq++; } } else { ////////////////// // Autosomal count // "1" allele count // Possible count of 2 alleles totfreq+=2; if (!s1) { if (!s2) // 00 = hom(11) { (*loc)->freq+=2; nmfreq+=2; } else // 01 = het(12) { (*loc)->freq+=1; nmfreq+=2; } } else if ( s2 ) // 11 = hom(22) { nmfreq+=2; } } } } // Next individual person++; i1++; i2++; } if (nmfreq>0){ count=(*loc)->freq; // Added by JN 5/24/07 (*loc)->freq /= (double)nmfreq; } string a1 = (*loc)->allele1; if (a1=="") a1="0"; FRQ << setw(8) << kname[k] << " " << setw(4) << a1 << " " << setw(4) << (*loc)->allele2 << " " << setw(8) << (*loc)->freq << " " << setw(6) << count << " " << setw(8) << nmfreq << " " // Modified by JN 5/24/07 << "\n"; } // Next SNP loc++; s++; } FRQ.close(); shutdown(); } void Plink::findMissRuns(Individual * person, ofstream & RUN) { int l=0; int nmiss = 0; // now means 'not missing' bool run = false; int start = 0; int end = 0; while ( l < nl_all ) { // Outside of a run? if (!run) { // A new run? (missing) if (person->one[l] && (!person->two[l])) { start = l; nmiss=1; run=true; } } else // if already in a run, either end or increase length? { // found a non-missing? 00, 11, 01 if ( person->one[l] == person->two[l] || person->two[l]) { nmiss++; // Average non-missing rate now too high?, given we have at least // a certain number of SNPs in run // (l-start) = number of SNPs in run currently if ((double)nmiss / (double)(l-start) >= par::miss_run_level && (l-start) >= par::miss_run_length) { end = l-1; run = false; } } else if ( locus[l]->chr != locus[start]->chr ) // different chromosome? { end = l-1; run = false; } else if ( l == (nl_all -1) ) // or end of all SNPs? { end = l; run = false; } } // Check run length? if (!run) { if (par::miss_run_length_kb) { if ( locus[end]->bp - locus[start]->bp >= par::miss_run_length * 1000 ) RUN << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(8) << person->phenotype << " " << setw(4) << locus[start]->chr << " " << setw(par::pp_maxsnp) << locus[start]->name << " " << setw(par::pp_maxsnp) << locus[end]->name << " " << setw(10) << (double)(locus[end]->bp - locus[start]->bp)/(double)1000 << " " << setw(10) << end - start +1 << " " << setw(10) << (double)nmiss/(double)(end - start + 1) << "\n"; } else { if ( end - start +1 >= par::miss_run_length ) RUN << person->fid << "\t" << person->iid << "\t" << locus[start]->chr << "\t" << locus[start]->name << "\t" << locus[end]->name << "\t" << (double)(locus[end]->bp - locus[start]->bp)/(double)1000 << "\t" << end - start + 1 << "\t" << (double)nmiss/(double)(end - start + 1) << "\n"; } ////////////////// // Clear counters start = end = nmiss = 0; } /////////////// // Next locus l++; } } void Plink::sexCheck() { // Get range of X chromosome markers if (par::SNP_major) SNP2Ind(); ofstream HET; string f = par::output_file_name + ".sexcheck"; HET.open(f.c_str(),ios::out); HET.precision(4); printLOG("Writing X-chromosome sex check results to [ "+f+" ] \n"); HET << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(12) << "PEDSEX" << " " << setw(12) << "SNPSEX" << " " << setw(12) << "STATUS" << " " << setw(12) << "F" << "\n"; for (int i1=0; i1chr] || par::chr_haploid[locus[l]->chr] ) skip = true; if ( locus[l]->nm <= 1 || locus[l]->freq < 1e-8 ) skip = true; if ( skip ) continue; double f = 0; vector_t hs(nk); double ht = 2 * locus[l]->freq * ( 1 - locus[l]->freq ); // Calculate Fst here // double avg_hs = 0; // for (int k = 0; k < nk ; k++ ) // avg_hs += het[k]; // avg_hs /= (double)nk; // double f = ( hs - ht ) / ht; for ( int k = 0 ; k < nk ; k++ ) { // FST << setw(4) << locus[l]->chr << " " // << setw(par::pp_maxsnp) << locus[l]->name << " " // << setw(20) << kname[k] << " " // << setw(8) << kind[k] << " " // << setw(12) << het[k] << "\n"; } FST << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(20) << "_FST_" << " " << setw(8) << locus[l]->nm << " " << setw(12) << f << "\n"; } FST.close(); return; } double Plink::calcInbreeding(Individual * p1, int m1, int m2, ofstream & HET) { // P(Homo) = F + (1-F)P(Homo by chance) // P(Homo by chance) = p^2+q^2 for a biallelic locus. // For an individual with N genotyped loci, we // 1. count the total observed number of loci which are homozygous (O), // 2. calculate the total expected number of loci homozygous by chance (E) // Then, using the method of moments, we have // O = NF + (1-F)E // Which rearranges to give // F = (O-E)/(N-E) // Count of nonmissing loci double N=0; double O=0; double E=0; // Consider all loci for (int l=m1; l<=m2;l++) { //////////////////////////////////////////////// // Skip X and haploid chromosome markers, or not if ( par::check_sex ) // For sex-checks { // only consider the X chromosome if ( ! par::chr_sex[locus[l]->chr] ) continue; } else // normal heterozygosity calculation { // so skip haploid markers if ( par::chr_sex[locus[l]->chr] || par::chr_haploid[locus[l]->chr] ) continue; } // Skip monomorphic markers, uninformative markers if ( locus[l]->nm <= 1 || locus[l]->freq < 1e-8 ) continue; ////////////////// // Observed data // check not missing: if (!(p1->one[l] && (!p1->two[l]))) { // homozygous non-missing loci if (p1->one[l] == p1->two[l]) O++; // non-missing loci N++; ///////////////////////// // Expected homozygousity // E = 2pq . 2N/(2N-1) // (Using Nei's unbiased estimator) E += 1 - ( 2 * locus[l]->freq * ( 1 - locus[l]->freq ) * ( locus[l]->nm / ( locus[l]->nm - 1 ) ) ); } } double F = (O-E)/(N-E); if ( par::check_sex) { HET << setw(par::pp_maxfid) << p1->fid << " " << setw(par::pp_maxiid) << p1->iid << " " << setw(12) << p1->sexcode << " "; if ( F > par::sex_threshold_male ) { HET << setw(12) << 1 << " "; if (p1->sexcode == "1") HET << setw(12) << "OK" << " "; else { HET << setw(12) << "PROBLEM" << " "; if (par::impute_sex) { p1->sexcode = "1"; p1->sex = true; } } } else if ( F < par::sex_threshold_female ) { HET << setw(12) << 2 << " "; if (p1->sexcode == "2") HET << setw(12) << "OK" << " "; else { HET << setw(12) << "PROBLEM" << " "; if (par::impute_sex) { p1->sexcode = "2"; p1->sex = false; } } } else { HET << setw(12) << 0 << " " << setw(12) << "PROBLEM" << " "; if (par::impute_sex) { p1->sexcode = "0"; p1->sex = false; if (!par::ignore_missing_sex) p1->missing = true; } } HET << setw(12) << F << "\n"; } else HET << setw(par::pp_maxfid) << p1->fid << " " << setw(par::pp_maxiid) << p1->iid << " " << setw(12) << (int)O << " " << setw(12) << E << " " << setw(12) << (int)N << " " << setw(12) << F << "\n"; return F; } Z Plink::calcGenomeIBS(Individual * p1, Individual * p2) { // Vector of average genome-wide IBS Z IBSg; // Count of nonmissing loci int cnt=0; // Other metrics ( in Plink:: ) pvIBS0 = 0; pvIBS2het = 0; int last_chr = -1; int last_bp = -1; // Consider all autosomal loci vector::iterator ia1 = p1->one.begin(); vector::iterator ia2 = p1->two.begin(); vector::iterator ib1 = p2->one.begin(); vector::iterator ib2 = p2->two.begin(); int l=0; while ( ia1 != p1->one.end() ) { // Skip X and haploid chromosome markers if ( par::chr_sex[locus[l]->chr] || par::chr_haploid[locus[l]->chr] ) { l++; ia1++; ia2++; ib1++; ib2++; continue; } // Only count if both genotypes nonmissing bool a1 = *ia1; bool a2 = *ia2; if (a1 && !a2) { l++; ia1++; ia2++; ib1++; ib2++; continue; } bool b1 = *ib1; bool b2 = *ib2; if (b1 && !b2) { l++; ia1++; ia2++; ib1++; ib2++; continue; } // Calculate IBS from genotypes // 10 = missing // 00 = 11hom // 01 = 12het // 11 = 22hom if ( a1 == b1 && a2 == b2 ) IBSg.z2++; // IBS 2 else if ( a1 != b1 && a2 != b2 ) IBSg.z0++; // IBS 0 else IBSg.z1++; // IBS 1 cnt++; // Also calculate p-value binomial test if ( ! ( par::matrix || par::cluster || par::genome_output ) ) { ia1++; ia2++; ib1++; ib2++; l++; continue; } if ( a1 != b1 && a2 != b2 ) // IBS 0 hom/hom { // Can we count this? if (locus[l]->chr != last_chr || locus[l]->bp > last_bp + par::ibstest_gap) { pvIBS0++; last_chr = locus[l]->chr; last_bp = locus[l]->bp; } } else if ( a1 != a2 && b1 != b2 ) // IBS 2 het/het { // Can we count this? if (locus[l]->chr != last_chr || locus[l]->bp > last_bp + par::ibstest_gap) { pvIBS2het++; last_chr = locus[l]->chr; last_bp = locus[l]->bp; } } // Next SNP ia1++; ia2++; ib1++; ib2++; l++; } if (cnt==0) { string msg = "No nonmissing markers for individuals " + p1->fid + " " + p1->iid + " - " + p2->fid + " " + p2->iid; error(msg); } // Standard genetic distance (proportion IBS 0 to 1) if ( par::cluster_euclidean ) dst = sqrt((IBSg.z1*0.5 + IBSg.z2*2)/(IBSg.z0+IBSg.z1+IBSg.z2*2)); else dst = (IBSg.z1*0.5 + IBSg.z2)/(IBSg.z0+IBSg.z1+IBSg.z2); // Also calculate p-value binomial test if ( par::cluster_missing || ! ( par::matrix || par::cluster || par::genome_output ) ) return IBSg; // Calculate p-value for IBS test // IBS0 : IBS2(het) in 1:2 ratio // n.b. 0.2222222 = 0.666*(1-0.666) double z = (pvIBS2het/(pvIBS0+pvIBS2het)-0.666666) / (sqrt(0.2222222/(pvIBS0+pvIBS2het))); // Store p-value in Plink::pv pv = normdist(z); // Return counts return IBSg; } void Plink::preCalcGenomeIBD() { // Take pairwise IBS and information on allele frequencies // to generate genome-wide IBD estimates // Expected IBS given IBD // Bias-corrected versison of estimator for expected proportion of // IBS SNP pairs given IBD status (cf. Nei bias-corrected // heterozygosity estimator). // All possible permutations of taking 4 alleles from 2N = 2N(2N-1)(2N-2)(2N-3) // x = count of allele 1 // y = count of allele 2 // p = x/2N, q=y/2N // Of these, // x(x-1)y(y-1) will be of order 11 22, // y(y-1)x(x-1) will be of order 22 11 // Therefore the probability of IBS 0 given IBS 0 is // E00 = 2 p*p*q*q * ( (x-1)/x * (y-1)/y * (2N/(2N-1)) * (2N/(2N-2)) * (2N/(2N-3)) ) // and so on for E01, etc // E(IBD)(IBS) // IBS >= IBD E00=E10=E20=E01=E11=E21=E02=E12=E22=0; int cnt = 0; for (int l=0; lchr] || par::chr_haploid[locus[l]->chr] ) continue; double p = locus[l]->freq; double q = 1 - p; double Na = locus[l]->nm; // = # alleles = 2N where N is number of individuals double x = p * Na; double y = q * Na; // Original, non bias-corrected versions // E00 += 2*p*p*q*q; // E01 += 4*p*p*p*q+4*p*q*q*q; // E02 += q*q*q*q + p*p*p*p + 4*p*p*q*q; // E11 += 2*p*p*q + 2*p*q*q; // E12 += p*p*p + q*q*q + p*p*q + p*q*q; double a00 = 2*p*p*q*q * ( (x-1)/x * (y-1)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); double a01 = 4*p*p*p*q * ( (x-1)/x * (x-2)/x * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ) + 4*p*q*q*q * ( (y-1)/y * (y-2)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); double a02 = q*q*q*q * ( (y-1)/y * (y-2)/y * (y-3)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ) + p*p*p*p * ( (x-1)/x * (x-2)/x * (x-3)/x * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ) + 4*p*p*q*q * ( (x-1)/x * (y-1)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); double a11 = 2*p*p*q * ( (x-1)/x * Na/(Na-1) * Na/(Na-2) ) + 2*p*q*q * ( (y-1)/y * Na/(Na-1) * Na/(Na-2) ); double a12 = p*p*p * ((x-1)/x * (x-2)/x * Na/(Na-1) * Na/(Na-2)) + q*q*q * ( (y-1)/y * (y-2)/y * Na/(Na-1) * Na/(Na-2)) + p*p*q * ( (x-1)/x * Na/(Na-1) * Na/(Na-2) ) + p*q*q * ((y-1)/y * Na/(Na-1) * Na/(Na-2)); if ( realnum(a00) && realnum(a01) && realnum(a02) && realnum(a11) && realnum(a12) ) { E00 += a00; E01 += a01; E02 += a02; E11 += a11; E12 += a12; cnt++; } } E00 /= cnt; E10 = 0; E20 = 0; E01 /= cnt; E11 /= cnt; E21 = 0; E02 /= cnt; E12 /= cnt; E22 = 1; if (par::verbose) { cout << "P(IBS|IBD) -- IBS row; IBD col\n"; cout << E00 << "\t" << E10 << "\t" << E20 << "\n"; cout << E01 << "\t" << E11 << "\t" << E21 << "\n"; cout << E02 << "\t" << E12 << "\t" << E22 << "\n"; cout << "\n"; } } Z Plink::calcGenomeIBD(Individual * p1, Individual * p2, Z IBSg) { Z z; double S = IBSg.z0 + IBSg.z1 + IBSg.z2; // E_IBS[row=IBS][col=IBD] // E(IBD)(IBS) double e00 = E00*S; double e10 = E10*S; double e20 = E20*S; double e01 = E01*S; double e11 = E11*S; double e21 = E21*S; double e02 = E02*S; double e12 = E12*S; double e22 = E22*S; z.z0 = IBSg.z0 / e00; z.z1 = (IBSg.z1 - z.z0*e01) / e11; z.z2 = (IBSg.z2 - z.z0*e02 - z.z1*e12) / e22; if (par::debug) cout << "DEBUG\t" << z.z0 << " " << z.z1 << " " << z.z2 << "\n"; // Bound IBD estimates to sum to 1 // and fall within 0-1 range if (par::bound) { if (z.z0>1) { z.z0=1; z.z1=z.z2=0; } if (z.z1>1) { z.z1=1; z.z0=z.z2=0; } if (z.z2>1) { z.z2=1; z.z0=z.z1=0; } if (z.z0<0) { double S=z.z1+z.z2; z.z1/=S; z.z2/=S; z.z0=0; } if (z.z1<0) { double S=z.z0+z.z2; z.z0/=S; z.z2/=S; z.z1=0; } if (z.z2<0) { double S=z.z0+z.z1; z.z0/=S; z.z1/=S; z.z2=0; } } // Possibly constrain IBD estimates to within possible triangle // i.e. 0.5 0.0 0.5 is invalid // // For purposes of sample checks, etc, we do not automatically do this (--genome) // For PLINK analysis we do (--plink) // // Constraint : z1^2 - 4 z0 z2 >= 0 // : x^2 - 2 pi x + z2 = 0 // // where pi = (z1 + 2 z2) / 2 // // So the constaint can also be written as // // pi^2 >= z2 double pihat = z.z1/2 + z.z2 ; double impossible = false; if ( ( pihat * pihat ) < z.z2 ) { impossible = true; // find new value for z1 (z1*) which satisfies the equation // // (z1* + 2 pi^2) / 2 = pi // // this gives // // z1* = 2pi(1-pi) // the transformed IBD probabilities would be // 1 - 2pi(1-pi) - pi^2, 2pi(1-pi), pi^2 if (par::nudge) { z.z0 = ( 1 - pihat) * ( 1 - pihat); z.z1 = 2 * pihat * (1-pihat); z.z2 = pihat * pihat; } } if ( par::genome_output_minimal ) { ZOUTFILE << dbl2str_fixed(dst,6) << " " << dbl2str(pv) << " " << dbl2str(z.z1/2 + z.z2) << "\n"; } else if (par::genome_output) { if ( (!par::pihat_filter) || ( pihat >= par::MIN_PIHAT && pihat <= par::MAX_PIHAT) ) { ZOUTFILE << sw( p1->fid, par::pp_maxfid) << sw( p1->iid, par::pp_maxiid) << sw( p2->fid, par::pp_maxfid) << sw( p2->iid, par::pp_maxiid); string rt = relType(p1,p2); ZOUTFILE << sw(rt,3); if ( rt == "UN" ) ZOUTFILE << sw("NA", 6); else ZOUTFILE << sw( genrel(p1,p2) , 6 ); if (par::show_impossible_IBD || !impossible) ZOUTFILE << sw(z.z0, 4,8) << sw(z.z1, 4,8) << sw(z.z2, 4,8); else ZOUTFILE << sw( -z.z0, 8) << sw( -z.z1, 8) << sw( -z.z2, 8); ZOUTFILE << sw( z.z1/2 + z.z2, 4, 8); if (par::bt) { if ( (!p1->aff) && (!p2->aff) ) ZOUTFILE << sw("-1", 4); else if ( p1->aff && p2->aff ) ZOUTFILE << sw("1",4); else if ((!p1->aff) && p2->aff) ZOUTFILE << sw("0",4); else if (p1->aff && !p2->aff) ZOUTFILE << sw("0",4); else ZOUTFILE << sw("NA",4); } else ZOUTFILE << sw("NA",4); ZOUTFILE << sw(dst,6,10); ZOUTFILE << sw(pv,4,8); double ov = (double)pvIBS2het / (double)pvIBS0; if ( realnum(ov) ) ZOUTFILE << sw(ov,4,8); else ZOUTFILE << sw("NA",8); if ( par::genome_output_full ) ZOUTFILE << sw((int)IBSg.z0, 8) << sw((int)IBSg.z1, 8) << sw((int)IBSg.z2, 8) << sw(pvIBS0, 4,8) << sw(pvIBS2het, 4, 8); ZOUTFILE << "\n"; } } return z; } void Plink::displayGenomeWideInfo() { /////////////////////////////////////// // This is an individual-mode analysis if (par::SNP_major) SNP2Ind(); string f = par::output_file_name + ".genome"; if ( par::genome_output_minimal ) f += ".min"; if ( par::compress_genome ) f += ".gz"; if ( par::genome_output_minimal ) printLOG("Writing minimal-format IBS information to [ " + f + " ] \n"); else printLOG("Writing whole genome IBS/IBD information to [ " + f + " ] \n"); ZOUTFILE.open( f , par::compress_genome ); stringstream s2; s2 << "Filtering output to include pairs with ( " << par::MIN_PIHAT << " <= PI-HAT <= " << par::MAX_PIHAT << " )\n"; printLOG(s2.str()); if ( par::genome_output_minimal ) { for (int i=0; ifid << " " << sample[i]->iid << "\n"; ZOUTFILE << "__END __END\n"; } else { ZOUTFILE << sw("FID1",par::pp_maxfid) << sw("IID1",par::pp_maxiid) << sw("FID2",par::pp_maxfid) << sw("IID2",par::pp_maxiid) << sw("RT",3) << sw("EZ",6) << sw("Z0",8) << sw("Z1",8) << sw("Z2",8) << sw("PI_HAT",8) << sw("PHE",4) << sw("DST",10) << sw("PPC",8) << sw("RATIO",8); if ( par::genome_output_full ) ZOUTFILE << sw("IBS0",8) << sw("IBS1",8) << sw("IBS2",8) << sw("HOMHOM",8) << sw("HETHET",8); ZOUTFILE << "\n"; } int c=0; int c2=0; for (int i1=0; i1= par::genome_test_threshold ) { int2 pair; pair.p1 = i1; pair.p2 = i2; related.insert(pair); } } } if (!par::silent) cout << "\n"; ZOUTFILE.close(); } void Plink::calcGenomeIBM(Individual * p1, Individual * p2) { // Individual-major mode assumed int cnt = 0; // Consider all loci vector::iterator ia1 = p1->one.begin(); vector::iterator ia2 = p1->two.begin(); vector::iterator ib1 = p2->one.begin(); vector::iterator ib2 = p2->two.begin(); while ( ia1 != p1->one.end() ) { // Count discordantly missing SNPs if ( *ia1 && !*ia2 ) { if ( ! ( *ib1 && !*ib2 ) ) cnt++; } else if ( *ib1 && !*ib2 ) cnt++; // Next SNP ia1++; ia2++; ib1++; ib2++; } // IBM similiarity metric dst = 1.0 - ( (double)cnt / (double)nl_all ); } void Plink::pruneLD() { if (!par::SNP_major) Ind2SNP(); printLOG("Performing LD-based pruning...\n"); string f = par::output_file_name + ".prune.in"; ofstream PIN(f.c_str(),ios::out); printLOG("Writing pruned-in SNPs to [ " + f + " ]\n"); f = par::output_file_name + ".prune.out"; ofstream POUT(f.c_str(),ios::out); printLOG("Writing pruned-out SNPs to [ " + f + " ]\n"); int win_start = 0; int win_end = win_start + par::prune_ld_win; //////////////////////// // Scan each chromosome vector chrs; if (par::run_chr==0) { vector r = getChromosomeRange(*this); printLOG("Scanning from chromosome "+ chromosomeName( r[0] ) +" to "+ chromosomeName( r[1] ) +"\n\n"); for (int i=r[0];i<=r[1];i++) { if (seeChromosome(*this,i)) chrs.push_back(i); } } else chrs.push_back(par::run_chr); // Inclusion or no? vector include(nl_all,true); // Only consider founders (set flag) vector::iterator person = sample.begin(); while ( person != sample.end() ) { if ( (*person)->founder ) (*person)->flag = true; else (*person)->flag = false; person++; } // Scan each chromosome for (int i=0;i par::run_end ) s2 = par::run_end; while ( s2 <= par::run_end ) { // calc VIF and set vector nSNP(0); for (int l=s1; l<=s2; l++) if ( include[l] ) { nSNP.push_back( l ); } // Skip if we only have a single SNP left if (nSNP.size() < 2) { if ( s2 == par::run_end ) break; s1 += par::prune_ld_step; s2 += par::prune_ld_step; if (s2 > par::run_end) s2 = par::run_end; if ( s2-s1 < 1) break; continue; } vector > variance; if (!par::silent) { cout << "Pruning SNPs " << s1-par::run_start+1 << " to " << s2-par::run_start+1 << " of " << par::run_end - par::run_start+1 << " \r"; cout.flush(); } // Calculate covariance matrices variance = calcSetCovarianceMatrix(nSNP); // Calculate VIFs vector cur = vif_prune(variance,par::prune_ld_vif,nSNP); // Update main list int k=0; for (int l=s1; l<=s2; l++) { // Update main list, but do not get back // already excluded SNPs if (include[l] && !cur[k++]) include[l] = false; } // Advance window if ( s2 == par::run_end ) break; s1 += par::prune_ld_step; s2 += par::prune_ld_step; if (s2 > par::run_end) s2 = par::run_end; if ( s2-s1 < 1) break; } // next window if (!par::silent) cout << "\n"; // Record what is in, what is out int cnt_in = 0, cnt_out = 0; for (int l=par::run_start; l<=par::run_end; l++) { if (include[l]) { PIN << locus[l]->name << "\n"; cnt_in++; } else { POUT << locus[l]->name << "\n"; cnt_out++; } } printLOG("For chromosome "+int2str(par::run_chr)+ ", "+int2str(cnt_out)+ " SNPs pruned out, "+int2str(cnt_in)+" remaining\n"); } // next chromosome PIN.close(); POUT.close(); } plink-1.07-src/fisher.cpp0000644000265600020320000016342011264127625014467 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include "fisher.h" #include "plink.h" #include "helper.h" double fisher(table_t t) { int nrow = t.size(); if ( nrow == 0 ) return -9; int ncol = t[0].size(); if ( ncol == 0 ) return -9; double table[200]; int c=0; for (int j=0; j #include #include #include #define SINT_MAX INT_MAX #undef min #undef max #define max(a, b)((a) < (b) ? (b) : (a)) #define min(a, b)((a) > (b) ? (b) : (a)) static void f2xact(int *nrow, int *ncol, double *table, int *ldtabl, double *expect, double *percnt, double *emin, double *prt, double *pre, double *fact, int *ico, int *iro, int *kyy, int *idif, int *irn, int *key, int *ldkey, int *ipoin, double *stp, int *ldstp, int *ifrq, double *dlp, double *dsp, double *tm, int *key2, int *iwk, double *rwk); static void f3xact(int *nrow, int *irow, int *ncol,int *icol, double *dlp, int *mm, double *fact, int *ico, int *iro, int *it, int *lb, int *nr, int *nt, int *nu, int *itc, int *ist, double *stv, double *alen, const double *tol); static void f4xact(int *nrow, int *irow, int *ncol, int *icol, double *dsp, double *fact, int *icstk, int *ncstk, int *lstk, int *mstk, int *nstk, int *nrstk, int *irstk, double *ystk, const double *tol); static void f5xact(double *pastp, const double *tol, int *kval, int *key, int *ldkey, int *ipoin, double *stp, int *ldstp, int *ifrq, int *npoin, int *nr, int *nl, int *ifreq, int *itop, int *ipsh); static void f6xact(int *nrow, int *irow, int *iflag, int *kyy, int *key, int *ldkey, int *last, int *ipn); static void f7xact(int *nrow, int *imax, int *idif, int *k, int *ks, int *iflag); static void f8xact(int *irow, int *is, int *i1, int *izero, int *knew); static double f9xact(int *n, int *mm, int *ir, double *fact); static void f10act(int *nrow, int *irow, int *ncol, int *icol, double *val, int *xmin, double *fact, int *nd, int *ne, int *m); static void f11act(int *irow, int *i1, int *i2, int *knew); static void prterr(int icode, char *mes); static int iwork(int iwkmax, int *iwkpt, int number, int itype); #ifdef USING_R # define isort(n, ix)R_isort(ix, *n) # include /* -> pgamma() */ #else static void isort(int *n, int *ix); static double gammds(double *y, double *p, int *ifault); static double alogam(double *x, int *ifault); #endif /* The only public function : */ void fexact(int *nrow, int *ncol, double *table, int *ldtabl, double *expect, double *percnt, double *emin, double *prt, double *pre, int *workspace) { /* ALGORITHM 643, COLLECTED ALGORITHMS FROM ACM. THIS WORK PUBLISHED IN TRANSACTIONS ON MATHEMATICAL SOFTWARE, VOL. 19, NO. 4, DECEMBER, 1993, PP. 484-488. ----------------------------------------------------------------------- Name: FEXACT Purpose: Computes Fisher's exact test probabilities and a hybrid approximation to Fisher exact test probabilities for a contingency table using the network algorithm. Usage: CALL FEXACT (NROW, NCOL, TABLE, LDTABL, EXPECT, PERCNT, EMIN, PRT, PRE) Arguments: NROW - The number of rows in the table.(Input) NCOL - The number of columns in the table.(Input) TABLE - NROW by NCOL matrix containing the contingency table.(Input) LDTABL - Leading dimension of TABLE exactly as specified in the dimension statement in the calling program.(Input) EXPECT - Expected value used in the hybrid algorithm for deciding when to use asymptotic theory probabilities.(Input) If EXPECT <= 0.0 then asymptotic theory probabilities are not used and Fisher exact test probabilities are computed. Otherwise, if PERCNT or more of the cells in the remaining table have estimated expected values of EXPECT or more, with no remaining cell having expected value less than EMIN, then asymptotic chi-squared probabilities are used. See the algorithm section of the manual document for details. Use EXPECT = 5.0 to obtain the 'Cochran' condition. PERCNT - Percentage of remaining cells that must have estimated expected values greater than EXPECT before asymptotic probabilities can be used.(Input) See argument EXPECT for details. Use PERCNT = 80.0 to obtain the 'Cochran' condition. EMIN - Minimum cell estimated expected value allowed for asymptotic chi-squared probabilities to be used.(Input) See argument EXPECT for details. Use EMIN = 1.0 to obtain the 'Cochran' condition. PRT - Probability of the observed table for fixed marginal totals.(Output) PRE - Table p-value.(Output) PRE is the probability of a more extreme table, where `extreme' is in a probabilistic sense. If EXPECT < 0 then the Fisher exact probability is returned. Otherwise, an approximation to the Fisher exact probability is computed based upon asymptotic chi-squared probabilities for ``large'' table expected values. The user defines ``large'' through the arguments EXPECT, PERCNT, and EMIN. Remarks: 1. For many problems one megabyte or more of workspace can be required.If the environment supports it, the user should begin by increasing the workspace used to 200,000 units. 2. In FEXACT, LDSTP = 30*LDKEY. The proportion of table space used by STP may be changed by changing the line MULT = 30 below to another value. 3. FEXACT may be converted to single precision by setting IREAL = 3, and converting all DOUBLE PRECISION specifications (except the specifications for RWRK, IWRK, and DWRK) to REAL.This will require changing the names and specifications of the intrinsic functions ALOG, AMAX1, AMIN1, EXP, and REAL. In addition, the machine specific constants will need to be changed, and the name DWRK will need to be changed to RWRK in the call to F2XACT. 4. Machine specific constants are specified and documented in F2XACT. A missing value code is specified in both FEXACT and F2XACT. 5. Although not a restriction, is is not generally practical to call this routine with large tables which are not sparse and in which the 'hybrid' algorithm has little effect. For example, although it is feasible to compute exact probabilities for the table 1 8 5 4 4 2 2 5 3 3 4 3 1 0 10 1 4 0 0 0 0, computing exact probabilities for a similar table which has been enlarged by the addition of an extra row (or column) may not be feasible. ----------------------------------------------------------------------- */ /* CONSTANT Parameters : */ /* To increase the length of the table of paste path lengths relative to the length of the hash table, increase MULT. */ const int mult = 30; /* AMISS is a missing value indicator which is returned when the probability is not defined. */ const double amiss = -12345.; /* Set IREAL = 4 for DOUBLE PRECISION Set IREAL = 3 for SINGLE PRECISION */ #define i_real 4 #define i_int 2 /* System generated locals */ int ikh; /* Local variables */ int nco, nro, ntot, numb, iiwk, irwk; int i, j, k, kk, ldkey, ldstp, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10; int i3a, i3b, i3c, i9a, iwkmax, iwkpt; /* Workspace Allocation (freed at end) */ double *equiv; iwkmax = 2 * (int) (*workspace / 2); // equiv = (double *) R_alloc(iwkmax / 2, sizeof(double)); equiv = (double *) calloc(iwkmax / 2, sizeof(double)); /* The check could never happen with Calloc! equiv = Calloc(iwkmax / 2, double); if (!equiv) { prterr(0, "Can not allocate specified workspace"); } */ #define dwrk (equiv) #define iwrk ((int *)equiv) #define rwrk ((float *)equiv) /* Parameter adjustments */ table -= *ldtabl + 1; /* Function Body */ iwkpt = 0; if (*nrow > *ldtabl) prterr(1, "NROW must be less than or equal to LDTABL."); ntot = 0; for (i = 1; i <= *nrow; ++i) { for (j = 1; j <= *ncol; ++j) { if (table[i + j * *ldtabl] < 0.) prterr(2, "All elements of TABLE must be positive."); ntot = (int) (ntot + table[i + j * *ldtabl]); } } if (ntot == 0) { prterr(3, "All elements of TABLE are zero.\n" "PRT and PRE are set to missing values."); *prt = amiss; *pre = amiss; goto L_End; } nco = max(*nrow, *ncol); nro = *nrow + *ncol - nco;/* = min(*nrow, *ncol) */ k = *nrow + *ncol + 1; kk = k * nco; ikh = ntot + 1; i1 = iwork(iwkmax, &iwkpt, ikh, i_real); i2 = iwork(iwkmax, &iwkpt, nco, i_int); i3 = iwork(iwkmax, &iwkpt, nco, i_int); i3a = iwork(iwkmax, &iwkpt, nco, i_int); i3b = iwork(iwkmax, &iwkpt, nro, i_int); i3c = iwork(iwkmax, &iwkpt, nro, i_int); ikh = max(k * 5 + (kk << 1), nco * 7 + 800); iiwk= iwork(iwkmax, &iwkpt, ikh, i_int); ikh = max(nco + 401, k); irwk= iwork(iwkmax, &iwkpt, ikh, i_real); /* NOTE: What follows below splits the remaining amount iwkmax - iwkpt of (int) workspace into hash tables as follows. type size index INT 2 * ldkey i4 i5 i11 REAL 2 * ldkey i8 i9 i10 REAL 2 * ldstp i6 INT 6 * ldstp i7 Hence, we need ldkey times 3 * 2 + 3 * 2 * s + 2 * mult * s + 6 * mult chunks of integer memory, where s = sizeof(REAL) / sizeof(INT). If doubles are used and are twice as long as ints, this gives 18 + 10 * mult so that the value of ldkey can be obtained by dividing available (int) workspace by this number. In fact, because iwork() can actually s * n + s - 1 int chunks when allocating a REAL, we use ldkey = available / numb - 1. FIXME: Can we always assume that sizeof(double) / sizeof(int) is 2? */ if (i_real == 4) {/* Double precision reals */ numb = 18 + 10 * mult; } else {/* Single precision reals */ numb = (mult << 3) + 12; } ldkey = (iwkmax - iwkpt) / numb - 1; ldstp = mult * ldkey; ikh = ldkey << 1;i4 = iwork(iwkmax, &iwkpt, ikh, i_int); ikh = ldkey << 1;i5 = iwork(iwkmax, &iwkpt, ikh, i_int); ikh = ldstp << 1;i6 = iwork(iwkmax, &iwkpt, ikh, i_real); ikh = ldstp * 6;i7 = iwork(iwkmax, &iwkpt, ikh, i_int); ikh = ldkey << 1;i8 = iwork(iwkmax, &iwkpt, ikh, i_real); ikh = ldkey << 1;i9 = iwork(iwkmax, &iwkpt, ikh, i_real); ikh = ldkey << 1;i9a = iwork(iwkmax, &iwkpt, ikh, i_real); ikh = ldkey << 1;i10 = iwork(iwkmax, &iwkpt, ikh, i_int); /* To convert to double precision, change RWRK to DWRK in the next CALL. */ f2xact(nrow, ncol, &table[*ldtabl + 1], ldtabl, expect, percnt, emin, prt, pre, dwrk + i1, iwrk + i2, iwrk + i3, iwrk + i3a, iwrk + i3b, iwrk + i3c, iwrk + i4, &ldkey, iwrk + i5, dwrk + i6, &ldstp, iwrk + i7, dwrk + i8, dwrk + i9, dwrk + i9a, iwrk + i10, iwrk + iiwk, dwrk + irwk); L_End: /* Free(equiv); */ free(equiv); return; } #undef rwrk #undef iwrk #undef dwrk /* ----------------------------------------------------------------------- Name:F2XACT Purpose:Computes Fisher's exact test for a contingency table, routine with workspace variables specified. Usage:F2XACT (NROW, NCOL, TABLE, LDTABL, EXPECT, PERCNT, EMIN, PRT, PRE, FACT, ICO, IRO, KYY, IDIF, IRN, KEY, LDKEY, IPOIN, STP, LDSTP, IFRQ, DLP, DSP, TM, KEY2, IWK, RWK) ----------------------------------------------------------------------- */ void f2xact(int *nrow, int *ncol, double *table, int *ldtabl, double *expect, double *percnt, double *emin, double *prt, double *pre, double *fact, int *ico, int *iro, int *kyy, int *idif, int *irn, int *key, int *ldkey, int *ipoin, double *stp, int *ldstp, int *ifrq, double *dlp, double *dsp, double *tm, int *key2, int *iwk, double *rwk) { /* IMAX is the largest representable int on the machine. */ const int imax = SINT_MAX; /* AMISS is a missing value indicator which is returned when the probability is not defined. */ const double amiss = -12345.; /* TOL is chosen as the square root of the smallest relative spacing. */ #ifndef Macintosh const static double tol = 3.45254e-7; #else static double tol = 3.45254e-7; #endif /* EMX is a large positive value used in comparing expected values. */ const static double emx = 1e30; /* Local variables {{any really need to be static ???}} */ static int kval, kmax, jkey, last, ipsh, itmp, itop, jstp, ntot, jstp2, jstp3, jstp4, i, ii, j, k, n, iflag, ncell, ifreq, chisq, ikkey, ikstp, ikstp2, k1, kb, kd, ks, i31, i32, i33, i34, i35, i36, i37, i38, i39, i41, i42, i43, i44, i45, i46, i47, i48, i310, i311, nco, nrb, ipn, ipo, itp, nro, nro2; static double dspt, dd, df,ddf, drn,dro, emn, obs, obs2, obs3, pastp, pv, tmp; double d1; #ifndef USING_R double d2; static int ifault; #endif bool nr_gt_nc; /* Parameter adjustments */ table -= *ldtabl + 1; --ico; --iro; --kyy; --idif; --irn; --key; --ipoin; --stp; --ifrq; --dlp; --dsp; --tm; --key2; --iwk; --rwk; /* Check table dimensions */ if (*nrow > *ldtabl) prterr(1, "NROW must be less than or equal to LDTABL."); if (*ncol <= 1) prterr(4, "NCOL must be at least 2"); /* Initialize KEY array */ for (i = 1; i <= *ldkey << 1; ++i) { key[i] = -9999; key2[i] = -9999; } /* Initialize parameters */ *pre = 0.; itop = 0; if (*expect > 0.) emn = *emin; else emn = emx; nr_gt_nc = *nrow > *ncol; /* nco := max(nrow, ncol) : */ if(nr_gt_nc) nco = *nrow; else nco = *ncol; /* Initialize pointers for workspace */ /* f3xact */ i31 = 1; i32 = i31 + nco; i33 = i32 + nco; i34 = i33 + nco; i35 = i34 + nco; i36 = i35 + nco; i37 = i36 + nco; i38 = i37 + nco; i39 = i38 + 400; i310 = 1; i311 = 401; /* f4xact */ k = *nrow + *ncol + 1; i41 = 1; i42 = i41 + k; i43 = i42 + k; i44 = i43 + k; i45 = i44 + k; i46 = i45 + k; i47 = i46 + k * nco; i48 = 1; /* Compute row marginals and total */ ntot = 0; for (i = 1; i <= *nrow; ++i) { iro[i] = 0; for (j = 1; j <= *ncol; ++j) { if (table[i + j * *ldtabl] < -1e-4) prterr(2, "All elements of TABLE must be positive."); iro[i] += (int) table[i + j * *ldtabl]; } ntot += iro[i]; } if (ntot == 0) { prterr(3, "All elements of TABLE are zero.\n" "PRT and PRE are set to missing values."); *pre = *prt = amiss; return; } /* Column marginals */ for (i = 1; i <= *ncol; ++i) { ico[i] = 0; for (j = 1; j <= *nrow; ++j) ico[i] += (int) table[j + i * *ldtabl]; } /* sort marginals */ isort(nrow, &iro[1]); isort(ncol, &ico[1]); /*Determine row and column marginals. Define max(nrow,ncol) =: nco >= nro := min(nrow,ncol) nco is defined above Swap marginals if necessary toico[1:nco] & iro[1:nro] */ if (nr_gt_nc) { nro = *ncol; /* Swap marginals */ for (i = 1; i <= nco; ++i) { itmp = iro[i]; if (i <= nro) iro[i] = ico[i]; ico[i] = itmp; } } else nro = *nrow; /* Get multiplers for stack */ kyy[1] = 1; for (i = 2; i <= nro; ++i) { /* Hash table multipliers */ if (iro[i - 1] + 1 <= imax / kyy[i - 1]) { kyy[i] = kyy[i - 1] * (iro[i - 1] + 1); j /= kyy[i - 1]; } else goto L_ERR_5; } /* Maximum product */ if (iro[nro - 1] + 1 <= imax / kyy[nro - 1]) { kmax = (iro[nro] + 1) * kyy[nro - 1]; } else { L_ERR_5: prterr(5, "The hash table key cannot be computed because " "the largest key\n" "is larger than the largest representable int.\n" "The algorithm cannot proceed.\n" "Reduce the workspace size, or use `exact = FALSE'."); return; } /* Compute log factorials */ fact[0] = 0.; fact[1] = 0.; if(ntot >= 2) fact[2] = log(2.); /* MM: old code assuming log() to be SLOW */ for (i = 3; i <= ntot; i += 2) { fact[i] = fact[i - 1] + log((double) i); j = i + 1; if (j <= ntot) fact[j] = fact[i] + fact[2] + fact[j / 2] - fact[j / 2 - 1]; } /* Compute obs := observed path length */ obs = tol; ntot = 0; for (j = 1; j <= nco; ++j) { dd = 0.; for (i = 1; i <= nro; ++i) { if (nr_gt_nc) { dd += fact[(int) table[j + i * *ldtabl]]; ntot += (int) table[j + i * *ldtabl]; } else { dd += fact[(int) table[i + j * *ldtabl]]; ntot += (int) table[i + j * *ldtabl]; } } obs += fact[ico[j]] - dd; } /* Denominator of observed table: DRO */ dro = f9xact(&nro, &ntot, &iro[1], fact); *prt = exp(obs - dro); /* Initialize pointers */ k = nco; last = *ldkey + 1; jkey = *ldkey + 1; jstp = *ldstp + 1; jstp2 = *ldstp * 3 + 1; jstp3 = (*ldstp << 2) + 1; jstp4 = *ldstp * 5 + 1; ikkey = 0; ikstp = 0; ikstp2 = *ldstp << 1; ipo = 1; ipoin[1] = 1; stp[1] = 0.; ifrq[1] = 1; ifrq[ikstp2 + 1] = -1; Outer_Loop: kb = nco - k + 1; ks = 0; n = ico[kb]; kd = nro + 1; kmax = nro; /* IDIF is the difference in going to the daughter */ for (i = 1; i <= nro; ++i) idif[i] = 0; /* Generate the first daughter */ do { --kd; ntot = min(n, iro[kd]); idif[kd] = ntot; if (idif[kmax] == 0) --kmax; n -= ntot; } while (n > 0 && kd != 1); if (n != 0) { goto L310; } k1 = k - 1; n = ico[kb]; ntot = 0; for (i = kb + 1; i <= nco; ++i) ntot += ico[i]; L150: /* Arc to daughter length=ICO(KB) */ for (i = 1; i <= nro; ++i) irn[i] = iro[i] - idif[i]; /* Sort irn */ if (k1 > 1) { if (nro == 2) { if (irn[1] > irn[2]) { ii = irn[1]; irn[1] = irn[2]; irn[2] = ii; } } else if (nro == 3) { ii = irn[1]; if (ii > irn[3]) { if (ii > irn[2]) { if (irn[2] > irn[3]) { irn[1] = irn[3]; irn[3] = ii; } else { irn[1] = irn[2]; irn[2] = irn[3]; irn[3] = ii; } } else { irn[1] = irn[3]; irn[3] = irn[2]; irn[2] = ii; } } else if (ii > irn[2]) { irn[1] = irn[2]; irn[2] = ii; } else if (irn[2] > irn[3]) { ii = irn[2]; irn[2] = irn[3]; irn[3] = ii; } } else { for (j = 2; j <= nro; ++j) { i = j - 1; ii = irn[j]; while (ii < irn[i]) { irn[i + 1] = irn[i]; --i; if (i == 0) break; } irn[i + 1] = ii; } } /* Adjust start for zero */ for (i = 1; i <= nro; ++i) { if (irn[i] != 0) break; } nrb = i; nro2 = nro - i + 1; } else { nrb = 1; nro2 = nro; } /* Some table values */ ddf = f9xact(&nro, &n, &idif[1], fact); drn = f9xact(&nro2, &ntot, &irn[nrb], fact) - dro + ddf; /* Get hash value */ if (k1 > 1) { kval = irn[1] + irn[2] * kyy[2]; for (i = 3; i <= nro; ++i) { kval += irn[i] * kyy[i]; } /* Get hash table entry */ i = kval % (*ldkey << 1) + 1; /* Search for unused location */ for (itp = i; itp <= *ldkey << 1; ++itp) { ii = key2[itp]; if (ii == kval) { goto L240; } else if (ii < 0) { key2[itp] = kval; dlp[itp] = 1.; dsp[itp] = 1.; goto L240; } } for (itp = 1; itp <= i - 1; ++itp) { ii = key2[itp]; if (ii == kval) { goto L240; } else if (ii < 0) { key2[itp] = kval; dlp[itp] = 1.; goto L240; } } /* KH prterr(6, "LDKEY is too small.\n" "It is not possible to give the value of LDKEY required,\n" "but you could try doubling LDKEY (and possibly LDSTP)."); */ prterr(6, "LDKEY is too small for this problem.\n" "Try increasing the size of the workspace."); } L240: ipsh = (1); /* Recover pastp */ ipn = ipoin[ipo + ikkey]; pastp = stp[ipn + ikstp]; ifreq = ifrq[ipn + ikstp]; /* Compute shortest and longest path */ if (k1 > 1) { obs2 = obs - fact[ico[kb + 1]] - fact[ico[kb + 2]] - ddf; for (i = 3; i <= k1; ++i) { obs2 -= fact[ico[kb + i]]; } if (dlp[itp] > 0.) { dspt = obs - obs2 - ddf; /* Compute longest path */ dlp[itp] = 0.; f3xact(&nro2, &irn[nrb], &k1, &ico[kb + 1], &dlp[itp], &ntot, fact, &iwk[i31], &iwk[i32], &iwk[i33], &iwk[i34], &iwk[i35], &iwk[i36], &iwk[i37], &iwk[i38], &iwk[i39], &rwk[i310], &rwk[i311], &tol); dlp[itp] = min(0., dlp[itp]); /* Compute shortest path */ dsp[itp] = dspt; f4xact(&nro2, &irn[nrb], &k1, &ico[kb + 1], &dsp[itp], fact, &iwk[i47], &iwk[i41], &iwk[i42], &iwk[i43], &iwk[i44], &iwk[i45], &iwk[i46], &rwk[i48], &tol); dsp[itp] = min(0., dsp[itp] - dspt); /* Use chi-squared approximation? */ if ((irn[nrb] * ico[kb + 1]) > ntot * emn) { ncell = 0; for (i = 0; i < nro2; ++i) for (j = 1; j <= k1; ++j) if (irn[nrb + i] * ico[kb + j] >= ntot * *expect) ncell++; if (ncell * 100 >= k1 * nro2 * *percnt) { tmp = 0.; for (i = 0; i < nro2; ++i) tmp += (fact[irn[nrb + i]] - fact[irn[nrb + i] - 1]); tmp *= k1 - 1; for (j = 1; j <= k1; ++j) tmp += (nro2 - 1) * (fact[ico[kb + j]] - fact[ico[kb + j] - 1]); df = (double) ((nro2 - 1) * (k1 - 1)); tmp += df * 1.83787706640934548356065947281; tmp -= (nro2 * k1 - 1) * (fact[ntot] - fact[ntot - 1]); tm[itp] = (obs - dro) * -2. - tmp; } else { /* tm(itp) set to a flag value */ tm[itp] = -9876.; } } else { tm[itp] = -9876.; } } obs3 = obs2 - dlp[itp]; obs2 -= dsp[itp]; if (tm[itp] == -9876.) { chisq = (0); } else { chisq = (1); tmp = tm[itp]; } } else { obs2 = obs - drn - dro; obs3 = obs2; } L300: /* Process node with new PASTP */ if (pastp <= obs3) { /* Update pre */ *pre += (double) ifreq * exp(pastp + drn); } else if (pastp < obs2) { if (chisq) { df = (double) ((nro2 - 1) * (k1 - 1)); #ifdef USING_R pv = pgamma(fmax2(0., tmp + (pastp + drn) * 2.) / 2., df / 2., /*scale = */ 1., /*lower_tail = */FALSE, /*log_p = */ FALSE); #else d1 = max(0., tmp + (pastp + drn) * 2.) / 2.; d2 = df / 2.; pv = 1. - gammds(&d1, &d2, &ifault); #endif *pre += (double) ifreq * exp(pastp + drn) * pv; } else { /* Put daughter on queue */ d1 = pastp + ddf; f5xact(&d1, &tol, &kval, &key[jkey], ldkey, &ipoin[jkey], &stp[jstp], ldstp, &ifrq[jstp], &ifrq[jstp2], &ifrq[jstp3], &ifrq[jstp4], &ifreq, &itop, &ipsh); ipsh = (0); } } /* Get next PASTP on chain */ ipn = ifrq[ipn + ikstp2]; if (ipn > 0) { pastp = stp[ipn + ikstp]; ifreq = ifrq[ipn + ikstp]; goto L300; } /* Generate a new daughter node */ f7xact(&kmax, &iro[1], &idif[1], &kd, &ks, &iflag); if (iflag != 1) { goto L150; } L310: /* Go get a new mother from stage K */ do { iflag = 1; f6xact(&nro, &iro[1], &iflag, &kyy[1], &key[ikkey + 1], ldkey, &last, &ipo); /* Update pointers */ if (iflag != 3) goto Outer_Loop; /* else iflag == 3 : no additional nodes to process */ --k; itop = 0; ikkey = jkey - 1; ikstp = jstp - 1; ikstp2 = jstp2 - 1; jkey = *ldkey - jkey + 2; jstp = *ldstp - jstp + 2; jstp2 = (*ldstp << 1) + jstp; for (i = 1; i <= *ldkey << 1; ++i) key2[i] = -9999; } while (k >= 2); } /* ----------------------------------------------------------------------- Name: F3XACT Purpose: Computes the shortest path length for a given table. Usage: F3XACT (NROW, IROW, NCOL, ICOL, DLP, MM, FACT, ICO, IRO, IT, LB, NR, NT, NU, ITC, IST, STV, ALEN, TOL) Arguments: NROW - The number of rows in the table.(Input) IROW - Vector of length NROW containing the row sums for the table.(Input) NCOL - The number of columns in the table.(Input) ICOL - Vector of length K containing the column sums for the table.(Input) DLP - The longest path for the table.(Output) MM - The total count in the table.(Output) FACT - Vector containing the logarithms of factorials.(Input) ICO - Work vector of length MAX(NROW,NCOL). IRO - Work vector of length MAX(NROW,NCOL). IT - Work vector of length MAX(NROW,NCOL). LB - Work vector of length MAX(NROW,NCOL). NR - Work vector of length MAX(NROW,NCOL). NT - Work vector of length MAX(NROW,NCOL). NU - Work vector of length MAX(NROW,NCOL). ITC - Work vector of length 400. IST - Work vector of length 400. STV - Work vector of length 400. ALEN - Work vector of length MAX(NROW,NCOL). TOL - Tolerance.(Input) ----------------------------------------------------------------------- */ void f3xact(int *nrow, int *irow, int *ncol, int *icol, double *dlp, int *mm, double *fact, int *ico, int *iro, int *it, int *lb, int *nr, int *nt, int *nu, int *itc, int *ist, double *stv, double *alen, const double *tol) { /* Initialized data */ static int ldst = 200; static int nst = 0; static int nitc = 0; /* Local variables */ static int xmin; static int i, k; static double v; static int n11, n12, ii, nn, ks, ic1, ic2, nc1, nn1; static int nr1, nco; static double val; static int nct, ipn, irl, key, lev, itp, nro; static double vmn; static int nrt, kyy, nc1s; /* Parameter adjustments */ --stv; --ist; --itc; --nu; --nt; --nr; --lb; --it; --iro; --ico; --icol; --irow; /* Function Body */ for (i = 0; i <= *ncol; ++i) { alen[i] = 0.; } for (i = 1; i <= 400; ++i) { ist[i] = -1; } /* nrow is 1 */ if (*nrow <= 1) { if (*nrow > 0) { *dlp -= fact[icol[1]]; for (i = 2; i <= *ncol; ++i) { *dlp -= fact[icol[i]]; } } return; } /* ncol is 1 */ if (*ncol <= 1) { if (*ncol > 0) { *dlp = *dlp - fact[irow[1]] - fact[irow[2]]; for (i = 3; i <= *nrow; ++i) { *dlp -= fact[irow[i]]; } } return; } /* 2 by 2 table */ if (*nrow * *ncol == 4) { n11 = (irow[1] + 1) * (icol[1] + 1) / (*mm + 2); n12 = irow[1] - n11; *dlp = *dlp - fact[n11] - fact[n12] - fact[icol[1] - n11] - fact[icol[2] - n12]; return; } /* Test for optimal table */ val = 0.; xmin = (0); if (irow[*nrow] <= irow[1] + *ncol) { f10act(nrow, &irow[1], ncol, &icol[1], &val, &xmin, fact, &lb[1], &nu[1], &nr[1]); } if (! xmin) { if (icol[*ncol] <= icol[1] + *nrow) { f10act(ncol, &icol[1], nrow, &irow[1], &val, &xmin, fact, &lb[1], &nu[1], &nr[1]); } } if (xmin) { *dlp -= val; return; } /* Setup for dynamic programming */ nn = *mm; /* Minimize ncol */ if (*nrow >= *ncol) { nro = *nrow; nco = *ncol; for (i = 1; i <= *nrow; ++i) { iro[i] = irow[i]; } ico[1] = icol[1]; nt[1] = nn - ico[1]; for (i = 2; i <= *ncol; ++i) { ico[i] = icol[i]; nt[i] = nt[i - 1] - ico[i]; } } else { nro = *ncol; nco = *nrow; ico[1] = irow[1]; nt[1] = nn - ico[1]; for (i = 2; i <= *nrow; ++i) { ico[i] = irow[i]; nt[i] = nt[i - 1] - ico[i]; } for (i = 1; i <= *ncol; ++i) iro[i] = icol[i]; } /* Initialize pointers */ vmn = 1e10; nc1s = nco - 1; irl = 1; ks = 0; k = ldst; kyy = ico[nco] + 1; LnewNode: /* Setup to generate new node */ lev = 1; nr1 = nro - 1; nrt = iro[irl]; nct = ico[1]; lb[1] = (int) ((double) ((nrt + 1) * (nct + 1)) / (double) (nn + nr1 * nc1s + 1) - *tol) - 1; nu[1] = (int) ((double) ((nrt + nc1s) * (nct + nr1)) / (double) (nn + nr1 + nc1s)) - lb[1] + 1; nr[1] = nrt - lb[1]; LoopNode: /* Generate a node */ --nu[lev]; if (nu[lev] == 0) { if (lev == 1) goto L200; --lev; goto LoopNode; } ++lb[lev]; --nr[lev]; L120: alen[lev] = alen[lev - 1] + fact[lb[lev]]; if (lev < nc1s) { nn1 = nt[lev]; nrt = nr[lev]; ++lev; nc1 = nco - lev; nct = ico[lev]; lb[lev] = (int) ((double) ((nrt + 1) * (nct + 1)) / (double) (nn1 + nr1 * nc1 + 1) - *tol); nu[lev] = (int) ((double) ((nrt + nc1) * (nct + nr1)) / (double) (nn1 + nr1 + nc1) - lb[lev] + 1); nr[lev] = nrt - lb[lev]; goto L120; } alen[nco] = alen[lev] + fact[nr[lev]]; lb[nco] = nr[lev]; v = val + alen[nco]; if (nro == 2) { /* Only 1 row left */ v = v + fact[ico[1] - lb[1]] + fact[ico[2] - lb[2]]; for (i = 3; i <= nco; ++i) { v += fact[ico[i] - lb[i]]; } if (v < vmn) { vmn = v; } } else if (nro == 3 && nco == 2) { /* 3 rows and 2 columns */ nn1 = nn - iro[irl] + 2; ic1 = ico[1] - lb[1]; ic2 = ico[2] - lb[2]; n11 = (iro[irl + 1] + 1) * (ic1 + 1) / nn1; n12 = iro[irl + 1] - n11; v = v + fact[n11] + fact[n12] + fact[ic1 - n11] + fact[ic2 - n12]; if (v < vmn) { vmn = v; } } else { /* Column marginals are new node */ for (i = 1; i <= nco; ++i) { it[i] = ico[i] - lb[i]; } /* Sort column marginals */ if (nco == 2) { if (it[1] > it[2]) { ii = it[1]; it[1] = it[2]; it[2] = ii; } } else if (nco == 3) { ii = it[1]; if (ii > it[3]) { if (ii > it[2]) { if (it[2] > it[3]) { it[1] = it[3]; it[3] = ii; } else { it[1] = it[2]; it[2] = it[3]; it[3] = ii; } } else { it[1] = it[3]; it[3] = it[2]; it[2] = ii; } } else if (ii > it[2]) { it[1] = it[2]; it[2] = ii; } else if (it[2] > it[3]) { ii = it[2]; it[2] = it[3]; it[3] = ii; } } else { isort(&nco, &it[1]); } /* Compute hash value */ key = it[1] * kyy + it[2]; for (i = 3; i <= nco; ++i) { key = it[i] + key * kyy; } if(key < 0) error("Bug in FEXACT: gave negative key \n"); /* Table index */ ipn = key % ldst + 1; /* Find empty position */ for (itp = ipn, ii = ks + ipn; itp <= ldst; ++itp, ++ii) { if (ist[ii] < 0) { goto L180; } else if (ist[ii] == key) { goto L190; } } for (itp = 1, ii = ks + 1; itp <= ipn - 1; ++itp, ++ii) { if (ist[ii] < 0) { goto L180; } else if (ist[ii] == key) { goto L190; } } error("Stack length exceeded in f3xact"); L180: /* Push onto stack */ ist[ii] = key; stv[ii] = v; ++nst; ii = nst + ks; itc[ii] = itp; goto LoopNode; L190: /* Marginals already on stack */ stv[ii] = min(v, stv[ii]); } goto LoopNode; L200: /* Pop item from stack */ if (nitc > 0) { /* Stack index */ itp = itc[nitc + k] + k; --nitc; val = stv[itp]; key = ist[itp]; ist[itp] = -1; /* Compute marginals */ for (i = nco; i >= 2; --i) { ico[i] = key % kyy; key /= kyy; } ico[1] = key; /* Set up nt array */ nt[1] = nn - ico[1]; for (i = 2; i <= nco; ++i) nt[i] = nt[i - 1] - ico[i]; /* Test for optimality (L90) */ xmin = (0); if (iro[nro] <= iro[irl] + nco) { f10act(&nro, &iro[irl], &nco, &ico[1], &val, &xmin, fact, &lb[1], &nu[1], &nr[1]); } if (!xmin && ico[nco] <= ico[1] + nro) f10act(&nco, &ico[1], &nro, &iro[irl], &val, &xmin, fact, &lb[1], &nu[1], &nr[1]); if (xmin) { if (vmn > val) vmn = val; goto L200; } else goto LnewNode; } else if (nro > 2 && nst > 0) { /* Go to next level */ nitc = nst; nst = 0; k = ks; ks = ldst - ks; nn -= iro[irl]; ++irl; --nro; goto L200; } *dlp -= vmn; } /* ----------------------------------------------------------------------- Name: F4XACT Purpose: Computes the longest path length for a given table. Usage: CALL F4XACT (NROW, IROW, NCOL, ICOL, DSP, FACT, ICSTK, NCSTK, LSTK, MSTK, NSTK, NRSTK, IRSTK, YSTK, TOL) Arguments: NROW - The number of rows in the table.(Input) IROW - Vector of length NROW containing the row sums for the table. (Input) NCOL - The number of columns in the table. (Input) ICOL - Vector of length K containing the column sums for the table. (Input) DSP - The shortest path for the table.(Output) FACT - Vector containing the logarithms of factorials. (Input) ICSTK - NCOL by NROW+NCOL+1 work array. NCSTK - Work vector of length NROW+NCOL+1. LSTK - Work vector of length NROW+NCOL+1. MSTK - Work vector of length NROW+NCOL+1. NSTK - Work vector of length NROW+NCOL+1. NRSTK - Work vector of length NROW+NCOL+1. IRSTK - NROW by MAX(NROW,NCOL) work array. YSTK - Work vector of length NROW+NCOL+1. TOL - Tolerance. (Input) ----------------------------------------------------------------------- */ void f4xact(int *nrow, int *irow, int *ncol, int *icol, double *dsp, double *fact, int *icstk, int *ncstk, int *lstk, int *mstk, int *nstk, int *nrstk, int *irstk, double *ystk, const double *tol) { /* System generated locals */ int ikh; /* Local variables */ int i, j, k, l, m, n, mn, ic1, ir1, ict, irt, istk, nco, nro; double y, amx; /* Parameter adjustments */ irstk -= *nrow + 1; --irow; icstk -= *ncol + 1; --icol; --ncstk; --lstk; --mstk; --nstk; --nrstk; --ystk; /* Function Body */ /* Take care of the easy cases first */ if (*nrow == 1) { for (i = 1; i <= *ncol; ++i) { *dsp -= fact[icol[i]]; } return; } if (*ncol == 1) { for (i = 1; i <= *nrow; ++i) { *dsp -= fact[irow[i]]; } return; } if (*nrow * *ncol == 4) { if (irow[2] <= icol[2]) { *dsp = *dsp - fact[irow[2]] - fact[icol[1]] - fact[icol[2] - irow[2]]; } else { *dsp = *dsp - fact[icol[2]] - fact[irow[1]] - fact[irow[2] - icol[2]]; } return; } /* initialization before loop */ for (i = 1; i <= *nrow; ++i) { irstk[i + *nrow] = irow[*nrow - i + 1]; } for (j = 1; j <= *ncol; ++j) { icstk[j + *ncol] = icol[*ncol - j + 1]; } nro = *nrow; nco = *ncol; nrstk[1] = nro; ncstk[1] = nco; ystk[1] = 0.; y = 0.; istk = 1; l = 1; amx = 0.; /* First LOOP */ do { ir1 = irstk[istk * *nrow + 1]; ic1 = icstk[istk * *ncol + 1]; if (ir1 > ic1) { if (nro >= nco) { m = nco - 1;n = 2; } else { m = nro;n = 1; } } else if (ir1 < ic1) { if (nro <= nco) { m = nro - 1;n = 1; } else { m = nco;n = 2; } } else { if (nro <= nco) { m = nro - 1;n = 1; } else { m = nco - 1;n = 2; } } L60: if (n == 1) { i = l; j = 1; } else { i = 1; j = l; } irt = irstk[i + istk * *nrow]; ict = icstk[j + istk * *ncol]; mn = irt; if (mn > ict) { mn = ict; } y += fact[mn]; if (irt == ict) { --nro; --nco; f11act(&irstk[istk * *nrow + 1], &i, &nro, &irstk[(istk + 1) * *nrow + 1]); f11act(&icstk[istk * *ncol + 1], &j, &nco, &icstk[(istk + 1) * *ncol + 1]); } else if (irt > ict) { --nco; f11act(&icstk[istk * *ncol + 1], &j, &nco, &icstk[(istk + 1) * *ncol + 1]); ikh = irt - ict; f8xact(&irstk[istk * *nrow + 1], &ikh, &i, &nro, &irstk[(istk + 1) * *nrow + 1]); } else { --nro; f11act(&irstk[istk * *nrow + 1], &i, &nro, &irstk[(istk + 1) * *nrow + 1]); ikh = ict - irt; f8xact(&icstk[istk * *ncol + 1], &ikh, &j, &nco, &icstk[(istk + 1) * *ncol + 1]); } if (nro == 1) { for (k = 1; k <= nco; ++k) { y += fact[icstk[k + (istk + 1) * *ncol]]; } break; } if (nco == 1) { for (k = 1; k <= nro; ++k) { y += fact[irstk[k + (istk + 1) * *nrow]]; } break; } lstk[istk] = l; mstk[istk] = m; nstk[istk] = n; ++istk; nrstk[istk] = nro; ncstk[istk] = nco; ystk[istk] = y; l = 1; } while(1);/* end do */ /* L90:*/ if (y > amx) { amx = y; if (*dsp - amx <= *tol) { *dsp = 0.; return; } } L100: --istk; if (istk == 0) { *dsp -= amx; if (*dsp - amx <= *tol) { *dsp = 0.; } return; } l = lstk[istk] + 1; /* L110: */ for(;; ++l) { if (l > mstk[istk])goto L100; n = nstk[istk]; nro = nrstk[istk]; nco = ncstk[istk]; y = ystk[istk]; if (n == 1) { if (irstk[l + istk * *nrow] < irstk[l - 1 + istk * *nrow])goto L60; } else if (n == 2) { if (icstk[l + istk * *ncol] < icstk[l - 1 + istk * *ncol])goto L60; } } } /* ----------------------------------------------------------------------- Name: F5XACT Purpose: Put node on stack in network algorithm. Usage: CALL F5XACT (PASTP, TOL, KVAL, KEY, LDKEY, IPOIN, STP, LDSTP, IFRQ, NPOIN, NR, NL, IFREQ, ITOP, IPSH) Arguments: PASTP - The past path length.(Input) TOL - Tolerance for equivalence of past path lengths. (Input) KVAL - Key value. (Input) KEY - Vector of length LDKEY containing the key values.(in/out) LDKEY - Length of vector KEY. (Input) IPOIN - Vector of length LDKEY pointing to the linked list of past path lengths. (in/out) STP - Vector of length LSDTP containing the linked lists of past path lengths. (in/out) LDSTP - Length of vector STP. (Input) IFRQ - Vector of length LDSTP containing the past path frequencies. (in/out) NPOIN - Vector of length LDSTP containing the pointers to the next past path length. (in/out) NR - Vector of length LDSTP containing the right object pointers in the tree of past path lengths. (in/out) NL - Vector of length LDSTP containing the left object pointers in the tree of past path lengths. (in/out) IFREQ - Frequency of the current path length. (Input) ITOP - Pointer to the top of STP. (Input) IPSH - Option parameter. (Input) If IPSH is true, the past path length is found in the table KEY. Otherwise the location of the past path length is assumed known and to have been found in a previous call. ==>>>>> USING "static" variables ----------------------------------------------------------------------- */ void f5xact(double *pastp, const double *tol, int *kval, int *key, int *ldkey, int *ipoin, double *stp, int *ldstp, int *ifrq, int *npoin, int *nr, int *nl, int *ifreq, int *itop, int *ipsh) { /* Local variables */ static int itmp, ird, ipn, itp; double test1, test2; /* Parameter adjustments */ --nl; --nr; --npoin; --ifrq; --stp; --ipoin; --key; /* Function Body */ if (*ipsh) { /* Convert KVAL to int in range 1, ..., LDKEY. */ ird = *kval % *ldkey + 1; /* Search for an unused location */ for (itp = ird; itp <= *ldkey; ++itp) { if (key[itp] == *kval) { goto L40; } if (key[itp] < 0) { goto L30; } } for (itp = 1; itp <= ird - 1; ++itp) { if (key[itp] == *kval) { goto L40; } if (key[itp] < 0) { goto L30; } } /* Return if KEY array is full */ /* KH prterr(6, "LDKEY is too small for this problem.\n" "It is not possible to estimate the value of LDKEY " "required,\n" "but twice the current value may be sufficient."); */ prterr(6, "LDKEY is too small for this problem.\n" "Try increasing the size of the workspace."); /* Update KEY */ L30: key[itp] = *kval; ++(*itop); ipoin[itp] = *itop; /* Return if STP array full */ if (*itop > *ldstp) { /* KH prterr(7, "LDSTP is too small for this problem.\n" "It is not possible to estimate the value of LDSTP " "required,\n" "but twice the current value may be sufficient."); */ prterr(7, "LDSTP is too small for this problem.\n" "Try increasing the size of the workspace."); } /* Update STP, etc. */ npoin[*itop] = -1; nr[*itop] = -1; nl[*itop] = -1; stp[*itop] = *pastp; ifrq[*itop] = *ifreq; return; } /* Find location, if any, of pastp */ L40: ipn = ipoin[itp]; test1 = *pastp - *tol; test2 = *pastp + *tol; L50: if (stp[ipn] < test1) { ipn = nl[ipn]; if (ipn > 0) { goto L50; } } else if (stp[ipn] > test2) { ipn = nr[ipn]; if (ipn > 0) { goto L50; } } else { ifrq[ipn] += *ifreq; return; } /* Return if STP array full */ ++(*itop); if (*itop > *ldstp) { /* prterr(7, "LDSTP is too small for this problem.\n" "It is not possible to estimate the value of LDSTP " "required,\n" "but twice the current value may be sufficient."); */ prterr(7, "LDSTP is too small for this problem.\n" "Try increasing the size of the workspace."); return; } /* Find location to add value */ ipn = ipoin[itp]; itmp = ipn; L60: if (stp[ipn] < test1) { itmp = ipn; ipn = nl[ipn]; if (ipn > 0) { goto L60; } else { nl[itmp] = *itop; } } else if (stp[ipn] > test2) { itmp = ipn; ipn = nr[ipn]; if (ipn > 0) { goto L60; } else { nr[itmp] = *itop; } } /* Update STP, etc. */ npoin[*itop] = npoin[itmp]; npoin[itmp] = *itop; stp[*itop] = *pastp; ifrq[*itop] = *ifreq; nl[*itop] = -1; nr[*itop] = -1; } /* ----------------------------------------------------------------------- Name: F6XACT Purpose: Pop a node off the stack. Usage: CALL F6XACT (NROW, IROW, IFLAG, KYY, KEY, LDKEY, LAST, IPN) Arguments: NROW - The number of rows in the table.(Input) IROW - Vector of length nrow containing the row sums on output.(Output) IFLAG - Set to 3 if there are no additional nodes to process. (Output) KYY - Constant mutlipliers used in forming the hash table key.(Input) KEY - Vector of length LDKEY containing the hash table keys.(In/out) LDKEY - Length of vector KEY.(Input) LAST - Index of the last key popped off the stack.(In/out) IPN - Pointer to the linked list of past path lengths.(Output) ----------------------------------------------------------------------- */ void f6xact(int *nrow, int *irow, int *iflag, int *kyy, int *key, int *ldkey, int *last, int *ipn) { int kval, j; /* Parameter adjustments */ --key; --kyy; --irow; /* Function Body */ L10: ++(*last); if (*last <= *ldkey) { if (key[*last] < 0) { goto L10; } /* Get KVAL from the stack */ kval = key[*last]; key[*last] = -9999; for (j = *nrow; j >= 2; --j) { irow[j] = kval / kyy[j]; kval -= irow[j] * kyy[j]; } irow[1] = kval; *ipn = *last; } else { *last = 0; *iflag = 3; } return; } /* ----------------------------------------------------------------------- Name: F7XACT Purpose: Generate the new nodes for given marinal totals. Usage: CALL F7XACT (NROW, IMAX, IDIF, K, KS, IFLAG) Arguments: NROW - The number of rows in the table.(Input) IMAX - The row marginal totals.(Input) IDIF - The column counts for the new column.(in/out) K - Indicator for the row to decrement.(in/out) KS - Indicator for the row to increment.(in/out) IFLAG - Status indicator.(Output) If IFLAG is zero, a new table was generated. For IFLAG = 1, no additional tables could be generated. ----------------------------------------------------------------------- */ void f7xact(int *nrow, int *imax, int *idif, int *k, int *ks, int *iflag) { int i, m, k1, mm; /* Parameter adjustments */ --idif; --imax; /* Function Body */ *iflag = 0; /* Find node which can be incremented, ks */ if (*ks == 0) do { ++(*ks); } while (idif[*ks] == imax[*ks]); /* Find node to decrement (>ks) */ if (idif[*k] > 0 && *k > *ks) { --idif[*k]; do { --(*k); } while(imax[*k] == 0); m = *k; /* Find node to increment (>=ks) */ while (idif[m] >= imax[m]) { --m; } ++idif[m]; /* Change ks */ if (m == *ks) { if (idif[m] == imax[m]) { *ks = *k; } } } else { Loop: /* Check for finish */ for (k1 = *k + 1; k1 <= *nrow; ++k1) { if (idif[k1] > 0) { goto L70; } } *iflag = 1; return; L70: /* Reallocate counts */ mm = 1; for (i = 1; i <= *k; ++i) { mm += idif[i]; idif[i] = 0; } *k = k1; do { --(*k); m = min(mm, imax[*k]); idif[*k] = m; mm -= m; } while (mm > 0 && *k != 1); /* Check that all counts reallocated */ if (mm > 0) { if (k1 != *nrow) { *k = k1; goto Loop; } *iflag = 1; return; } /* Get ks */ --idif[k1]; *ks = 0; do { ++(*ks); if (*ks > *k) { return; } } while (idif[*ks] >= imax[*ks]); } } /* ----------------------------------------------------------------------- Name: F8XACT Purpose: Routine for reducing a vector when there is a zero element. Usage: CALL F8XACT (IROW, IS, I1, IZERO, NEW) Arguments: IROW - Vector containing the row counts.(Input) IS - Indicator.(Input) I1 - Indicator.(Input) IZERO - Position of the zero.(Input) NEW - Vector of new row counts.(Output) ----------------------------------------------------------------------- */ void f8xact(int *irow, int *is, int *i1, int *izero, int *knew) { int i; /* Parameter adjustments */ --knew; --irow; /* Function Body */ for (i = 1; i < *i1; ++i) knew[i] = irow[i]; for (i = *i1; i <= *izero - 1; ++i) { if (*is >= irow[i + 1]) break; knew[i] = irow[i + 1]; } knew[i] = *is; for(;;) { ++i; if (i > *izero) return; knew[i] = irow[i]; } } /* ----------------------------------------------------------------------- Name: F9XACT Purpose: Computes the log of a multinomial coefficient. Usage: F9XACT(N, MM, IR, FACT) Arguments: N - Length of IR.(Input) MM - Number for factorial in numerator.(Input) IR - Vector of length N containing the numbers for the denominator of the factorial.(Input) FACT - Table of log factorials.(Input) F9XACT - The log of the multinomal coefficient.(Output) ----------------------------------------------------------------------- */ double f9xact(int *n, int *mm, int *ir, double *fact) { double d; int k; d = fact[*mm]; for (k = 0; k < *n; k++) d -= fact[ir[k]]; return d; } /* ----------------------------------------------------------------------- Name: F10ACT Purpose: Computes the shortest path length for special tables. Usage: F10ACT (NROW, IROW, NCOL, ICOL, VAL, XMIN, FACT, ND, NE, M) Arguments: NROW - The number of rows in the table.(Input) IROW - Vector of length NROW containing the row totals.(Input) NCOL - The number of columns in the table. (Input) ICO - Vector of length NCOL containing the column totals.(Input) VAL - The shortest path. (Output) XMIN - Set to true if shortest path obtained. (Output) FACT - Vector containing the logarithms of factorials. (Input) ND - Workspace vector of length NROW.(Input) NE - Workspace vector of length NCOL.(Input) M - Workspace vector of length NCOL.(Input) Chapter: STAT/LIBRARY Categorical and Discrete Data Analysis ----------------------------------------------------------------------- */ void f10act(int *nrow, int *irow, int *ncol, int *icol, double *val, int *xmin, double *fact, int *nd, int *ne, int *m) { /* Local variables */ int i, is, ix, nrw1; /* Parameter adjustments */ --m; --ne; --nd; --icol; --irow; /* Function Body */ for (i = 1; i <= *nrow - 1; ++i) nd[i] = 0; is = icol[1] / *nrow; ix = icol[1] - *nrow * is; ne[1] = is; m[1] = ix; if (ix != 0) ++nd[ix]; for (i = 2; i <= *ncol; ++i) { ix = icol[i] / *nrow; ne[i] = ix; is += ix; ix = icol[i] - *nrow * ix; m[i] = ix; if (ix != 0) ++nd[ix]; } for (i = *nrow - 2; i >= 1; --i) nd[i] += nd[i + 1]; ix = 0; nrw1 = *nrow + 1; for (i = *nrow; i >= 2; --i) { ix = ix + is + nd[nrw1 - i] - irow[i]; if (ix < 0) return; } for (i = 1; i <= *ncol; ++i) { ix = ne[i]; is = m[i]; *val = *val + is * fact[ix + 1] + (*nrow - is) * fact[ix]; } *xmin = (1); return; } /* ----------------------------------------------------------------------- Name: F11ACT Purpose: Routine for revising row totals. Usage: CALL F11ACT (IROW, I1, I2, KNEW) Arguments: IROW - Vector containing the row totals.(Input) I1 - Indicator.(Input) I2 - Indicator. (Input) KNEW - Vector containing the row totals.(Output) ----------------------------------------------------------------------- */ void f11act(int *irow, int *i1, int *i2, int *knew) { int i; /* Parameter adjustments */ --knew; --irow; for (i = 1; i <= (*i1 - 1); ++i)knew[i] = irow[i]; for (i = *i1; i <= *i2; ++i)knew[i] = irow[i + 1]; return; } void prterr(int icode, char *mes) { error("FEXACT error "+int2str(icode)); } /* ----------------------------------------------------------------------- Name: iwork Purpose: Routine for allocating workspace. Usage: iwork (iwkmax, iwkpt, number, itype) Arguments: iwkmax - Maximum (int) amount of workspace.(Input) iwkpt - Amount of (int) workspace currently allocated.(in/out) number - Number of elements of workspace desired.(Input) itype - Workspace type.(Input) ITYPE TYPE 2 integer 3 float 4 double iwork(): Index in rwrk, dwrk, or iwrk of the beginning of the first free element in the workspace array.(Output) ----------------------------------------------------------------------- */ int iwork(int iwkmax, int *iwkpt, int number, int itype) { int i; i = *iwkpt; if (itype == 2 || itype == 3) *iwkpt += number; else { /* double */ if (i % 2 != 0) ++i; *iwkpt += (number << 1); i /= 2; } if (*iwkpt > iwkmax) prterr(40, "Out of workspace."); return i; } #ifndef USING_R void isort(int *n, int *ix) { /* ----------------------------------------------------------------------- Name: ISORT Purpose: Shell sort for an int vector. Usage: CALL ISORT (N, IX) Arguments: N - Lenth of vector IX.(Input) IX - Vector to be sorted.(in/out) ----------------------------------------------------------------------- */ static int ikey, i, j, m, il[10], kl, it, iu[10], ku; /* Parameter adjustments */ --ix; /* Function Body */ m = 1; i = 1; j = *n; L10: if (i >= j) { goto L40; } kl = i; ku = j; ikey = i; ++j; /* Find element in first half */ L20: ++i; if (i < j) { if (ix[ikey] > ix[i]) { goto L20; } } /* Find element in second half */ L30: --j; if (ix[j] > ix[ikey]) { goto L30; } /* Interchange */ if (i < j) { it = ix[i]; ix[i] = ix[j]; ix[j] = it; goto L20; } it = ix[ikey]; ix[ikey] = ix[j]; ix[j] = it; /* Save upper and lower subscripts of the array yet to be sorted */ if (m < 11) { if (j - kl < ku - j) { il[m - 1] = j + 1; iu[m - 1] = ku; i = kl; --j; } else { il[m - 1] = kl; iu[m - 1] = j - 1; i = j + 1; j = ku; } ++m; goto L10; } else { prterr(20, "This should never occur."); } /* Use another segment */ L40: --m; if (m == 0) { return; } i = il[m - 1]; j = iu[m - 1]; goto L10; } double gammds(double *y, double *p, int *ifault) { /* ----------------------------------------------------------------------- Name: GAMMDS Purpose: Cumulative distribution for the gamma distribution. Usage: PGAMMA (Q, ALPHA,IFAULT) Arguments: Q - Value at which the distribution is desired. (Input) ALPHA - Parameter in the gamma distribution. (Input) IFAULT - Error indicator.(Output) IFAULT DEFINITION 0 No error 1 An argument is misspecified. 2 A numerical error has occurred. PGAMMA - The cdf for the gamma distribution with parameter alpha evaluated at Q. (Output) ----------------------------------------------------------------------- Algorithm AS 147 APPL. Statist. (1980) VOL. 29, P. 113 Computes the incomplete gamma integral for positive parameters Y, P using and infinite series. */ static double a, c, f, g; static int ifail; /* Checks for the admissibility of arguments and value of F */ *ifault = 1; g = 0.; if (*y <= 0. || *p <= 0.) { return g; } *ifault = 2; /* ALOGAM is natural log of gamma function no need to test ifail as an error is impossible */ a = *p + 1.; f = exp(*p * log(*y) - alogam(&a, &ifail) - *y); if (f == 0.) { return g; } *ifault = 0; /* Series begins */ c = 1.; g = 1.; a = *p; L10: a += 1.; c = c * *y / a; g += c; if (c / g > 1e-6) { goto L10; } g *= f; return g; } /* ----------------------------------------------------------------------- Name: ALOGAM Purpose: Value of the log-gamma function. Usage: ALOGAM (X, IFAULT) Arguments: X - Value at which the log-gamma function is to be evaluated. (Input) IFAULT - Error indicator. (Output) IFAULT DEFINITION 0 No error 1 X < 0 ALGAMA - The value of the log-gamma function at XX. (Output) ----------------------------------------------------------------------- Algorithm ACM 291, Comm. ACM. (1966) Vol. 9, P. 684 Evaluates natural logarithm of gamma(x) for X greater than zero. */ double alogam(double *x, int *ifault) { /* Initialized data */ static double a1 = .918938533204673; static double a2 = 5.95238095238e-4; static double a3 = 7.93650793651e-4; static double a4 = .002777777777778; static double a5 = .083333333333333; /* Local variables */ static double f, y, z; *ifault = 1; if (*x < 0.) { return(0.); } *ifault = 0; y = *x; f = 0.; if (y >= 7.) { goto L30; } f = y; L10: y += 1.; if (y >= 7.) { goto L20; } f *= y; goto L10; L20: f = -log(f); L30: z = 1. / (y * y); return(f + (y - .5) * log(y) - y + a1 + (((-a2 * z + a3) * z - a4) * z + a5) / y); } #endif /* not USING_R */ plink-1.07-src/options.cpp0000644000265600020320000007705011264127624014704 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include "options.h" using namespace std; // Temporary dummy function bool par::myfunction = false; Options par::opt; bool par::verbose = false; bool par::flag = false; bool par::dumpped = false; bool par::debug = false; bool par::dummy = false; int par::dummy_nind = 0; int par::dummy_nsnp = 0; bool par::web_check = true; bool par::tucc = false; bool par::do_not_load_snps = false; double const par::epsilon = 1e-12; long unsigned int par::random_seed = 0; int par::simul_ncases = 1000; int par::simul_ncontrols = 1000; string par::simul_label = ""; double par::simul_prevalence = 0.01; bool par::simul = false; string par::simul_file = ""; bool par::simul_tags = false; bool par::simul_haps = false; bool par::simul_qt = false; double par::simul_qt_var = 0.05; bool par::lookup = false; bool par::lookup_single_snp = false; bool par::lookup_to_file = false; bool par::lookup_gene = false; bool par::lookup_multiple_genes = false; string par::lookup_gene_name = "GENE1"; string par::lookup_snp = "rs1234"; int par::lookup_gene_kb_window = 20; int par::lookup_snp_kb_window = 100; bool par::lookup2 = false; string par::lookup2_cmd = ""; bool par::idhelp = false; string par::idhelp_output_delimit = " "; bool par::idhelp_dump_from_dict = false; string par::idhelp_dump_from_dict_cmd = ""; string par::idhelp_dictionary = ""; bool par::idhelp_auto_alias = false; bool par::idhelp_lookup = false; string par::idhelp_lookup_string = ""; bool par::idhelp_subset = false; string par::idhelp_subset_string =""; bool par::idhelp_replace = false; string par::idhelp_replace_string = ""; bool par::idhelp_match = false; vector par::idhelp_match_string; bool par::idhelp_no_dict = false; bool par::idhelp_list_aliases = false; bool par::idhelp_alias_update = true; bool par::run_R_script = false; bool par::run_R_write_script = false; string par::R_script = "script.R"; bool par::run_R_chisq = false; bool par::run_R_z = false; int par::run_R_nsnps = 100; int par::R_port = 6311; bool par::recode = false; bool par::recode_transpose = false; bool par::recode_long = false; bool par::recode_long_ref = false; bool par::recode_mutlist = false; bool par::recode_12 = false; bool par::recode_AD = false; bool par::recode_AD_fixed = false; bool par::recode_AD_Aonly = false; bool par::recode_allele_coding = false; string par::recode_allele_coding_file = "file.lst"; string par::recode_delimit = " "; string par::recode_indelimit = " "; bool par::recode_HV = false; bool par::recode_whap = false; bool par::recode_fastphase = false; bool par::recode_structure = false; bool par::recode_bimbam = false; bool par::recode_1234 = false; bool par::recode_ACGT = false; bool par::set_reference_allele = false; string par::set_reference_allele_file = "dummy.file"; bool par::lfile_allele_count = false; bool par::preserve_all_genotypes = false; bool par::preserve_mendel_errors = false; bool par::zero_cluster = false; string par::zero_cluster_filename = "plink.zero"; bool par::oblig_missing = false; string par::oblig_missing_filename = "plink.zero"; string par::oblig_clusters_filename = "plink.clst"; bool par::loop_over = false; string par::loop_over_label = ""; int par::loop_counter = 0; string par::loop_over_filename = "plink.clst"; bool par::list_by_allele = false; bool par::list_twolocus = false; string par::twolocus_snp1 = ""; string par::twolocus_snp2 = ""; bool par::indiv_report = false; string par::indiv_report_fid = "fid1"; string par::indiv_report_iid = "iid1"; bool par::plist = false; string par::plist_fid1 = ""; string par::plist_iid1 = ""; string par::plist_fid2 = ""; string par::plist_iid2 = ""; bool par::merge_data = false; bool par::merge_force_strand = false; int par::merge_mode = 1; bool par::merge_binary = false; bool par::merge_list = false; string par::merge_list_filename = "merge.list"; string par::merge_pedfile = "merge.ped"; string par::merge_mapfile = "merge.map"; string par::merge_bedfile = "merge.bed"; string par::merge_bimfile = "merge.bim"; string par::merge_famfile = "merge.fam"; bool par::write_snplist = false; bool par::update_map = false; bool par::update_cm = false; bool par::update_chr = false; bool par::update_name = false; bool par::update_ids = false; string par::update_ids_file = ""; bool par::update_sex = false; string par::update_sex_file = ""; bool par::update_parents = false; string par::update_parents_file = ""; bool par::update_pheno = false; string par::update_pheno_file = ""; string par::update_mapfile = "new.map"; string par::range_delimiter = "-"; bool par::update_alleles = false; string par::update_allele_file = "dummy"; bool par::compound_genotype_code = false; string par::tpedfile = "plink.tped"; string par::tfamfile = "plink.tfam"; bool par::tfile_input = false; string par::lpedfile = "plink.lgen"; bool par::lfile_input = false; bool par::ref_file = false; string par::ref_file_name = ""; bool par::gvar = false; bool par::gvar_write = false; bool par::gvar_to_standard = false; bool par::load_gvar = false; bool par::gvar_include_all_variants = false; bool par::gvar_verbose_association = false; string par::gmapfile = "plink.map"; string par::gfamfile = "plink.fam"; string par::gvarfile = "plink.gvar"; bool par::gvar_full_report = false; bool par::flip_strand = false; string par::flip_file = "plink.flip"; bool par::flip_subset = false; string par::flip_subset_file = "plink.file"; bool par::compress_file = false; bool par::uncompress_file = false; string par::compress_filename = ""; bool par::read_ped = false; string par::pedfile = "plink.ped"; string par::mapfile = "plink.map"; string par::fileroot = "plink"; bool par::ped_from_stdin = false; bool par::map3 = false; bool par::liability = false; bool par::ped_skip_sex = false; bool par::ped_skip_parents = false; bool par::ped_skip_fid = false; bool par::ped_skip_pheno = false; bool par::SNP_major = true; bool par::out_SNP_major = true; string par::output_file_name = "plink"; bool par::silent = false; bool par::gplink = false; bool par::cli = false; bool par::fast_binary = false; string par::bitfilename = "plink.bed"; string par::famfile = "plink.fam"; string par::bitfilename_map = "plink.bim"; bool par::write_bitfile = false; bool par::read_bitfile = false; bool par::pheno_file = false; bool par::covar_file = false; bool par::clist = false; bool par::no_show_covar = false; bool par::dump_covar = false; bool par::dump_covar_with_phenotype = false; bool par::dump_covar_dummy_coding = false; bool par::filter_on_covar = false; int par::clist_number = 0; int par::plist_number = 0; bool par::snp_attrib_filter = false; string par::snp_attrib_value = ""; string par::snp_attrib_file = ""; bool par::ind_attrib_filter = false; string par::ind_attrib_value = ""; string par::ind_attrib_file = ""; bool par::multiple_phenotypes = false; string par::multiple_phenotype_file = ""; string par::make_pheno_filename = ""; string par::make_pheno_value = ""; bool par::make_pheno = false; bool par::make_pheno_present = false; bool par::dump_clst = false; bool par::clist_selection = false; bool par::clist_selection_name = false; bool par::clist_selection_number = false; string par::clist_selection_string = ""; bool par::plist_selection = false; bool par::plist_selection_name = false; bool par::plist_selection_number = false; string par::plist_selection_string = ""; int par::mult_pheno = 1; string par::name_pheno = ""; bool par::all_pheno = false; int par::mult_covar = 1; int par::mult_clst = 1; int par::mult_filter = 1; string par::filter_value = "1"; string par::number_list_string = ""; bool par::number_list_positive = true; string par::pheno_filename = "plink.phe"; string par::covar_filename = "plink.cov"; string par::clist_filename = "plink.cov"; string par::filter_filename = "plink.cov"; string par::missing_genotype = "0"; string par::missing_phenotype = "-9"; string par::out_missing_genotype = "0"; string par::out_missing_phenotype = "-9"; bool par::missing_phenotype_explicit = false; bool par::missing_genotype_explicit = false; bool par::ignore_missing_sex = false; bool par::cm_map = false; double par::grid = 0.005; // 0.5cM = 500kb grid double par::fringe = .01; // 1 cM fringe bool par::singlepoint = false; int par::inter_grid = 2; bool par::done_global_pihat = false; bool par::summ_nonfounders = false; bool par::make_founders = false; bool par::has_nonfounders = false; bool par::make_missing_parents = false; bool par::report_missing = false; bool par::test_missing = false; bool par::mishap_test = false; int par::mishap_window = 1; bool par::calcFst = false; bool par::score_risk = false; string par::score_risk_file = "plink.risk"; bool par::score_risk_ranges = false; string par::score_risk_ranges_file = "plink.ranges"; int par::score_risk_ranges_min = 0; bool par::score_impute_expected = true; bool par::score_risk_on_qrange = false; string par::score_qrange_file = ""; string par::score_qfile = ""; bool par::score_test = false; bool par::profile_sets = false; bool par::proxy_assoc = false; bool par::proxy_glm = false; bool par::proxy_all = false; bool par::proxy_full_report = false; bool par::proxy_impute = false; bool par::proxy_impute_replace = false; bool par::proxy_impute_preserve_genotyped = false; bool par::proxy_impute_genotypic_concordance = false; bool par::proxy_record_dosage = false; bool par::proxy_error = false; bool par::proxy_leave_out = false; bool par::proxy_include_reference = false; bool par::proxy_CC = false; bool par::proxy_TDT = false; string par::proxy_assoc_snp = "rs1234"; bool par::proxy_list = false; string par::proxy_list_file = "proxy.hap"; bool par::proxy_all_list = false; string par::proxy_all_list_file = "proxy.list"; bool par::proxy_list_proxies = false; bool par::proxy_exclude = false; string par::proxy_exclude_list = "pexclude.list"; bool par::proxy_exclude_from_file = false; bool par::proxy_reference_only = false; int par::proxy_maxhap = 3; double par::proxy_r2 = 0.5; double par::proxy_info_threshold = 0.5; bool par::impute_verbose = false; double par::proxy_maf = 0.005; double par::proxy_mhf = 0.01; double par::proxy_geno = 0.2; double par::proxy_impute_threshold = 0.9; bool par::make_minor_allele = true; double par::proxy_planB_threshold = 0.1; double par::proxy_kb_planB = 500; int par::proxy_window_planB = 30; int par::proxy_snp_filter_planB = 10; double par::proxy_r2_filter_A_planB = 0.00; double par::proxy_r2_filter_B_planB = 0.01; double par::proxy_r2_filter_C_planB = 0.50; double par::proxy_kb_planA = 250; int par::proxy_window_planA = 15; int par::proxy_snp_filter_planA = 5; double par::proxy_r2_filter_A_planA = 0.00; double par::proxy_r2_filter_B_planA = 0.25; double par::proxy_r2_filter_C_planA = 0.50; double par::proxy_kb = 250; int par::proxy_window = 15; int par::proxy_snp_filter = 5; bool par::proxy_r2_filter = true; double par::proxy_r2_filter_A = 0.00; double par::proxy_r2_filter_B = 0.05; double par::proxy_r2_filter_C = 0.50; bool par::greport = false; string par::greport_results = "file1"; string par::greport_gene_list = "file2"; bool par::greport_subset = false; string par::greport_subset_file = "file3"; bool par::greport_display_empty = false; bool par::annot_file = false; string par::annot_filename = ""; bool par::meta_analysis = false; vector par::meta_files; bool par::set_screen = false; string par::set_screen_resultfile = ""; bool par::gettag_mode = false; bool par::gettag_mode1 = true; bool par::gettag_mode2 = false; string par::gettag_file = ""; double par::gettag_r2 = 0.8; int par::gettag_kb = 250000; // 250kb default, in BP bool par::gettag_listall = false; bool par::clumpld = false; bool par::clumpld_best = false; string par::clumpld_results = "plink.assoc"; string par::clumpld_column = "P"; bool par::clumpld_verbose = false; bool par::clumpld_indep = true; int par::clumpld_kb = 250000; double par::clumpld_r2 = 0.5; double par::clumpld_p1 = 1e-4; double par::clumpld_p2 = 1e-2; bool par::clumpld_index1 = false; bool par::clumpld_only_show_replications = false; bool par::clumpld_only_show_replications_list = false; bool par::clumpld_annot = false; string par::clumpld_annot_fields = ""; string par::clumpld_range_file = "range.list"; bool par::clumpld_range_annotate = false; int par::clumpld_min = 0; // NOT USED double par::min_af = 0.01; double par::max_af = 1; // max minor allele freq double par::min_hf = 0.01; double par::max_hf = 1; int par::min_geno_cell = 5; double par::rarer_maf_threshold = 0.1; double par::rarer_dist_threshold = 100000; // 100 kb int par::rarer_interval = 100; // in bp bool par::rare_test = false; bool par::rare_test_weight1 = false; bool par::rare_test_print_details = false; string par::rare_test_print_details_snp = ""; bool par::elf_pcmode = false; bool par::elf_pcmode_2sided = false; bool par::elf_baseline = false; bool par::rare_test_score_range = false; double par::rare_test_score_range_threshold = 0.01; string par::rare_test_score_results_file = ""; string par::rare_test_score_range_file = ""; bool par::rare_test_summary_controls = false; vector par::chr_haploid(0); vector par::chr_sex(0); vector par::chr_Y(0); vector par::chr_code(0); map par::chr_map; bool par::species_dog = false; bool par::species_cow = false; bool par::species_horse = false; bool par::species_sheep = false; bool par::species_rice = false; bool par::species_mouse = false; int par::run_chr = 0; int par::run_start = 0; int par::run_end = 0; string par::m1 = ""; string par::m2 = ""; double par::window = 0; // kb bool par::position_window = false; int par::from_window = 0; // bp int par::to_window = 0; // bp bool par::mk_datfile = false; bool par::qt = false; bool par::bt = true; bool par::coding01 = false; bool par::ignore_phenotypes = true; bool par::filter_cases = false; bool par::filter_controls = false; bool par::filter_males = false; bool par::filter_females = false; bool par::filter_founders = false; bool par::filter_nonfounders = false; bool par::segment_haplotrack = false; string par::segment_haplotrack_fid1 = "1"; string par::segment_haplotrack_iid1 = "1"; string par::segment_haplotrack_fid2 = "2"; string par::segment_haplotrack_iid2 = "2"; bool par::segment_output = false; bool par::segment_minimal = false; bool par::segment_silently_return_groups = false; int par::segment_current_focal_snp = -1; bool par::segment_overlap = false; bool par::segment_verbose = false; bool par::segment_validate = false; bool par::segment_test_individual = false; bool par::segment_test_specific_segs = false; bool par::segment_test_fisher = false; bool par::segment_test_1sided = true; bool par::segment_test_force_1sided = false; bool par::segment_test_ignore_discordant = false; int par::segment_snp1 = -1; int par::segment_snp2 = -1; string par::segment_m1 = ""; string par::segment_m2 = ""; bool par::force_span = false; int par::segment_length = 1000000; // 1000kb default min length int par::segment_snp = 100; // 100 SNPs bool par::segment_output_started = false; bool par::read_segment_file = false; string par::read_segment_filename = ""; int par::segment_inter_snp_distance = 1000; //unit = kb bool par::multi_output = false; bool par::gmulti_output = false; bool par::pihat_filter = true; bool par::genome_output = false; bool par::compress_genome = false; bool par::genome_only_check_rels = false; bool par::genome_output_minimal = false; bool par::genome_output_full = false; bool par::genome_2sets = false; string par::genome_setlist1 = "plink.set1"; string par::genome_setlist2 = "plink.set2"; bool par::genome_test = false; double par::genome_test_threshold = 0.01; int par::genome_test_min_snp = 20; bool par::ibs_test = false; int par::ibs_test_min_snp = 20; bool par::ibs_test_method2 = false; bool par::summary_ibd_output = false; double par::IBD_threshold = 0.2; double par::segment_threshold_start = 0.25; double par::segment_threshold_finish = 0.25; bool par::nudge = false; bool par::bound = true; bool par::show_impossible_IBD = true; bool par::IBD_within = false; bool par::SD = true; bool par::CP = false; bool par::affpair = false; bool par::remove_unaffected_pairs = false; bool par::fix_prev = false; double par::fixed_prev = 0; bool par::sol_family = false; string par::tagfile = "plink.tag"; string par::mapfile_impute = "plink.impute.map"; bool par::impute_tags = false; bool par::sliding_window = false; string par::sliding_window_size = "2"; bool par::make_blocks = false; bool par::meta_large_phase = false; bool par::phase_snps = false; bool par::phase_hap_all = false; double par::hap_post_prob = 0.8; double par::hap_missing_geno = 0.5; int par::hap_max_nf_phases = 1024; double par::hap_min_phase_prob = 1e-2; bool par::display_hap_freqs = false; bool par::haplo_plem_verbose = false; bool par::haplo_plem_follow = false; int par::haplo_plem_follow_ind = -1; string par::haplo_plem_follow_fid = "FID1"; string par::haplo_plem_follow_iid = "IID1"; int par::haplo_plem_window = 6; int par::haplo_plem_overlap = 2; int par::haplo_plem_original_overlap = 2; int par::haplo_plem_iter = 20; int par::haplo_plem_likelihood_iter = 5; double par::haplo_plem_window_prune_phase = 1e-10; double par::haplo_plem_window_tol = 1e-4; double par::haplo_plem_zero_threshold = -1; bool par::haplo_plem_nonzero_threshold = true; int par::haplo_plem_meta_window = 2; double par::haplo_plem_meta_prune_haplotype = 1e-6; double par::haplo_plem_meta_prune_phase = 0.01; int par::haplo_plem_meta_iter = 200; int par::haplo_plem_meta_likelihood_iter = 5; double par::haplo_plem_meta_tol = 1e-4; bool par::test_hap_CC = false; bool par::test_hap_TDT = false; bool par::test_hap_QTL = false; bool par::test_hap_GLM = false; bool par::test_hap_GLM_omnibus = false; bool par::test_hap_only = false; bool par::display_phase_probs = false; bool par::display_phase_probs_wide = false; bool par::weighted_mm = false; bool par::chap_test = false; bool par::chap_sole_variant = false; bool par::chap_independent_effect = false; bool par::chap_sole_variant_specific_alleles = false; string par::chap_sole_variant_specific_allele_list = ""; bool par::chap_haplotype_specific = false; string par::chap_entity = ""; bool par::chap_specified_groups = false; bool par::chap_specified_snps = false; string par::chap_model1 = ""; string par::chap_model0 = ""; bool par::chap_drop_snps = false; string par::chap_drop_snps_list = ""; bool par::chap_add_grp_specifics = false; bool par::assoc_test = false; bool par::assoc_counts = false; bool par::assoc_glm = false; bool par::standard_beta = false; bool par::assoc_glm_without_main_snp = false; bool par::assoc_test_alt_perm = false; bool par::full_model_assoc = false; bool par::trend_only = false; bool par::fisher_test = false; bool par::return_beta = false; bool par::hap_specific_snps = false; string par::hap_specific_snps_list = ""; bool par::qt_means = false; bool par::conditioning_snp_single = false; string par::conditioning_snp_name = "rs1234"; bool par::conditioning_snps = false; string par::conditioning_snps_file = "plink.list"; int par::xchr_model = 1; bool par::glm_sex_effect = false; bool par::glm_no_auto_sex_effect = false; bool par::glm_dominant = false; bool par::glm_recessive = false; double par::vif_threshold = 50; bool par::twoDFmodel = false; bool par::twoDFmodel_hethom = false; bool par::test_full_model = false; bool par::simple_interaction = false; vector par::parameter_list(0); vector par::test_list(0); bool par::glm_user_test = false; bool par::glm_user_parameters = false; bool par::qt_with_covariates = false; bool par::model_perm_best = false; bool par::model_perm_gen = false; bool par::model_perm_dom = false; bool par::model_perm_rec = false; bool par::model_perm_trend = false; bool par::output_pheno_perm = false; bool par::assoc_gxe = false; bool par::QTDT_test = false; bool par::QFAM_total = false; bool par::QFAM_between = false; bool par::QFAM_within1 = false; bool par::QFAM_within2 = false; bool par::QFAM_adaptive = false; bool par::TDT_test = false; bool par::sibTDT_test = false; bool par::mating_tests = false; bool par::dfam_tdt = true; bool par::dfam_sibs = true; bool par::dfam_unrelateds = true; bool par::perm_TDT_basic = true; bool par::perm_TDT_parent = false; bool par::discordant_parents = false; bool par::parent_of_origin = false; bool par::perm_POO_poo = true; bool par::perm_POO_pat = false; bool par::perm_POO_mat = false; bool par::perm_POO_best = false; bool par::built_families = false; bool par::HWD_test = false; bool par::HWD_report = false; double par::HWD_limit = 0.001; bool par::HWD_standard = false; bool par::HWD_filter_on_all = false; bool par::MENDEL_test = false; bool par::MENDEL_report = false; double par::MENDEL_ind = 0.1; double par::MENDEL_snp = 0.1; bool par::CMH_test_1 = false; bool par::CMH_test_2 = false; bool par::CMH_test_ORD = false; bool par::breslowday = false; bool par::OR_homog_test = false; double par::ci_level = 0.95; double par::ci_zt = 0; bool par::display_ci = false; bool par::pfilter = false; double par::pfvalue = 1e-5; bool par::multtest = false; bool par::use_GC = false; bool par::fix_lambda = false; double par::lambda = 1; bool par::qq_plot = false; bool par::logscale = false; bool par::ibs_sharing_test = false; string par::keep_file = "plink.list"; string par::remove_file = "plink.list"; bool par::extract_set = false; bool par::exclude_set = false; string par::exclude_file = "plink.list"; string par::extract_file = "plink.list"; bool par::snp_range_list = false; bool par::thin_snps = false; double par::thin_param = 0; bool par::read_snp_qual = false; string par::snp_qual_file = "dummy"; double par::snp_qual_min = 0; double par::snp_qual_max = 1; bool par::read_geno_qual = false; string par::geno_qual_file = "dummy"; double par::geno_qual_min = 0; double par::geno_qual_max = 1; bool par::make_set = false; string par::make_set_file = "plink.set"; int par::make_set_border = 0; bool par::make_set_collapse = false; bool par::make_set_ignore_group = false; string par::make_set_collapse_label = "SET"; bool par::make_set_complement = false; bool par::write_set = false; bool par::read_set = false; bool par::drop_sets = true; bool par::snp_include_from_cl = false; string par::snp_include_range = ""; bool par::dump_gene = false; string par::dump_genename = ""; bool par::permute = false; bool par::perm_count = false; bool par::mperm_save_all = false; bool par::mperm_save_best = false; bool par::mperm_rank = false; int par::replicates = 1000; bool par::adaptive_perm = true; int par::adaptive_min = 5; int par::adaptive_max = 1000000; int par::adaptive_interval = 1; double par::adaptive_interval2= 0.001; double par::adaptive_alpha = 0.00; double par::adaptive_ci = 0.0001; bool par::perm_genedrop = false; bool par::perm_genedrop_and_swap = false; bool par::perm_genedrop_unrel = false; bool par::perm_genedrop_parents = false; bool par::perm_genedrop_sibships = false; bool par::hotel = false; bool par::set_test = false; bool par::set_p2 = false; int par::set_min = -1; int par::set_max = 5; double par::set_r2_val = 0.5; bool par::set_r2 = false; bool par::set_r2_phase = false; double par::set_chisq_threshold = 3.84146; bool par::set_r2_write = false; bool par::set_r2_read = false; string par::set_r2_read_file = "plink.ldset"; string par::subsetfile = "dummy.file"; bool par::use_subset = false; string par::setfile = "plink.set"; bool par::set_score = false; double par::set_score_p = 1; bool par::set_table = false; double par::set_step_in = 0.05; bool par::set_step = false; bool par::permute_within_sol = false; bool par::boot = false; bool par::disp_r1 = false; bool par::disp_r2 = false; bool par::disp_r_window = false; int par::disp_r_window_snp = 10; int par::disp_r_window_kb = 1000000; double par::disp_r_window_r2 = 0.2; bool par::ld_anchor = false; bool par::ld_anchor_list = false; bool par::flip_scan = false; double par::flip_scan_threshold = 0.5; bool par::flip_scan_verbose = false; bool par::prune_ld = false; bool par::prune_ld_pairwise = false; bool par::prune_ld_pairwise_maf = true; double par::prune_ld_vif = 2; double par::prune_ld_r2 = 1 - 1e-6; int par::prune_ld_win = 100; int par::prune_ld_step = 50; bool par::prune_r2_prefer = false; string par::prune_r2_prefer_list = "dummy"; bool par::prune_r2_fixed = false; string par::prune_r2_fixed_list = "dummy"; bool par::calc_SNPSNP_LD = false; string par::ld_SNP1 = ""; string par::ld_SNP1_file =""; string par::ld_SNP2 = ""; double par::epi_alpha1 = 0.0001; double par::epi_alpha2 = 0.01; bool par::epi_filter = true; bool par::set_by_set = true; bool par::epistasis = false; bool par::fast_epistasis = false; bool par::epi_caseonly = false; double par::epi_caseonly_kb_gap = 1000; // 1Mb default gap in SNPxSNP tests bool par::epi_genebased = false; bool par::epi_quickscan = false; bool par::inbreeding = false; bool par::check_sex = false; bool par::impute_sex = false; double par::sex_threshold_male = 0.8; double par::sex_threshold_female = 0.2; bool par::homo_run = false; bool par::homo_run_snps = false; bool par::homo_run_kb = false; double par::homo_run_density = 50; int par::homo_run_gap = 1000; // 1Mb bool par::homo_miss_as_hom = false; int par::homo_run_length_snps= 100; int par::homo_run_length_kb = 1000; // 1Mb in kb int par::homo_run_het = 1; int par::homo_windowSize = 50; // SNPs int par::homo_windowKB = 5000; // int par::homo_windowAllowedHet = 1; // 1 SNP per 20 int par::homo_windowAllowedMissing = 5; double par::homo_threshold = 0.05; bool par::homo_summary_allelic_match = false; bool par::homo_run_consensus_match = false; double par::fuzzy_homo = 0.99; bool par::homozyg_verbose = false; int par::pool_size_min = 2; bool par::ibs_run = false; int par::ibs_run_length_snps = 100; int par::ibs_run_length_kb = 100; double par::ibs_run_density = 0.01; // 1 SNP per 100kb average int par::ibs_inner_run_length_kb = 100; int par::ibs_inner_run_length_snp = 20; int par::ibs_join_kb = 100; int par::ibs_join_snp = 1; int par::ibs_run_missing = 2; int par::ibs_run_0 = 1; int par::ibs_inter_snp_distance = 1000000; // units=bp, 1=Mb bool par::ibs_2only = false; bool par::miss_run = false; int par::miss_run_length = 100; bool par::miss_run_length_kb = false; double par::miss_run_level = 0.80; bool par::FIXED = false; bool par::FIXED_p = false; Z par::FIX_IBD; double par::FIX_p = 0.5; bool par::matrix = false; bool par::distance_matrix = false; bool par::cluster = false; bool par::cluster_euclidean = false; bool par::cluster_group_avg = false; bool par::force_initial_cluster = false; bool par::cluster_plot = false; int par::cluster_mds_dim = 2; bool par::mds_by_individual = true; bool par::genome_groups = false; bool par::cluster_ibm_constraint = false; double par::cluster_ibm_constraint_value = 0; bool par::cluster_missing = false; bool par::cluster_selcon = false; string par::cluster_selcon_file = "plink.clst"; int par::max_cluster_N = -1; double par:: merge_p = 0; int par::ibstest_gap = 500000; // 500 kb int par::max_cluster_size = 0; int par::max_cluster_case = 0; int par::max_cluster_control = 0; bool par::cluster_on_phenotype = false; bool par::cluster_on_mcc = false; int par::min_neighbour = 1; int par::max_neighbour = 10; bool par::outlier_detection = false; bool par::bmatch = false; bool par::bmatch_usertype = false; bool par::qmatch = false; string par::bmatch_filename = "plink.bmatch"; string par::qmatch_filename = "plink.qmatch"; string par::bmatch_direction_filename = "plink.bm"; string par::qmatch_threshold_filename = "plink.qt"; bool par::include_cluster = false; bool par::include_cluster_from_file = false; string par::include_cluster_filename = "plink.clst"; int par::analyse_cluster = 0; bool par::af_write = false; bool par::af_count = false; bool par::af_read = false; bool par::ibd_read = false; string par::ibd_file = "plink.genome"; bool par::ibd_read_minimal = false; bool par::ibd_read_list = false; string par::ibd_file_list = "plink.genome.list"; bool par::inc_write = false; bool par::inc_read = false; string par::inc_file = "plink.inc"; string par::af_file = "plink.frq"; bool par::locked = false; bool par::include_all_pairs = false; double par::include_all_z1 = 0.001; double par::MIN_PIHAT = 0.0025; double par::MAX_PIHAT = 1.0000; double par::MAX_CORR_PIHAT_PIHAT_G = 0.9; double par::MAX_GENO_MISSING = 0.1; double par::MAX_IND_MISSING = 0.1; int par::MAX_LINE_LENGTH = 1000000; bool par::remove_indiv = false; bool par::keep_indiv = false; bool par::extract_before_exclude = true; bool par::remove_before_keep = true; string par::remove_indiv_list = "plink.list"; string par::keep_indiv_list = "plink.list"; int par::pp_maxsnp = 6; int par::pp_maxfid = 6; int par::pp_maxiid = 6; int par::BATCH_SIZE = 500000; bool par::plink = false; bool par::display_segment_long = false; bool par::cnv_makemap = false; bool par::cnv_writelist = false; bool par::cnv_list = false; bool par::display_cnv_track = false; int par::cnv_col = 0; string par::cnv_listname = "plink.cnv"; int par::cnv_min_kb = -1; double par::cnv_min_score = -1; int par::cnv_min_sites = -1; int par::cnv_max_kb = -1; double par::cnv_max_score = -1; int par::cnv_max_sites = -1; bool par::cnv_del_only = false; bool par::cnv_dup_only = false; int par::cnv_type = -1; bool par::cnv_intersect = false; bool par::cnv_exclude = false; string par::cnv_intersect_file = "plink.file"; bool par::cnv_intersect_subset = false; string par::cnv_intersect_subset_file = "plink.file"; double par::cnv_overlap = -1; bool par::cnv_count = false; bool par::cnv_defined_overlap = false; bool par::cnv_indiv_perm = false; bool par::cnv_pos_perm = false; bool par::cnv_drop_no_segment = false; bool par::cnv_freq_method2 = false; double par::cnv_freq_method2_threshold = 0.8; bool par::cnv_write_freq = false; bool par::cnv_freq_include = false; bool par::cnv_freq_include_below = true; bool par::cnv_freq_include_exact = false; bool par::cnv_freq_include_exact_exclude = false; int par::cnv_freq_include_cnt = -1; bool par::cnv_unique = false; bool par::cnv_intersect_writeback = false; bool par::cnv_intersect_writeback_verbose = false; bool par::cnv_disrupt = false; int par::cnv_region_border = 0; // kb bool par::cnv_union_overlap = false; bool par::cnv_region_overlap = false; bool par::cnv_check_overlap = false; bool par::cnv_count_baseline = false; string par::cnv_count_baseline_file = ""; bool par::cnv_weighted_gene_test = false; bool par::cnv_enrichment_test = false; int par::cnv_en_model = 4; bool par::cnv_glm = false; bool par::seg_test_window = false; double par::seg_test_window_bp = 100000; bool par::seg_test_region = false; bool par::dosage_assoc = false; string par::dosage_file = ""; bool par::dosage_hard_call = false; double par::dosage_hard_call_thresh = 0.99; int par::dosage_hard_call_thresh2 = 0; bool par::dosage_hasMap = false; bool par::write_dosage = false; plink-1.07-src/parse.cpp0000644000265600020320000035616711264127625014335 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "options.h" #include "helper.h" #include "stats.h" #include "nlist.h" void getOutputFilename(CArgs & a) { //////////////////////////////////////// // Insert commands from a file if (a.find("--script")) { string f = a.value("--script"); a.fromScript(f); } if (a.find("--rerun")) { string f = a.value("--rerun"); a.fromPriorLog(f); } //////////////////////////////////////////////////// // Start processing commands if (a.find("--out")) { par::output_file_name = a.value("--out"); } if (a.find("--gplink")) { if ( par::cli ) error("Cannot specify --gplink and --interactive"); par::silent = true; par::gplink = true; } if (a.find("--silent")) { if ( par::cli ) error("Cannot specify --silent and --interactive"); par::silent = true; } if (a.find("--interactive")) { if ( a.find("--script") ) error("Cannot specify --script and --interactive"); par::cli = true; } } void setOptions(CArgs & a) { ///////////////////////////////////// // Web-based functions if (a.find("--noweb")) par::web_check = false; if (a.find("--lookup")) { par::lookup = true; par::lookup_single_snp = true; par::lookup_snp = a.value("--lookup"); } if (a.find("--lookup-list")) { par::lookup = true; par::lookup_to_file = true; par::lookup_single_snp = false; par::lookup_snp = a.value("--lookup-list"); } if (a.find("--lookup-save")) par::lookup_to_file = true; if (a.find("--lookup-gene")) { par::lookup = true; par::lookup_to_file = true; par::lookup_gene = true; par::lookup_gene_name = a.value("--lookup-gene"); } if (a.find("--lookup-gene-list")) { par::lookup = true; par::lookup_to_file = true; par::lookup_multiple_genes = true; par::lookup_gene = true; par::lookup_gene_name = a.value("--lookup-gene-list"); } if (a.find("--lookup-gene-kb")) { par::lookup_gene_kb_window = a.value_int("--lookup-gene-kb"); } if (a.find("--lookup-kb")) { par::lookup_snp_kb_window = a.value_int("--lookup-kb"); } if (a.find("--lookup2")) { par::lookup2 = true; par::lookup2_cmd = a.value("--lookup2"); } if (a.find("--lookup-gene2")) { par::lookup2 = true; par::lookup_gene = true; par::lookup2_cmd = a.value("--lookup-gene2"); } if (a.find("--id-dict")) { par::idhelp = true; par::idhelp_dictionary = a.value("--id-dict"); if ( a.find("--id-auto-alias") ) par::idhelp_auto_alias = true; // Just print each exact line matching field=value if ( a.find("--id-dump") ) { par::idhelp_dump_from_dict = true; par::idhelp_dump_from_dict_cmd = a.value("--id-dump"); } // Default is to dump whole table if ( a.find("--id-table")) { par::idhelp_subset = "dump_subtable"; par::idhelp_subset_string = a.value("--id-table"); } if ( a.find("--id-lookup")) { par::idhelp_lookup = true; par::idhelp_lookup_string = a.value("--id-lookup"); } if ( a.find("--id-replace")) { par::idhelp_replace = true; vector s = a.value("--id-replace",3); par::idhelp_replace_string = s[0] + "," + "," + s[1] + "," + s[2]; } if ( a.find("--id-match")) { if ( a.find("--id-replace")) error("You cannot ID match and ID replace together"); par::idhelp_match = true; par::idhelp_match_string = a.varValue("--id-match"); } if (a.find("--id-delimit")) { par::idhelp_output_delimit = a.value("--id-delimit"); if ( par::idhelp_output_delimit == "tab" ) par::idhelp_output_delimit = "\t"; if ( par::idhelp_output_delimit == "bar" ) par::idhelp_output_delimit = "|"; if ( par::idhelp_output_delimit == "semi-colon" ) par::idhelp_output_delimit = ";"; if ( par::idhelp_output_delimit == "comma" ) par::idhelp_output_delimit = ","; if ( par::idhelp_output_delimit == "space" ) par::idhelp_output_delimit = " "; } if ( a.find("--id-alias")) par::idhelp_list_aliases = true; if ( par::idhelp_match && par::idhelp_replace ) error("Cannot specify --id-replace and --id-match together"); if ( par::idhelp_replace && par::idhelp_lookup ) error("Cannot specify --id-replace and --id-lookup together"); if ( par::idhelp_replace && par::idhelp_subset ) error("Cannot specify --id-replace and --id-table together"); if ( par::idhelp_match && par::idhelp_lookup ) error("Cannot specify --id-match and --id-lookup together"); } if ( a.find("--id-match") && !a.find("--id-dict") ) { par::idhelp = true; par::idhelp_match = true; par::idhelp_no_dict = true; par::idhelp_match_string = a.varValue("--id-match"); } if (a.find("--R")) { par::run_R_script = true; par::R_script = a.value("--R"); } if (a.find("--R-port")) par::R_port = a.value_int("--R-port"); if (a.find("--R-debug")) par::run_R_write_script = true; if (a.find("--R-chisq")) par::run_R_chisq = true; if (a.find("--R-z")) par::run_R_z = true; if (a.find("--R-nsnps")) par::run_R_nsnps = a.value_int("--R-nsnps"); if (a.find("--seed")) par::random_seed = a.value_lui("--seed"); //////////////////////////////////////// // A data-generation option? bool makedata = false; if (a.find("--recode")) { makedata = true; par::recode = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recode12")) { makedata = true; par::recode_12 = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recodeHV")) { makedata = true; par::recode_HV = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recode-lgen")) { makedata = true; par::recode_long = true; if (a.find("--with-reference")) par::recode_long_ref = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recode-rlist")) { makedata = true; par::recode_mutlist = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recode-whap")) { makedata = true; par::recode_whap = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recode-fastphase")) { makedata = true; par::recode_fastphase = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recode-structure")) { makedata = true; par::recode_structure = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recode-bimbam")) { makedata = true; par::recode_bimbam = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recodeAD")) { makedata = true; par::recode_AD = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recodeA")) { makedata = true; par::recode_AD = true; par::recode_AD_Aonly = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--recode-allele")) { if ( ! par::recode_AD ) error("You need to specify --recodeA or --recodeAD also"); par::recode_allele_coding = true; par::recode_allele_coding_file = a.value("--recode-allele"); } if (a.find("--make-bed")) { makedata = true; par::write_bitfile = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } ////////////////////////////////////////////// // Modifiers of main data generation commands if (a.find("--alleleACGT") || a.find("--allele-ACGT")) par::recode_ACGT = true; if (a.find("--allele1234") || a.find("--allele-1234")) par::recode_1234 = true; if (a.find("--reference-allele")) { par::set_reference_allele = true; par::set_reference_allele_file = a.value("--reference-allele"); par::make_minor_allele = false; } if (a.find("--transpose")) { if ( ! ( a.find("--recode") || a.find("--recode12") ) ) error("--transpose requires --recode or --recode12"); par::recode_transpose = true; } if (a.find("--tab")) { if ( ! makedata ) error("You can only specify --tab with a --recode* option"); par::recode_delimit = "\t"; } //////////////////////////////////////////////////// // Convenience functions and obligatory missingness // Zero out genotypes for a particular SNP / cluster if ( a.find("--zero-cluster") ) { if ( ! a.find("--within") ) error("You must specify --within with --zero-cluster"); par::zero_cluster = true; par::zero_cluster_filename = a.value("--zero-cluster"); checkFileExists(par::zero_cluster_filename); } // Perform genotyping rate calculations, but allow for // obligatory missingness if ( a.find("--oblig-missing") ) { if ( ! a.find("--oblig-clusters") ) error("You must specify --oblig-clusters with --oblig-missing"); par::oblig_missing = true; par::oblig_missing_filename = a.value("--oblig-missing"); checkFileExists(par::oblig_missing_filename); } if ( a.find("--oblig-clusters") ) { if ( ! a.find("--oblig-missing") ) error("You must specify --oblig-missing with --oblig-clusters"); par::oblig_clusters_filename = a.value("--oblig-clusters"); checkFileExists(par::oblig_clusters_filename ); } // Multiple category phenotype if ( a.find("--loop-assoc") ) { if ( a.find("--within") ) error("You cannot specify --within with --loop-assoc"); par::all_pheno = true; par::assoc_test = true; par::loop_over = true; par::include_cluster_from_file = true; par::include_cluster_filename = a.value("--loop-assoc"); checkFileExists(par::include_cluster_filename); } //////////////////////////////////////// // TODO list if (a.find("--todo")) { cout << "TODO list\n" << " * Make --min/--max thresholds apply w/ read-segment\n" << " * Odds ratio for DFAM\n" << " * Check Hotelling's T2 imputation of missing genotypes\n" << " * Add improved familial phasing to haplotype tests\n" << " * Add OR and CI to haplotype tests\n" << " * Add CMH and/or permutation to haplotype tests\n" << " \n"; exit(0); } /////////////////////////////////////// // Output options if (a.find("--flag")) par::flag = true; if (a.find("--verbose")) par::verbose = true; if (a.find("--pedigree")) par::dumpped = true; if (a.find("--tucc")) par::tucc = true; if (a.find("--debug")) par::debug = true; /////////////////////////////////////// // IBD analyses, output options if (a.find("--multi")) par::multi_output = true; if (a.find("--gmulti")) par::gmulti_output = true; if (a.find("--summarise-ibd")) par::summary_ibd_output = true; if (a.find("--genome") || a.find("--Z-genome") || a.find("--genome-minimal") ) { // By default, include everybody if (!a.find("--min")) { par::MIN_PIHAT = 0; par::pihat_filter = false; } par::genome_output = true; if (a.find("--Z-genome")) par::compress_genome = true; if (a.find("--genome-minimal")) par::genome_output_minimal = true; else if (a.find("--genome-full")) par::genome_output_full = true; } if (a.find("--rel-check")) par::genome_only_check_rels = true; if (a.find("--impossible")) { if (!a.find("--genome")) error("Can only specify --impossible with --genome"); par::show_impossible_IBD = false; } if (a.find("--nudge")) { if (!a.find("--genome")) error("Can only specify --nudge with --genome"); if (a.find("--impossible")) error("Cannot specify --impossible and --nudge together"); par::nudge = true; } if (a.find("--unbounded")) { par::bound = false; par::MIN_PIHAT = -1; par::MAX_PIHAT = 1; } if (a.find("--genome-lists")) { if ( ! ( a.find("--genome") || a.find("--segment") ) ) error("Must specify --genome or --segment with --genome-lists"); par::genome_2sets = true; vector s = a.value("--genome-lists",2); par::genome_setlist1 = s[0]; par::genome_setlist2 = s[1]; } if (a.find("--segment-within")) { if ( !a.find("--within")) error("You need to specify a --within {clusterfile}"); par::IBD_within = true; } if (a.find("--genome-test")) { par::genome_test = true; par::genome_test_threshold = a.value_double("--genome-test"); par::plink = true; par::permute = true; par::adaptive_perm = false; par::replicates = 100000; } if (a.find("--ibs-test")) { par::ibs_test = true; par::permute = true; par::adaptive_perm = false; par::replicates = 100000; } if (a.find("--ibs-test2")) { par::ibs_test = true; par::ibs_test_method2 = true; par::permute = true; par::adaptive_perm = false; par::replicates = 100000; } if (a.find("--segment-match-snp")) par::genome_test_min_snp = a.value_int("--segment-match-snp"); /////////////////////////////////////////// // WGAS main options: summary stats and QC if (a.find("--missing")) { par::report_missing = true; } if (a.find("--mendel")) { par::MENDEL_report = true; } if (a.find("--hardy")) par::HWD_report = true; if (a.find("--hardy2")) par::HWD_report = par::HWD_standard = true; if (a.find("--het")) par::inbreeding = true; if (a.find("--Fst") || a.find("--fst")) { if ( ! a.find("--within") ) error("Need to specify --within with --Fst"); par::calcFst = true; } if (a.find("--check-sex")) par::check_sex = true; if (a.find("--impute-sex")) par::check_sex = par::impute_sex = true; if (a.find("--test-missing")) { par::test_missing = true; } if (a.find("--test-mishap")) { par::mishap_test = true; } if (a.find("--mishap-window")) { par::mishap_test = true; par::mishap_window = a.value_int("--mishap-window"); } if (a.find("--score")) { par::score_risk = true; par::score_risk_file = a.value("--score"); if (a.find("--score-no-mean-imputation")) par::score_impute_expected = false; if (a.find("--score-ranges")) { par::score_risk_ranges = true; par::score_risk_ranges_file = a.value("--score-ranges"); } if (a.find("--score-ranges-min")) par::score_risk_ranges_min = a.value_int("--score-ranges-min"); if (a.find("--score-ranges-border")) par::make_set_border = a.value_int("--score-ranges-border") * 1000; if (a.find("--q-score-file")) { if ( ! a.find("--q-score-range") ) error("Must specify --q-score-range with --q-score-file"); par::score_risk_on_qrange = true; par::score_qrange_file = a.value("--q-score-range"); par::score_qfile = a.value("--q-score-file"); } if ( a.find("--score-test") ) { par::score_test = true; } if ( a.find("--set") || a.find("--make-set") ) par::profile_sets = true; } //////////////////////////////////////////////////////////////// // Proxy association methods if (a.find("--proxy-assoc")) { par::proxy_assoc = true; par::proxy_CC = true; par::proxy_assoc_snp = a.value("--proxy-assoc"); if ( par::proxy_assoc_snp == "all" || par::proxy_assoc_snp == "ALL" ) par::proxy_all = true; if ( a.find("--proxy-glm") ) par::proxy_glm = true; } if (a.find("--proxy-drop")) { if ( a.find("--proxy-impute")) error("Cannot have --proxy-drop and --proxy-impute (already implied)"); par::proxy_leave_out = true; } if (a.find("--proxy-exclude")) { par::proxy_exclude = true; par::proxy_exclude_from_file = false; par::proxy_exclude_list = a.value("--proxy-exclude"); } else if (a.find("--proxy-exclude-list")) { par::proxy_exclude = true; par::proxy_exclude_from_file = true; par::proxy_exclude_list = a.value("--proxy-exclude-list"); } if (a.find("--proxy-include-reference")) par::proxy_include_reference = true; if (a.find("--proxy-impute")) { par::proxy_assoc = true; par::proxy_impute = true; par::proxy_assoc_snp = a.value("--proxy-impute"); if ( par::proxy_assoc_snp == "all" || par::proxy_assoc_snp == "ALL" ) par::proxy_all = true; if ( a.find("--proxy-dosage") ) par::proxy_record_dosage = true; if ( a.find("--proxy-replace") ) par::proxy_impute_replace = true; // if ( a.find("--proxy-preserve") ) // par::proxy_impute_preserve_genotyped = true; if ( a.find("--proxy-genotypic-concordance") ) par::proxy_impute_genotypic_concordance = true; } if (a.find("--proxy-error")) par::proxy_error = true; if (a.find("--proxy-impute-threshold")) par::proxy_impute_threshold = a.value_double("--proxy-impute-threshold"); if (a.find("--proxy-tdt")) { if (a.find("--hap-tdt")) error("Cannot specify --proxy-tdt and --hap together"); par::proxy_assoc = true; par::proxy_TDT = true; par::proxy_assoc_snp = a.value("--proxy-tdt"); if ( par::proxy_assoc_snp == "all" || par::proxy_assoc_snp == "ALL" ) par::proxy_all = true; } if ( par::proxy_assoc ) { // Reset default missing rate per individual // per haplotype (more relevant if the window // size is small (i.e. to goal is not to discard // too many / any individuals, but rely on E-M // phasing to reconstruct missing genotypes. Normally // the default value here is 0.5; it can be modified as // --hap-miss option is below this one par::hap_missing_geno = 0.9; } if (a.find("--proxy-verbose")) par::proxy_full_report = true; if (a.find("--proxy-flanking")) { if ( par::proxy_all ) error("Cannt specify --proxy-flanking with >1 reference SNP"); par::proxy_list = true; par::proxy_list_file = a.value("--proxy-flanking"); } if (a.find("--proxy-list")) { if ( a.find("--proxy-flanking")) error("Cannt specify --proxy-flanking with >1 reference SNP"); par::proxy_all_list = true; par::proxy_all_list_file = a.value("--proxy-list"); } if (a.find("--proxy-sub-r2")) par::proxy_r2 = a.value_double("--proxy-sub-r2"); if (a.find("--proxy-maf")) par::proxy_maf = a.value_double("--proxy-maf"); if (a.find("--proxy-geno")) par::proxy_geno = a.value_double("--proxy-geno"); if (a.find("--proxy-mhf")) par::proxy_mhf = a.value_double("--proxy-mhf"); if (a.find("--proxy-sub-maxsnp")) par::proxy_maxhap = a.value_int("--proxy-sub-maxsnp"); if (a.find("--proxy-no-r2-filter")) par::proxy_r2_filter = false; if (a.find("--proxy-show-proxies")) par::proxy_list_proxies = true; if (a.find("--proxy-r2-reference-only")) par::proxy_reference_only = true; ////////////////////////////////////// // Frequency dependent proxy filters // Plans A and B if (a.find("--proxy-r2")) { par::proxy_r2_filter = true; vector s = a.value("--proxy-r2",3); par::proxy_r2_filter_A_planA = getDouble(s[0], "--proxy-r2"); par::proxy_r2_filter_B_planA = getDouble(s[1], "--proxy-r2"); par::proxy_r2_filter_C_planA = getDouble(s[2], "--proxy-r2"); } if (a.find("--proxy-maxsnp")) par::proxy_snp_filter_planA = a.value_int("--proxy-maxsnp"); if (a.find("--proxy-window")) par::proxy_window_planA = a.value_int("--proxy-window"); if (a.find("--proxy-kb")) par::proxy_kb_planA = a.value_double("--proxy-kb"); // And plan B if (a.find("--proxy-b-threshold")) par::proxy_planB_threshold = a.value_double("--proxy-b-threshold"); if (a.find("--proxy-b-r2")) { par::proxy_r2_filter = true; vector s = a.value("--proxy-b-r2",3); par::proxy_r2_filter_A_planB = getDouble(s[0], "--proxy-b-r2"); par::proxy_r2_filter_B_planB = getDouble(s[1], "--proxy-b-r2"); par::proxy_r2_filter_C_planB = getDouble(s[2], "--proxy-b-r2"); } if (a.find("--proxy-b-maxsnp")) par::proxy_snp_filter_planB = a.value_int("--proxy-b-maxsnp"); if (a.find("--proxy-b-window")) par::proxy_window_planB = a.value_int("--proxy-b-window"); if (a.find("--proxy-b-kb")) par::proxy_kb_planB = a.value_double("--proxy-b-kb"); //////////////////////////////////////// // Segmental options // --segment-match-snp // --homozyg // --homozyg-window-kb // --homozyg-window-snp // --homozyg-window-het // --homozyg-window-missing // --homozyg-snp // --homozyg-kb // --homozyg-density // --homozyg-gap // --homozyg-group // --homozyg-match // --homozyg-het // --homozyg-verbose // --segment // --segment-gap // --segment-length // --segment-thresholds // --segment-minimal // --segment-group // --segment-spanning // --segment-from // --segment-to // --segment-force // --segment-match // --segment-verbose // --pool-size // Read segments back in // --read-segment // --read-segment-minimal ////////////////////////// // CNV/other segment types if (a.find("--cnv-list")) { par::cnv_list = true; par::cnv_listname = a.value("--cnv-list"); } if (a.find("--cfile")) { if ( a.find("--cnv-list")) error("Cannot specify --cfile and --cnv-list together"); if ( a.find("--fam")) error("Cannot specify --cfile and --fam together"); if ( a.find("--map")) error("Cannot specify --cfile and --map together"); par::cnv_list = true; par::fileroot = a.value("--cfile"); par::cnv_listname = par::fileroot + ".cnv"; par::famfile = par::fileroot + ".fam"; par::mapfile = par::fileroot + ".cnv.map"; } if ( par::cnv_list ) { // Remove any missing individuals if (!a.find("--cnv-missing-phenotypes")) par::ignore_phenotypes = false; if (a.find("--cnv-make-map")) par::cnv_makemap = true; if (a.find("--cnv-write")) { par::cnv_writelist = true; if (a.find("--with-phenotype")) par::dump_covar_with_phenotype = true; } if ( a.find("--cnv-disrupt" ) ) { if (a.find("--cnv-overlap") || a.find("--cnv-union-overlap") || a.find("--cnv-region-overlap")) error("Cannot specify --cnv-overlap and --cnv-disrupt together"); par::cnv_disrupt = true; } if (a.find("--cnv-intersect")) { if ( a.find("--cnv-exclude") || a.find("--cnv-count") ) error("Cannot specify --cnv-count/intersect/exclude/disrupt together"); par::cnv_intersect = true; par::cnv_intersect_file = a.value("--cnv-intersect"); } if (a.find("--cnv-subset")) { if ( ! ( a.find("--cnv-exclude") || a.find("--cnv-count") || a.find("--cnv-intersect")) ) error("Must use --cnv-intersect/exclude/count with --cnv-subset"); par::cnv_intersect_subset = true; par::cnv_intersect_subset_file = a.value("--cnv-subset"); } if (a.find("--cnv-check-no-overlap")) par::cnv_check_overlap = true; if (a.find("--cnv-border")) { par::cnv_region_border = 1000 * a.value_int("--cnv-border"); } if (a.find("--cnv-exclude")) { if ( a.find("--cnv-count") || a.find("--cnv-intersect")) error("Cannot specify --cnv-count/intersect/exclude/disrupt together"); par::cnv_intersect = par::cnv_exclude = true; par::cnv_intersect_file = a.value("--cnv-exclude"); } if (a.find("--cnv-count")) { if ( a.find("--cnv-exclude") || a.find("--cnv-intersect")) error("Cannot specify --cnv-count/intersect/exclude together"); par::cnv_intersect = par::cnv_count = true; par::cnv_intersect_file = a.value("--cnv-count"); if ( a.find("--cnv-count-baseline")) { par::cnv_count_baseline = true; par::cnv_count_baseline_file = a.value("--cnv-count-baseline"); } if ( a.find("--cnv-weighted-count")) par::cnv_weighted_gene_test = true; } if (a.find("--cnv-freq-method2")) { par::cnv_freq_method2 = true; par::cnv_freq_method2_threshold = a.value_double("--cnv-freq-method2"); if (a.find("--cnv-unique")) error("Cannot specify --cnv-unique and --cnv-method2 together"); if (a.find("--cnv-write-freq")) par::cnv_write_freq = true; } if (a.find("--cnv-freq-exclude-above")) { par::cnv_freq_include = true; par::cnv_freq_include_below = true; par::cnv_freq_include_cnt = a.value_int("--cnv-freq-exclude-above"); } if (a.find("--cnv-freq-exclude-below")) { par::cnv_freq_include = true; par::cnv_freq_include_below = false; par::cnv_freq_include_cnt = a.value_int("--cnv-freq-exclude-below"); } if (a.find("--cnv-freq-include-exact")) { par::cnv_freq_include = true; par::cnv_freq_include_exact = true; par::cnv_freq_include_cnt = a.value_int("--cnv-freq-include-exact"); } if (a.find("--cnv-freq-exclude-exact")) { par::cnv_freq_include = true; par::cnv_freq_include_exact = true; par::cnv_freq_include_exact_exclude = true; par::cnv_freq_include_cnt = a.value_int("--cnv-freq-exclude-exact"); } if (a.find("--cnv-unique")) { par::cnv_unique = true; } if (a.find("--cnv-report-regions") ) { if ( ! ( a.find("--cnv-intersect") || a.find("--cnv-exclude") || a.find("--cnv-disrupt") ) ) error("Must specify --cnv-intersect/exclude/disrupt with --cnv-report-regions"); par::cnv_intersect_writeback = true; } else if (a.find("--cnv-verbose-report-regions") ) { if ( ! ( a.find("--cnv-intersect") || a.find("--cnv-exclude") || a.find("--cnv-disrupt") ) ) error("Must specify --cnv-intersect/exclude/disrupt with --cnv-report-regions"); par::cnv_intersect_writeback = true; par::cnv_intersect_writeback_verbose = true; } if (a.find("--cnv-overlap")) { par::cnv_defined_overlap = true; par::cnv_overlap = a.value_double("--cnv-overlap"); } else if (a.find("--cnv-union-overlap")) { par::cnv_defined_overlap = true; par::cnv_overlap = a.value_double("--cnv-union-overlap"); par::cnv_union_overlap = true; } else if (a.find("--cnv-region-overlap")) { par::cnv_defined_overlap = true; par::cnv_overlap = a.value_double("--cnv-region-overlap"); par::cnv_region_overlap = true; } if (a.find("--cnv-kb")) par::cnv_min_kb = a.value_int("--cnv-kb"); if (a.find("--cnv-score")) par::cnv_min_score = a.value_double("--cnv-score"); if (a.find("--cnv-sites")) par::cnv_min_sites = a.value_int("--cnv-sites"); if (a.find("--cnv-max-kb")) par::cnv_max_kb = a.value_int("--cnv-max-kb"); if (a.find("--cnv-max-score")) par::cnv_max_score = a.value_double("--cnv-max-score"); if (a.find("--cnv-max-sites")) par::cnv_max_sites = a.value_int("--cnv-max-sites"); if (a.find("--cnv-del")) par::cnv_del_only = true; if (a.find("--cnv-dup")) par::cnv_dup_only = true; if (a.find("--cnv-test-window")) { par::seg_test_window = true; par::seg_test_window_bp = a.value_double("--cnv-test-window") * 1000; } if (a.find("--cnv-test-region")) { if ( par::seg_test_window ) error("Cannot specify both --cnv-test-window and --cnv-test-region"); par::seg_test_region = true; } if (a.find("--cnv-test-2sided")) par::segment_test_1sided = false; if (a.find("--cnv-test-1sided")) par::segment_test_force_1sided = true; // if (a.find("--cnv-glm")) // par::cnv_glm = true; if (a.find("--cnv-indiv-perm")) par::cnv_indiv_perm = true; if (a.find("--cnv-enrichment-test")) { if ( ! a.find("--cnv-count") ) error("The --cnv-enrichment-test option requires --cnv-count"); par::cnv_enrichment_test = true; if ( a.find("--cnv-model") ) par::cnv_en_model = a.value_int("--cnv-model"); } if (a.find("--cnv-position-perm")) par::cnv_pos_perm = true; if (a.find("--cnv-seglist")) par::display_segment_long = true; if (a.find("--cnv-track")) par::display_cnv_track = true; if (a.find("--cnv-blue")) par::cnv_col = 1; if (a.find("--cnv-green")) par::cnv_col = 2; if (a.find("--cnv-red")) par::cnv_col = 3; if (a.find("--cnv-brown")) par::cnv_col = 4; if (a.find("--cnv-drop-no-segment")) par::cnv_drop_no_segment = true; if (a.find("--merge") || a.find("--bmerge")) error("Cannot specify --cnv-list and merge SNP data together"); if (a.find("--exclude") || a.find("--extract")) error("Cannot specify --cnv-list and exclude/extract markers SNP data together"); if (a.find("--maf") || a.find("--geno") || a.find("--mind")) error("Cannot specify --cnv-list and filter on SNP data together"); } /////////////////////// // Runs of homozygosity if (a.find("--homozyg")) { par::homo_run = true; } if (a.find("--read-homozyg")) { par::homo_run = true; par::read_segment_filename = a.value("--read-homozyg"); par::read_segment_file = true; } if (a.find("--homozyg-snp")) { par::homo_run = true; par::homo_run_snps = true; par::homo_run_length_snps = a.value_int("--homozyg-snp"); } if (a.find("--homozyg-kb")) { par::homo_run = true; par::homo_run_kb = true; par::homo_run_length_kb = a.value_int("--homozyg-kb"); } if (a.find("--homozyg-density")) { par::homo_run = true; par::homo_run_density = a.value_double("--homozyg-density"); } if (a.find("--homozyg-gap")) { par::homo_run = true; par::homo_run_gap = a.value_int("--homozyg-gap"); } if (a.find("--homozyg-window-snp")) { par::homo_run = true; par::homo_windowSize = a.value_int("--homozyg-window-snp"); } if (a.find("--homozyg-window-kb")) { par::homo_run = true; par::homo_windowKB = a.value_int("--homozyg-window-kb"); } if (a.find("--homozyg-window-het")) { par::homo_run = true; par::homo_windowAllowedHet = a.value_int("--homozyg-window-het"); } if (a.find("--homozyg-window-missing")) { par::homo_run = true; par::homo_windowAllowedMissing = a.value_int("--homozyg-window-missing"); } if (a.find("--homozyg-window-threshold")) { par::homo_run = true; par::homo_threshold = a.value_double("--homozyg-window-threshold"); } if (a.find("--homozyg-group")) { par::homo_summary_allelic_match = true; par::fuzzy_homo = 0.95; } if (a.find("--homozyg-match")) { par::homo_summary_allelic_match = true; par::fuzzy_homo = a.value_double("--homozyg-match"); } if (a.find("--homozyg-het")) { if (! ( a.find("--homozyg-snp") || a.find("--homozyg-kb") ) ) error("Must specify --homozyg-snp or --homozyg-kb with --homozyg-het"); par::homo_run_het = a.value_int("--homozyg-het"); } if (a.find("--homozyg-verbose")) par::homozyg_verbose = true; if (a.find("--consensus-match")) { par::homo_run_consensus_match = true; } if (a.find("--homozyg-include-missing")) { par::homo_miss_as_hom = true; } if (a.find("--pool-size")) { par::pool_size_min = a.value_int("--pool-size"); } if (a.find("--ibs")) par::ibs_run = true; if (a.find("--ibs2")) par::ibs_run = par::ibs_2only = true; if (a.find("--ibs-density")) { par::ibs_run = true; par::ibs_run_density = a.value_double("--ibs-density"); } if (a.find("--ibs-kb")) { par::ibs_run = true; par::ibs_run_length_kb = a.value_int("--ibs-kb"); } if (a.find("--ibs-snp")) { par::ibs_run = true; par::ibs_run_length_snps = a.value_int("--ibs-snp"); } // static int ibs_inner_run_length_kb; // if (a.find("--ibs-join-snp")) // { // par::ibs_run = true; // par::ibs_join_snp = a.value_int("--ibs-join-snp"); // } // if (a.find("--ibs-join-snp")) // { // par::ibs_run = true; // par::ibs_join_snp = a.value_int("--ibs-join-snp"); // } // if (a.find("--ibs-join-kb")) // { // par::ibs_run = true; // par::ibs_join_kb = a.value_int("--ibs-join-kb") * 1000; // } if (a.find("--ibs-gap")) { par::ibs_run = true; par::ibs_inter_snp_distance = a.value_int("--ibs-gap") * 1000; } if (a.find("--ibs-miss")) { par::ibs_run = true; par::ibs_run_missing = a.value_int("--ibs-miss"); } if (a.find("--ibs-err")) { par::ibs_run = true; par::ibs_run_0 = a.value_int("--ibs-err"); } if (a.find("--miss-run-snps")) { par::miss_run = true; par::miss_run_length = a.value_int("--miss-run-snps"); } if (a.find("--miss-run")) { cerr << "\n*** WARNING -- use --miss-run-snps N option instead\n\n"; par::miss_run = true; par::miss_run_length_kb = true; par::miss_run_length = a.value_int("--miss-run"); } if (a.find("--miss-run-level")) { par::miss_run = true; par::miss_run_level = a.value_double("--miss-run-level"); } /////////////////////////// // Segmental sharing tests // if (a.find("--plink")) // { // par::plink = true; // par::nudge = true; // } if (a.find("--segment")) { par::plink = true; par::nudge = true; par::segment_output = true; } if (a.find("--segment-ibs")) { par::segment_validate = true; } if (a.find("--segment-minimal")) { par::plink = true; par::nudge = true; par::segment_output = true; par::segment_minimal = true; } if (a.find("--segment-test-individual")) { // Instead of pairwise Case-Case versus non-Case-Case, // just count the number of segs that any one individual // has, and compare count in cases to count in controls par::segment_test_individual = true; if (a.find("--specific")) par::segment_test_specific_segs = true; } if (a.find("--segment-test-ignore-discordant")) { // Pairwise Case-Case versus Control-Control par::segment_test_ignore_discordant = true; } if (a.find("--segment-test-fisher")) { par::segment_test_fisher = true; } if (a.find("--segment-test-2sided")) par::segment_test_1sided = false; if (a.find("--segment-group")) { // Use HBD segment match routine, but without // allelic identity option par::segment_overlap = true; par::homo_summary_allelic_match = false; par::fuzzy_homo = 0.95; } if (a.find("--segment-spanning")) { // if (!a.find("--segment-group")) // error("Must specify --segment-group for --segment-spanning\n"); par::segment_overlap = true; par::homo_summary_allelic_match = false; par::fuzzy_homo = 0.95; par::segment_m1 = par::segment_m2 = a.value("--segment-spanning"); } if (a.find("--segment-from")) { if (!a.find("--segment-group")) error("Must specify --segment-group for --segment-from\n"); par::segment_m1 = a.value("--segment-from"); } if (a.find("--segment-to")) { if (!a.find("--segment-group")) error("Must specify --segment-group for --segment-to\n"); par::segment_m2 = a.value("--segment-to"); } if (a.find("--segment-force")) { if (! ( a.find("--segment-from") && a.find("--segment-to") ) ) error("Can only use --segment-force with --segment-from/to \n"); par::force_span = true; } if (a.find("--segment-match")) { // Use HBD segment match routine, but without // allelic identity option par::segment_overlap = true; par::homo_summary_allelic_match = true; par::fuzzy_homo = a.value_double("--segment-match"); } if (a.find("--segment-verbose")) par::segment_verbose = true; if (a.find("--read-segment")) { if (par::segment_output) error("Cannot specify both --segment and --read-segment\n"); par::read_segment_filename = a.value("--read-segment"); par::read_segment_file = true; par::plink = true; par::segment_output = true; } if (a.find("--read-segment-minimal")) { if (par::segment_output) error("Cannot specify both --segment and --read-segment\n"); par::read_segment_filename = a.value("--read-segment-minimal"); par::read_segment_file = true; par::plink = true; par::segment_output = true; par::segment_minimal = true; } if (a.find("--segment-gap")) { par::segment_output = true; par::segment_inter_snp_distance = a.value_int("--segment-gap"); } if (a.find("--segment-length")) { // Min length in kb, convert to bp par::segment_length = a.value_int("--segment-length") * 1000; } if (a.find("--segment-snp")) { // Min length in SNPs par::segment_snp = a.value_int("--segment-snp"); } if (a.find("--segment-thresholds")) { par::segment_output = true; vector s = a.value("--segment-thresholds",2); par::segment_threshold_start = getDouble(s[0], "--segment-thresholds"); par::segment_threshold_finish = getDouble(s[1], "--segment-thresholds"); } ////////////////////////// // Misc, external functions if (a.find("--elf-test")) { vector s = a.value("--elf-test",2); par::rarer_maf_threshold = getDouble(s[0], "--elf-test"); if ( par::rarer_maf_threshold < 0 || par::rarer_maf_threshold > 1 ) error("Frequency thresholds not valid for --elf-test"); par::rarer_dist_threshold = getDouble(s[1], "--elf-test") * 1000; par::rare_test = true; if (a.find("--elf-weight")) par::rare_test_weight1 = true; if (a.find("--elf-interval")) par::rarer_interval = a.value_int("--elf-interval"); if (a.find("--elf-eigen")) par::elf_pcmode = true; if (a.find("--elf-2sided")) par::elf_pcmode_2sided = true; } if (a.find("--elf-baseline")) { par::rarer_maf_threshold = a.value_double("--elf-baseline"); par::rare_test = true; par::elf_baseline = true; } if (a.find("--elf-summary")) { par::rare_test_score_range = true; vector s = a.value("--elf-summary",2); par::rare_test_score_range_threshold = getDouble(s[0], "--elf-summary"); par::rare_test_score_results_file = s[1]; if (a.find("--elf-range")) { par::rare_test_score_range_file = a.value("--elf-range"); } else error("Need to specify a --elf-range file also"); if ( a.find("--elf-controls") ) par::rare_test_summary_controls = true; } if (a.find("--elf-details")) { par::rare_test_print_details = true; par::rare_test_print_details_snp = a.value("--elf-details"); } ////////////////////////// // Association testing if (a.find("--assoc")) { par::assoc_test = true; if ( a.find("--counts")) par::assoc_counts = true; } if (a.find("--qt-means")) { if ( ! a.find("--assoc")) error("Can only specify --qt-means with --assoc for a QTL test"); par::qt_means = true; } if (a.find("--logistic")) par::assoc_test = par::assoc_glm = true; if (a.find("--beta")) par::return_beta = true; if (a.find("--linear")) par::assoc_test = par::assoc_glm = true; if (a.find("--standard-beta")) { if ( ! a.find("--linear") ) error("Must specify --linear with --standard-beta"); par::standard_beta = true; } if (a.find("--no-snp")) par::assoc_glm_without_main_snp = true; if (a.find("--vif")) par::vif_threshold = a.value_double("--vif"); if (a.find("--genotypic")) { if ( a.find("--hethom")) par::twoDFmodel_hethom = true; par::twoDFmodel = true; } if (a.find("--interaction")) par::simple_interaction = true; if (a.find("--parameters")) { string ilist = a.value("--parameters"); // NList nl(0); // par::parameter_list = nl.deparseNumberList(ilist); par::parameter_list = parse2int(ilist); par::glm_user_parameters = true; } if (a.find("--tests")) { string ilist = a.value("--tests"); // NList nl(0); // par::test_list = nl.deparseNumberList(ilist); par::test_list = parse2int(ilist); par::glm_user_test = true; } if (a.find("--condition")) { if (! ( a.find("--linear") || a.find("--logistic") || a.find("--proxy-glm") || a.find("--hap-linear") || a.find("--hap-logistic") || a.find("--elf-test") || a.find("--chap")) ) error("Can only use --condition with --linear, --logistic, --proxy-glm, --chap, --elf-test, --hap-linear or --hap-logistic"); par::conditioning_snp_single = true; par::conditioning_snps = true; par::conditioning_snp_name = a.value("--condition"); } if (a.find("--condition-list")) { if ( ! ( a.find("--linear") || a.find("--logistic") || a.find("--hap-linear") || a.find("--hap-logistic") || a.find("--chap")) ) error("Can only use --condition-list with --linear, --logistic, --hap-linear, --hap-logistic or --chap"); par::conditioning_snps = true; par::conditioning_snps_file = a.value("--condition-list"); } if (a.find("--test-all")) par::test_full_model = true; if (a.find("--sex")) par::glm_sex_effect = true; if (a.find("--no-x-sex")) par::glm_no_auto_sex_effect = true; if (a.find("--xchr-model")) { par::xchr_model = a.value_int("--xchr-model"); if (par::xchr_model < 0 || par::xchr_model >4) error("--xchr-model must have a value between 1 and 4"); } if (a.find("--dominant")) { if ( ! ( a.find("--linear") || a.find("--logistic")) ) error("Can only use --condition-list with --linear or --logistic"); if (a.find("--genotypic")) error("Cannot specify --dominant and --genotypic together"); par::glm_dominant = true; // Only consider autosomes par::xchr_model = 0; } if (a.find("--recessive")) { if ( ! ( a.find("--linear") || a.find("--logistic")) ) error("Can only use --condition-list with --linear or --logistic"); if (a.find("--genotypic")) error("Cannot specify --recessive and --genotypic together"); if (a.find("--dominant")) error("Cannot specify --recessive and --dominant together"); par::glm_recessive = true; // Only consider autosomes par::xchr_model = 0; } if (a.find("--qfam")) { if (a.find("--within")) error("Cannot specify --within and --qfam together"); if (a.find("--family")) error("Cannot specify --family and --qfam together"); if (a.find("--genedrop")) error("Cannot specify --genedrop and --qfam together"); par::QTDT_test = true; par::assoc_test = true; par::QFAM_within1 = true; } if (a.find("--qfam-total")) { if (a.find("--within")) error("Cannot specify --within and --qfam-total together"); if (a.find("--family")) error("Cannot specify --family and --qfam-total together"); if (a.find("--genedrop")) error("Cannot specify --genedrop and --qfam-total together"); par::QTDT_test = true; par::assoc_test = true; par::QFAM_total = true; } if (a.find("--qfam-between")) { if (a.find("--within")) error("Cannot specify --within and --qfam-between together"); if (a.find("--family")) error("Cannot specify --family and --qfam-between together"); if (a.find("--genedrop")) error("Cannot specify --genedrop and --qfam-between together"); par::QTDT_test = true; par::assoc_test = true; par::QFAM_between = true; } if (a.find("--qfam-parents")) { if (a.find("--within")) error("Cannot specify --within and --qfam-parents together"); if (a.find("--family")) error("Cannot specify --family and --qfam-parents together"); if (a.find("--genedrop")) error("Cannot specify --genedrop and --qfam-parents together"); par::QTDT_test = true; par::assoc_test = true; par::QFAM_within2 = true; } if (a.find("--mh") || a.find("--cmh") || a.find("--mh1")) par::CMH_test_1 = par::assoc_test = true; if (a.find("--mh-ord")) par::CMH_test_ORD = par::assoc_test = true; if (a.find("--mh2")) par::CMH_test_2 = par::assoc_test = true; if (a.find("--bd")) { par::CMH_test_1 = par::assoc_test = true; par::breslowday = true; } if (a.find("--homog")) par::assoc_test = par::OR_homog_test = true; if (a.find("--model") || a.find("--trend") || a.find("--model-dom") || a.find("--model-rec") || a.find("--model-trend") || a.find("--model-gen") ) { par::assoc_test = true; par::full_model_assoc = true; if ( a.find("--trend") ) { par::trend_only = true; par::model_perm_trend = true; } else { if (a.find("--model-gen")) par::model_perm_gen = true; else if (a.find("--model-dom")) par::model_perm_dom = true; else if (a.find("--model-rec")) par::model_perm_rec = true; else if (a.find("--model-trend")) par::model_perm_trend = true; else par::model_perm_best = true; if ( (par::model_perm_gen || par::model_perm_best ) && a.find("--adjust") ) error("Cannot specific --model-gen or --model-best with --adjust\n" "Add --model-dom, --model-rec or --model-trend instead"); } } if (a.find("--fisher")) { par::assoc_test = true; par::fisher_test = true; par::min_geno_cell = 0; } if (a.find("--cell")) par::min_geno_cell = a.value_int("--cell"); if (a.find("--gxe")) { if (!a.find("--covar")) error ("--covar {filename} must be specified with --gxe"); par::assoc_gxe = true; } if (a.find("--tdt")) { par::TDT_test = true; par::perm_TDT_basic = true; par::perm_TDT_parent = false; } if (a.find("--mating")) { par::TDT_test = true; par::mating_tests = true; } if (a.find("--dfam")) { if ( a.find("--family") ) error("Cannot specify --family with --dfam\n"); par::TDT_test = true; par::sibTDT_test = true; } if (a.find("--dfam-no-tdt")) par::dfam_tdt = false; if (a.find("--dfam-no-sibs")) par::dfam_sibs = false; if (a.find("--dfam-no-unrelateds")) par::dfam_unrelateds = false; if (a.find("--parentdt1")) { par::TDT_test = true; par::perm_TDT_basic = false; par::perm_TDT_parent = true; } if (a.find("--parentdt2")) { par::TDT_test = true; par::perm_TDT_basic = false; par::perm_TDT_parent = false; } if (a.find("--poo")) { if (!a.find("--tdt")) error("Parent-of-origin analysis requires --tdt option"); if (a.find("--hap-tdt")) error("Parent-of-origin analysis not yet implemented for haplotypic TDT"); par::TDT_test = true; par::parent_of_origin = true; // flavour of permutation? if (a.find("--pat")) { par::perm_POO_poo = false; par::perm_POO_pat = true; } else if (a.find("--mat")) { par::perm_POO_poo = false; par::perm_POO_mat = true; } else if (a.find("--best")) { par::perm_POO_poo = false; par::perm_POO_best = true; } } if (a.find("--sharing")) par::ibs_sharing_test = true; if (a.find("--boot")) par::boot = true; if (a.find("--blocks")) { par::make_blocks = true; // This default can be over-ridden below (200kb) par::disp_r_window_kb = 200 * 1000; } if (a.find("--ld")) { par::calc_SNPSNP_LD = true; vector s = a.value("--ld",2); par::ld_SNP1 = s[0]; par::ld_SNP2 = s[1]; } if (a.find("--ld-snp")) { par::disp_r2 = true; par::ld_anchor = true; par::ld_SNP1 = a.value("--ld-snp"); } else if (a.find("--ld-snp-list")) { par::disp_r2 = true; par::ld_anchor = true; par::ld_anchor_list = true; par::ld_SNP1_file = a.value("--ld-snp-list"); checkFileExists(par::ld_SNP1_file); } if (a.find("--ld-window")) { par::disp_r2 = true; if (a.find("--matrix")) error("Cannot specify --matrix and --ld-window together\n"); par::disp_r_window = true; par::disp_r_window_snp = a.value_int("--ld-window"); } if (a.find("--ld-window-kb")) { par::disp_r2 = true; if (a.find("--matrix")) error("Cannot specify --matrix and --ld-window together\n"); par::disp_r_window = true; // Store in base-pair units par::disp_r_window_kb = a.value_int("--ld-window-kb") * 1000; } if (a.find("--ld-window-r2")) { par::disp_r2 = true; if (a.find("--matrix")) error("Cannot specify --matrix and --ld-window together\n"); par::disp_r_window_r2 = a.value_double("--ld-window-r2"); } if (a.find("--r2")) { par::disp_r2 = true; if ( ! ( a.find("--matrix") || a.find("--inter-chr") ) ) par::disp_r_window = true; } // By default, we assume r^2 is often interest, but here at the end, if // specified we can swap to make the basic correlation if (a.find("--r")) { par::disp_r1 = true; par::disp_r2 = false; if ( ! ( a.find("--matrix") || a.find("--inter-chr") ) ) par::disp_r_window = true; } if (a.find("--flip-scan")) { par::flip_scan = true; if ( a.find("--flip-scan-threshold") ) par::flip_scan_threshold = a.value_double("--flip-scan-threshold"); if ( a.find("--flip-scan-verbose") ) par::flip_scan_verbose = true; } if (a.find("--indep")) { if (makedata) { string msg = "Cannot specify --indep with --make-bed or --recode\n"; msg += " use --extract/--exclude with *.prune.in/*.prune.out files"; error(msg); } par::prune_ld = true; vector s = a.value("--indep",3); par::prune_ld_win = getInt(s[0].c_str(),"--indep"); par::prune_ld_step = getInt(s[1].c_str(),"--indep"); par::prune_ld_vif = getDouble(s[2].c_str(),"--indep"); if (par::prune_ld_win<2) error("Cannot have a window size < 2 for --indep {window} {step} {VIF}"); if (par::prune_ld_step<1) error("Cannot have a window step < 1 for --indep {window} {step} {VIF}"); if (par::prune_ld_vif<1) error("Cannot have a VIF threshold < 1 for --indep {window} {step} {VIF}"); } if (a.find("--indep-pairwise")) { if (makedata) { string msg = "Cannot specify --indep with --make-bed or --recode\n"; msg += " use --extract/--exclude with *.prune.in/*.prune.out files"; error(msg); } par::prune_ld = true; par::prune_ld_pairwise = true; vector s = a.value("--indep-pairwise",3); par::prune_ld_win = getInt(s[0].c_str(),"--indep-pairwise"); par::prune_ld_step = getInt(s[1].c_str(),"--indep-pairwise"); par::prune_ld_r2 = getDouble(s[2].c_str(),"--indep-pairwise"); if (par::prune_ld_win<2) error("Cannot have a window size < 2 for --indep-pairwise {window} {step} {R^2}"); if (par::prune_ld_step<1) error("Cannot have a window step < 1 for --indep-pairwise {window} {step} {R^2}"); if (par::prune_ld_r2<0) error("Cannot have an R^2 threshold < 0 for --indep-pairwise {window} {step} {R^2}"); if (par::prune_ld_r2>1) error("Cannot have an R^2 threshold > 1 for --indep-pairwise {window} {step} {R^2}"); // Compare to correlation par::prune_ld_r2 = sqrt(par::prune_ld_r2); if (a.find("--indep-prefer")) { par::prune_r2_prefer = true; par::prune_r2_prefer_list = a.value("--indep-prefer"); } if (a.find("--indep-fixed")) { par::prune_r2_fixed = true; par::prune_r2_fixed_list = a.value("--indep-fixed"); } } if (a.find("--T2") || a.find("--t2") ) { error("This command is disabled in v1.04"); if ( a.find("--set-test") || a.find("--assoc") ) error("Cannot specify --T2 and other association commands"); par::set_test = true; par::hotel = true; } if (a.find("--set")) { par::read_set = true; par::setfile = a.value("--set"); } if (a.find("--set-test")) { par::set_test = true; // Force use of LD-aware test par::set_r2 = true; if ( (!a.find("--gene")) && ( a.find("--assoc") || a.find("--tdt") ) && ( ! ( a.find("--mperm") || a.find("--perm") ) ) ) error("Must use --mperm N or --perm with set association tests"); } if (a.find("--set-screen")) { if ( ! ( a.find("--set") || a.find("--make-set") ) ) error("Must specify a --set or --make-set\n"); par::set_screen = true; par::set_screen_resultfile = a.value("--set-screen"); } if (a.find("--set-step")) { par::set_test = true; par::set_step = true; par::set_step_in = a.value_double("--set-step"); } if (a.find("--set-score")) { par::set_test = true; par::set_score = true; par::set_score_p = a.value_double("--set-score"); par::set_max = par::set_min = 1; if ( a.find("--set-min") || a.find("--set-max") ) error("Cannot specify --set-min or --set-max with --set-score"); } if (a.find("--set-p2")) { par::set_p2 = true; } if (a.find("--set-min")) { if (!a.find("--set")) error ("You need to specify --set also"); par::set_min = a.value_int("--set-min"); } if (a.find("--set-max")) { if (! ( a.find("--set") || a.find("--make-set"))) error ("You need to specify --set or --make-set also"); if (!a.find("--set-test")) error ("You need to specify --set-test also"); par::set_max = a.value_int("--set-max"); } if (a.find("--set-r2")) { if (! ( a.find("--set") || a.find("--make-set"))) error ("You need to specify --set or --make-set also"); if (!a.find("--set-test")) error ("You need to specify --set-test also"); if (a.find("--set-r2-phase")) par::set_r2_phase = true; par::set_r2_val = a.value_double("--set-r2"); par::set_r2 = true; } if (a.find("--write-set-r2")) { par::set_r2_write = true; } if (a.find("--read-set-r2")) { par::set_r2_read = true; par::set_r2_read_file = a.value("--read-set-r2"); } if (a.find("--set-p")) { if (!a.find("--set-test")) error ("You need to specify --set-test also"); double p = a.value_double("--set-p"); if ( p <= 0 || p > 1 ) error("P-value for --set-p must be 0 s = a.value("--homozyg-haplo-track",2); par::segment_haplotrack_fid1 = s[0]; par::segment_haplotrack_iid1 = s[1]; par::segment_haplotrack_fid2 = par::segment_haplotrack_fid1; par::segment_haplotrack_iid2 = par::segment_haplotrack_iid1; } if (a.find("--segment-haplo-track")) { if ( ! a.find("--hap-window")) error("The 'haplo-track' option requires --hap-window {s} to be specified"); par::segment_haplotrack = true; vector s = a.value("--segment-haplo-track",4); par::segment_haplotrack_fid1 = s[0]; par::segment_haplotrack_iid1 = s[1]; par::segment_haplotrack_fid2 = s[2]; par::segment_haplotrack_iid2 = s[3]; } ///////////////////////////////// // EM Phasing options if (a.find("--em-verbose")) par::haplo_plem_verbose = true; if (a.find("--em-follow")) { par::haplo_plem_follow = true; vector s = a.value("--em-follow",2); par::haplo_plem_follow_fid = s[0]; par::haplo_plem_follow_iid = s[1]; } if (a.find("--em-window")) par::haplo_plem_window = a.value_int("--em-window"); if (a.find("--em-overlap")) par::haplo_plem_original_overlap = par::haplo_plem_overlap = a.value_int("--em-overlap"); if (a.find("--em-window-iter")) par::haplo_plem_iter = a.value_int("--em-window-iter"); if (a.find("--em-window-prune-phase")) par::haplo_plem_window_prune_phase = a.value_double("--em-window-prune-phase"); if (a.find("--em-window-likelihood")) par::haplo_plem_likelihood_iter = a.value_int("--em-window-likilood"); if (a.find("--em-window-tol")) par::haplo_plem_window_tol = a.value_double("--em-window-tol"); if (a.find("--em-window-prune-haplotype")) { par::haplo_plem_zero_threshold = a.value_double("--em-window-prune-haplotype"); if ( par::haplo_plem_zero_threshold == 0 ) par::haplo_plem_nonzero_threshold = false; else par::haplo_plem_nonzero_threshold = true; } ////////////////////////// // Meta EM parameters if (a.find("--em-meta-window")) { par::haplo_plem_meta_window = a.value_int("--em-meta-window"); if ( par::haplo_plem_meta_window < 2 ) error("--em-meta-window must be >1"); } if (a.find("--em-meta-prune-haplotype")) par::haplo_plem_meta_prune_haplotype = a.value_double("--em-meta-prune-haplotype"); if (a.find("--em-meta-prune-phase")) par::haplo_plem_meta_prune_phase = a.value_double("--em-meta-prune-phase"); if (a.find("--em-meta-iter")) par::haplo_plem_meta_iter = a.value_int("--em-meta-iter"); if (a.find("--em-meta-likilood")) par::haplo_plem_meta_likelihood_iter = a.value_int("--em-meta-likilood"); if (a.find("--em-meta-tol")) par::haplo_plem_meta_tol = a.value_double("--em-meta-tol"); ///////////////////////////// // Other haplotype options if (a.find("--hap-all")) { if (!a.find("--hap")) error("--hap-all modifies --hap, but you have not specified --hap"); par::phase_hap_all = true; } if (a.find("--whap")) { par::phase_snps = true; par::tagfile = a.value("--whap"); par::weighted_mm = true; } if (a.find("--hap-pp")) { par::hap_post_prob = a.value_double("--hap-pp"); } if (a.find("--hap-miss")) { par::hap_missing_geno = a.value_double("--hap-miss"); } if (a.find("--hap-freq")) { if ( par::impute_tags ) error("Cannot specify --hap-impute and --hap-freq\n"); par::display_hap_freqs = true; } if (a.find("--hap-assoc")) { if ( par::impute_tags ) error("Cannot specify --hap-impute and --hap-assoc\n"); par::test_hap_CC = true; } if ( a.find("--hap-logistic") || a.find("--hap-linear") ) { par::test_hap_GLM = true; if ( a.find("--hap-omnibus")) par::test_hap_GLM_omnibus = true; if ( a.find("--perm")) error("Cannot currently use --perm and --hap-logistic or --hap-linear. Use --mperm N instead"); } if (a.find("--chap")) { if ( ! a.find("--hap-snps")) error("--chap requires --hap-snps"); par::chap_test = true; if ( a.find("--null-group") ) { par::chap_specified_groups = true; par::chap_model0 = a.value("--null-group"); } if ( a.find("--alt-group") ) { par::chap_specified_groups = true; par::chap_model1 = a.value("--alt-group"); } if ( a.find("--null-snp") ) { if ( par::chap_specified_groups ) error("Cannot specify SNPs and groups for --chap tests"); par::chap_specified_snps = true; par::chap_model0 = a.value("--null-snp"); } if ( a.find("--alt-snp") ) { if ( par::chap_specified_groups ) error("Cannot specify SNPs and groups for --chap tests"); par::chap_specified_snps = true; par::chap_model1 = a.value("--alt-snp"); } if ( a.find("--control") ) { if ( par::chap_specified_groups || par::chap_specified_snps ) error("Cannot use --control and other --chap options"); par::chap_sole_variant = true; par::chap_entity = a.value("--control"); if ( a.find("--control-alleles")) { par::chap_sole_variant_specific_alleles = true; par::chap_sole_variant_specific_allele_list = a.value("--control-alleles"); } } else if ( a.find("--independent-effect") ) { if ( par::chap_specified_groups || par::chap_specified_snps ) error("Cannot use --independent-effect and other --chap options"); par::chap_independent_effect = true; par::chap_entity = a.value("--independent-effect"); } else if ( a.find("--specific-haplotype") ) { if ( par::chap_specified_groups || par::chap_specified_snps ) error("Cannot use --specific-haplotype and other --chap options"); par::chap_haplotype_specific = true; par::chap_entity = a.value("--specific-haplotype"); } if ( a.find("--test-snp") ) { if ( ! ( a.find("--condition") || a.find("--condition-list") ) ) error("You must first specify conditioning SNPs"); par::chap_drop_snps = true; par::chap_drop_snps_list = a.value("--test-snp"); } if ( a.find("--each-vs-others") ) par::chap_add_grp_specifics = true; if ( a.find("--each-versus-others") ) par::chap_add_grp_specifics = true; } if (a.find("--hap-snps")) { if ( a.find("--hap") || a.find("--hap-window")) error("Cannot specify --hap-snps with --hap or --hap-window"); par::phase_snps = true; par::hap_specific_snps = true; par::hap_specific_snps_list = a.value("--hap-snps"); par::phase_hap_all = true; } if (a.find("--hap-tdt")) { if ( par::impute_tags ) error("Cannot specify --hap-impute and --hap-tdt\n"); par::test_hap_TDT = true; } if (a.find("--hap-phase")) { if ( par::impute_tags ) error("Cannot specify --hap-impute and --hap-phase\n"); par::display_phase_probs = true; } if (a.find("--hap-phase-wide")) { if ( par::impute_tags ) error("Cannot specify --hap-impute and --hap-phase-wide\n"); par::display_phase_probs = par::display_phase_probs_wide = true; } if (a.find("--hap-only")) { par::test_hap_only = true; } if (a.find("--hap-max-phase")) { par::hap_max_nf_phases = a.value_int("--hap-max-phase"); if ( par::hap_max_nf_phases < 1 ) error("Invalid --hap-max-phase value (should be >0)"); } if (a.find("--hap-min-phase-prob")) { par::hap_min_phase_prob = a.value_double("--hap-min-phase-prob"); if ( par::hap_min_phase_prob < 0 || par::hap_min_phase_prob > 1 ) error("Invalid --hap-min-phase-prob value, should be between 0 and 1"); } ////////////////////////// // Epistasis if (a.find("--epistasis")) { if (a.find("--set") || a.find("--make-set")) par::set_test = true; par::epistasis = true; } // Use odds-ratio test as default fast-epistasis method if (a.find("--fast-epistasis")) { par::fast_epistasis = par::epistasis = true; if ( a.find("--set") || a.find("--make-set")) par::set_test = true; } if (a.find("--case-only")) { par::epistasis = true; par::fast_epistasis = true; par::epi_caseonly = true; if (a.find("--epistasis")) error("--case-only requires --fast-epistasis"); } if (a.find("--gap")) { if (!a.find("--case-only")) error("--gap option only valid when --caseonly is in effect"); par::epi_caseonly_kb_gap = a.value_double("--gap"); } if (a.find("--nop")) par::epi_quickscan = true; if (a.find("--set-by-all")) par::set_by_set = par::drop_sets = false; if (a.find("--epi1")) par::epi_alpha1 = a.value_double("--epi1"); if (par::epi_alpha1 > 1) par::epi_filter = false; if (a.find("--epi2")) par::epi_alpha2 = a.value_double("--epi2"); if (a.find("--twolocus")) { par::list_twolocus = true; vector s = a.value("--twolocus",2); par::twolocus_snp1 = s[0]; par::twolocus_snp2 = s[1]; } if (a.find("--genepi")) { par::set_test = true; par::epi_genebased = true; } /////////////////////////////////////// // Gene X environment / heterogeneity ////////////////////////////////////// // File output options if (a.find("--freq")) { par::af_write = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 // if (!a.find("--maf")) par::min_af = 0.0; // if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; // if (!a.find("--mind")) par::MAX_IND_MISSING = 1; // display MAF counts instead of freqs? if (a.find("--counts")) { if (a.find("--within")) error("Cannot specify --counts and --within together\n"); par::af_count = true; } } if (a.find("--nonfounders")) par::summ_nonfounders = true; if (a.find("--make-founders")) par::make_founders = true; if (a.find("--allow-no-sex")) par::ignore_missing_sex = true; if (a.find("--must-have-sex")) { if ( ! makedata ) error("Can only specify --must-have-sex with a data generation command"); par::ignore_missing_sex = false; } else if ( makedata ) { par::ignore_missing_sex = true; } if (a.find("--read-freq")) { par::af_read = true; par::af_file = a.value("--read-freq"); } if (a.find("--read-genome")) { par::ibd_read = true; par::ibd_file = a.value("--read-genome"); checkFileExists( par::ibd_file ); } else if (a.find("--read-genome-list")) { par::ibd_read = par::ibd_read_list = true; par::ibd_file_list = a.value("--read-genome-list"); checkFileExists( par::ibd_file_list ); } if (a.find("--Z-genome")) par::compress_genome = true; if ( a.find("--genome-groups") ) { if ( ! a.find("--genome-groups")) error("You must specify a .genome file with --read-genome"); if ( ! a.find("--within")) error("You must specify a cluster file with --within"); par::genome_groups = true; } if (a.find("--read-genome-minimal")) { if (a.find("--read-genome")) error("Cannot specify both --read-genome and --read-genome-minimal"); par::ibd_read = true; par::ibd_read_minimal = true; par::ibd_file = a.value("--read-genome-minimal"); checkFileExists( par::ibd_file ); } if (a.find("--list")) { par::list_by_allele = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--report")) { par::indiv_report = true; vector s = a.value("--report",2); par::indiv_report_fid = s[0]; par::indiv_report_iid = s[1]; if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; } if (a.find("--plist")) { par::plist = true; // and unless otherwise specified, set GENO = 1 and MAF = 0 if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; vector s = a.value("--plist",4); par::plist_fid1 = s[0]; par::plist_iid1 = s[1]; par::plist_fid2 = s[2]; par::plist_iid2 = s[3]; } if (a.find("--fix-allele")) { if (! (a.find("--recodeAD") || a.find("--recodeA")) ) error("--fix-allele option only works with --recodeA/--recodeAD options"); else par::recode_AD_fixed = true; } if (a.find("--merge")) { if (a.find("--merge-list") || a.find("--bmerge") ) error("Can only specify --merge or --bmerge or --merge-list"); par::merge_data = true; vector s = a.value("--merge",2); par::merge_pedfile = s[0]; par::merge_mapfile = s[1]; checkFileExists( par::merge_pedfile ); checkFileExists( par::merge_mapfile ); } if (a.find("--bmerge")) { if (a.find("--merge-list") || a.find("--merge") ) error("Can only specify --bmerge or --merge or --merge-list"); par::merge_data = true; par::merge_binary = true; vector s = a.value("--bmerge",3); par::merge_bedfile = s[0]; par::merge_bimfile = s[1]; par::merge_famfile = s[2]; checkFileExists( par::merge_bedfile ); checkFileExists( par::merge_bimfile ); checkFileExists( par::merge_famfile ); } if (a.find("--merge-flip")) par::merge_force_strand = true; if (a.find("--merge-list")) { if (a.find("--merge") || a.find("--bmerge") ) error("Can only specify --merge or --bmerge or --merge-list"); par::merge_data = true; par::merge_list = true; par::merge_list_filename = a.value("--merge-list"); } if (a.find("--merge-mode")) { if (! (a.find("--merge") || a.find("--merge-list") || a.find("--bmerge") ) ) error("Can only specify --merge-mode when --merge or --bmerge or --merge-list is used"); par::merge_mode = a.value_int("--merge-mode"); if (par::merge_mode < 1 || par::merge_mode > 7) error("--merge-mode N, where N must be between 1 and 7"); if (par::merge_list && par::merge_mode >= 6) error("Can not specify --merge-mode 6/7 (diff) and --merge-list"); } if (a.find("--flip")) { par::flip_strand = true; par::flip_file = a.value("--flip"); if (a.find("--flip-subset")) { par::flip_subset = true; par::flip_subset_file = a.value("--flip-subset"); checkFileExists( par::flip_subset_file ); } } // Remove these individuals from a file if (a.find("--remove")) { par::remove_indiv = true; par::remove_indiv_list = a.value("--remove"); checkFileExists( par::remove_indiv_list ); } // Keep only these individuals from a file if (a.find("--keep")) { par::keep_indiv = true; par::keep_indiv_list = a.value("--keep"); checkFileExists( par::keep_indiv_list ); } // By default, remove then keep if (a.find("--keep-before-remove")) par::remove_before_keep = false; // Extract only these SNPs from a file if (a.find("--extract")) { par::extract_set = true; par::extract_file = a.value("--extract"); checkFileExists( par::extract_file ); } // Exclude these SNPs from a file if (a.find("--exclude")) { par::exclude_set = true; par::exclude_file = a.value("--exclude"); checkFileExists( par::exclude_file ); } if (a.find("--thin")) { par::thin_snps = true; par::thin_param = a.value_double("--thin"); } // Select a set of SNPs based on different physical // positions: this modifies the behavior of --extract // and --exclude if (a.find("--range")) { if ( ! ( a.find("--extract") || a.find("--exclude") ) ) error("Must specify --extract or --exclude with --range"); par::snp_range_list = true; } if (a.find("--border")) { par::make_set_border = a.value_int("--border") * 1000; } if (a.find("--make-set")) { par::make_set = true; par::make_set_file = a.value("--make-set"); if ( a.find("--make-set-border")) par::make_set_border = a.value_int("--make-set-border") * 1000; if ( a.find("--make-set-collapse-group")) { par::make_set_collapse = true; } else if ( a.find("--make-set-collapse-all")) { par::make_set_collapse = true; par::make_set_collapse_label = a.value("--make-set-collapse-all"); par::make_set_ignore_group = true; } else if ( a.find("--make-set-complement-group") ) { par::make_set_complement = true; } else if ( a. find("--make-set-complement-all")) { par::make_set_complement = true; par::make_set_ignore_group = true; par::make_set_collapse_label = a.value("--make-set-complement-all"); } } if (a.find("--write-set")) { if ( ! ( a.find("--set") || a.find("--make-set"))) error("Must specify --set or --make-set with --write-set"); par::write_set = true; } // By default, extract before exclude if (a.find("--exclude-before-extract")) par::extract_before_exclude = false; // Extract SNPs in a certain GENE (specified in the SET file) if (a.find("--gene")) { if (! (a.find("--set") || a.find("--make-set"))) error("You must also specify --set or --make-set with --gene\n"); par::extract_set = true; par::dump_gene = true; par::dump_genename = a.value("--gene"); } if (a.find("--ind-major")) { if (!a.find("--make-bed")) error("You can only specify --ind-major when --make-bed is in effect"); par::out_SNP_major = false; } if (a.find("--include")) par::inc_write = true; if (a.find("--read-include")) { par::inc_read = true; par::inc_file = a.value("--read-include"); } //////////////////////////////// // Genotype quality score files if (a.find("--qual-scores")) { par::read_snp_qual = true; par::snp_qual_file = a.value("--qual-scores"); if (a.find("--qual-threshold")) par::snp_qual_min = a.value_double("--qual-threshold"); if (a.find("--qual-max-threshold")) par::snp_qual_max = a.value_double("--qual-max-threshold"); } if (a.find("--qual-geno-scores")) { par::read_geno_qual = true; par::geno_qual_file = a.value("--qual-geno-scores"); if (a.find("--qual-geno-threshold")) par::geno_qual_min = a.value_double("--qual-geno-threshold"); if (a.find("--qual-geno-max-threshold")) par::geno_qual_max = a.value_double("--qual-geno-max-threshold"); } /////////////////////////// // Plink phenotype definition // // Squared differences (quantitative trait) // if (a.find("--SD")) { par::SD = true; par::CP = false; } // // Cross-product (quantitative trait) // if (a.find("--CP")) { par::CP = true; par::SD = false; } // // Fix the prevalence of a binary trait // if (a.find("--prev")) { // par::fix_prev=true; // par::fixed_prev = a.value_double("--prev"); // } // // Remove unaffected pairs from analysis // // i.e. so only discordant versus affected concordant // // i.e. need at least 1 affected "1A" // if (a.find("--1aff")) { par::remove_unaffected_pairs = true; } /////////////////////////// // Some basic filters if (a.find("--prune")) par::ignore_phenotypes = false; if (a.find("--filter-cases")) par::filter_cases = true; if (a.find("--filter-controls")) par::filter_controls = true; if (a.find("--filter-females")) par::filter_females = true; if (a.find("--filter-males")) par::filter_males = true; if (a.find("--filter-founders")) par::filter_founders = true; if (a.find("--filter-nonfounders")) par::filter_nonfounders = true; if (a.find("--filter-cases") && a.find("--filter-controls")) error("Cannot filter on both cases and controls"); if (a.find("--filter-males") && a.find("--filter-females")) error("Cannot filter on both males and females"); if (a.find("--filter-founders") && a.find("--filter-nonfounders")) error("Cannot filter on both founders and nonfounders"); if (a.find("--attrib")) { par::snp_attrib_filter = true; vector s = a.value("--attrib",2); par::snp_attrib_file = s[0]; par::snp_attrib_value = s[1]; } if (a.find("--attrib-indiv")) { par::ind_attrib_filter = true; vector s = a.value("--attrib-indiv",2); par::ind_attrib_file = s[0]; par::ind_attrib_value = s[1]; } ///////////////////////////////// // Basic input file processing if (a.find("--dummy")) { vector s = a.value("--dummy",2); par::dummy = true; par::dummy_nind = getInt(s[0].c_str(),"--dummy"); par::dummy_nsnp = getInt(s[1].c_str(),"--dummy"); } if (a.find("--simulate")) { par::simul = true; par::simul_file = a.value("--simulate"); } else if (a.find("--simulate-qt")) { par::simul = par::simul_qt = true; par::simul_file = a.value("--simulate-qt"); if (a.find("--simulate-n")) par::simul_ncases = a.value_int("--simulate-n"); } if (a.find("--simulate-label")) par::simul_label = a.value("--simulate-label"); if (a.find("--simulate-tags")) { if (!a.find("--simulate")) error("Requires --simulate for --simulate-tags"); par::simul_tags = true; } if (a.find("--simulate-haps")) { if (!a.find("--simulate")) error("Requires --simulate for --simulate-haps"); par::simul_tags = true; par::simul_haps = true; } if (a.find("--simulate-ncases")) par::simul_ncases = a.value_int("--simulate-ncases"); if (a.find("--simulate-ncontrols")) par::simul_ncontrols = a.value_int("--simulate-ncontrols"); if (a.find("--simulate-prevalence")) par::simul_prevalence = a.value_double("--simulate-prevalence"); //////////////////////////////////////////////////// // Main file input options if (a.find("--compress")) { par::compress_file = true; par::compress_filename = a.value("--compress"); } if (a.find("--decompress")) { par::uncompress_file = true; par::compress_filename = a.value("--decompress"); } if (a.find("--file")) { if (a.find("--map") || a.find("--ped") ) error("Use either --file {root} OR --ped {name} --map {name}"); par::read_ped = true; par::fileroot = a.value("--file"); par::pedfile = par::fileroot + ".ped"; par::mapfile = par::fileroot + ".map"; } if (a.find("--tfile")) { if (a.find("--tfam") || a.find("--tped") ) error("Use either --tfile {root} OR --tped {name} --tfam {name}"); par::tfile_input = true; par::fileroot = a.value("--tfile"); par::tpedfile = par::fileroot + ".tped"; par::tfamfile = par::fileroot + ".tfam"; } if (a.find("--tped")) { par::tpedfile = a.value("--tped"); if (par::tpedfile == "-") par::ped_from_stdin = true; par::tfile_input = true; } if (a.find("--tfam")) { par::tfamfile = a.value("--tfam"); par::tfile_input = true; } // Long file format if (a.find("--lfile")) { if (a.find("--fam") || a.find("--lgen") || a.find("--map") ) error("Use either --lfile {root} OR --lgen {name} --map (name) --fam {name}"); par::lfile_input = true; par::fileroot = a.value("--lfile"); par::lpedfile = par::fileroot + ".lgen"; par::famfile = par::fileroot + ".fam"; par::mapfile = par::fileroot + ".map"; } if (a.find("--lgen")) { par::lpedfile = a.value("--lgen"); if (par::lpedfile == "-") par::ped_from_stdin = true; par::lfile_input = true; } //////////////////////// // Reference allele file if (a.find("--reference")) { par::ref_file = true; par::ref_file_name = a.value("--reference"); } ///////////////////// // Generic variant if ( a.find("--gfile") || a.find("--gvar") ) { // Analyse generic variants par::gvar = true; // Primarily load these; this will be changed // if another load is not specified par::load_gvar = false; if ( a.find("--gfile") ) { par::fileroot = a.value("--gfile"); par::gvarfile = par::fileroot + ".gvar"; par::gmapfile = par::fileroot + ".map"; par::gfamfile = par::fileroot + ".fam"; } else { par::gvarfile = a.value("--gvar"); } if (a.find("--gvar-verbose")) par::gvar_verbose_association = true; if (a.find("--gvar-all")) par::gvar_include_all_variants = true; if (a.find("--gvar-convert")) par::gvar_to_standard = true; if (a.find("--gvar-verbose")) par::gvar_full_report = true; } if (a.find("--gvar-write")) par::gvar_write = true; // Text-file modifiers if (a.find("--map3")) par::map3 = true; if (a.find("--no-sex")) par::ped_skip_sex = true; if (a.find("--no-parents")) par::ped_skip_parents = true; if (a.find("--no-fid")) par::ped_skip_fid = true; if (a.find("--no-pheno")) par::ped_skip_pheno = true; if (a.find("--liability")) par::liability = true; if (a.find("--bfile")) { if (a.find("--bim") || a.find("--bed") || a.find("--fam") ) error("Use either --bfile {root} OR --bed {name} --bim {name} --fam {name}"); par::read_bitfile = true; par::fileroot = a.value("--bfile"); par::bitfilename = par::pedfile = par::fileroot + ".bed"; par::bitfilename_map = par::mapfile = par::fileroot + ".bim"; par::famfile = par::fileroot + ".fam"; } if (a.find("--no-snps")) par::do_not_load_snps = true; if (a.find("--bfile-faster")) par::fast_binary = true; if (a.find("--ped")) { par::read_ped = true; par::pedfile = a.value("--ped"); if (par::pedfile == "-") par::ped_from_stdin = true; } if (a.find("--map")) par::mapfile = a.value("--map"); if (a.find("--bed")) { par::read_bitfile = true; par::bitfilename = par::pedfile = a.value("--bed"); } if (a.find("--fam")) { par::famfile = a.value("--fam"); } if (a.find("--bim")) { par::read_bitfile = true; par::bitfilename_map = par::mapfile = a.value("--bim"); } // Single phenotype in file specified if (a.find("--pheno")) { par::pheno_file = true; par::pheno_filename = a.value("--pheno"); } // Multiple phenotypes if (a.find("--mult-pheno")) { par::multiple_phenotypes = true; par::multiple_phenotype_file = a.value("--mult-pheno"); } if (a.find("--mult-pheno-number")) { par::plist_selection = par::plist_selection_number = true; par::plist_selection_string = a.value("--mult-pheno-number"); } if ( a.find("--mult-pheno-name") ) { par::plist_selection = par::plist_selection_name = true; par::plist_selection_string = a.value("--mult-pheno-name"); } // Get phenotype from a cluster file if (a.find("--make-pheno")) { par::make_pheno = true; if ( a.find("--pheno") || a.find("--all-pheno")) error("Incompatible phenotype selection commands specified"); vector s = a.value("--make-pheno",2); par::make_pheno_filename = s[0]; par::make_pheno_value = s[1]; if ( par::make_pheno_value == "*" ) par::make_pheno_present = true; } // Binary 0/1 coding instead of 1/2 if (a.find("--1")) par::coding01 = true; // Multiple phenotypes in a file specified if (a.find("--mpheno")) { if (!a.find("--pheno")) error("You need to specify --pheno {file} with --mpheno {N}"); par::mult_pheno = a.value_int("--mpheno"); } // Select phenotype ny name if (a.find("--pheno-name")) { if (!a.find("--pheno")) error("You need to specify --pheno {file} with --pheno-name {name}"); if (a.find("--mpheno")) error("You cannot specify --mpheno and --pheno-name together"); par::name_pheno = a.value("--pheno-name"); } if (a.find("--values")) { par::number_list_string = a.value("--values"); } if (a.find("--valueless")) { par::number_list_string = a.value("--valueless"); par::number_list_positive = false; } // Loop over all phenotypes if (a.find("--all-pheno")) { if (!a.find("--pheno")) error("You need to specify --pheno {file} with --all-pheno"); if (a.find("--mpheno")) error("You cannot specify --mpheno {N} with --all-pheno"); par::mult_pheno = 1; par::all_pheno = true; } // Single covariate in file specified if (a.find("--covar")) { par::covar_file = true; if ( a.find("--gxe") ) { par::covar_filename = a.value("--covar"); checkFileExists(par::covar_filename); par::clist = false; } else { // Aside from old "--gxe" method, all other // options now use the new clist format // Multiple covariates in file specified, read all of them par::clist = true; par::clist_filename = a.value("--covar"); checkFileExists(par::clist_filename); // Request to dump back out all covariates if (makedata) { par::clist = true; if (a.find("--dummy-coding")) par::dump_covar_dummy_coding = true; } if (a.find("--with-phenotype")) par::dump_covar_with_phenotype = true; } } // Multiple covariates in file specified, select one if (a.find("--mcovar")) { if (!a.find("--covar")) error("You need to specify --covar {file} with --mcovar {N}"); par::mult_covar = a.value_int("--mcovar"); } // Selection fields for covariates if (a.find("--covar-number")) { par::clist_selection = par::clist_selection_number = true; par::clist_selection_string = a.value("--covar-number"); } if ( a.find("--covar-name") ) { par::clist_selection = par::clist_selection_name = true; par::clist_selection_string = a.value("--covar-name"); } // Request to dump back out all covariates if (a.find("--write-covar")) { if (makedata) error("No need to specify --write-covar separately"); if (!a.find("--covar")) error("You must specify a --covar {file} with --write-covar"); if (a.find("--with-phenotype")) par::dump_covar_with_phenotype = true; if (a.find("--dummy-coding")) par::dump_covar_dummy_coding = true; par::dump_covar = true; par::clist = true; } // Request to dump back out all covariates if (a.find("--write-cluster")) { if (!a.find("--within")) error("You must specify a --within {file} with --write-covar"); par::dump_clst = true; } if (a.find("--write-snplist")) par::write_snplist = true; if (a.find("--update-map")) { if (a.find("--update-cm")) par::update_cm = true; else if (a.find("--update-chr")) par::update_chr = true; else if ( a.find("--update-name")) par::update_name = true; par::update_map = true; par::update_mapfile = a.value("--update-map"); } if (a.find("--update-ids")) { par::update_ids = true; par::update_ids_file = a.value("--update-ids"); } if (a.find("--update-sex")) { if ( a.find("--update-ids")) error("Cannot --update-ids at same time as --update-sex"); par::update_sex = true; par::update_sex_file = a.value("--update-sex"); } if (a.find("--update-parents")) { if ( a.find("--update-ids")) error("Cannot --update-ids at same time as --update-parents"); par::update_parents = true; par::update_parents_file = a.value("--update-parents"); } if (a.find("--update-pheno")) { if ( a.find("--update-ids")) error("Cannot --update-ids at same time as --update-pheno"); par::update_pheno = true; par::update_pheno_file = a.value("--update-pheno"); } if (a.find("--update-alleles")) { if ( a.find("--update-name")) error("Cannot --update-alleles at same time as --update-name"); par::update_alleles = true; par::update_allele_file = a.value("--update-alleles"); } // Examine only a subset of the data? if (a.find("--filter")) { par::filter_on_covar = true; vector s = a.value("--filter",2); par::filter_filename = s[0]; par::filter_value = s[1]; checkFileExists( par::filter_filename ); } if (a.find("--mfilter")) { if (!a.find("--filter")) error("You can only specify --mfilter with --filter\n"); par::mult_filter = a.value_int("--mfilter"); } // Different species other than human? // i.e. alters chromosome definitions if (a.find("--dog")) par::species_dog = true; if (a.find("--cow")) par::species_cow = true; if (a.find("--mouse")) par::species_mouse = true; if (a.find("--sheep")) par::species_sheep = true; if (a.find("--horse")) par::species_horse = true; if (a.find("--rice")) par::species_rice = true; ////////////////////////////////// // Multipoint and singlepoint // if (a.find("--singlepoint")) par::singlepoint = true; if (a.find("--fringe")) { par::singlepoint = false; par::fringe = a.value_double("--fringe"); } if (a.find("--grid")) { par::singlepoint = false; par::grid = a.value_double("--grid"); par::inter_grid = 0; } if (a.find("--step")) { par::singlepoint = false; par::inter_grid = a.value_int("--step"); } if (a.find("--cm")) par::cm_map = true; if (a.find("--ci")) { par::display_ci = true; par::ci_level = a.value_double("--ci"); if ( par::ci_level < 0.01 || par::ci_level >= 1 ) error("CI level (--ci) must be between 0 and 1\n"); par::ci_zt = ltqnorm( 1 - (1 - par::ci_level) / 2 ); } if (a.find("--pfilter")) { par::pfilter = true; par::pfvalue = a.value_double("--pfilter"); } if (a.find("--hide-covar")) par::no_show_covar = true; if (a.find("--meta-analysis")) { par::meta_analysis = true; par::meta_files = a.varValue("--meta-analysis"); } if (a.find("--annotate")) { par::annot_file = true; par::annot_filename = a.value("--annotate"); } if (a.find("--gene-report")) { par::greport = true; par::greport_results = a.value("--gene-report"); if (!a.find("--gene-list") ) error("You must specify a --gene-list"); else par::greport_gene_list = a.value("--gene-list"); if ( a.find("--gene-list-border") ) par::make_set_border = a.value_int("--gene-list-border") * 1000; if ( a.find("--gene-subset") ) { par::greport_subset = true; par::greport_subset_file = a.value("--gene-subset"); } if ( a.find("--gene-report-empty") ) par::greport_display_empty = true; } if (a.find("--show-tags")) { par::gettag_mode = true; par::gettag_file = a.value("--show-tags"); if (a.find("--list-all") || par::gettag_file == "all" ) par::gettag_listall = true; if ( a.find("--tag-mode2") ) { par::gettag_mode1 = false; par::gettag_mode2 = true; } if ( a.find("--tag-r2")) par::gettag_r2 = a.value_double("--tag-r2"); if ( a.find("--tag-kb")) par::gettag_kb = a.value_int("--tag-kb") * 1000; } if (a.find("--clump")) { par::clumpld = true; par::clumpld_results = a.value("--clump"); if (a.find("--clump-best")) par::clumpld_best = true; if (a.find("--clump-field")) par::clumpld_column = a.value("--clump-field"); if (a.find("--clump-verbose")) par::clumpld_verbose = true; if (a.find("--clump-p1")) par::clumpld_p1 = a.value_double("--clump-p1"); if (a.find("--clump-p2")) par::clumpld_p2 = a.value_double("--clump-p2"); if (a.find("--clump-r2")) par::clumpld_r2 = a.value_double("--clump-r2"); if (a.find("--clump-kb")) par::clumpld_kb = a.value_int("--clump-kb") * 1000; if (a.find("--clump-index-first")) par::clumpld_index1 = true; if (a.find("--clump-replicate")) par::clumpld_only_show_replications = true; if (a.find("--clump-range")) { if ( a.find("--make-set")) error("Cannot specify --make-set and --clump-range together"); par::clumpld_range_annotate = true; par::clumpld_range_file = a.value("--clump-range"); if ( a.find("--clump-range-border")) par::make_set_border = a.value_int("--clump-range-border") * 1000; } if (a.find("--clump-only-non-index")) { par::clumpld_only_show_replications = true; par::clumpld_only_show_replications_list = true; } if (a.find("--clump-annotate")) { par::clumpld_annot = true; par::clumpld_annot_fields = a.value("--clump-annotate"); } if (a.find("--clump-allow-overlap")) par::clumpld_indep = false; } if (a.find("--adjust")) { par::multtest = true; } if (a.find("--log10")) { par::logscale = true; } if (a.find("--qq-plot")) { par::qq_plot = true; } if (a.find("--lambda")) { par::fix_lambda = true; par::lambda = a.value_double("--lambda"); if ( par::lambda < 1 ) par::lambda = 1; } if (a.find("--gc")) { if (!a.find("--adjust")) error("Must specify --adjust to use --gc"); par::use_GC = true; } /////////////////////// // Permutation options // Use permutations (default is adaptive) if (a.find("--perm")) { par::permute = true; } // Return counts not p-values (i.e. number of times exceeded) if ( a.find("--perm-count") ) { par::perm_count = true; } // Specify parameters for adaptive permutation if (a.find("--aperm")) { if (a.find("--segment")) error("--segment options requires --pperm option"); if (a.find("--set")||a.find("--make-set")) error("Cannot use --aperm with SET options (use --mperm N instead)"); par::permute = true; par::adaptive_perm = true; vector s = a.value("--aperm",6); par::adaptive_min = getInt(s[0].c_str(),"--aperm"); par::adaptive_max = getInt(s[1].c_str(),"--aperm"); par::adaptive_alpha = getDouble(s[2].c_str(),"--aperm"); par::adaptive_ci = getDouble(s[3].c_str(),"--aperm"); par::adaptive_interval = getInt(s[4].c_str(),"--aperm"); par::adaptive_interval2 = getDouble(s[5].c_str(),"--aperm"); } // Non-adaptive (maxT) permutations if (a.find("--mperm")) { par::permute = true; par::adaptive_perm = false; par::replicates = a.value_int("--mperm"); if (a.find("--mperm-save")) par::mperm_save_best = true; else if (a.find("--mperm-save-all")) par::mperm_save_all = true; // But make special fix for QFAM tests if ( par::QTDT_test ) { par::adaptive_perm = true; par::adaptive_min = par::replicates; par::adaptive_max = par::replicates; par::adaptive_alpha = 0; par::adaptive_ci = 0; par::adaptive_interval = par::replicates+1; par::adaptive_interval2 = 0; par::QFAM_adaptive = true; } } if (a.find("--make-perm-pheno")) { if ( a.find("--mperm") || a.find("--perm") ) error("Cannot specify --make-perm-pheno with other permutation options"); par::output_pheno_perm = true; par::permute = true; par::adaptive_perm = false; par::replicates = a.value_int("--make-perm-pheno"); } if (a.find("--rank")) { if (! a.find("--mperm") ) error("--rank requires --mperm to be specified"); par::mperm_rank = true; } // PLINK permutations if (a.find("--pperm")) { if (! ( a.find("--segment") || a.find("--read-segment") ) ) error("--pperm options requires --segment or --read-segment option"); par::permute = true; par::adaptive_perm = false; par::replicates = a.value_int("--pperm"); } if (a.find("--p2")) { if ((!a.find("--perm")) && (!a.find("--mperm")) && (!a.find("--aperm"))) error("--p2 option also requires--perm, --aperm or --mperm"); else if (!a.find("--assoc")) error("The --p2 option can only be specified with --assoc"); else par::assoc_test_alt_perm = true; } ///////////////////// // Gene-dropping if (a.find("--genedrop")) par::perm_genedrop = par::permute = true; if (a.find("--swap-sibs")) { if (!a.find("--genedrop")) error("--swap-sibs only makes sense when --genedrop specified"); par::perm_genedrop_sibships = true; par::perm_genedrop_and_swap = true; } if (a.find("--swap-parents")) { if (!a.find("--genedrop")) error("--swap-parents only makes sense when --genedrop specified"); par::perm_genedrop_parents = true; par::perm_genedrop_and_swap = true; } if (a.find("--swap-unrel")) { if (!a.find("--genedrop")) error("--swap-unrel only makes sense when --genedrop specified"); par::perm_genedrop_unrel = true; par::perm_genedrop_and_swap = true; } /////////////////////// // Misc. options if (a.find("--compound-genotypes")) { if ( ! ( a.find("--file") || a.find("--ped") || a.find("--lfile") || a.find("--lgen") ) ) error("--compound-genotype only works with PED/MAP or LGEN filesets currently"); par::compound_genotype_code = true; } if (a.find("--allele-count")) { if ( ! ( ( a.find("--lfile") || a.find("--lgen") ) && a.find("--reference") ) ) error("Can only use --allele-count with --lgen and --reference\n"); par::lfile_allele_count = true; // expect either 1 or 2, to indicate # of non-reference alleles (i.e. # of mutations) } if (a.find("--missing-genotype")) { par::missing_genotype = a.value("--missing-genotype"); par::out_missing_genotype = par::missing_genotype; par::missing_genotype_explicit = true; } if (a.find("--missing-phenotype")) { par::missing_phenotype = a.value("--missing-phenotype"); par::out_missing_phenotype = par::missing_phenotype; par::missing_phenotype_explicit = true; } if (a.find("--output-missing-genotype")) { par::out_missing_genotype = a.value("--output-missing-genotype"); } if (a.find("--output-missing-phenotype")) { par::out_missing_phenotype = a.value("--output-missing-phenotype"); } if (a.find("--FIX")) { par::FIXED = par::FIXED_p = true; vector p = a.value("--FIX",4); par::FIX_IBD.z0 = getDouble(p[0].c_str(),"--FIX"); par::FIX_IBD.z1 = getDouble(p[1].c_str(),"--FIX"); par::FIX_IBD.z2 = getDouble(p[2].c_str(),"--FIX"); par::FIX_p = getDouble(p[3].c_str(),"--FIX"); cout << "Fixing Z0, Z1, Z2 and p to " << par::FIX_IBD.z0 << " " << par::FIX_IBD.z1 << " " << par::FIX_IBD.z2 << " " << par::FIX_p << "\n(p must refer to '1' allele in '1/2' genotype)\n"; } if (a.find("--fix-ibd")) { par::FIXED = true; vector p = a.value("--fix-ibd",3); par::FIX_IBD.z0 = getDouble(p[0].c_str(),"--fix-ibd"); par::FIX_IBD.z1 = getDouble(p[1].c_str(),"--fix-ibd"); par::FIX_IBD.z2 = getDouble(p[2].c_str(),"--fib-ibd"); } if (a.find("--batch")) par::BATCH_SIZE = a.value_int("--batch"); if (a.find("--min")) par::MIN_PIHAT = a.value_double("--min"); if (a.find("--max")) par::MAX_PIHAT = a.value_double("--max"); if (a.find("--all-pairs")) { if (a.find("--min")) error("Cannot specify --min and --all-pairs\n"); par::include_all_pairs = true; } // if (a.find("--lock")) // { par::locked = true; } // if (a.find("--unlock")) // { par::locked = false; } /////////////////////////////// // Basic filters: make this the // default now... if ( a.find("--all") || true ) { par::min_af = 0.0; par::MAX_GENO_MISSING = 1; par::MAX_IND_MISSING = 1; } if (a.find("--geno")) par::MAX_GENO_MISSING = a.value_double("--geno"); if (a.find("--mind")) par::MAX_IND_MISSING = a.value_double("--mind"); if (a.find("--maf")) par::min_af = a.value_double("--maf"); if (a.find("--max-maf")) { par::max_af = a.value_double("--max-maf"); if (par::max_af < par::min_af) error("Cannot set --max-maf less than --maf\n"); } if (a.find("--keep-allele-order")) { par::make_minor_allele = false; } if (a.find("--mhf")) par::min_hf = a.value_double("--mhf"); if (a.find("--max-mhf")) { par::max_hf = a.value_double("--max-mhf"); if (par::max_hf < par::min_hf) error("Cannot set --max-mhf less than --mhf\n"); } if (a.find("--hwe")) { par::HWD_test = true; par::HWD_limit = a.value_double("--hwe"); } if (a.find("--hwe2")) { par::HWD_test = true; par::HWD_standard = true; par::HWD_limit = a.value_double("--hwe2"); } if (a.find("--hwe-all")) { par::HWD_filter_on_all = true; } if (a.find("--me")) { par::MENDEL_test = true; vector s = a.value("--me",2); par::MENDEL_ind = getDouble(s[0],"--me"); par::MENDEL_snp = getDouble(s[1],"--me"); } ////////////////////////////////// // Reading a dosage file if (a.find("--dosage")) { par::dosage_assoc = true; par::dosage_file = a.value("--dosage"); if ( ! a.find("--fam") ) error("You need to also specify a FAM (--fam) file"); if ( a.find("--map")) par::dosage_hasMap = true; if ( a.find("--hard-call")) { if ( ! a.find("--map") ) error("Need to specify --map with --hard-call"); par::dosage_hard_call = true; vector s = a.value("--hard-call",2); par::dosage_hard_call_thresh = getDouble(s[0].c_str(),"--hard-call"); par::dosage_hard_call_thresh2 = getInt(s[1].c_str(),"--hard-call"); } else if ( a.find("--write-dosage") ) par::write_dosage = true; } ////////////////////////////////// // IBS clustering if (a.find("--cluster")) { par::cluster = true; if (a.find("--within")) par::force_initial_cluster = true; if (a.find("--group-avg") || a.find("--group-average")) { par::cluster_group_avg = true; } } if (a.find("--euclidean")) { if (!a.find("--cluster")) error("Cannot specify --euclidean without --cluster"); par::cluster_euclidean = true; } if (a.find("--pick1")) { par::cluster_selcon = true; par::cluster_selcon_file = a.value("--pick1"); } if (a.find("--cluster-missing")) { par::cluster = true; par::cluster_missing = true; par::matrix = true; if (!a.find("--maf")) par::min_af = 0.0; if (!a.find("--geno")) par::MAX_GENO_MISSING = 1; if (!a.find("--mind")) par::MAX_IND_MISSING = 1; par::merge_p = 0; if (a.find("--ppc")) error("Cannot specify --ppc with --cluster-missing"); } if (a.find("--K")) { if (!a.find("--cluster")) error("Must specify --cluster also if --K used"); par::max_cluster_N = a.value_int("--K"); } if (a.find("--neighbour")) { vector s = a.value("--neighbour",2); par::min_neighbour = getInt(s[0].c_str(),"--neighbour"); par::max_neighbour = getInt(s[1].c_str(),"--neighbour"); par::outlier_detection = true; } if (a.find("--matrix")) par::matrix = true; if (a.find("--distance-matrix")) { par::distance_matrix = par::matrix = true; } if (a.find("--mds-plot")) { par::cluster_plot = true; par::cluster_mds_dim = a.value_int("--mds-plot"); } if (a.find("--mds-cluster")) par::mds_by_individual = false; if (a.find("--pmerge")) error("--pmerge is depreciated: use --ppc instead\n"); if (a.find("--ppc")) { if (!par::cluster) error("--ppc options requires --cluster"); par::merge_p = a.value_double("--ppc"); } if (a.find("--pibs-gap")) error("--pibs-gap is depreciated: please use --ppc-gap\n"); if (a.find("--ppc-gap")) { par::ibstest_gap = 1000 * a.value_int("--ppc-gap"); } if (a.find("--ibm")) { if (! a.find("--cluster")) error("Can only use --ibm with --cluster\n"); par::cluster_ibm_constraint = true; par::cluster_ibm_constraint_value = a.value_double("--ibm"); } if (a.find("--mc")) { if (!par::cluster) error("--mc options requires --cluster"); par::max_cluster_size = a.value_int("--mc"); } if (a.find("--cc")) { if (!par::cluster) error("--cc options requires --cluster"); par::cluster_on_phenotype = true; } if (a.find("--mcc")) { if (!par::cluster) error("--mcc options requires --cluster"); par::cluster_on_mcc = true; vector s = a.value("--mcc",2); par::max_cluster_case = getInt(s[0].c_str(),"--mcc"); par::max_cluster_control = getInt(s[1].c_str(),"--mcc"); if (a.find("--mc") || a.find("-cc")) error("Cannot specify --mc N and/or --cc as well as --mcc N1 N2\n"); } ///////////////////////////////// // External criteria to match on // Categorical binary traits, // by default // e.g. { A, A } is a match and so are pairable // { A, B } is not // // if match-type file is also specifed, then matches // can potentially be otherwise, e.g. // { A, B } are pairable // { A, A } are not if (a.find("--match")) { par::bmatch = true; par::bmatch_filename = a.value("--match"); } if (a.find("--match-type")) { if (!a.find("--match")) error("Must specify a --match {file} with the --match-type {file} option"); par::bmatch_usertype = true; par::bmatch_direction_filename = a.value("--match-type"); } // Quantitative trait match // Based on difference exceeding a certain threshold // e.g. (X-Y)>T => no match // (X-Y)<=T => match // T is specified by including an extra individual in the qmatch file // with the Family ID and Individual ID "_T_" if (a.find("--qmatch")) { if (!a.find("--qt")) error("You need to specify a --qt file when using --qmatch"); par::qmatch_threshold_filename = a.value("--qt"); par::qmatch = true; par::qmatch_filename = a.value("--qmatch"); } ////////////////////////// // Permutation clustering if (a.find("--family")) { par::sol_family = true; par::permute_within_sol = true; par::include_cluster = true; } if (a.find("--within")) { par::permute_within_sol = par::include_cluster = true; par::include_cluster_from_file = true; par::include_cluster_filename = a.value("--within"); checkFileExists(par::include_cluster_filename); } if (a.find("--mwithin")) { if (!a.find("--within")) error("You can only specify --mwithin with --within"); par::mult_clst = a.value_int("--mwithin"); } ////////////////////////////////// // Specific scan region selected if (!a.find("--from")) { // Specify a specific chromosome string c="0"; // Default all chromosomes if (a.find("--chr")) c = a.value("--chr"); if (c=="X" || c=="x") par::run_chr = 23; else if (c=="Y" || c=="y") par::run_chr = 24; else par::run_chr = getInt(c,"--chr"); } if (a.find("--from")) { if (!a.find("--to")) error("Must also specify --to {marker} when using --from {marker}"); par::m1 = a.value("--from"); par::m2 = a.value("--to"); par::run_chr = -1; } if (a.find("--snp")) { par::m1 = a.value("--snp"); par::m2 = a.value("--snp"); par::run_chr = -1; } if (a.find("--snps")) { par::extract_set = true; par::snp_include_from_cl = true; par::snp_include_range = a.value("--snps"); if ( a.find("--snp") || a.find("--window") || a.find("--extract") || a.find("--exclude") ) error("Cannot specify multiple SNP-selection options with --snps"); } if ( a.find("--d") ) { par::range_delimiter = a.value("--d"); if ( par::range_delimiter.length() > 1 ) error("Range delimiter can only be 1 character"); if ( par::range_delimiter == "," ) error("Cannot set range delimiter to comma"); } if (a.find("--window")) { if (!a.find("--snp")) error("Must specify --snp with --window"); par::window = a.value_double("--window"); } if (a.find("--from-bp")) { if (!a.find("--to-bp")) error("Must specify --to-bp with --from-bp"); par::from_window = a.value_int("--from-bp"); par::position_window = true; } if (a.find("--from-kb")) { if (!a.find("--to-kb")) error("Must specify --to-kb with --from-kb"); par::from_window = int(a.value_double("--from-kb") * 1000); par::position_window = true; } if (a.find("--from-mb")) { if (!a.find("--to-mb")) error("Must specify --to-mb with --from-mb"); double v = a.value_double("--from-mb"); if (v>1000) error("Too large a value for --from-mb"); par::from_window = int(v * 1000 * 1000); par::position_window = true; } if (a.find("--to-bp")) { if (!a.find("--from-bp")) error("Must specify --from-bp with --to-bp"); par::to_window = a.value_int("--to-bp"); par::position_window = true; } if (a.find("--to-kb")) { if (!a.find("--from-kb")) error("Must specify --from-kb with --to-kb"); par::to_window = int(a.value_double("--to-kb") * 1000); par::position_window = true; } if (a.find("--to-mb")) { if (!a.find("--from-mb")) error("Must specify --from-mb with --to-mb"); double v = a.value_double("--to-mb"); if (v>1000) error("Too large a value for --to-mb"); par::to_window = int(v * 1000 * 1000); par::position_window = true; } if (par::position_window) if (!a.find("--chr")) error("You must specify which chromosome (--chr N) also"); //////////////////////////////////////////// // General warnings if ( a.find("--assoc") && a.find("--covar") ) error("Cannot specify --covar with --assoc"); if ( a.find("--model") && a.find("--covar") ) error("Cannot specify --covar with --model"); if ( a.find("--assoc") && a.find("--linear") ) error("Cannot specify --assoc with --linear"); if ( a.find("--assoc") && a.find("--logistic") ) error("Cannot specify --assoc with --logistic"); if ( a.find("--model") && a.find("--linear") ) error("Cannot specify --model with --linear"); if ( a.find("--model") && a.find("--logistic") ) error("Cannot specify --model with --logistic"); if ( a.find("--model") && a.find("--assoc") ) error("Cannot specify --model with --assoc"); ///////////////////////////////////////////// // Help -- display all options if (a.find("--help") || a.find("-h")) { cout << "\n" << "Please visit the PLINK website for a complete list of options\n" << "\n" << "A few common options are listed here:\n" << "\n"; cout << "plink --file {fileroot} Specify .ped and .map files \n" << " --bfile {fileroot} Specify .bed, .fam and .map \n" << "\n" << " --out {fileroot} Specify output root filename\n" << "\n" << " --missing-genotype {0} Missing genotype code \n" << " --missing-phenotype {-9} Missing phenotype code \n" << "\n" << " --pheno {phenofile} Specify .phe file \n" << " --within {file} Specify cluster file \n" << " --cov {covarfile} Specify .cov file \n" << "\n" << " --extract {snplist} Extract list of SNPs \n" << " --exclude {snplist} Exclude list of SNPs \n" << " --remove {indlist} Remove these individuals \n" << " --keep {indlist} Keep these individuals \n" << "\n" << " --make-bed Make .bed, .fam and .bim \n" << " --recode Output new PED and MAP files\n" << " --recode12 As above, with 1/2 alleles \n" << " --recodeAD As above, but: 1/0/-1, 0/1/0\n" << " --recodeA As above, but: 1/0/-1 only \n" << "\n" << " --snp {marker} Specify this single SNP \n" << " --snps {marker list} Specify list,range of SNPs \n" << " --window {kb} Select +/- kb around --snp \n" << " --chr {N} Analyse chromosome \n" << " --from-kb {KB} Start scan here (kilobase) \n" << " --to-kb {KB} End scan here \n" << "\n" << " --all Set filters to include all \n" << " --maf {0.01} Minor allele frequency \n" << " --geno {0.1} Maximum per-SNP missing \n" << " --mind {0.1} Maximum per-person missing \n" << "\n" << " --freq Output allele frequencies \n" << " --hardy Hardy-Weinberg tests \n" << " --missing Genotyping rate information \n" << " --het Individual inbreeding \n" << " --genome Genome-wide IBS/IBD \n" << " --cluster Perform IBS clustering \n" << "\n" << " --assoc Case/control, QT association\n" << " --model Full-model C/C association \n" << " --tdt Family-based TDT association\n" << " --linear Linear regression model \n" << " --logistic Logistic regression model \n" << "\n" << " --perm Apaptive permutations \n" << " --mperm {1000} max(T) permutations \n" << "\n" << " --hap {tagfilename} Multimarker predictor list \n" << " --hap-window {N} Phase sliding window \n" << " --hap-snps {snp list} Phase this set of SNPs \n" << " --hap-assoc Haplotype-based association \n" << " --hap-tdt Haplotype-based TDT \n" << " --chap Conditional haplotype tests \n" << " --hap-phase Report haplotype phases \n" << " --hap-freq Report haplotype frequencies\n" << "\n"; cout << "\nPlease visit the PLINK website for a complete list of options\n\n"; shutdown(); } // By default, most tests are SNP major par::SNP_major = true; // Exceptions are: // TDT ( family structure confuses things) // Whole genome / IBS clustering // PLINK if (par::TDT_test || par::MENDEL_test || par::MENDEL_report || par::genome_output || par::cluster || par::plink ) par::SNP_major = false; if (a.find("--ind")) par::SNP_major = false; // If recoding data, the default will be not to set heterozygous // haploid genotypes to missing. Likewise for Mendel errors. Merge // operations will also specify a recode/make-bed, so they are also // captured here. The one special case where we want to allow to // preserve males hets on the X is the --check-sex if ( par::check_sex ) par::preserve_all_genotypes = true; if ( par::write_bitfile || par::recode || par::recode_HV || par::recode_whap || par::recode_12 || par::recode_AD ) { // Unless flag given, these options will not replace haploid // heterozygotes with a missing genotype if ( a.find("--set-hh-missing") ) par::preserve_all_genotypes = false; else par::preserve_all_genotypes = true; if ( a.find("--set-me-missing") ) { par::preserve_mendel_errors = false; } else par::preserve_mendel_errors = true; } } plink-1.07-src/trio.cpp0000644000265600020320000007402411264127624014164 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "crandom.h" #include "sets.h" #include "perm.h" extern Plink * PP; string gprint(int l, bool s1, bool s2); //////////////////////////////////////////////////////// // Helper function: add individual to family, w/ checks. void addParent(Family * f, Individual * person) { // A real person? if ( person ) { // as father if ( person->sex ) { if ( f->pat == NULL ) { f->pat = person; // Do not overwrite (*see note below) if (!person->family) person->family = f; } else error("Problem pedigree structure: two fathers found : family " +person->fid); } else if (person->sexcode=="2") { // as mother if ( f->mat == NULL ) { f->mat = person; // Do not overwrite (*see note below) if (!person->family) person->family = f; } else error("Problem pedigree structure: two mothers found: family " +person->fid); } else { error("Problem with ambiguous parental sex codes for family " +person->fid); } } else { // Otherwise, add as a dummy individual if ( !f->pat ) f->pat = person; else if ( ! f->mat ) f->mat = person; else error("Internal error: allocated too many parents...\n"); } } void addPerson(Family * f, Individual * person) { // Add as child if does not already exist for (int c=0; ckid.size(); c++) if ( f->kid[c]->iid == person->iid ) error("Problem with family " +f->kid[c]->fid+" child "+f->kid[c]->iid +": offspring already exists\n"); f->kid.push_back(person); // Always set (*see note) person->family = f; } // Note on priority of setting person->family pointer: // This will be preferentially set to the family in which // the individual is the offspring (i.e. an individual can // only appear in one nuclear family as an offspring, but // multiple as a founder). Therefore, if nothing has been set // (singleton) set the family pointer for founder (i.e. so as // not to overwrite offspring family pointer) but otherwise // always overwrite if setting the offspring family. // In qfam.cpp then all family[] are used to construct B scores // (i.e. an individual's genotypes might be used in several families) // but the person->family is used to enter the person into the actual // analysis (i.e. so the individual will always appear as offspring, // or sibship (even is S=1) void Plink::parseTrios() { ///////////////////////////////////// // General check for unique FID, IIDs // and that no IID == 0 set fid_iid; vector::iterator person = sample.begin(); while ( person != sample.end() ) { if ( (*person)->iid == "0" ) error("Family "+(*person)->fid+" has person with reserved 0 ID\n"); if ( (*person)->iid == (*person)->pat ) error("Family "+(*person)->fid+" has person "+(*person)->iid+" who is own father"); if ( (*person)->iid == (*person)->mat ) error("Family "+(*person)->fid+" has person "+(*person)->iid+" who is own mother"); string s = (*person)->fid+"_"+(*person)->iid; if ( fid_iid.find(s) != fid_iid.end() ) error("Duplicate individual in pedigree: " +(*person)->fid+" " +(*person)->iid+"\n"); else fid_iid.insert(s); person++; } ///////////////////////////////////// // First consider all nonfounders // with 2 parents map fam; map::iterator f; set infamily; person = sample.begin(); while ( person != sample.end() ) { // For non-founders if ( ! (*person)->founder ) { string fpm = (*person)->fid+ "_"+(*person)->pat+ "_"+(*person)->mat; f = fam.find(fpm); // Have we already come across this parental pairing? if ( f != fam.end() ) { addPerson(f->second,*person); infamily.insert(*person); } else { Family * nfam = new Family; // Add person to new family addPerson(nfam,*person); infamily.insert(*person); // And the parental pairing addParent(nfam,(*person)->pp); addParent(nfam,(*person)->pm); infamily.insert( (*person)->pp ); infamily.insert( (*person)->pm ); if ( (*person)->pp != NULL && (*person)->pm != NULL ) nfam->parents = true; else nfam->sibship = true; // And add it to the list fam.insert(make_pair(fpm,nfam)); } } // Consider next individual person++; } // Rescan list for singleton founders person = sample.begin(); while ( person != sample.end() ) { if ( infamily.find( *person ) == infamily.end() ) { Family * nfam = new Family; // Have we already seen a singleton founder in this family? map::iterator myf = fam.find( (*person)->fid+"_0_0" ); if ( myf != fam.end() ) nfam = myf->second; addPerson(nfam,*person); nfam->singleton = true; fam.insert(make_pair((*person)->fid+"_0_0",nfam)); } person++; } // Counts int disc_parent_cnt = 0; int with_parents_ind_cnt = 0; int with_parents_fam_cnt = 0; int aff_with_parents_trio_cnt = 0; int without_parents_ind_cnt = 0; int without_parents_fam_cnt = 0; int singleton_cnt = 0; // Assign family type flags for ( f = fam.begin() ; f != fam.end() ; f++) { Family * mf = f->second; // TDT = 2 parents + atleast 1 affected offspring if ( mf->kid.size()>=0) { if ( mf->parents ) { with_parents_fam_cnt++; for (int k=0; kkid.size(); k++) { with_parents_ind_cnt++; if (mf->kid[k]->aff) { aff_with_parents_trio_cnt++; mf->TDT = true; } } } else { without_parents_fam_cnt++; for (int k=0; kkid.size(); k++) without_parents_ind_cnt++; } // Set flag for phenotypically discordant parents mf->discordant_parents = false; if ( mf->parents ) if ( mf->pat->phenotype != mf->mat->phenotype && (!mf->pat->missing) && (!mf->mat->missing) ) { mf->discordant_parents = true; disc_parent_cnt++; } } if (mf->singleton) singleton_cnt++; family.push_back(mf); } // Report counts printLOG(int2str(family.size())+" nuclear families, "); printLOG(int2str(singleton_cnt)+" founder singletons found\n"); printLOG(int2str(with_parents_ind_cnt)+" non-founders with 2 parents in " +int2str(with_parents_fam_cnt)+" nuclear families\n"); printLOG(int2str(without_parents_ind_cnt-singleton_cnt)+" non-founders without 2 parents in " +int2str(without_parents_fam_cnt-singleton_cnt)+" nuclear families\n"); if (par::bt) printLOG(int2str(aff_with_parents_trio_cnt)+" affected offspring trios\n"); printLOG(int2str(disc_parent_cnt)+" phenotypically discordant parent pairs found\n"); if (disc_parent_cnt>0) par::discordant_parents = true; // DFAM routine has it's own dump pedigree code if ( par::dumpped && ! par::sibTDT_test ) { string str = par::output_file_name + ".pdump"; printLOG("Dumping pedigree information to [ " + str + " ]\n"); ofstream PD(str.c_str(),ios::out); vector::iterator f = family.begin(); while ( f != family.end() ) { if ( (*f)->singleton ) { PD << "SINGLETON(S)\t" << (*f)->kid[0]->fid << " : "; for (int k=0; k < (*f)->kid.size() ;k++) PD << (*f)->kid[k]->iid << " "; PD << "\n"; } else if ( (*f)->sibship ) { PD << "SIBSHIP \t" << (*f)->kid[0]->fid << " : "; for ( int k=0; k<(*f)->kid.size(); k++) PD << (*f)->kid[k]->iid << " "; PD << "\n"; } else if ( (*f)->parents ) { PD << "W/PARENTS\t" << (*f)->pat->fid << " : "; PD << (*f)->pat->iid << " x " << (*f)->mat->iid << " -> "; for ( int k=0; k<(*f)->kid.size(); k++) PD << (*f)->kid[k]->iid << " "; PD << "\n"; } else PD << "UNDEFINED\t" << (*f)->pat->fid << " " << (*f)->pat->iid << "\n"; // Next family f++; } PD << "\n\n"; PD << "Listing by individual: columns are (0/1 for true false) \n" << " FID\tFamily ID\n" << " IID\tIndividual ID\n" << " Phenotype\n" << " Parents?\n" << " Singleton?\n" << " Sibship?\n" << " Discordant parents?\n" << " TDT?\n\n"; // Now list by individual vector::iterator person = sample.begin(); while ( person != sample.end() ) { PD << (*person)->fid << "\t" << (*person)->iid << "\t" << (*person)->phenotype << "\t" << (*person)->family->parents << " " << (*person)->family->singleton << " " << (*person)->family->sibship << " " << (*person)->family->discordant_parents << " " << (*person)->family->TDT << "\n"; person++; } PD.close(); } } void Plink::checkMendel() { ////////////////////////////////// // Individual-major mode analysis if (par::SNP_major) SNP2Ind(); ofstream MEN; ofstream MENL; ofstream MENI; ofstream MENF; if (par::MENDEL_test) printLOG("Filtering SNPs/families for Mendel Error rates above "+ dbl2str(par::MENDEL_snp)+", "+dbl2str(par::MENDEL_ind)+"\n"); if (par::MENDEL_report) { string f = par::output_file_name+".mendel"; string fi = par::output_file_name+".imendel"; string ff = par::output_file_name+".fmendel"; string fl = par::output_file_name+".lmendel"; printLOG("Writing all Mendel errors to [ " + f +" ]\n"); printLOG("Writing per-offspring Mendel summary to [ " + fi + " ]\n"); printLOG("Writing per-family Mendel summary to [ " + ff + " ]\n"); printLOG("Writing per-locus Mendel summary to [ " + fl + " ]\n"); MEN.open(f.c_str(),ios::out); MENL.open(fl.c_str(),ios::out); MENI.open(fi.c_str(),ios::out); MENF.open(ff.c_str(),ios::out); MEN << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "KID" << " " << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(6) << "CODE" << setw(22) << "ERROR" << "\n"; MENL << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(4) << "N\n"; MENI << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(4) << "N\n"; MENF << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "PAT" << " " << setw(par::pp_maxiid) << "MAT" << " " << setw(6) << "CHLD" << " " << setw(4) << "N" << "\n"; } // Flag to remove SNP if need be vector mendel_locus(nl_all,false); // Counts per family vector mendel_family(family.size(),0); vector mendel_pat(family.size(),0); vector mendel_mat(family.size(),0); vector > mendel_indiv(family.size()); for (int f=0; fkid.size()); // Count number of trios int n_trios = 0; for (int f=0; fkid.size(); int total_m = 0; // total number of Mendel errors int l_removed = 0; // # of SNPs removed due to high ME rate // Test each locus for (int l=0; lchr]) continue; // Flag for X markers bool X = par::chr_sex[locus[l]->chr]; int m=0; // number of Mendel errors per locus for (int f=0; fparents ) continue; Individual * pat = family[f]->pat; Individual * mat = family[f]->mat; vector kid = family[f]->kid; mendel_indiv[f].resize(kid.size()); for (int c=0; cone[l] && !kid[c]->two[l] ) continue; int mendel_type = 0; // For autosomal markers if ( (!X) || (!kid[c]->sex) ) { if ( (!kid[c]->one[l]) && kid[c]->two[l] ) { // KID = 01 // 00x00 -> 01 (m1) // 11x11 -> 01 (m2) if ( ( (!pat->one[l]) && (!pat->two[l]) ) && ( (!mat->one[l]) && (!mat->two[l]) ) ) { if ( ! par::preserve_mendel_errors) { kid[c]->one[l] = true; kid[c]->two[l] = false; mat->one[l] = true; mat->two[l] = false; pat->one[l] = true; pat->two[l] = false; } m++; mendel_type = 1; } else if ( ( pat->one[l] && pat->two[l] ) && ( mat->one[l] && mat->two[l] ) ) { if ( ! par::preserve_mendel_errors) { kid[c]->one[l] = true; kid[c]->two[l] = false; mat->one[l] = true; mat->two[l] = false; pat->one[l] = true; pat->two[l] = false; } m++; mendel_type = 2; } } else if ( (!kid[c]->one[l]) && (!kid[c]->two[l]) ) { // KID = 00 // 00x11 -> 00 (m3) P11->00 // 01x11 -> 00 (m3) // ??x11 -> 00 (m3) // 11x00 -> 00 (m4) M11->00 // 11x01 -> 00 (m4) // 11x?? -> 00 (m4) // 11x11 -> 00 (m5) P11+M11->00 // Hom parent --> opposite hom child // rule = at least one '11' parent if ( ( pat->one[l] && pat->two[l] ) || ( mat->one[l] && mat->two[l] ) ) { if ( ! par::preserve_mendel_errors ) { kid[c]->one[l] = true; kid[c]->two[l] = false; } m++; if ( pat->one[l] && pat->two[l] && mat->one[l] && mat->two[l] ) mendel_type = 5; else if ( pat->one[l] && pat->two[l] ) { mendel_type = 3; if ( ! par::preserve_mendel_errors ) { pat->one[l] = true; pat->two[l] = false; } } else { mendel_type = 4; if ( ! par::preserve_mendel_errors ) { mat->one[l] = true; mat->two[l] = false; } } } } else { // KID = 11 // 00x01 -> 11 (m6) // 00x11 -> 11 // 00x?? -> 11 // 01x00 -> 11 (m7) // 11x00 -> 11 // ??x00 -> 11 // 00x00 -> 11 (m8) P00+M00->11 // rule = at least one '00' parent if ( ( (!pat->one[l]) && (!pat->two[l]) ) || ( (!mat->one[l]) && (!mat->two[l]) ) ) { if ( ! par::preserve_mendel_errors) { kid[c]->one[l] = true; kid[c]->two[l] = false; } m++; if ( (!pat->one[l]) && (!pat->two[l]) && (!mat->one[l]) && (!mat->two[l]) ) mendel_type = 8; else if ( (!pat->one[l]) && (!pat->two[l]) ) { mendel_type = 6; if ( ! par::preserve_mendel_errors ) { pat->one[l] = true; pat->two[l] = false; } } else { mendel_type = 7; if ( ! par::preserve_mendel_errors ) { mat->one[l] = true; mat->two[l] = false; } } } } } else { if ( kid[c]->one[l] && kid[c]->two[l] && (!mat->one[l]) && (!mat->two[l]) ) { m++; mendel_type = 9; if ( ! par::preserve_mendel_errors) { kid[c]->one[l] = true; kid[c]->two[l] = false; mat->one[l] = true; mat->two[l] = false; } } if ( (!kid[c]->one[l]) && (!kid[c]->two[l]) && mat->one[l] && mat->two[l] ) { m++; mendel_type = 10; if ( ! par::preserve_mendel_errors) { kid[c]->one[l] = true; kid[c]->two[l] = false; mat->one[l] = true; mat->two[l] = false; } } } // Individual counts // m1 00x00 -> 01 K / P+M // m2 11x11 -> 01 K / P+M // m3 11x** -> 00 K / P // m4 **x11 -> 00 K / M // m5 11x11 -> 00 K // m6 00x** -> 11 K / P // m7 **x00 -> 11 K / M // m8 00x00 -> 11 K // X marker errors for male offspring // m9 **x00 -> 11 K / M // m10 **x11 -> 00 K / M if (par::MENDEL_report && mendel_type>0) { MEN << setw(par::pp_maxfid) << family[f]->pat->fid << " " << setw(par::pp_maxiid) << kid[c]->iid << " " << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " ; if (mendel_type==1) { MEN << setw(6) << "1"<< setw(22); string s = locus[l]->allele1 + "/" + locus[l]->allele1 + " x " + locus[l]->allele1 + "/" + locus[l]->allele1 + " -> " + locus[l]->allele1 + "/" + locus[l]->allele2; MEN << s << "\n"; } else if (mendel_type==2) { MEN << setw(6) << "2"<< setw(22); string s = locus[l]->allele2 + "/" + locus[l]->allele2 + " x " + locus[l]->allele2 + "/" + locus[l]->allele2 + " -> " + locus[l]->allele1 + "/" + locus[l]->allele2; MEN << s << "\n"; } else if (mendel_type==3) { MEN << setw(6) << "3"<< setw(22); string s = locus[l]->allele2 + "/" + locus[l]->allele2 + " x " + "*/*" + " -> " + locus[l]->allele1 + "/" + locus[l]->allele1; MEN << s << "\n"; } else if (mendel_type==4) { MEN << setw(6) << "4"<< setw(22); string s = string("*/*") + string(" x ") + locus[l]->allele2 + "/" + locus[l]->allele2 + " -> " + locus[l]->allele1 + "/" + locus[l]->allele1; MEN << s << "\n"; } else if (mendel_type==5) { MEN << setw(6) << "5"<< setw(22); string s= locus[l]->allele2 + "/" + locus[l]->allele2 + " x " + locus[l]->allele2 + "/" + locus[l]->allele2 + " -> " + locus[l]->allele1 + "/" + locus[l]->allele1; MEN << s << "\n"; } else if (mendel_type==6) { MEN << setw(6) << "6"<< setw(22); string s = locus[l]->allele1 + "/" + locus[l]->allele1 + " x " + "*/*" + " -> " + locus[l]->allele2 + "/" + locus[l]->allele2; MEN << s << "\n"; } else if (mendel_type==7) { MEN << setw(6) << "7"<< setw(22); string s = string("*/*") + string(" x ") + locus[l]->allele1 + "/" + locus[l]->allele1 + " -> " + locus[l]->allele2 + "/" + locus[l]->allele2; MEN << s << "\n"; } else if (mendel_type==8) { MEN << setw(6) << "8" << setw(22); string s = locus[l]->allele1 + "/" + locus[l]->allele1 + " x " + locus[l]->allele1 + "/" + locus[l]->allele1 + " -> " + locus[l]->allele2 + "/" + locus[l]->allele2; MEN << s << "\n"; } else if (mendel_type==9) { MEN << setw(6) << "9" << setw(22); string s = string("*/*") + " x " + locus[l]->allele1 + "/" + locus[l]->allele1 + " -> " + locus[l]->allele2 + "/" + locus[l]->allele2; MEN << s << "\n"; } else if (mendel_type==10) { MEN << setw(6) << "10" << setw(22); string s = string("*/*") + " x " + locus[l]->allele2 + "/" + locus[l]->allele2 + " -> " + locus[l]->allele1 + "/" + locus[l]->allele1; MEN << s << "\n"; } } // Family count if (mendel_type>0) { // We may wish to add other weighting schemes here if ( mendel_type==1 || mendel_type==2 ) { mendel_indiv[f][c]++; mendel_pat[f]++; mendel_mat[f]++; } else if ( mendel_type==5 || mendel_type==8 ) mendel_indiv[f][c]++; else if ( mendel_type==3 || mendel_type==6 ) { mendel_indiv[f][c]++; mendel_pat[f]++; } else if ( mendel_type==4 || mendel_type==7 ) { mendel_indiv[f][c]++; mendel_mat[f]++; } else if ( mendel_type==9 || mendel_type==10 ) { mendel_indiv[f][c]++; mendel_mat[f]++; } } // Family count if (mendel_type>0) mendel_family[f]++; } } // Report per-SNP Mendel count if (par::MENDEL_report) MENL << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << m << "\n"; // Is this a bad SNP? if ( (double)m/(double)n_trios > par::MENDEL_snp) { l_removed++; mendel_locus[l] = true; } // Keep track of total number of errors total_m += m; } printLOG(int2str(total_m)+" Mendel errors detected in total\n"); if (par::MENDEL_report) { for (int f=0; fparents ) MENF << setw(par::pp_maxfid) << family[f]->pat->fid << " " << setw(par::pp_maxiid) << family[f]->pat->iid << " " << setw(par::pp_maxiid) << family[f]->mat->iid << " " << setw(6) << family[f]->kid.size() << " " << setw(4) << mendel_family[f] << "\n"; } for (int f=0; fparents ) { vector kid = family[f]->kid; MENI << setw(par::pp_maxfid) << family[f]->pat->fid << " " << setw(par::pp_maxiid) << family[f]->pat->iid << " " << setw(4) << mendel_pat[f] << "\n"; MENI << setw(par::pp_maxfid) << family[f]->mat->fid << " " << setw(par::pp_maxiid) << family[f]->mat->iid << " " << setw(4) << mendel_mat[f] << " " << "\n"; for (int c=0; cpat->fid << " " << setw(par::pp_maxiid) << kid[c]->iid << " " << setw(4) << mendel_indiv[f][c] << "\n"; } } MEN.close(); MENL.close(); MENI.close(); MENF.close(); shutdown(); } // Using Mendel error rates to automatically remove SNPs // and families if (par::MENDEL_test) { //////////////////////////////////////// // Remove selected loci from locus list, // by copying rest to a new list // People/genotypes first, then locus/map info bool any_mendel = false; for (int l=0; l badfam; int f_removed = 0; for (int f=0;f par::MENDEL_ind) { f_removed++; badfam.insert(make_pair(family[f]->pat,0)); badfam.insert(make_pair(family[f]->mat,0)); for (int c=0; ckid.size(); c++) badfam.insert(make_pair(family[f]->kid[c],0)); } } // Remove individuals as appropriate int n_removed = 0; vector indel(sample.size(),false); for (int i=0;i0) int tmp = deleteIndividuals(indel); printLOG(int2str(f_removed)+" families ( "+ int2str(n_removed)+" individuals ) removed due to Mendel errors\n"); printLOG(int2str(l_removed)+" markers removed due to Mendel errors, "+ int2str(nl_all)+" remaining\n"); //////////////////////////// // Rebuild family structure? if (f_removed+n_removed>0) { // Wipe existing family structure printLOG("Rebuilding families after filtering on Mendel errors\n"); family.clear(); map fnd; map idmap; linkRelateds(idmap, fnd); parseTrios(); } } } void Plink::makeFounders() { for (int i=0; ifounder ) { Individual * father = sample[i]->pp; Individual * mother = sample[i]->pm; if ( ! ( father && mother ) ) { person->pat = person->mat = "0"; ++cnt_f; } } } } void Plink::pseudoCaseControl() { printLOG("Writing pseudo case/control units to [ "+par::output_file_name+".tucc.ped ]\n"); // Consider each trio unit, // then all SNPs vector t1(nl_all); vector t2(nl_all); vector u1(nl_all); vector u2(nl_all); if ( par::SNP_major ) SNP2Ind(); ofstream POUT; POUT.open( (par::output_file_name+".tucc.ped").c_str(), ios::out); for (int f=0; fparents ) continue; Individual * pat = family[f]->pat; Individual * mat = family[f]->mat; vector kid = family[f]->kid; for (int c=0; ckid[c]; // Score for each SNP for (int l=0; lone[l]; bool mat2 = mat->two[l]; bool pat1 = pat->one[l]; bool pat2 = pat->two[l]; bool kid1 = child->one[l]; bool kid2 = child->two[l]; //////////////////////// // Missing ? if ( ( pat1 && ! pat2 ) || ( mat1 && ! mat2 ) || ( kid1 && ! kid2 ) ) { t1[l] = true; t2[l] = false; u1[l] = true; u2[l] = false; } else { bool X = par::chr_haploid[locus[l]->chr]; bool haploid = par::chr_sex[locus[l]->chr]; // Autosome if ( X || haploid ) { t1[l] = true; t2[l] = false; u1[l] = true; u2[l] = false; } else { // Transmitted alleles t1[l] = kid1; t2[l] = kid2; // Untransmitted alleles int aCount = 0; if ( pat1 ) ++aCount; if ( pat2 ) ++aCount; if ( mat1 ) ++aCount; if ( mat2 ) ++aCount; if ( kid1 ) --aCount; if ( kid2 ) --aCount; if ( aCount == 0 ) { u1[l] = false; u2[l] = false; } else if ( aCount == 1 ) { u1[l] = false; u2[l] = true; } else if ( aCount == 2 ) { u1[l] = true; u2[l] = true; } else { cout << kid1 << kid2 << " <- " << pat1 << pat2 << " " << mat2 << mat2 << "\n"; error("Internal problem in --tucc"); } } } // Next SNP } // Output two rows in PED file for this trio POUT << child->fid << " " << child->iid << "_T 0 0 " << child->sexcode << " 2 "; for (int l=0; lfid << " " << child->iid << "_U 0 0 " << child->sexcode << " 1 "; for (int l=0; llocus[l]->allele1; string a2 = par::recode_12 ? "2" : PP->locus[l]->allele2; if ( (!s1) && (!s2) ) return par::recode_delimit+a1+par::recode_indelimit+a1; else if ( (!s1) && s2 ) return par::recode_delimit+a1+par::recode_indelimit+a2; else if ( s1 && s2 ) return par::recode_delimit+a2+par::recode_indelimit+a2; else return par::recode_delimit + par::missing_genotype + par::recode_indelimit+par::missing_genotype; return "?"; } void Plink::makeMissingParents() { // Add to a separate pile of dummy people map padded; for (int i=0; ipp == NULL && person->pat != "0" ) { Individual * d; string pcode = person->fid+"_"+person->pat; map::iterator f = padded.find( pcode ); if ( f != padded.end() ) d = f->second; else { d = new Individual; d->fid = person->fid; d->iid = person->pat; d->sex = true; padded.insert( make_pair( pcode, d ) ); } person->pp = d; d->kids.push_back(person); } if ( person->pm == NULL && person->mat != "0" ) { Individual * d; string pcode = person->fid+"_"+person->mat; map::iterator f = padded.find( pcode ); if ( f != padded.end() ) d = f->second; else { d = new Individual; d->fid = person->fid; d->iid = person->mat; d->sex = false; padded.insert( make_pair( pcode, d ) ); } person->pm = d; d->kids.push_back(person); } // And make sure that the actual parents also list who is their // child (i.e. if one parent was previously missing, this might // not be the case) if ( person->pm ) { bool found = false; for ( int k=0; kpm->kids.size(); k++) if ( person->pm->kids[k] == person ) found = true; if ( ! found ) person->pm->kids.push_back( person ); } if ( person->pp ) { bool found = false; for ( int k=0; kpp->kids.size(); k++) if ( person->pp->kids[k] == person ) found = true; if ( ! found ) person->pp->kids.push_back( person ); } } if (padded.size() > 0) printLOG("Added " + int2str(padded.size()) + " dummy parents\n"); } plink-1.07-src/nlist.cpp0000644000265600020320000001212211264127625014330 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "nlist.h" vector NList::deparseStringList(string input) { return tokenize(input); } vector NList::deparseNumberList(string input) { // We have a string that should contain integers, possibily // separated by "," and "-" characters // Use 1-N coding up until the last moment, instead of // standard 0..N-1 coding firstWord = "1"; lastWord = int2str(maxcat); vector tok = tokenize(input); vector nums; for (int i=0; i( j, tok[i] , std::dec)) continue; nums.push_back(j); } } return expandNumberList(nums); } vector NList::deparseStringList(string input, map * mapping) { // Assume map contains a 0..N-1 coding in the map map::iterator im = mapping->begin(); while ( im != mapping->end() ) { int i = im->second; if ( i == 0 ) firstWord = im->first; if ( i == maxcat - 1 ) lastWord = im->first; ++im; } //////////////////////////////////////////////////////////////// // Convert string codes to numbers, then call deparseNumberList // But incrememnt to 1..N coding here, i.e. so that it is // similar process to the human-friendly 1..N coding for a // numeric list vector tok = tokenize(input); vector nums; for (int i=0; i::iterator im = mapping->find(tok[i]); if ( im != mapping->end() ) nums.push_back(im->second + 1); else if ( tok[i] == range_string ) nums.push_back(-1); else error("Cannot find value: " + tok[i] + "\n"); } return expandNumberList(nums); } vector NList::expandNumberList(vector & nlist) { // Convert vector n; ////////////////// // Check ranges for (int i=0; i nlist[i+1] ) { int tmp = nlist[i+1]; nlist[i+1] = nlist[i-1]; nlist[i-1] = tmp; } } } ////////////////// // Expand ranges for (int i=0; i0 ) { // Only add valid codes if ( nlist[i] <= maxcat ) n.push_back(nlist[i]); } else { int start = nlist[i-1]+1; int end = nlist[i+1]-1; if ( end > maxcat ) end = maxcat; for (int j=start; j<=end; j++) n.push_back(j); } } // Sort and uniquify stable_sort(n.begin(),n.end()); vector::iterator ne = unique(n.begin(),n.end()); n.erase(ne, n.end()); // Shift to 0..N-1 coding for (int i=0; i k(maxcat,false); for (int i=0; i n2 = n; n.clear(); for (int i=0; i NList::tokenize(string s) { vector t; string word = ""; bool range_set = false; bool number_given = false; for (int i=0; i&,set&,int,int,int); int count_intersects(set&,int,int,int); double weighted_count_intersects(set&,int,int,int); vector segmentCountCaseControls(Plink*,bool); set allSegmentsIntersecting(Range &); #endif plink-1.07-src/metaem.cpp0000644000265600020320000002200411264127625014447 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2007 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "genogroup.h" #include "phase.h" #include "haplowindow.h" extern ofstream LOG; using namespace std; bool HaploPhase::makeWaplotype(vector & wCounter, vector & wMax) { int j = 0; while (1) { if (wCounter[j] < wMax[j]) { ++wCounter[j]; return true; } else // if this position is a max { wCounter[j] = 0; ++j; // Or are we done if (j == actual_nw ) return false; } } return true; } void HaploPhase::enumeratePhasedWindows(int i) { // Form "waplotypes" (haplotypes of windows) vector wCounter(actual_nw, 0); vector wMax(actual_nw, 0); // Allow at least one move wCounter[0] = -1; // Clear phases for precaution hap1[i].clear(); hap2[i].clear(); for (int w = startWindow; w <= finishWindow ; w++) { int wc = w - startWindow; if (windows[w]->hap1[i].size() > 0) wMax[wc] = windows[w]->hap1[i].size() - 1; else { include[i] = false; return; } } ////////////////////////////////////// // Consider all possible combinations while ( makeWaplotype(wCounter, wMax) ) { // But is this a legal window, i.e. given stubs? // i.e. scan all intermediate windows and check bool okay = true; vector leftAlignListA; vector leftAlignListB; vector rightAlignListA; vector rightAlignListB; for (int w = startWindow; w <= finishWindow; w++) { int wc = w - startWindow; // Check left side / left window HaploWindow * currentWindow = windows[w]; int currentH1 = currentWindow->hap1[i][ wCounter[wc] ]; int currentH2 = currentWindow->hap2[i][ wCounter[wc] ]; int l1 = currentWindow->leftStub[ currentH1 ]; int l2 = currentWindow->leftStub[ currentH2 ]; int r1 = currentWindow->rightStub[ currentH1 ]; int r2 = currentWindow->rightStub[ currentH2 ]; // Left alignment? if ( w > startWindow ) { HaploWindow * leftWindow = windows[w-1]; int leftH1 = leftWindow->hap1[i][ wCounter[wc-1] ]; int leftH2 = leftWindow->hap2[i][ wCounter[wc-1] ]; int ol1 = leftWindow->rightStub[ leftH1 ]; int ol2 = leftWindow->rightStub[ leftH2 ]; bool leftAlignA = l1 == ol1 && l2 == ol2; bool leftAlignB = l1 == ol2 && l2 == ol1; if ( ! ( leftAlignA || leftAlignB ) ) { okay = false; break; } else { leftAlignListA.push_back( leftAlignA ); leftAlignListB.push_back( leftAlignB ); } } else { leftAlignListA.push_back( false ); leftAlignListB.push_back( false ); } ///////////////////// // Right alignment? if ( w < finishWindow ) { HaploWindow * rightWindow = windows[w+1]; int rightH1 = rightWindow->hap1[i][ wCounter[wc+1] ]; int rightH2 = rightWindow->hap2[i][ wCounter[wc+1] ]; int or1 = rightWindow->leftStub[ rightH1 ]; int or2 = rightWindow->leftStub[ rightH2 ]; bool rightAlignA = r1 == or1 && r2 == or2; bool rightAlignB = r1 == or2 && r2 == or1; if ( ! ( rightAlignA || rightAlignB ) ) { okay = false; break; } else // save which possible pairings align { rightAlignListA.push_back( rightAlignA ); rightAlignListB.push_back( rightAlignB ); } } else { rightAlignListA.push_back( false ); rightAlignListB.push_back( false ); } } ///////////////////////////////////////////////////////// // For legal combinations, enumerate possible waplotypes if ( okay ) { vector flipable(actual_nw,false); int nflip = 0; bool firstSkipped = false; // Consider all A/B and B/A pairings of haplotypes for // windows other than the first (i.e. we will be lining up // haplotypes in hap1/hap2 meaningfully then) // if ( startWindow > 0 ) firstSkipped = true; for (int w = startWindow; w <= finishWindow; w++) { int wc = w - startWindow; HaploWindow * currentWindow = windows[w]; int currentH1 = currentWindow->hap1[i][ wCounter[wc] ]; int currentH2 = currentWindow->hap2[i][ wCounter[wc] ]; if ( currentH1 != currentH2 ) { if ( firstSkipped ) { flipable[wc] = true; nflip++; } // firstSkipped = true; } } int nperm = (int)pow(2.0,(double)nflip); unsigned int h=0; while (h f1; unsigned int p=1; for (int flip=0; flip waplotype1; vector waplotype2; int f2 = 0; for (int w = startWindow; w <= finishWindow ; w++) { int wc = w - startWindow; HaploWindow * currentWindow = windows[w]; int currentH1 = currentWindow->hap1[i][ wCounter[wc] ]; int currentH2 = currentWindow->hap2[i][ wCounter[wc] ]; if ( flipable[wc] ) { if ( f1[f2++] ) { int tmp = currentH1; currentH1 = currentH2; currentH2 = tmp; } } waplotype1.push_back( currentH1 ); waplotype2.push_back( currentH2 ); } /////////////////////////////// // Test for ultimate alignment bool allAligned = true; // Skip first window for ( int w = startWindow+1; w <= finishWindow; w++) { int wc = w - startWindow; HaploWindow * currentWindow = windows[w]; int l1 = currentWindow->leftStub[ waplotype1[wc] ]; int l2 = currentWindow->leftStub[ waplotype2[wc] ]; HaploWindow * leftWindow = windows[w-1]; int ol1 = leftWindow->rightStub[ waplotype1[wc-1] ]; int ol2 = leftWindow->rightStub[ waplotype2[wc-1] ]; bool leftAlign = l1 == ol1 && l2 == ol2; if ( ! leftAlign ) allAligned = false; } if ( allAligned ) { // Have we already seen these two waplotypes? map< vector, int>::iterator wi = hapmap.find( waplotype1 ); // First waplotype if ( wi == hapmap.end() ) { hapmap.insert( make_pair( waplotype1 , nh )); vector thisHaplotype; for(int w= startWindow; w <= finishWindow; w++) { int wc = w - startWindow; HaploWindow * currentWindow = windows[w]; int start = par::haplo_plem_overlap; if (wc==0) start = 0; for (int s = start; s < currentWindow->ns; s++) thisHaplotype.push_back( currentWindow->hap[waplotype1[wc]][s]); } hap.push_back( thisHaplotype ); hapmapb.insert( make_pair( thisHaplotype, nh )); hapi.push_back( waplotype1 ); hap1[i].push_back( nh ); nh++; } else { hap1[i].push_back( wi->second ); hapmapb.insert( make_pair( hap[wi->second], wi->second )); } wi = hapmap.find( waplotype2 ); if ( wi == hapmap.end() ) { hapmap.insert( make_pair( waplotype2 , nh )); vector thisHaplotype; for(int w=startWindow; w <= finishWindow; w++) { int wc = w - startWindow; HaploWindow * currentWindow = windows[w]; int start = par::haplo_plem_overlap; if (wc==0) start = 0; for (int s = start; s < currentWindow->ns; s++) thisHaplotype.push_back( currentWindow->hap[waplotype2[wc]][s]); } hap.push_back( thisHaplotype ); hapmapb.insert( make_pair( thisHaplotype, nh )); hapi.push_back( waplotype2 ); hap2[i].push_back( nh ); nh++; } else { hap2[i].push_back( wi->second ); hapmapb.insert( make_pair( hap[wi->second], wi->second )); } } // Consider next waplotype h++; } } // end of 'add if legal' condition } // consider next possible legal set // If more than a single phased set exists declare ambigious ambig[i] = hap1[i].size() > 1; include[i] = hap1[i].size() > 0; if ( par::haplo_plem_follow && i == par::haplo_plem_follow_ind ) { VPHASE << "Added " << hap1[i].size() << " phases for followed individual\n"; } } plink-1.07-src/r.cpp0000644000265600020320000002075511264127626013454 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "stats.h" using namespace std; #ifdef WITH_R_PLUGINS #define MAIN // we are the main program, we need to define this #define SOCK_ERRORS // we will use verbose socket errors #include "sisocks.h" #include "Rconnection.h" #endif void Plink::Rfunc() { #ifdef WITH_R_PLUGINS bool write_script = par::run_R_write_script; // Ensure SNP-major mode if ( ! par::SNP_major ) Ind2SNP(); // Are thre individuals / SNPs worth testing if ( n == 0 ) error("No individuals left for analysis"); else if ( nl_all == 0 ) error("No SNPs left for analysis"); #ifdef SKIP printLOG("R-extensions not implemented on this system...\n"); return; #else printLOG("R-extension call for script [ " + par::R_script + " ]\n"); checkFileExists( par::R_script ); ifstream RIN(par::R_script.c_str(), ios::in); ofstream ROUT; ofstream RSCRIPT; Rconnection *rc; if ( ! write_script ) { ROUT.open((par::output_file_name+".auto.R").c_str(), ios::out); printLOG("Writing results of R-extension to [ " + par::output_file_name+".auto.R ]\n"); rc = new Rconnection("127.0.0.1", par::R_port); int i=rc->connect(); if (i) { char msg[128]; sockerrorchecks(msg, 128, -1); printf("unable to connect (result=%d, socket:%s).\n", i, msg); } // Minimal output rc->eval("options(echo=F)"); } else { printLOG("Writing debug-mode R-extension to [ " + par::output_file_name+".debug.R ]\n"); RSCRIPT.open((par::output_file_name+".debug.R").c_str(), ios::out); } ///////////////////////////////////////////// // Read R script that defines function Rplink string rcommand_data, user_function, line; while(getline(RIN, line)) user_function += line + "\n"; //////////////////////////////////// // create R friendly data structures // Remove individuals with missing phenotypes removeMissingPhenotypes(*this); // We are passing 'n' individuals // and nl_all SNPs. By default, we will pass all individuals, // but run_R_nsnps-sized batches of SNPs only // Phenotypes vector_t pvec(n); double * p = &(pvec[0]); for (int i=0;iphenotype; // Cluster information, per person vector svec(n); int * s = &(svec[0]); for ( int i = 0; i < n; i++ ) s[i] = sample[i]->sol; // Covariates int x = -1; int size = n * par::clist_number; vector_t cvec(size); double * c = &(cvec[0]); // if there exists a covariate matrix if( par::clist_number > 0 ){ for( int i = 0; i < n; i++ ){ for( int j = 0; j < par::clist_number; j++ ){ c[++x] = sample[i]->clist[j]; } } } // Assign space Rinteger * rN; Rdouble * rP; Rinteger * rS; Rdouble * rCov; // Assign variables in R if ( ! write_script ) { rS = new Rinteger(s, n); rP = new Rdouble(p, n); rN = new Rinteger(&n, 1); rc->assign("n", rN ); rc->assign("PHENO", rP ); rc->assign("CLUSTER", rS); rc->eval("CLUSTER[CLUSTER==-1] <- NA"); // If there exists a covariate matrix if( par::clist_number > 0 ){ rCov = new Rdouble(c, size); rc->assign("c", rCov); rc->eval("COVAR<-matrix(c,nrow=n,byrow=T)"); } else{ rc->eval("COVAR<-NA"); } } else { // Write commands to file RSCRIPT << "n <- " << n << "\n"; RSCRIPT << "PHENO <- c( "; for (int i=0; i 0 && n > 0) { RSCRIPT << "c <- c( "; for (int i=0; i gvec(n*nloc); int * g = &(gvec[0]); int x = 0; for ( int i = 0; i < n; i++ ) for( int j = nstart; j <= nstop; j++ ){ bool one = SNP[j]->one[i]; bool two = SNP[j]->two[i]; if( (!one) && !two) g[x] = 2; if( (!one) && two ) g[x] = 1; if( one && (!two) ) g[x] = -1; if( one && two) g[x] = 0; x++; } Rinteger * rL = new Rinteger(&nloc,1); Rinteger * rG = new Rinteger(g, n*nloc); // Assign variables in R if ( ! write_script ) { rc->assign("l", rL ); rc->assign("g", rG ); rc->eval("GENO<-matrix(g,nrow=n,byrow=T)"); rc->eval("GENO[GENO==-1] <- NA"); } else { // Write commands to file RSCRIPT << "l <- " << nloc << "\n"; RSCRIPT << "g <- c( "; for (int i=0; ieval( user_function.c_str() ); /////////////////////////////////////////////////////////// // And call the user's function, saving vector of results Rdouble *data = (Rdouble*) rc->eval("Rplink(PHENO,GENO,CLUSTER,COVAR)"); /////////////////////////////////////////////////////////// // If everything went okay, we can get the results if (data) { // Store the results in a vector of doubles double * d = data->doubleArray(); // expect format // N { N items per SNP } int i = 0; int ct = data->length(); for (int l=nstart; l < nstart + nloc; l++) { ROUT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(10) << locus[l]->bp << " " << setw(4) << locus[l]->allele1 << " "; int c = (int)d[i++]; for (int j=0;jchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(10) << locus[l]->bp << " " << setw(4) << locus[l]->allele1 << " " << "NA" << "\n"; } } if ( ! write_script ) { delete rN; delete rP; delete rS; if( par::clist_number > 0 ) delete rCov; } // If in DEBUG mode, now close this file if ( write_script ) { RSCRIPT.close(); shutdown(); } if ( ! par::silent ) cout << "\n"; // Dispose the connection object, which implicitly closes the // connection delete rc; ROUT.close(); #endif #endif return; } plink-1.07-src/lookup.cpp0000644000265600020320000001526611264127625014524 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "sockets.h" using namespace std; #define PORT_NUM 80 #define IP_ADDR "132.183.161.22" void Plink::lookup() { #ifdef SKIP printLOG("Web-lookup not implemented on this system...\n"); return; #else printLOG("PLINK-SNP (WGAS SNP annotation courtesy of Patrick Sullivan)\n"); vector tokens; vector< string > gene_list; string GET_STRING; if ( par::lookup_gene ) { if ( par::lookup_multiple_genes ) { string query = ""; int c = 0; string window= int2str(par::lookup_snp_kb_window * 1000); // Read list, append to query and send checkFileExists(par::lookup_gene_name); ifstream S(par::lookup_gene_name.c_str(),ios::in); string cmd; int x = 0; GET_STRING = "GET /~purcell/cgi-bin/gene.pl?win=" + window; while(!S.eof()) { string gene; S >> gene; if (gene=="") continue; std::string s; std::stringstream out; out << x; s = out.str(); cmd += "&gene" + s + "=" + gene; x++; c++; gene_list.push_back(gene); if ( c>100 ) error("Please do not send large batch queries to PLINK-SNP"); } S.close(); GET_STRING += cmd + " HTTP/1.0 \n Content Length: 10000 \nHost: pngu.mgh.harvard.edu\nConnection: close\n\n"; tokens = socketConnection( this , IP_ADDR, PORT_NUM, GET_STRING ); } else { string window = int2str( par::lookup_gene_kb_window * 1000 ); printLOG("Looking up gene information (and SNPs +/- " + int2str(par::lookup_gene_kb_window)+" kb)\n"); GET_STRING = "GET /~purcell/cgi-bin/gene.pl?win=" + window + "&gene=" + par::lookup_gene_name + " HTTP/1.0 \nHost: pngu.mgh.harvard.edu\nConnection: close\n\n"; tokens = socketConnection( this , IP_ADDR, PORT_NUM, GET_STRING ); } } else if ( par::lookup_single_snp ) { string window= int2str(par::lookup_snp_kb_window * 1000); printLOG("Looking up SNP information, listing genes within " + int2str(par::lookup_snp_kb_window)+" kb\n"); GET_STRING = "GET /~purcell/cgi-bin/snp.pl?win=" + window + "&snp=" + par::lookup_snp + " HTTP/1.0 \nHost: pngu.mgh.harvard.edu\nConnection: close\n\n"; tokens = socketConnection( this , IP_ADDR, PORT_NUM, GET_STRING ); } else { string query = ""; int c = 0; string window= int2str(par::lookup_snp_kb_window * 1000); // Read list, append to query and send checkFileExists(par::lookup_snp); ifstream S(par::lookup_snp.c_str(),ios::in); string cmd; int x = 0; GET_STRING = "GET /~purcell/cgi-bin/snp.pl?win=" + window; while(!S.eof()) { string snp; S >> snp; if (snp=="") continue; std::string s; std::stringstream out; out << x; s = out.str(); cmd += "&snp" + s + "=" + snp; x++; c++; if ( c>100 ) error("Please do not send large batch queries to PLINK-SNP"); } S.close(); GET_STRING += cmd + " HTTP/1.0\nHost: pngu.mgh.harvard.edu\nConnection: close\n\n"; tokens = socketConnection( this , IP_ADDR, PORT_NUM, GET_STRING ); printLOG("Looking up SNP information, listing genes within " + int2str(par::lookup_snp_kb_window)+" kb)\n"); } if ( tokens.size() < 25 ) { cout << "\n\n"; if( par::lookup_single_snp ) cout << par::lookup_snp; else cout << par::lookup_gene_name; cout << " not found\n"; cout << "-----------------------------\n"; return; } if ( par::lookup_to_file ) { ofstream OUT; string f = par::output_file_name; // Need to handle both single and multiple instances if ( par::lookup_gene ) { f += ".snp.list"; if ( tokens[0] == "-1") { printLOG("\n\nCould not find gene " + par::lookup_gene_name + " in database\n"); return; } printLOG("Writing SNP details to [ " + f + " ]\n\n\n"); OUT.open( f.c_str(), ios::out ); bool geneInfo = true; int totalSNPs = 0; int gcount = 0; for (int i=25; i #include #include #ifdef WITH_ZLIB #include "zfstream.h" #endif using namespace std; // A lightweight wrapper around zfstream wrapper to zlib const int MAX_LINE_LENGTH = 1000000; class ZInput { string filename; bool compressed; char buf[MAX_LINE_LENGTH]; #ifdef WITH_ZLIB gzifstream zinf; #else ifstream zinf; #endif ifstream inf; public: ZInput(string, bool); ZInput(); void open(string, bool); char readChar(); string readLine(); vector tokenizeLine(); void close(); bool endOfFile(); void unbuffered(); }; class ZOutput { #ifdef WITH_ZLIB gzofstream zoutf; #else ofstream zoutf; #endif ofstream outf; string filename; bool compressed; char buf[MAX_LINE_LENGTH]; public: ZOutput(string, bool); ZOutput(); void open(string,bool); void write(string); ZOutput & operator<< (const string & s) { write(s); return *this; } void writeLine(string); void close(); void unbuffered(); }; void fileCompress(); void fileUncompress(); #endif plink-1.07-src/COPYING.txt0000645000265600020320000003600510652403213014341 0ustar tilleaadmin GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. plink-1.07-src/dcdflib.cpp0000644000265600020320000070121311264127625014574 0ustar tilleaadmin#include #include #include #include "cdflib.h" /* ----------------------------------------------------------------------- COMPUTATION OF LN(GAMMA(B)/GAMMA(A+B)) WHEN B .GE. 8 -------- IN THIS ALGORITHM, DEL(X) IS THE FUNCTION DEFINED BY LN(GAMMA(X)) = (X - 0.5)*LN(X) - X + 0.5*LN(2*PI) + DEL(X). ----------------------------------------------------------------------- */ double algdiv(double *a,double *b) { static double c0 = .833333333333333e-01; static double c1 = -.277777777760991e-02; static double c2 = .793650666825390e-03; static double c3 = -.595202931351870e-03; static double c4 = .837308034031215e-03; static double c5 = -.165322962780713e-02; static double algdiv,c,d,h,s11,s3,s5,s7,s9,t,u,v,w,x,x2,T1; /* .. .. Executable Statements .. */ if(*a <= *b) goto S10; h = *b/ *a; c = 1.0e0/(1.0e0+h); x = h/(1.0e0+h); d = *a+(*b-0.5e0); goto S20; S10: h = *a/ *b; c = h/(1.0e0+h); x = 1.0e0/(1.0e0+h); d = *b+(*a-0.5e0); S20: /* SET SN = (1 - X**N)/(1 - X) */ x2 = x*x; s3 = 1.0e0+(x+x2); s5 = 1.0e0+(x+x2*s3); s7 = 1.0e0+(x+x2*s5); s9 = 1.0e0+(x+x2*s7); s11 = 1.0e0+(x+x2*s9); /* SET W = DEL(B) - DEL(A + B) */ t = pow(1.0e0/ *b,2.0); w = ((((c5*s11*t+c4*s9)*t+c3*s7)*t+c2*s5)*t+c1*s3)*t+c0; w *= (c/ *b); /* COMBINE THE RESULTS */ T1 = *a/ *b; u = d*alnrel(&T1); v = *a*(log(*b)-1.0e0); if(u <= v) goto S30; algdiv = w-v-u; return algdiv; S30: algdiv = w-u-v; return algdiv; } double alngam(double *x) /* ********************************************************************** double alngam(double *x) double precision LN of the GAMma function Function Returns the natural logarithm of GAMMA(X). Arguments X --> value at which scaled log gamma is to be returned X is DOUBLE PRECISION Method If X .le. 6.0, then use recursion to get X below 3 then apply rational approximation number 5236 of Hart et al, Computer Approximations, John Wiley and Sons, NY, 1968. If X .gt. 6.0, then use recursion to get X to at least 12 and then use formula 5423 of the same source. ********************************************************************** */ { #define hln2pi 0.91893853320467274178e0 static double coef[5] = { 0.83333333333333023564e-1,-0.27777777768818808e-2,0.79365006754279e-3, -0.594997310889e-3,0.8065880899e-3 }; static double scoefd[4] = { 0.62003838007126989331e2,0.9822521104713994894e1,-0.8906016659497461257e1, 0.1000000000000000000e1 }; static double scoefn[9] = { 0.62003838007127258804e2,0.36036772530024836321e2,0.20782472531792126786e2, 0.6338067999387272343e1,0.215994312846059073e1,0.3980671310203570498e0, 0.1093115956710439502e0,0.92381945590275995e-2,0.29737866448101651e-2 }; static int K1 = 9; static int K3 = 4; static int K5 = 5; static double alngam,offset,prod,xx; static int i,n; static double T2,T4,T6; /* .. .. Executable Statements .. */ if(!(*x <= 6.0e0)) goto S70; prod = 1.0e0; xx = *x; if(!(*x > 3.0e0)) goto S30; S10: if(!(xx > 3.0e0)) goto S20; xx -= 1.0e0; prod *= xx; goto S10; S30: S20: if(!(*x < 2.0e0)) goto S60; S40: if(!(xx < 2.0e0)) goto S50; prod /= xx; xx += 1.0e0; goto S40; S60: S50: T2 = xx-2.0e0; T4 = xx-2.0e0; alngam = devlpl(scoefn,&K1,&T2)/devlpl(scoefd,&K3,&T4); /* COMPUTE RATIONAL APPROXIMATION TO GAMMA(X) */ alngam *= prod; alngam = log(alngam); goto S110; S70: offset = hln2pi; /* IF NECESSARY MAKE X AT LEAST 12 AND CARRY CORRECTION IN OFFSET */ n = fifidint(12.0e0-*x); if(!(n > 0)) goto S90; prod = 1.0e0; for(i=1; i<=n; i++) prod *= (*x+(double)(i-1)); offset -= log(prod); xx = *x+(double)n; goto S100; S90: xx = *x; S100: /* COMPUTE POWER SERIES */ T6 = 1.0e0/pow(xx,2.0); alngam = devlpl(coef,&K5,&T6)/xx; alngam += (offset+(xx-0.5e0)*log(xx)-xx); S110: return alngam; #undef hln2pi } double alnrel(double *a) /* ----------------------------------------------------------------------- EVALUATION OF THE FUNCTION LN(1 + A) ----------------------------------------------------------------------- */ { static double p1 = -.129418923021993e+01; static double p2 = .405303492862024e+00; static double p3 = -.178874546012214e-01; static double q1 = -.162752256355323e+01; static double q2 = .747811014037616e+00; static double q3 = -.845104217945565e-01; static double alnrel,t,t2,w,x; /* .. .. Executable Statements .. */ if(fabs(*a) > 0.375e0) goto S10; t = *a/(*a+2.0e0); t2 = t*t; w = (((p3*t2+p2)*t2+p1)*t2+1.0e0)/(((q3*t2+q2)*t2+q1)*t2+1.0e0); alnrel = 2.0e0*t*w; return alnrel; S10: x = 1.e0+*a; alnrel = log(x); return alnrel; } double apser(double *a,double *b,double *x,double *eps) /* ----------------------------------------------------------------------- APSER YIELDS THE INCOMPLETE BETA RATIO I(SUB(1-X))(B,A) FOR A .LE. MIN(EPS,EPS*B), B*X .LE. 1, AND X .LE. 0.5. USED WHEN A IS VERY SMALL. USE ONLY IF ABOVE INEQUALITIES ARE SATISFIED. ----------------------------------------------------------------------- */ { static double g = .577215664901533e0; static double apser,aj,bx,c,j,s,t,tol; /* .. .. Executable Statements .. */ bx = *b**x; t = *x-bx; if(*b**eps > 2.e-2) goto S10; c = log(*x)+psi(b)+g+t; goto S20; S10: c = log(bx)+g+t; S20: tol = 5.0e0**eps*fabs(c); j = 1.0e0; s = 0.0e0; S30: j += 1.0e0; t *= (*x-bx/j); aj = t/j; s += aj; if(fabs(aj) > tol) goto S30; apser = -(*a*(c+s)); return apser; } double basym(double *a,double *b,double *lambda,double *eps) /* ----------------------------------------------------------------------- ASYMPTOTIC EXPANSION FOR IX(A,B) FOR LARGE A AND B. LAMBDA = (A + B)*Y - B AND EPS IS THE TOLERANCE USED. IT IS ASSUMED THAT LAMBDA IS NONNEGATIVE AND THAT A AND B ARE GREATER THAN OR EQUAL TO 15. ----------------------------------------------------------------------- */ { static double e0 = 1.12837916709551e0; static double e1 = .353553390593274e0; static int num = 20; /* ------------------------ ****** NUM IS THE MAXIMUM VALUE THAT N CAN TAKE IN THE DO LOOP ENDING AT STATEMENT 50. IT IS REQUIRED THAT NUM BE EVEN. THE ARRAYS A0, B0, C, D HAVE DIMENSION NUM + 1. ------------------------ E0 = 2/SQRT(PI) E1 = 2**(-3/2) ------------------------ */ static int K3 = 1; static double basym,bsum,dsum,f,h,h2,hn,j0,j1,r,r0,r1,s,sum,t,t0,t1,u,w,w0,z,z0, z2,zn,znm1; static int i,im1,imj,j,m,mm1,mmj,n,np1; static double a0[21],b0[21],c[21],d[21],T1,T2; /* .. .. Executable Statements .. */ basym = 0.0e0; if(*a >= *b) goto S10; h = *a/ *b; r0 = 1.0e0/(1.0e0+h); r1 = (*b-*a)/ *b; w0 = 1.0e0/sqrt(*a*(1.0e0+h)); goto S20; S10: h = *b/ *a; r0 = 1.0e0/(1.0e0+h); r1 = (*b-*a)/ *a; w0 = 1.0e0/sqrt(*b*(1.0e0+h)); S20: T1 = -(*lambda/ *a); T2 = *lambda/ *b; f = *a*rlog1(&T1)+*b*rlog1(&T2); t = exp(-f); if(t == 0.0e0) return basym; z0 = sqrt(f); z = 0.5e0*(z0/e1); z2 = f+f; a0[0] = 2.0e0/3.0e0*r1; c[0] = -(0.5e0*a0[0]); d[0] = -c[0]; j0 = 0.5e0/e0*erfc1(&K3,&z0); j1 = e1; sum = j0+d[0]*w0*j1; s = 1.0e0; h2 = h*h; hn = 1.0e0; w = w0; znm1 = z; zn = z2; for(n=2; n<=num; n+=2) { hn = h2*hn; a0[n-1] = 2.0e0*r0*(1.0e0+h*hn)/((double)n+2.0e0); np1 = n+1; s += hn; a0[np1-1] = 2.0e0*r1*s/((double)n+3.0e0); for(i=n; i<=np1; i++) { r = -(0.5e0*((double)i+1.0e0)); b0[0] = r*a0[0]; for(m=2; m<=i; m++) { bsum = 0.0e0; mm1 = m-1; for(j=1; j<=mm1; j++) { mmj = m-j; bsum += (((double)j*r-(double)mmj)*a0[j-1]*b0[mmj-1]); } b0[m-1] = r*a0[m-1]+bsum/(double)m; } c[i-1] = b0[i-1]/((double)i+1.0e0); dsum = 0.0e0; im1 = i-1; for(j=1; j<=im1; j++) { imj = i-j; dsum += (d[imj-1]*c[j-1]); } d[i-1] = -(dsum+c[i-1]); } j0 = e1*znm1+((double)n-1.0e0)*j0; j1 = e1*zn+(double)n*j1; znm1 = z2*znm1; zn = z2*zn; w = w0*w; t0 = d[n-1]*w*j0; w = w0*w; t1 = d[np1-1]*w*j1; sum += (t0+t1); if(fabs(t0)+fabs(t1) <= *eps*sum) goto S80; } S80: u = exp(-bcorr(a,b)); basym = e0*t*u*sum; return basym; } double bcorr(double *a0,double *b0) /* ----------------------------------------------------------------------- EVALUATION OF DEL(A0) + DEL(B0) - DEL(A0 + B0) WHERE LN(GAMMA(A)) = (A - 0.5)*LN(A) - A + 0.5*LN(2*PI) + DEL(A). IT IS ASSUMED THAT A0 .GE. 8 AND B0 .GE. 8. ----------------------------------------------------------------------- */ { static double c0 = .833333333333333e-01; static double c1 = -.277777777760991e-02; static double c2 = .793650666825390e-03; static double c3 = -.595202931351870e-03; static double c4 = .837308034031215e-03; static double c5 = -.165322962780713e-02; static double bcorr,a,b,c,h,s11,s3,s5,s7,s9,t,w,x,x2; /* .. .. Executable Statements .. */ a = fifdmin1(*a0,*b0); b = fifdmax1(*a0,*b0); h = a/b; c = h/(1.0e0+h); x = 1.0e0/(1.0e0+h); x2 = x*x; /* SET SN = (1 - X**N)/(1 - X) */ s3 = 1.0e0+(x+x2); s5 = 1.0e0+(x+x2*s3); s7 = 1.0e0+(x+x2*s5); s9 = 1.0e0+(x+x2*s7); s11 = 1.0e0+(x+x2*s9); /* SET W = DEL(B) - DEL(A + B) */ t = pow(1.0e0/b,2.0); w = ((((c5*s11*t+c4*s9)*t+c3*s7)*t+c2*s5)*t+c1*s3)*t+c0; w *= (c/b); /* COMPUTE DEL(A) + W */ t = pow(1.0e0/a,2.0); bcorr = (((((c5*t+c4)*t+c3)*t+c2)*t+c1)*t+c0)/a+w; return bcorr; } double betaln(double *a0,double *b0) /* ----------------------------------------------------------------------- EVALUATION OF THE LOGARITHM OF THE BETA FUNCTION ----------------------------------------------------------------------- E = 0.5*LN(2*PI) -------------------------- */ { static double e = .918938533204673e0; static double betaln,a,b,c,h,u,v,w,z; static int i,n; static double T1; /* .. .. Executable Statements .. */ a = fifdmin1(*a0,*b0); b = fifdmax1(*a0,*b0); if(a >= 8.0e0) goto S100; if(a >= 1.0e0) goto S20; /* ----------------------------------------------------------------------- PROCEDURE WHEN A .LT. 1 ----------------------------------------------------------------------- */ if(b >= 8.0e0) goto S10; T1 = a+b; betaln = gamln(&a)+(gamln(&b)-gamln(&T1)); return betaln; S10: betaln = gamln(&a)+algdiv(&a,&b); return betaln; S20: /* ----------------------------------------------------------------------- PROCEDURE WHEN 1 .LE. A .LT. 8 ----------------------------------------------------------------------- */ if(a > 2.0e0) goto S40; if(b > 2.0e0) goto S30; betaln = gamln(&a)+gamln(&b)-gsumln(&a,&b); return betaln; S30: w = 0.0e0; if(b < 8.0e0) goto S60; betaln = gamln(&a)+algdiv(&a,&b); return betaln; S40: /* REDUCTION OF A WHEN B .LE. 1000 */ if(b > 1000.0e0) goto S80; n = int(a-1.0e0); w = 1.0e0; for(i=1; i<=n; i++) { a -= 1.0e0; h = a/b; w *= (h/(1.0e0+h)); } w = log(w); if(b < 8.0e0) goto S60; betaln = w+gamln(&a)+algdiv(&a,&b); return betaln; S60: /* REDUCTION OF B WHEN B .LT. 8 */ n = int(b-1.0e0); z = 1.0e0; for(i=1; i<=n; i++) { b -= 1.0e0; z *= (b/(a+b)); } betaln = w+log(z)+(gamln(&a)+(gamln(&b)-gsumln(&a,&b))); return betaln; S80: /* REDUCTION OF A WHEN B .GT. 1000 */ n = int(a-1.0e0); w = 1.0e0; for(i=1; i<=n; i++) { a -= 1.0e0; w *= (a/(1.0e0+a/b)); } betaln = log(w)-(double)n*log(b)+(gamln(&a)+algdiv(&a,&b)); return betaln; S100: /* ----------------------------------------------------------------------- PROCEDURE WHEN A .GE. 8 ----------------------------------------------------------------------- */ w = bcorr(&a,&b); h = a/b; c = h/(1.0e0+h); u = -((a-0.5e0)*log(c)); v = b*alnrel(&h); if(u <= v) goto S110; betaln = -(0.5e0*log(b))+e+w-v-u; return betaln; S110: betaln = -(0.5e0*log(b))+e+w-u-v; return betaln; } double bfrac(double *a,double *b,double *x,double *y,double *lambda, double *eps) /* ----------------------------------------------------------------------- CONTINUED FRACTION EXPANSION FOR IX(A,B) WHEN A,B .GT. 1. IT IS ASSUMED THAT LAMBDA = (A + B)*Y - B. ----------------------------------------------------------------------- */ { static double bfrac,alpha,an,anp1,beta,bn,bnp1,c,c0,c1,e,n,p,r,r0,s,t,w,yp1; /* .. .. Executable Statements .. */ bfrac = brcomp(a,b,x,y); if(bfrac == 0.0e0) return bfrac; c = 1.0e0+*lambda; c0 = *b/ *a; c1 = 1.0e0+1.0e0/ *a; yp1 = *y+1.0e0; n = 0.0e0; p = 1.0e0; s = *a+1.0e0; an = 0.0e0; bn = anp1 = 1.0e0; bnp1 = c/c1; r = c1/c; S10: /* CONTINUED FRACTION CALCULATION */ n += 1.0e0; t = n/ *a; w = n*(*b-n)**x; e = *a/s; alpha = p*(p+c0)*e*e*(w**x); e = (1.0e0+t)/(c1+t+t); beta = n+w/s+e*(c+n*yp1); p = 1.0e0+t; s += 2.0e0; /* UPDATE AN, BN, ANP1, AND BNP1 */ t = alpha*an+beta*anp1; an = anp1; anp1 = t; t = alpha*bn+beta*bnp1; bn = bnp1; bnp1 = t; r0 = r; r = anp1/bnp1; if(fabs(r-r0) <= *eps*r) goto S20; /* RESCALE AN, BN, ANP1, AND BNP1 */ an /= bnp1; bn /= bnp1; anp1 = r; bnp1 = 1.0e0; goto S10; S20: /* TERMINATION */ bfrac *= r; return bfrac; } void bgrat(double *a,double *b,double *x,double *y,double *w, double *eps,int *ierr) /* ----------------------------------------------------------------------- ASYMPTOTIC EXPANSION FOR IX(A,B) WHEN A IS LARGER THAN B. THE RESULT OF THE EXPANSION IS ADDED TO W. IT IS ASSUMED THAT A .GE. 15 AND B .LE. 1. EPS IS THE TOLERANCE USED. IERR IS A VARIABLE THAT REPORTS THE STATUS OF THE RESULTS. ----------------------------------------------------------------------- */ { static double bm1,bp2n,cn,coef,dj,j,l,lnx,n2,nu,p,q,r,s,sum,t,t2,u,v,z; static int i,n,nm1; static double c[30],d[30],T1; /* .. .. Executable Statements .. */ bm1 = *b-0.5e0-0.5e0; nu = *a+0.5e0*bm1; if(*y > 0.375e0) goto S10; T1 = -*y; lnx = alnrel(&T1); goto S20; S10: lnx = log(*x); S20: z = -(nu*lnx); if(*b*z == 0.0e0) goto S70; /* COMPUTATION OF THE EXPANSION SET R = EXP(-Z)*Z**B/GAMMA(B) */ r = *b*(1.0e0+gam1(b))*exp(*b*log(z)); r *= (exp(*a*lnx)*exp(0.5e0*bm1*lnx)); u = algdiv(b,a)+*b*log(nu); u = r*exp(-u); if(u == 0.0e0) goto S70; grat1(b,&z,&r,&p,&q,eps); v = 0.25e0*pow(1.0e0/nu,2.0); t2 = 0.25e0*lnx*lnx; l = *w/u; j = q/r; sum = j; t = cn = 1.0e0; n2 = 0.0e0; for(n=1; n<=30; n++) { bp2n = *b+n2; j = (bp2n*(bp2n+1.0e0)*j+(z+bp2n+1.0e0)*t)*v; n2 += 2.0e0; t *= t2; cn /= (n2*(n2+1.0e0)); c[n-1] = cn; s = 0.0e0; if(n == 1) goto S40; nm1 = n-1; coef = *b-(double)n; for(i=1; i<=nm1; i++) { s += (coef*c[i-1]*d[n-i-1]); coef += *b; } S40: d[n-1] = bm1*cn+s/(double)n; dj = d[n-1]*j; sum += dj; if(sum <= 0.0e0) goto S70; if(fabs(dj) <= *eps*(sum+l)) goto S60; } S60: /* ADD THE RESULTS TO W */ *ierr = 0; *w += (u*sum); return; S70: /* THE EXPANSION CANNOT BE COMPUTED */ *ierr = 1; return; } double bpser(double *a,double *b,double *x,double *eps) /* ----------------------------------------------------------------------- POWER SERIES EXPANSION FOR EVALUATING IX(A,B) WHEN B .LE. 1 OR B*X .LE. 0.7. EPS IS THE TOLERANCE USED. ----------------------------------------------------------------------- */ { static double bpser,a0,apb,b0,c,n,sum,t,tol,u,w,z; static int i,m; /* .. .. Executable Statements .. */ bpser = 0.0e0; if(*x == 0.0e0) return bpser; /* ----------------------------------------------------------------------- COMPUTE THE FACTOR X**A/(A*BETA(A,B)) ----------------------------------------------------------------------- */ a0 = fifdmin1(*a,*b); if(a0 < 1.0e0) goto S10; z = *a*log(*x)-betaln(a,b); bpser = exp(z)/ *a; goto S100; S10: b0 = fifdmax1(*a,*b); if(b0 >= 8.0e0) goto S90; if(b0 > 1.0e0) goto S40; /* PROCEDURE FOR A0 .LT. 1 AND B0 .LE. 1 */ bpser = pow(*x,*a); if(bpser == 0.0e0) return bpser; apb = *a+*b; if(apb > 1.0e0) goto S20; z = 1.0e0+gam1(&apb); goto S30; S20: u = *a+*b-1.e0; z = (1.0e0+gam1(&u))/apb; S30: c = (1.0e0+gam1(a))*(1.0e0+gam1(b))/z; bpser *= (c*(*b/apb)); goto S100; S40: /* PROCEDURE FOR A0 .LT. 1 AND 1 .LT. B0 .LT. 8 */ u = gamln1(&a0); m = int(b0-1.0e0); if(m < 1) goto S60; c = 1.0e0; for(i=1; i<=m; i++) { b0 -= 1.0e0; c *= (b0/(a0+b0)); } u = log(c)+u; S60: z = *a*log(*x)-u; b0 -= 1.0e0; apb = a0+b0; if(apb > 1.0e0) goto S70; t = 1.0e0+gam1(&apb); goto S80; S70: u = a0+b0-1.e0; t = (1.0e0+gam1(&u))/apb; S80: bpser = exp(z)*(a0/ *a)*(1.0e0+gam1(&b0))/t; goto S100; S90: /* PROCEDURE FOR A0 .LT. 1 AND B0 .GE. 8 */ u = gamln1(&a0)+algdiv(&a0,&b0); z = *a*log(*x)-u; bpser = a0/ *a*exp(z); S100: if(bpser == 0.0e0 || *a <= 0.1e0**eps) return bpser; /* ----------------------------------------------------------------------- COMPUTE THE SERIES ----------------------------------------------------------------------- */ sum = n = 0.0e0; c = 1.0e0; tol = *eps/ *a; S110: n += 1.0e0; c *= ((0.5e0+(0.5e0-*b/n))**x); w = c/(*a+n); sum += w; if(fabs(w) > tol) goto S110; bpser *= (1.0e0+*a*sum); return bpser; } void bratio(double *a,double *b,double *x,double *y,double *w, double *w1,int *ierr) /* ----------------------------------------------------------------------- EVALUATION OF THE INCOMPLETE BETA FUNCTION IX(A,B) -------------------- IT IS ASSUMED THAT A AND B ARE NONNEGATIVE, AND THAT X .LE. 1 AND Y = 1 - X. BRATIO ASSIGNS W AND W1 THE VALUES W = IX(A,B) W1 = 1 - IX(A,B) IERR IS A VARIABLE THAT REPORTS THE STATUS OF THE RESULTS. IF NO INPUT ERRORS ARE DETECTED THEN IERR IS SET TO 0 AND W AND W1 ARE COMPUTED. OTHERWISE, IF AN ERROR IS DETECTED, THEN W AND W1 ARE ASSIGNED THE VALUE 0 AND IERR IS SET TO ONE OF THE FOLLOWING VALUES ... IERR = 1 IF A OR B IS NEGATIVE IERR = 2 IF A = B = 0 IERR = 3 IF X .LT. 0 OR X .GT. 1 IERR = 4 IF Y .LT. 0 OR Y .GT. 1 IERR = 5 IF X + Y .NE. 1 IERR = 6 IF X = A = 0 IERR = 7 IF Y = B = 0 -------------------- WRITTEN BY ALFRED H. MORRIS, JR. NAVAL SURFACE WARFARE CENTER DAHLGREN, VIRGINIA REVISED ... NOV 1991 ----------------------------------------------------------------------- */ { static int K1 = 1; static double a0,b0,eps,lambda,t,x0,y0,z; static int ierr1,ind,n; static double T2,T3,T4,T5; /* .. .. Executable Statements .. */ /* ****** EPS IS A MACHINE DEPENDENT CONSTANT. EPS IS THE SMALLEST FLOATING POINT NUMBER FOR WHICH 1.0 + EPS .GT. 1.0 */ eps = spmpar(&K1); *w = *w1 = 0.0e0; if(*a < 0.0e0 || *b < 0.0e0) goto S270; if(*a == 0.0e0 && *b == 0.0e0) goto S280; if(*x < 0.0e0 || *x > 1.0e0) goto S290; if(*y < 0.0e0 || *y > 1.0e0) goto S300; z = *x+*y-0.5e0-0.5e0; if(fabs(z) > 3.0e0*eps) goto S310; *ierr = 0; if(*x == 0.0e0) goto S210; if(*y == 0.0e0) goto S230; if(*a == 0.0e0) goto S240; if(*b == 0.0e0) goto S220; eps = fifdmax1(eps,1.e-15); if(fifdmax1(*a,*b) < 1.e-3*eps) goto S260; ind = 0; a0 = *a; b0 = *b; x0 = *x; y0 = *y; if(fifdmin1(a0,b0) > 1.0e0) goto S40; /* PROCEDURE FOR A0 .LE. 1 OR B0 .LE. 1 */ if(*x <= 0.5e0) goto S10; ind = 1; a0 = *b; b0 = *a; x0 = *y; y0 = *x; S10: if(b0 < fifdmin1(eps,eps*a0)) goto S90; if(a0 < fifdmin1(eps,eps*b0) && b0*x0 <= 1.0e0) goto S100; if(fifdmax1(a0,b0) > 1.0e0) goto S20; if(a0 >= fifdmin1(0.2e0,b0)) goto S110; if(pow(x0,a0) <= 0.9e0) goto S110; if(x0 >= 0.3e0) goto S120; n = 20; goto S140; S20: if(b0 <= 1.0e0) goto S110; if(x0 >= 0.3e0) goto S120; if(x0 >= 0.1e0) goto S30; if(pow(x0*b0,a0) <= 0.7e0) goto S110; S30: if(b0 > 15.0e0) goto S150; n = 20; goto S140; S40: /* PROCEDURE FOR A0 .GT. 1 AND B0 .GT. 1 */ if(*a > *b) goto S50; lambda = *a-(*a+*b)**x; goto S60; S50: lambda = (*a+*b)**y-*b; S60: if(lambda >= 0.0e0) goto S70; ind = 1; a0 = *b; b0 = *a; x0 = *y; y0 = *x; lambda = fabs(lambda); S70: if(b0 < 40.0e0 && b0*x0 <= 0.7e0) goto S110; if(b0 < 40.0e0) goto S160; if(a0 > b0) goto S80; if(a0 <= 100.0e0) goto S130; if(lambda > 0.03e0*a0) goto S130; goto S200; S80: if(b0 <= 100.0e0) goto S130; if(lambda > 0.03e0*b0) goto S130; goto S200; S90: /* EVALUATION OF THE APPROPRIATE ALGORITHM */ *w = fpser(&a0,&b0,&x0,&eps); *w1 = 0.5e0+(0.5e0-*w); goto S250; S100: *w1 = apser(&a0,&b0,&x0,&eps); *w = 0.5e0+(0.5e0-*w1); goto S250; S110: *w = bpser(&a0,&b0,&x0,&eps); *w1 = 0.5e0+(0.5e0-*w); goto S250; S120: *w1 = bpser(&b0,&a0,&y0,&eps); *w = 0.5e0+(0.5e0-*w1); goto S250; S130: T2 = 15.0e0*eps; *w = bfrac(&a0,&b0,&x0,&y0,&lambda,&T2); *w1 = 0.5e0+(0.5e0-*w); goto S250; S140: *w1 = bup(&b0,&a0,&y0,&x0,&n,&eps); b0 += (double)n; S150: T3 = 15.0e0*eps; bgrat(&b0,&a0,&y0,&x0,w1,&T3,&ierr1); *w = 0.5e0+(0.5e0-*w1); goto S250; S160: n = (int)b0; b0 -= (double)n; if(b0 != 0.0e0) goto S170; n -= 1; b0 = 1.0e0; S170: *w = bup(&b0,&a0,&y0,&x0,&n,&eps); if(x0 > 0.7e0) goto S180; *w += bpser(&a0,&b0,&x0,&eps); *w1 = 0.5e0+(0.5e0-*w); goto S250; S180: if(a0 > 15.0e0) goto S190; n = 20; *w += bup(&a0,&b0,&x0,&y0,&n,&eps); a0 += (double)n; S190: T4 = 15.0e0*eps; bgrat(&a0,&b0,&x0,&y0,w,&T4,&ierr1); *w1 = 0.5e0+(0.5e0-*w); goto S250; S200: T5 = 100.0e0*eps; *w = basym(&a0,&b0,&lambda,&T5); *w1 = 0.5e0+(0.5e0-*w); goto S250; S210: /* TERMINATION OF THE PROCEDURE */ if(*a == 0.0e0) goto S320; S220: *w = 0.0e0; *w1 = 1.0e0; return; S230: if(*b == 0.0e0) goto S330; S240: *w = 1.0e0; *w1 = 0.0e0; return; S250: if(ind == 0) return; t = *w; *w = *w1; *w1 = t; return; S260: /* PROCEDURE FOR A AND B .LT. 1.E-3*EPS */ *w = *b/(*a+*b); *w1 = *a/(*a+*b); return; S270: /* ERROR RETURN */ *ierr = 1; return; S280: *ierr = 2; return; S290: *ierr = 3; return; S300: *ierr = 4; return; S310: *ierr = 5; return; S320: *ierr = 6; return; S330: *ierr = 7; return; } double brcmp1(int *mu,double *a,double *b,double *x,double *y) /* ----------------------------------------------------------------------- EVALUATION OF EXP(MU) * (X**A*Y**B/BETA(A,B)) ----------------------------------------------------------------------- */ { static double Const = .398942280401433e0; static double brcmp1,a0,apb,b0,c,e,h,lambda,lnx,lny,t,u,v,x0,y0,z; static int i,n; /* ----------------- CONST = 1/SQRT(2*PI) ----------------- */ static double T1,T2,T3,T4; /* .. .. Executable Statements .. */ a0 = fifdmin1(*a,*b); if(a0 >= 8.0e0) goto S130; if(*x > 0.375e0) goto S10; lnx = log(*x); T1 = -*x; lny = alnrel(&T1); goto S30; S10: if(*y > 0.375e0) goto S20; T2 = -*y; lnx = alnrel(&T2); lny = log(*y); goto S30; S20: lnx = log(*x); lny = log(*y); S30: z = *a*lnx+*b*lny; if(a0 < 1.0e0) goto S40; z -= betaln(a,b); brcmp1 = esum(mu,&z); return brcmp1; S40: /* ----------------------------------------------------------------------- PROCEDURE FOR A .LT. 1 OR B .LT. 1 ----------------------------------------------------------------------- */ b0 = fifdmax1(*a,*b); if(b0 >= 8.0e0) goto S120; if(b0 > 1.0e0) goto S70; /* ALGORITHM FOR B0 .LE. 1 */ brcmp1 = esum(mu,&z); if(brcmp1 == 0.0e0) return brcmp1; apb = *a+*b; if(apb > 1.0e0) goto S50; z = 1.0e0+gam1(&apb); goto S60; S50: u = *a+*b-1.e0; z = (1.0e0+gam1(&u))/apb; S60: c = (1.0e0+gam1(a))*(1.0e0+gam1(b))/z; brcmp1 = brcmp1*(a0*c)/(1.0e0+a0/b0); return brcmp1; S70: /* ALGORITHM FOR 1 .LT. B0 .LT. 8 */ u = gamln1(&a0); n = int(b0-1.0e0); if(n < 1) goto S90; c = 1.0e0; for(i=1; i<=n; i++) { b0 -= 1.0e0; c *= (b0/(a0+b0)); } u = log(c)+u; S90: z -= u; b0 -= 1.0e0; apb = a0+b0; if(apb > 1.0e0) goto S100; t = 1.0e0+gam1(&apb); goto S110; S100: u = a0+b0-1.e0; t = (1.0e0+gam1(&u))/apb; S110: brcmp1 = a0*esum(mu,&z)*(1.0e0+gam1(&b0))/t; return brcmp1; S120: /* ALGORITHM FOR B0 .GE. 8 */ u = gamln1(&a0)+algdiv(&a0,&b0); T3 = z-u; brcmp1 = a0*esum(mu,&T3); return brcmp1; S130: /* ----------------------------------------------------------------------- PROCEDURE FOR A .GE. 8 AND B .GE. 8 ----------------------------------------------------------------------- */ if(*a > *b) goto S140; h = *a/ *b; x0 = h/(1.0e0+h); y0 = 1.0e0/(1.0e0+h); lambda = *a-(*a+*b)**x; goto S150; S140: h = *b/ *a; x0 = 1.0e0/(1.0e0+h); y0 = h/(1.0e0+h); lambda = (*a+*b)**y-*b; S150: e = -(lambda/ *a); if(fabs(e) > 0.6e0) goto S160; u = rlog1(&e); goto S170; S160: u = e-log(*x/x0); S170: e = lambda/ *b; if(fabs(e) > 0.6e0) goto S180; v = rlog1(&e); goto S190; S180: v = e-log(*y/y0); S190: T4 = -(*a*u+*b*v); z = esum(mu,&T4); brcmp1 = Const*sqrt(*b*x0)*z*exp(-bcorr(a,b)); return brcmp1; } double brcomp(double *a,double *b,double *x,double *y) /* ----------------------------------------------------------------------- EVALUATION OF X**A*Y**B/BETA(A,B) ----------------------------------------------------------------------- */ { static double Const = .398942280401433e0; static double brcomp,a0,apb,b0,c,e,h,lambda,lnx,lny,t,u,v,x0,y0,z; static int i,n; /* ----------------- CONST = 1/SQRT(2*PI) ----------------- */ static double T1,T2; /* .. .. Executable Statements .. */ brcomp = 0.0e0; if(*x == 0.0e0 || *y == 0.0e0) return brcomp; a0 = fifdmin1(*a,*b); if(a0 >= 8.0e0) goto S130; if(*x > 0.375e0) goto S10; lnx = log(*x); T1 = -*x; lny = alnrel(&T1); goto S30; S10: if(*y > 0.375e0) goto S20; T2 = -*y; lnx = alnrel(&T2); lny = log(*y); goto S30; S20: lnx = log(*x); lny = log(*y); S30: z = *a*lnx+*b*lny; if(a0 < 1.0e0) goto S40; z -= betaln(a,b); brcomp = exp(z); return brcomp; S40: /* ----------------------------------------------------------------------- PROCEDURE FOR A .LT. 1 OR B .LT. 1 ----------------------------------------------------------------------- */ b0 = fifdmax1(*a,*b); if(b0 >= 8.0e0) goto S120; if(b0 > 1.0e0) goto S70; /* ALGORITHM FOR B0 .LE. 1 */ brcomp = exp(z); if(brcomp == 0.0e0) return brcomp; apb = *a+*b; if(apb > 1.0e0) goto S50; z = 1.0e0+gam1(&apb); goto S60; S50: u = *a+*b-1.e0; z = (1.0e0+gam1(&u))/apb; S60: c = (1.0e0+gam1(a))*(1.0e0+gam1(b))/z; brcomp = brcomp*(a0*c)/(1.0e0+a0/b0); return brcomp; S70: /* ALGORITHM FOR 1 .LT. B0 .LT. 8 */ u = gamln1(&a0); n = int(b0-1.0e0); if(n < 1) goto S90; c = 1.0e0; for(i=1; i<=n; i++) { b0 -= 1.0e0; c *= (b0/(a0+b0)); } u = log(c)+u; S90: z -= u; b0 -= 1.0e0; apb = a0+b0; if(apb > 1.0e0) goto S100; t = 1.0e0+gam1(&apb); goto S110; S100: u = a0+b0-1.e0; t = (1.0e0+gam1(&u))/apb; S110: brcomp = a0*exp(z)*(1.0e0+gam1(&b0))/t; return brcomp; S120: /* ALGORITHM FOR B0 .GE. 8 */ u = gamln1(&a0)+algdiv(&a0,&b0); brcomp = a0*exp(z-u); return brcomp; S130: /* ----------------------------------------------------------------------- PROCEDURE FOR A .GE. 8 AND B .GE. 8 ----------------------------------------------------------------------- */ if(*a > *b) goto S140; h = *a/ *b; x0 = h/(1.0e0+h); y0 = 1.0e0/(1.0e0+h); lambda = *a-(*a+*b)**x; goto S150; S140: h = *b/ *a; x0 = 1.0e0/(1.0e0+h); y0 = h/(1.0e0+h); lambda = (*a+*b)**y-*b; S150: e = -(lambda/ *a); if(fabs(e) > 0.6e0) goto S160; u = rlog1(&e); goto S170; S160: u = e-log(*x/x0); S170: e = lambda/ *b; if(fabs(e) > 0.6e0) goto S180; v = rlog1(&e); goto S190; S180: v = e-log(*y/y0); S190: z = exp(-(*a*u+*b*v)); brcomp = Const*sqrt(*b*x0)*z*exp(-bcorr(a,b)); return brcomp; } double bup(double *a,double *b,double *x,double *y,int *n,double *eps) /* ----------------------------------------------------------------------- EVALUATION OF IX(A,B) - IX(A+N,B) WHERE N IS A POSITIVE INTEGER. EPS IS THE TOLERANCE USED. ----------------------------------------------------------------------- */ { static int K1 = 1; static int K2 = 0; static double bup,ap1,apb,d,l,r,t,w; static int i,k,kp1,mu,nm1; /* .. .. Executable Statements .. */ /* OBTAIN THE SCALING FACTOR EXP(-MU) AND EXP(MU)*(X**A*Y**B/BETA(A,B))/A */ apb = *a+*b; ap1 = *a+1.0e0; mu = 0; d = 1.0e0; if(*n == 1 || *a < 1.0e0) goto S10; if(apb < 1.1e0*ap1) goto S10; mu = int(fabs(exparg(&K1))); k = int(exparg(&K2)); if(k < mu) mu = k; t = mu; d = exp(-t); S10: bup = brcmp1(&mu,a,b,x,y)/ *a; if(*n == 1 || bup == 0.0e0) return bup; nm1 = *n-1; w = d; /* LET K BE THE INDEX OF THE MAXIMUM TERM */ k = 0; if(*b <= 1.0e0) goto S50; if(*y > 1.e-4) goto S20; k = nm1; goto S30; S20: r = (*b-1.0e0)**x/ *y-*a; if(r < 1.0e0) goto S50; t = nm1; k = nm1; if(r < t) k = (int)r; S30: /* ADD THE INCREASING TERMS OF THE SERIES */ for(i=1; i<=k; i++) { l = i-1; d = (apb+l)/(ap1+l)**x*d; w += d; } if(k == nm1) goto S70; S50: /* ADD THE REMAINING TERMS OF THE SERIES */ kp1 = k+1; for(i=kp1; i<=nm1; i++) { l = i-1; d = (apb+l)/(ap1+l)**x*d; w += d; if(d <= *eps*w) goto S70; } S70: /* TERMINATE THE PROCEDURE */ bup *= w; return bup; } void cdfbet(int *which,double *p,double *q,double *x,double *y, double *a,double *b,int *status,double *bound) /********************************************************************** void cdfbet(int *which,double *p,double *q,double *x,double *y, double *a,double *b,int *status,double *bound) Cumulative Distribution Function BETa Distribution Function Calculates any one parameter of the beta distribution given values for the others. Arguments WHICH --> Integer indicating which of the next four argument values is to be calculated from the others. Legal range: 1..4 iwhich = 1 : Calculate P and Q from X,Y,A and B iwhich = 2 : Calculate X and Y from P,Q,A and B iwhich = 3 : Calculate A from P,Q,X,Y and B iwhich = 4 : Calculate B from P,Q,X,Y and A P <--> The integral from 0 to X of the chi-square distribution. Input range: [0, 1]. Q <--> 1-P. Input range: [0, 1]. P + Q = 1.0. X <--> Upper limit of integration of beta density. Input range: [0,1]. Search range: [0,1] Y <--> 1-X. Input range: [0,1]. Search range: [0,1] X + Y = 1.0. A <--> The first parameter of the beta density. Input range: (0, +infinity). Search range: [1D-300,1D300] B <--> The second parameter of the beta density. Input range: (0, +infinity). Search range: [1D-300,1D300] STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 4 if X + Y .ne. 1 BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Cumulative distribution function (P) is calculated directly by code associated with the following reference. DiDinato, A. R. and Morris, A. H. Algorithm 708: Significant Digit Computation of the Incomplete Beta Function Ratios. ACM Trans. Math. Softw. 18 (1993), 360-373. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. Note The beta density is proportional to t^(A-1) * (1-t)^(B-1) **********************************************************************/ { #define tol (1.0e-8) #define atol (1.0e-50) #define zero (1.0e-300) #define inf 1.0e300 #define one 1.0e0 static int K1 = 1; static double K2 = 0.0e0; static double K3 = 1.0e0; static double K8 = 0.5e0; static double K9 = 5.0e0; static double fx,xhi,xlo,cum,ccum,xy,pq; static unsigned long qhi,qleft,qporq; static double T4,T5,T6,T7,T10,T11,T12,T13,T14,T15; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 || *which > 4)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 4.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p < 0.0e0 || *p > 1.0e0)) goto S60; if(!(*p < 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = 1.0e0; S50: *status = -2; return; S70: S60: if(*which == 1) goto S110; /* Q */ if(!(*q < 0.0e0 || *q > 1.0e0)) goto S100; if(!(*q < 0.0e0)) goto S80; *bound = 0.0e0; goto S90; S80: *bound = 1.0e0; S90: *status = -3; return; S110: S100: if(*which == 2) goto S150; /* X */ if(!(*x < 0.0e0 || *x > 1.0e0)) goto S140; if(!(*x < 0.0e0)) goto S120; *bound = 0.0e0; goto S130; S120: *bound = 1.0e0; S130: *status = -4; return; S150: S140: if(*which == 2) goto S190; /* Y */ if(!(*y < 0.0e0 || *y > 1.0e0)) goto S180; if(!(*y < 0.0e0)) goto S160; *bound = 0.0e0; goto S170; S160: *bound = 1.0e0; S170: *status = -5; return; S190: S180: if(*which == 3) goto S210; /* A */ if(!(*a <= 0.0e0)) goto S200; *bound = 0.0e0; *status = -6; return; S210: S200: if(*which == 4) goto S230; /* B */ if(!(*b <= 0.0e0)) goto S220; *bound = 0.0e0; *status = -7; return; S230: S220: if(*which == 1) goto S270; /* P + Q */ pq = *p+*q; if(!(fabs(pq-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S260; if(!(pq < 0.0e0)) goto S240; *bound = 0.0e0; goto S250; S240: *bound = 1.0e0; S250: *status = 3; return; S270: S260: if(*which == 2) goto S310; /* X + Y */ xy = *x+*y; if(!(fabs(xy-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S300; if(!(xy < 0.0e0)) goto S280; *bound = 0.0e0; goto S290; S280: *bound = 1.0e0; S290: *status = 4; return; S310: S300: if(!(*which == 1)) qporq = *p <= *q; /* Select the minimum of P or Q Calculate ANSWERS */ if(1 == *which) { /* Calculating P and Q */ cumbet(x,y,a,b,p,q); *status = 0; } else if(2 == *which) { /* Calculating X and Y */ T4 = atol; T5 = tol; dstzr(&K2,&K3,&T4,&T5); if(!qporq) goto S340; *status = 0; dzror(status,x,&fx,&xlo,&xhi,&qleft,&qhi); *y = one-*x; S320: if(!(*status == 1)) goto S330; cumbet(x,y,a,b,&cum,&ccum); fx = cum-*p; dzror(status,x,&fx,&xlo,&xhi,&qleft,&qhi); *y = one-*x; goto S320; S330: goto S370; S340: *status = 0; dzror(status,y,&fx,&xlo,&xhi,&qleft,&qhi); *x = one-*y; S350: if(!(*status == 1)) goto S360; cumbet(x,y,a,b,&cum,&ccum); fx = ccum-*q; dzror(status,y,&fx,&xlo,&xhi,&qleft,&qhi); *x = one-*y; goto S350; S370: S360: if(!(*status == -1)) goto S400; if(!qleft) goto S380; *status = 1; *bound = 0.0e0; goto S390; S380: *status = 2; *bound = 1.0e0; S400: S390: ; } else if(3 == *which) { /* Computing A */ *a = 5.0e0; T6 = zero; T7 = inf; T10 = atol; T11 = tol; dstinv(&T6,&T7,&K8,&K8,&K9,&T10,&T11); *status = 0; dinvr(status,a,&fx,&qleft,&qhi); S410: if(!(*status == 1)) goto S440; cumbet(x,y,a,b,&cum,&ccum); if(!qporq) goto S420; fx = cum-*p; goto S430; S420: fx = ccum-*q; S430: dinvr(status,a,&fx,&qleft,&qhi); goto S410; S440: if(!(*status == -1)) goto S470; if(!qleft) goto S450; *status = 1; *bound = zero; goto S460; S450: *status = 2; *bound = inf; S470: S460: ; } else if(4 == *which) { /* Computing B */ *b = 5.0e0; T12 = zero; T13 = inf; T14 = atol; T15 = tol; dstinv(&T12,&T13,&K8,&K8,&K9,&T14,&T15); *status = 0; dinvr(status,b,&fx,&qleft,&qhi); S480: if(!(*status == 1)) goto S510; cumbet(x,y,a,b,&cum,&ccum); if(!qporq) goto S490; fx = cum-*p; goto S500; S490: fx = ccum-*q; S500: dinvr(status,b,&fx,&qleft,&qhi); goto S480; S510: if(!(*status == -1)) goto S540; if(!qleft) goto S520; *status = 1; *bound = zero; goto S530; S520: *status = 2; *bound = inf; S530: ; } S540: return; #undef tol #undef atol #undef zero #undef inf #undef one } void cdfbin(int *which,double *p,double *q,double *s,double *xn, double *pr,double *ompr,int *status,double *bound) /********************************************************************** void cdfbin(int *which,double *p,double *q,double *s,double *xn, double *pr,double *ompr,int *status,double *bound) Cumulative Distribution Function BINomial distribution Function Calculates any one parameter of the binomial distribution given values for the others. Arguments WHICH --> Integer indicating which of the next four argument values is to be calculated from the others. Legal range: 1..4 iwhich = 1 : Calculate P and Q from S,XN,PR and OMPR iwhich = 2 : Calculate S from P,Q,XN,PR and OMPR iwhich = 3 : Calculate XN from P,Q,S,PR and OMPR iwhich = 4 : Calculate PR and OMPR from P,Q,S and XN P <--> The cumulation from 0 to S of the binomial distribution. (Probablility of S or fewer successes in XN trials each with probability of success PR.) Input range: [0,1]. Q <--> 1-P. Input range: [0, 1]. P + Q = 1.0. S <--> The number of successes observed. Input range: [0, XN] Search range: [0, XN] XN <--> The number of binomial trials. Input range: (0, +infinity). Search range: [1E-300, 1E300] PR <--> The probability of success in each binomial trial. Input range: [0,1]. Search range: [0,1] OMPR <--> 1-PR Input range: [0,1]. Search range: [0,1] PR + OMPR = 1.0 STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 4 if PR + OMPR .ne. 1 BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Formula 26.5.24 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to reduce the binomial distribution to the cumulative incomplete beta distribution. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. **********************************************************************/ { #define atol (1.0e-50) #define tol (1.0e-8) #define zero (1.0e-300) #define inf 1.0e300 #define one 1.0e0 static int K1 = 1; static double K2 = 0.0e0; static double K3 = 0.5e0; static double K4 = 5.0e0; static double K11 = 1.0e0; static double fx,xhi,xlo,cum,ccum,pq,prompr; static unsigned long qhi,qleft,qporq; static double T5,T6,T7,T8,T9,T10,T12,T13; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 && *which > 4)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 4.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p < 0.0e0 || *p > 1.0e0)) goto S60; if(!(*p < 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = 1.0e0; S50: *status = -2; return; S70: S60: if(*which == 1) goto S110; /* Q */ if(!(*q < 0.0e0 || *q > 1.0e0)) goto S100; if(!(*q < 0.0e0)) goto S80; *bound = 0.0e0; goto S90; S80: *bound = 1.0e0; S90: *status = -3; return; S110: S100: if(*which == 3) goto S130; /* XN */ if(!(*xn <= 0.0e0)) goto S120; *bound = 0.0e0; *status = -5; return; S130: S120: if(*which == 2) goto S170; /* S */ if(!(*s < 0.0e0 || *which != 3 && *s > *xn)) goto S160; if(!(*s < 0.0e0)) goto S140; *bound = 0.0e0; goto S150; S140: *bound = *xn; S150: *status = -4; return; S170: S160: if(*which == 4) goto S210; /* PR */ if(!(*pr < 0.0e0 || *pr > 1.0e0)) goto S200; if(!(*pr < 0.0e0)) goto S180; *bound = 0.0e0; goto S190; S180: *bound = 1.0e0; S190: *status = -6; return; S210: S200: if(*which == 4) goto S250; /* OMPR */ if(!(*ompr < 0.0e0 || *ompr > 1.0e0)) goto S240; if(!(*ompr < 0.0e0)) goto S220; *bound = 0.0e0; goto S230; S220: *bound = 1.0e0; S230: *status = -7; return; S250: S240: if(*which == 1) goto S290; /* P + Q */ pq = *p+*q; if(!(fabs(pq-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S280; if(!(pq < 0.0e0)) goto S260; *bound = 0.0e0; goto S270; S260: *bound = 1.0e0; S270: *status = 3; return; S290: S280: if(*which == 4) goto S330; /* PR + OMPR */ prompr = *pr+*ompr; if(!(fabs(prompr-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S320; if(!(prompr < 0.0e0)) goto S300; *bound = 0.0e0; goto S310; S300: *bound = 1.0e0; S310: *status = 4; return; S330: S320: if(!(*which == 1)) qporq = *p <= *q; /* Select the minimum of P or Q Calculate ANSWERS */ if(1 == *which) { /* Calculating P */ cumbin(s,xn,pr,ompr,p,q); *status = 0; } else if(2 == *which) { /* Calculating S */ *s = 5.0e0; T5 = atol; T6 = tol; dstinv(&K2,xn,&K3,&K3,&K4,&T5,&T6); *status = 0; dinvr(status,s,&fx,&qleft,&qhi); S340: if(!(*status == 1)) goto S370; cumbin(s,xn,pr,ompr,&cum,&ccum); if(!qporq) goto S350; fx = cum-*p; goto S360; S350: fx = ccum-*q; S360: dinvr(status,s,&fx,&qleft,&qhi); goto S340; S370: if(!(*status == -1)) goto S400; if(!qleft) goto S380; *status = 1; *bound = 0.0e0; goto S390; S380: *status = 2; *bound = *xn; S400: S390: ; } else if(3 == *which) { /* Calculating XN */ *xn = 5.0e0; T7 = zero; T8 = inf; T9 = atol; T10 = tol; dstinv(&T7,&T8,&K3,&K3,&K4,&T9,&T10); *status = 0; dinvr(status,xn,&fx,&qleft,&qhi); S410: if(!(*status == 1)) goto S440; cumbin(s,xn,pr,ompr,&cum,&ccum); if(!qporq) goto S420; fx = cum-*p; goto S430; S420: fx = ccum-*q; S430: dinvr(status,xn,&fx,&qleft,&qhi); goto S410; S440: if(!(*status == -1)) goto S470; if(!qleft) goto S450; *status = 1; *bound = zero; goto S460; S450: *status = 2; *bound = inf; S470: S460: ; } else if(4 == *which) { /* Calculating PR and OMPR */ T12 = atol; T13 = tol; dstzr(&K2,&K11,&T12,&T13); if(!qporq) goto S500; *status = 0; dzror(status,pr,&fx,&xlo,&xhi,&qleft,&qhi); *ompr = one-*pr; S480: if(!(*status == 1)) goto S490; cumbin(s,xn,pr,ompr,&cum,&ccum); fx = cum-*p; dzror(status,pr,&fx,&xlo,&xhi,&qleft,&qhi); *ompr = one-*pr; goto S480; S490: goto S530; S500: *status = 0; dzror(status,ompr,&fx,&xlo,&xhi,&qleft,&qhi); *pr = one-*ompr; S510: if(!(*status == 1)) goto S520; cumbin(s,xn,pr,ompr,&cum,&ccum); fx = ccum-*q; dzror(status,ompr,&fx,&xlo,&xhi,&qleft,&qhi); *pr = one-*ompr; goto S510; S530: S520: if(!(*status == -1)) goto S560; if(!qleft) goto S540; *status = 1; *bound = 0.0e0; goto S550; S540: *status = 2; *bound = 1.0e0; S550: ; } S560: return; #undef atol #undef tol #undef zero #undef inf #undef one } void cdfchi(int *which,double *p,double *q,double *x,double *df, int *status,double *bound) /********************************************************************** void cdfchi(int *which,double *p,double *q,double *x,double *df, int *status,double *bound) Cumulative Distribution Function CHI-Square distribution Function Calculates any one parameter of the chi-square distribution given values for the others. Arguments WHICH --> Integer indicating which of the next three argument values is to be calculated from the others. Legal range: 1..3 iwhich = 1 : Calculate P and Q from X and DF iwhich = 2 : Calculate X from P,Q and DF iwhich = 3 : Calculate DF from P,Q and X P <--> The integral from 0 to X of the chi-square distribution. Input range: [0, 1]. Q <--> 1-P. Input range: (0, 1]. P + Q = 1.0. X <--> Upper limit of integration of the non-central chi-square distribution. Input range: [0, +infinity). Search range: [0,1E300] DF <--> Degrees of freedom of the chi-square distribution. Input range: (0, +infinity). Search range: [ 1E-300, 1E300] STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 10 indicates error returned from cumgam. See references in cdfgam BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Formula 26.4.19 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to reduce the chisqure distribution to the incomplete distribution. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. **********************************************************************/ { #define tol (1.0e-8) #define atol (1.0e-50) #define zero (1.0e-300) #define inf 1.0e300 static int K1 = 1; static double K2 = 0.0e0; static double K4 = 0.5e0; static double K5 = 5.0e0; static double fx,cum,ccum,pq,porq; static unsigned long qhi,qleft,qporq; static double T3,T6,T7,T8,T9,T10,T11; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 || *which > 3)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 3.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p < 0.0e0 || *p > 1.0e0)) goto S60; if(!(*p < 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = 1.0e0; S50: *status = -2; return; S70: S60: if(*which == 1) goto S110; /* Q */ if(!(*q <= 0.0e0 || *q > 1.0e0)) goto S100; if(!(*q <= 0.0e0)) goto S80; *bound = 0.0e0; goto S90; S80: *bound = 1.0e0; S90: *status = -3; return; S110: S100: if(*which == 2) goto S130; /* X */ if(!(*x < 0.0e0)) goto S120; *bound = 0.0e0; *status = -4; return; S130: S120: if(*which == 3) goto S150; /* DF */ if(!(*df <= 0.0e0)) goto S140; *bound = 0.0e0; *status = -5; return; S150: S140: if(*which == 1) goto S190; /* P + Q */ pq = *p+*q; if(!(fabs(pq-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S180; if(!(pq < 0.0e0)) goto S160; *bound = 0.0e0; goto S170; S160: *bound = 1.0e0; S170: *status = 3; return; S190: S180: if(*which == 1) goto S220; /* Select the minimum of P or Q */ qporq = *p <= *q; if(!qporq) goto S200; porq = *p; goto S210; S200: porq = *q; S220: S210: /* Calculate ANSWERS */ if(1 == *which) { /* Calculating P and Q */ *status = 0; cumchi(x,df,p,q); if(porq > 1.5e0) { *status = 10; return; } } else if(2 == *which) { /* Calculating X */ *x = 5.0e0; T3 = inf; T6 = atol; T7 = tol; dstinv(&K2,&T3,&K4,&K4,&K5,&T6,&T7); *status = 0; dinvr(status,x,&fx,&qleft,&qhi); S230: if(!(*status == 1)) goto S270; cumchi(x,df,&cum,&ccum); if(!qporq) goto S240; fx = cum-*p; goto S250; S240: fx = ccum-*q; S250: if(!(fx+porq > 1.5e0)) goto S260; *status = 10; return; S260: dinvr(status,x,&fx,&qleft,&qhi); goto S230; S270: if(!(*status == -1)) goto S300; if(!qleft) goto S280; *status = 1; *bound = 0.0e0; goto S290; S280: *status = 2; *bound = inf; S300: S290: ; } else if(3 == *which) { /* Calculating DF */ *df = 5.0e0; T8 = zero; T9 = inf; T10 = atol; T11 = tol; dstinv(&T8,&T9,&K4,&K4,&K5,&T10,&T11); *status = 0; dinvr(status,df,&fx,&qleft,&qhi); S310: if(!(*status == 1)) goto S350; cumchi(x,df,&cum,&ccum); if(!qporq) goto S320; fx = cum-*p; goto S330; S320: fx = ccum-*q; S330: if(!(fx+porq > 1.5e0)) goto S340; *status = 10; return; S340: dinvr(status,df,&fx,&qleft,&qhi); goto S310; S350: if(!(*status == -1)) goto S380; if(!qleft) goto S360; *status = 1; *bound = zero; goto S370; S360: *status = 2; *bound = inf; S370: ; } S380: return; #undef tol #undef atol #undef zero #undef inf } void cdfchn(int *which,double *p,double *q,double *x,double *df, double *pnonc,int *status,double *bound) /********************************************************************** void cdfchn(int *which,double *p,double *q,double *x,double *df, double *pnonc,int *status,double *bound) Cumulative Distribution Function Non-central Chi-Square Function Calculates any one parameter of the non-central chi-square distribution given values for the others. Arguments WHICH --> Integer indicating which of the next three argument values is to be calculated from the others. Input range: 1..4 iwhich = 1 : Calculate P and Q from X and DF iwhich = 2 : Calculate X from P,DF and PNONC iwhich = 3 : Calculate DF from P,X and PNONC iwhich = 3 : Calculate PNONC from P,X and DF P <--> The integral from 0 to X of the non-central chi-square distribution. Input range: [0, 1-1E-16). Q <--> 1-P. Q is not used by this subroutine and is only included for similarity with other cdf* routines. X <--> Upper limit of integration of the non-central chi-square distribution. Input range: [0, +infinity). Search range: [0,1E300] DF <--> Degrees of freedom of the non-central chi-square distribution. Input range: (0, +infinity). Search range: [ 1E-300, 1E300] PNONC <--> Non-centrality parameter of the non-central chi-square distribution. Input range: [0, +infinity). Search range: [0,1E4] STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Formula 26.4.25 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to compute the cumulative distribution function. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. WARNING The computation time required for this routine is proportional to the noncentrality parameter (PNONC). Very large values of this parameter can consume immense computer resources. This is why the search range is bounded by 10,000. **********************************************************************/ { #define tent4 1.0e4 #define tol (1.0e-8) #define atol (1.0e-50) #define zero (1.0e-300) #define one (1.0e0-1.0e-16) #define inf 1.0e300 static double K1 = 0.0e0; static double K3 = 0.5e0; static double K4 = 5.0e0; static double fx,cum,ccum; static unsigned long qhi,qleft; static double T2,T5,T6,T7,T8,T9,T10,T11,T12,T13; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 || *which > 4)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 4.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p < 0.0e0 || *p > one)) goto S60; if(!(*p < 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = one; S50: *status = -2; return; S70: S60: if(*which == 2) goto S90; /* X */ if(!(*x < 0.0e0)) goto S80; *bound = 0.0e0; *status = -4; return; S90: S80: if(*which == 3) goto S110; /* DF */ if(!(*df <= 0.0e0)) goto S100; *bound = 0.0e0; *status = -5; return; S110: S100: if(*which == 4) goto S130; /* PNONC */ if(!(*pnonc < 0.0e0)) goto S120; *bound = 0.0e0; *status = -6; return; S130: S120: /* Calculate ANSWERS */ if(1 == *which) { /* Calculating P and Q */ cumchn(x,df,pnonc,p,q); *status = 0; } else if(2 == *which) { /* Calculating X */ *x = 5.0e0; T2 = inf; T5 = atol; T6 = tol; dstinv(&K1,&T2,&K3,&K3,&K4,&T5,&T6); *status = 0; dinvr(status,x,&fx,&qleft,&qhi); S140: if(!(*status == 1)) goto S150; cumchn(x,df,pnonc,&cum,&ccum); fx = cum-*p; dinvr(status,x,&fx,&qleft,&qhi); goto S140; S150: if(!(*status == -1)) goto S180; if(!qleft) goto S160; *status = 1; *bound = 0.0e0; goto S170; S160: *status = 2; *bound = inf; S180: S170: ; } else if(3 == *which) { /* Calculating DF */ *df = 5.0e0; T7 = zero; T8 = inf; T9 = atol; T10 = tol; dstinv(&T7,&T8,&K3,&K3,&K4,&T9,&T10); *status = 0; dinvr(status,df,&fx,&qleft,&qhi); S190: if(!(*status == 1)) goto S200; cumchn(x,df,pnonc,&cum,&ccum); fx = cum-*p; dinvr(status,df,&fx,&qleft,&qhi); goto S190; S200: if(!(*status == -1)) goto S230; if(!qleft) goto S210; *status = 1; *bound = zero; goto S220; S210: *status = 2; *bound = inf; S230: S220: ; } else if(4 == *which) { /* Calculating PNONC */ *pnonc = 5.0e0; T11 = tent4; T12 = atol; T13 = tol; dstinv(&K1,&T11,&K3,&K3,&K4,&T12,&T13); *status = 0; dinvr(status,pnonc,&fx,&qleft,&qhi); S240: if(!(*status == 1)) goto S250; cumchn(x,df,pnonc,&cum,&ccum); fx = cum-*p; dinvr(status,pnonc,&fx,&qleft,&qhi); goto S240; S250: if(!(*status == -1)) goto S280; if(!qleft) goto S260; *status = 1; *bound = zero; goto S270; S260: *status = 2; *bound = tent4; S270: ; } S280: return; #undef tent4 #undef tol #undef atol #undef zero #undef one #undef inf } void cdff(int *which,double *p,double *q,double *f,double *dfn, double *dfd,int *status,double *bound) /********************************************************************** void cdff(int *which,double *p,double *q,double *f,double *dfn, double *dfd,int *status,double *bound) Cumulative Distribution Function F distribution Function Calculates any one parameter of the F distribution given values for the others. Arguments WHICH --> Integer indicating which of the next four argument values is to be calculated from the others. Legal range: 1..4 iwhich = 1 : Calculate P and Q from F,DFN and DFD iwhich = 2 : Calculate F from P,Q,DFN and DFD iwhich = 3 : Calculate DFN from P,Q,F and DFD iwhich = 4 : Calculate DFD from P,Q,F and DFN P <--> The integral from 0 to F of the f-density. Input range: [0,1]. Q <--> 1-P. Input range: (0, 1]. P + Q = 1.0. F <--> Upper limit of integration of the f-density. Input range: [0, +infinity). Search range: [0,1E300] DFN < --> Degrees of freedom of the numerator sum of squares. Input range: (0, +infinity). Search range: [ 1E-300, 1E300] DFD < --> Degrees of freedom of the denominator sum of squares. Input range: (0, +infinity). Search range: [ 1E-300, 1E300] STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Formula 26.6.2 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to reduce the computation of the cumulative distribution function for the F variate to that of an incomplete beta. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. WARNING The value of the cumulative F distribution is not necessarily monotone in either degrees of freedom. There thus may be two values that provide a given CDF value. This routine assumes monotonicity and will find an arbitrary one of the two values. **********************************************************************/ { #define tol (1.0e-8) #define atol (1.0e-50) #define zero (1.0e-300) #define inf 1.0e300 static int K1 = 1; static double K2 = 0.0e0; static double K4 = 0.5e0; static double K5 = 5.0e0; static double pq,fx,cum,ccum; static unsigned long qhi,qleft,qporq; static double T3,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 || *which > 4)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 4.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p < 0.0e0 || *p > 1.0e0)) goto S60; if(!(*p < 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = 1.0e0; S50: *status = -2; return; S70: S60: if(*which == 1) goto S110; /* Q */ if(!(*q <= 0.0e0 || *q > 1.0e0)) goto S100; if(!(*q <= 0.0e0)) goto S80; *bound = 0.0e0; goto S90; S80: *bound = 1.0e0; S90: *status = -3; return; S110: S100: if(*which == 2) goto S130; /* F */ if(!(*f < 0.0e0)) goto S120; *bound = 0.0e0; *status = -4; return; S130: S120: if(*which == 3) goto S150; /* DFN */ if(!(*dfn <= 0.0e0)) goto S140; *bound = 0.0e0; *status = -5; return; S150: S140: if(*which == 4) goto S170; /* DFD */ if(!(*dfd <= 0.0e0)) goto S160; *bound = 0.0e0; *status = -6; return; S170: S160: if(*which == 1) goto S210; /* P + Q */ pq = *p+*q; if(!(fabs(pq-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S200; if(!(pq < 0.0e0)) goto S180; *bound = 0.0e0; goto S190; S180: *bound = 1.0e0; S190: *status = 3; return; S210: S200: if(!(*which == 1)) qporq = *p <= *q; /* Select the minimum of P or Q Calculate ANSWERS */ if(1 == *which) { /* Calculating P */ cumf(f,dfn,dfd,p,q); *status = 0; } else if(2 == *which) { /* Calculating F */ *f = 5.0e0; T3 = inf; T6 = atol; T7 = tol; dstinv(&K2,&T3,&K4,&K4,&K5,&T6,&T7); *status = 0; dinvr(status,f,&fx,&qleft,&qhi); S220: if(!(*status == 1)) goto S250; cumf(f,dfn,dfd,&cum,&ccum); if(!qporq) goto S230; fx = cum-*p; goto S240; S230: fx = ccum-*q; S240: dinvr(status,f,&fx,&qleft,&qhi); goto S220; S250: if(!(*status == -1)) goto S280; if(!qleft) goto S260; *status = 1; *bound = 0.0e0; goto S270; S260: *status = 2; *bound = inf; S280: S270: ; } else if(3 == *which) { /* Calculating DFN */ *dfn = 5.0e0; T8 = zero; T9 = inf; T10 = atol; T11 = tol; dstinv(&T8,&T9,&K4,&K4,&K5,&T10,&T11); *status = 0; dinvr(status,dfn,&fx,&qleft,&qhi); S290: if(!(*status == 1)) goto S320; cumf(f,dfn,dfd,&cum,&ccum); if(!qporq) goto S300; fx = cum-*p; goto S310; S300: fx = ccum-*q; S310: dinvr(status,dfn,&fx,&qleft,&qhi); goto S290; S320: if(!(*status == -1)) goto S350; if(!qleft) goto S330; *status = 1; *bound = zero; goto S340; S330: *status = 2; *bound = inf; S350: S340: ; } else if(4 == *which) { /* Calculating DFD */ *dfd = 5.0e0; T12 = zero; T13 = inf; T14 = atol; T15 = tol; dstinv(&T12,&T13,&K4,&K4,&K5,&T14,&T15); *status = 0; dinvr(status,dfd,&fx,&qleft,&qhi); S360: if(!(*status == 1)) goto S390; cumf(f,dfn,dfd,&cum,&ccum); if(!qporq) goto S370; fx = cum-*p; goto S380; S370: fx = ccum-*q; S380: dinvr(status,dfd,&fx,&qleft,&qhi); goto S360; S390: if(!(*status == -1)) goto S420; if(!qleft) goto S400; *status = 1; *bound = zero; goto S410; S400: *status = 2; *bound = inf; S410: ; } S420: return; #undef tol #undef atol #undef zero #undef inf } void cdffnc(int *which,double *p,double *q,double *f,double *dfn, double *dfd,double *phonc,int *status,double *bound) /********************************************************************** void cdffnc(int *which,double *p,double *q,double *f,double *dfn, double *dfd,double *phonc,int *status,double *bound) Cumulative Distribution Function Non-central F distribution Function Calculates any one parameter of the Non-central F distribution given values for the others. Arguments WHICH --> Integer indicating which of the next five argument values is to be calculated from the others. Legal range: 1..5 iwhich = 1 : Calculate P and Q from F,DFN,DFD and PNONC iwhich = 2 : Calculate F from P,Q,DFN,DFD and PNONC iwhich = 3 : Calculate DFN from P,Q,F,DFD and PNONC iwhich = 4 : Calculate DFD from P,Q,F,DFN and PNONC iwhich = 5 : Calculate PNONC from P,Q,F,DFN and DFD P <--> The integral from 0 to F of the non-central f-density. Input range: [0,1-1E-16). Q <--> 1-P. Q is not used by this subroutine and is only included for similarity with other cdf* routines. F <--> Upper limit of integration of the non-central f-density. Input range: [0, +infinity). Search range: [0,1E300] DFN < --> Degrees of freedom of the numerator sum of squares. Input range: (0, +infinity). Search range: [ 1E-300, 1E300] DFD < --> Degrees of freedom of the denominator sum of squares. Must be in range: (0, +infinity). Input range: (0, +infinity). Search range: [ 1E-300, 1E300] PNONC <-> The non-centrality parameter Input range: [0,infinity) Search range: [0,1E4] STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Formula 26.6.20 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to compute the cumulative distribution function. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. WARNING The computation time required for this routine is proportional to the noncentrality parameter (PNONC). Very large values of this parameter can consume immense computer resources. This is why the search range is bounded by 10,000. WARNING The value of the cumulative noncentral F distribution is not necessarily monotone in either degrees of freedom. There thus may be two values that provide a given CDF value. This routine assumes monotonicity and will find an arbitrary one of the two values. **********************************************************************/ { #define tent4 1.0e4 #define tol (1.0e-8) #define atol (1.0e-50) #define zero (1.0e-300) #define one (1.0e0-1.0e-16) #define inf 1.0e300 static double K1 = 0.0e0; static double K3 = 0.5e0; static double K4 = 5.0e0; static double fx,cum,ccum; static unsigned long qhi,qleft; static double T2,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 || *which > 5)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 5.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p < 0.0e0 || *p > one)) goto S60; if(!(*p < 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = one; S50: *status = -2; return; S70: S60: if(*which == 2) goto S90; /* F */ if(!(*f < 0.0e0)) goto S80; *bound = 0.0e0; *status = -4; return; S90: S80: if(*which == 3) goto S110; /* DFN */ if(!(*dfn <= 0.0e0)) goto S100; *bound = 0.0e0; *status = -5; return; S110: S100: if(*which == 4) goto S130; /* DFD */ if(!(*dfd <= 0.0e0)) goto S120; *bound = 0.0e0; *status = -6; return; S130: S120: if(*which == 5) goto S150; /* PHONC */ if(!(*phonc < 0.0e0)) goto S140; *bound = 0.0e0; *status = -7; return; S150: S140: /* Calculate ANSWERS */ if(1 == *which) { /* Calculating P */ cumfnc(f,dfn,dfd,phonc,p,q); *status = 0; } else if(2 == *which) { /* Calculating F */ *f = 5.0e0; T2 = inf; T5 = atol; T6 = tol; dstinv(&K1,&T2,&K3,&K3,&K4,&T5,&T6); *status = 0; dinvr(status,f,&fx,&qleft,&qhi); S160: if(!(*status == 1)) goto S170; cumfnc(f,dfn,dfd,phonc,&cum,&ccum); fx = cum-*p; dinvr(status,f,&fx,&qleft,&qhi); goto S160; S170: if(!(*status == -1)) goto S200; if(!qleft) goto S180; *status = 1; *bound = 0.0e0; goto S190; S180: *status = 2; *bound = inf; S200: S190: ; } else if(3 == *which) { /* Calculating DFN */ *dfn = 5.0e0; T7 = zero; T8 = inf; T9 = atol; T10 = tol; dstinv(&T7,&T8,&K3,&K3,&K4,&T9,&T10); *status = 0; dinvr(status,dfn,&fx,&qleft,&qhi); S210: if(!(*status == 1)) goto S220; cumfnc(f,dfn,dfd,phonc,&cum,&ccum); fx = cum-*p; dinvr(status,dfn,&fx,&qleft,&qhi); goto S210; S220: if(!(*status == -1)) goto S250; if(!qleft) goto S230; *status = 1; *bound = zero; goto S240; S230: *status = 2; *bound = inf; S250: S240: ; } else if(4 == *which) { /* Calculating DFD */ *dfd = 5.0e0; T11 = zero; T12 = inf; T13 = atol; T14 = tol; dstinv(&T11,&T12,&K3,&K3,&K4,&T13,&T14); *status = 0; dinvr(status,dfd,&fx,&qleft,&qhi); S260: if(!(*status == 1)) goto S270; cumfnc(f,dfn,dfd,phonc,&cum,&ccum); fx = cum-*p; dinvr(status,dfd,&fx,&qleft,&qhi); goto S260; S270: if(!(*status == -1)) goto S300; if(!qleft) goto S280; *status = 1; *bound = zero; goto S290; S280: *status = 2; *bound = inf; S300: S290: ; } else if(5 == *which) { /* Calculating PHONC */ *phonc = 5.0e0; T15 = tent4; T16 = atol; T17 = tol; dstinv(&K1,&T15,&K3,&K3,&K4,&T16,&T17); *status = 0; dinvr(status,phonc,&fx,&qleft,&qhi); S310: if(!(*status == 1)) goto S320; cumfnc(f,dfn,dfd,phonc,&cum,&ccum); fx = cum-*p; dinvr(status,phonc,&fx,&qleft,&qhi); goto S310; S320: if(!(*status == -1)) goto S350; if(!qleft) goto S330; *status = 1; *bound = 0.0e0; goto S340; S330: *status = 2; *bound = tent4; S340: ; } S350: return; #undef tent4 #undef tol #undef atol #undef zero #undef one #undef inf } void cdfgam(int *which,double *p,double *q,double *x,double *shape, double *scale,int *status,double *bound) /********************************************************************** void cdfgam(int *which,double *p,double *q,double *x,double *shape, double *scale,int *status,double *bound) Cumulative Distribution Function GAMma Distribution Function Calculates any one parameter of the gamma distribution given values for the others. Arguments WHICH --> Integer indicating which of the next four argument values is to be calculated from the others. Legal range: 1..4 iwhich = 1 : Calculate P and Q from X,SHAPE and SCALE iwhich = 2 : Calculate X from P,Q,SHAPE and SCALE iwhich = 3 : Calculate SHAPE from P,Q,X and SCALE iwhich = 4 : Calculate SCALE from P,Q,X and SHAPE P <--> The integral from 0 to X of the gamma density. Input range: [0,1]. Q <--> 1-P. Input range: (0, 1]. P + Q = 1.0. X <--> The upper limit of integration of the gamma density. Input range: [0, +infinity). Search range: [0,1E300] SHAPE <--> The shape parameter of the gamma density. Input range: (0, +infinity). Search range: [1E-300,1E300] SCALE <--> The scale parameter of the gamma density. Input range: (0, +infinity). Search range: (1E-300,1E300] STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 10 if the gamma or inverse gamma routine cannot compute the answer. Usually happens only for X and SHAPE very large (gt 1E10 or more) BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Cumulative distribution function (P) is calculated directly by the code associated with: DiDinato, A. R. and Morris, A. H. Computation of the incomplete gamma function ratios and their inverse. ACM Trans. Math. Softw. 12 (1986), 377-393. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. Note The gamma density is proportional to T**(SHAPE - 1) * EXP(- SCALE * T) **********************************************************************/ { #define tol (1.0e-8) #define atol (1.0e-50) #define zero (1.0e-300) #define inf 1.0e300 static int K1 = 1; static double K5 = 0.5e0; static double K6 = 5.0e0; static double xx,fx,xscale,cum,ccum,pq,porq; static int ierr; static unsigned long qhi,qleft,qporq; static double T2,T3,T4,T7,T8,T9; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 || *which > 4)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 4.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p < 0.0e0 || *p > 1.0e0)) goto S60; if(!(*p < 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = 1.0e0; S50: *status = -2; return; S70: S60: if(*which == 1) goto S110; /* Q */ if(!(*q <= 0.0e0 || *q > 1.0e0)) goto S100; if(!(*q <= 0.0e0)) goto S80; *bound = 0.0e0; goto S90; S80: *bound = 1.0e0; S90: *status = -3; return; S110: S100: if(*which == 2) goto S130; /* X */ if(!(*x < 0.0e0)) goto S120; *bound = 0.0e0; *status = -4; return; S130: S120: if(*which == 3) goto S150; /* SHAPE */ if(!(*shape <= 0.0e0)) goto S140; *bound = 0.0e0; *status = -5; return; S150: S140: if(*which == 4) goto S170; /* SCALE */ if(!(*scale <= 0.0e0)) goto S160; *bound = 0.0e0; *status = -6; return; S170: S160: if(*which == 1) goto S210; /* P + Q */ pq = *p+*q; if(!(fabs(pq-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S200; if(!(pq < 0.0e0)) goto S180; *bound = 0.0e0; goto S190; S180: *bound = 1.0e0; S190: *status = 3; return; S210: S200: if(*which == 1) goto S240; /* Select the minimum of P or Q */ qporq = *p <= *q; if(!qporq) goto S220; porq = *p; goto S230; S220: porq = *q; S240: S230: /* Calculate ANSWERS */ if(1 == *which) { /* Calculating P */ *status = 0; xscale = *x**scale; cumgam(&xscale,shape,p,q); if(porq > 1.5e0) *status = 10; } else if(2 == *which) { /* Computing X */ T2 = -1.0e0; gaminv(shape,&xx,&T2,p,q,&ierr); if(ierr < 0.0e0) { *status = 10; return; } else { *x = xx/ *scale; *status = 0; } } else if(3 == *which) { /* Computing SHAPE */ *shape = 5.0e0; xscale = *x**scale; T3 = zero; T4 = inf; T7 = atol; T8 = tol; dstinv(&T3,&T4,&K5,&K5,&K6,&T7,&T8); *status = 0; dinvr(status,shape,&fx,&qleft,&qhi); S250: if(!(*status == 1)) goto S290; cumgam(&xscale,shape,&cum,&ccum); if(!qporq) goto S260; fx = cum-*p; goto S270; S260: fx = ccum-*q; S270: if(!(qporq && cum > 1.5e0 || !qporq && ccum > 1.5e0)) goto S280; *status = 10; return; S280: dinvr(status,shape,&fx,&qleft,&qhi); goto S250; S290: if(!(*status == -1)) goto S320; if(!qleft) goto S300; *status = 1; *bound = zero; goto S310; S300: *status = 2; *bound = inf; S320: S310: ; } else if(4 == *which) { /* Computing SCALE */ T9 = -1.0e0; gaminv(shape,&xx,&T9,p,q,&ierr); if(ierr < 0.0e0) { *status = 10; return; } else { *scale = xx/ *x; *status = 0; } } return; #undef tol #undef atol #undef zero #undef inf } void cdfnbn(int *which,double *p,double *q,double *s,double *xn, double *pr,double *ompr,int *status,double *bound) /********************************************************************** void cdfnbn(int *which,double *p,double *q,double *s,double *xn, double *pr,double *ompr,int *status,double *bound) Cumulative Distribution Function Negative BiNomial distribution Function Calculates any one parameter of the negative binomial distribution given values for the others. The cumulative negative binomial distribution returns the probability that there will be F or fewer failures before the XNth success in binomial trials each of which has probability of success PR. The individual term of the negative binomial is the probability of S failures before XN successes and is Choose( S, XN+S-1 ) * PR^(XN) * (1-PR)^S Arguments WHICH --> Integer indicating which of the next four argument values is to be calculated from the others. Legal range: 1..4 iwhich = 1 : Calculate P and Q from S,XN,PR and OMPR iwhich = 2 : Calculate S from P,Q,XN,PR and OMPR iwhich = 3 : Calculate XN from P,Q,S,PR and OMPR iwhich = 4 : Calculate PR and OMPR from P,Q,S and XN P <--> The cumulation from 0 to S of the negative binomial distribution. Input range: [0,1]. Q <--> 1-P. Input range: (0, 1]. P + Q = 1.0. S <--> The upper limit of cumulation of the binomial distribution. There are F or fewer failures before the XNth success. Input range: [0, +infinity). Search range: [0, 1E300] XN <--> The number of successes. Input range: [0, +infinity). Search range: [0, 1E300] PR <--> The probability of success in each binomial trial. Input range: [0,1]. Search range: [0,1]. OMPR <--> 1-PR Input range: [0,1]. Search range: [0,1] PR + OMPR = 1.0 STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 4 if PR + OMPR .ne. 1 BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Formula 26.5.26 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to reduce calculation of the cumulative distribution function to that of an incomplete beta. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. **********************************************************************/ { #define tol (1.0e-8) #define atol (1.0e-50) #define inf 1.0e300 #define one 1.0e0 static int K1 = 1; static double K2 = 0.0e0; static double K4 = 0.5e0; static double K5 = 5.0e0; static double K11 = 1.0e0; static double fx,xhi,xlo,pq,prompr,cum,ccum; static unsigned long qhi,qleft,qporq; static double T3,T6,T7,T8,T9,T10,T12,T13; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 || *which > 4)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 4.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p < 0.0e0 || *p > 1.0e0)) goto S60; if(!(*p < 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = 1.0e0; S50: *status = -2; return; S70: S60: if(*which == 1) goto S110; /* Q */ if(!(*q <= 0.0e0 || *q > 1.0e0)) goto S100; if(!(*q <= 0.0e0)) goto S80; *bound = 0.0e0; goto S90; S80: *bound = 1.0e0; S90: *status = -3; return; S110: S100: if(*which == 2) goto S130; /* S */ if(!(*s < 0.0e0)) goto S120; *bound = 0.0e0; *status = -4; return; S130: S120: if(*which == 3) goto S150; /* XN */ if(!(*xn < 0.0e0)) goto S140; *bound = 0.0e0; *status = -5; return; S150: S140: if(*which == 4) goto S190; /* PR */ if(!(*pr < 0.0e0 || *pr > 1.0e0)) goto S180; if(!(*pr < 0.0e0)) goto S160; *bound = 0.0e0; goto S170; S160: *bound = 1.0e0; S170: *status = -6; return; S190: S180: if(*which == 4) goto S230; /* OMPR */ if(!(*ompr < 0.0e0 || *ompr > 1.0e0)) goto S220; if(!(*ompr < 0.0e0)) goto S200; *bound = 0.0e0; goto S210; S200: *bound = 1.0e0; S210: *status = -7; return; S230: S220: if(*which == 1) goto S270; /* P + Q */ pq = *p+*q; if(!(fabs(pq-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S260; if(!(pq < 0.0e0)) goto S240; *bound = 0.0e0; goto S250; S240: *bound = 1.0e0; S250: *status = 3; return; S270: S260: if(*which == 4) goto S310; /* PR + OMPR */ prompr = *pr+*ompr; if(!(fabs(prompr-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S300; if(!(prompr < 0.0e0)) goto S280; *bound = 0.0e0; goto S290; S280: *bound = 1.0e0; S290: *status = 4; return; S310: S300: if(!(*which == 1)) qporq = *p <= *q; /* Select the minimum of P or Q Calculate ANSWERS */ if(1 == *which) { /* Calculating P */ cumnbn(s,xn,pr,ompr,p,q); *status = 0; } else if(2 == *which) { /* Calculating S */ *s = 5.0e0; T3 = inf; T6 = atol; T7 = tol; dstinv(&K2,&T3,&K4,&K4,&K5,&T6,&T7); *status = 0; dinvr(status,s,&fx,&qleft,&qhi); S320: if(!(*status == 1)) goto S350; cumnbn(s,xn,pr,ompr,&cum,&ccum); if(!qporq) goto S330; fx = cum-*p; goto S340; S330: fx = ccum-*q; S340: dinvr(status,s,&fx,&qleft,&qhi); goto S320; S350: if(!(*status == -1)) goto S380; if(!qleft) goto S360; *status = 1; *bound = 0.0e0; goto S370; S360: *status = 2; *bound = inf; S380: S370: ; } else if(3 == *which) { /* Calculating XN */ *xn = 5.0e0; T8 = inf; T9 = atol; T10 = tol; dstinv(&K2,&T8,&K4,&K4,&K5,&T9,&T10); *status = 0; dinvr(status,xn,&fx,&qleft,&qhi); S390: if(!(*status == 1)) goto S420; cumnbn(s,xn,pr,ompr,&cum,&ccum); if(!qporq) goto S400; fx = cum-*p; goto S410; S400: fx = ccum-*q; S410: dinvr(status,xn,&fx,&qleft,&qhi); goto S390; S420: if(!(*status == -1)) goto S450; if(!qleft) goto S430; *status = 1; *bound = 0.0e0; goto S440; S430: *status = 2; *bound = inf; S450: S440: ; } else if(4 == *which) { /* Calculating PR and OMPR */ T12 = atol; T13 = tol; dstzr(&K2,&K11,&T12,&T13); if(!qporq) goto S480; *status = 0; dzror(status,pr,&fx,&xlo,&xhi,&qleft,&qhi); *ompr = one-*pr; S460: if(!(*status == 1)) goto S470; cumnbn(s,xn,pr,ompr,&cum,&ccum); fx = cum-*p; dzror(status,pr,&fx,&xlo,&xhi,&qleft,&qhi); *ompr = one-*pr; goto S460; S470: goto S510; S480: *status = 0; dzror(status,ompr,&fx,&xlo,&xhi,&qleft,&qhi); *pr = one-*ompr; S490: if(!(*status == 1)) goto S500; cumnbn(s,xn,pr,ompr,&cum,&ccum); fx = ccum-*q; dzror(status,ompr,&fx,&xlo,&xhi,&qleft,&qhi); *pr = one-*ompr; goto S490; S510: S500: if(!(*status == -1)) goto S540; if(!qleft) goto S520; *status = 1; *bound = 0.0e0; goto S530; S520: *status = 2; *bound = 1.0e0; S530: ; } S540: return; #undef tol #undef atol #undef inf #undef one } void cdfnor(int *which,double *p,double *q,double *x,double *mean, double *sd,int *status,double *bound) /********************************************************************** void cdfnor(int *which,double *p,double *q,double *x,double *mean, double *sd,int *status,double *bound) Cumulative Distribution Function NORmal distribution Function Calculates any one parameter of the normal distribution given values for the others. Arguments WHICH --> Integer indicating which of the next parameter values is to be calculated using values of the others. Legal range: 1..4 iwhich = 1 : Calculate P and Q from X,MEAN and SD iwhich = 2 : Calculate X from P,Q,MEAN and SD iwhich = 3 : Calculate MEAN from P,Q,X and SD iwhich = 4 : Calculate SD from P,Q,X and MEAN P <--> The integral from -infinity to X of the normal density. Input range: (0,1]. Q <--> 1-P. Input range: (0, 1]. P + Q = 1.0. X < --> Upper limit of integration of the normal-density. Input range: ( -infinity, +infinity) MEAN <--> The mean of the normal density. Input range: (-infinity, +infinity) SD <--> Standard Deviation of the normal density. Input range: (0, +infinity). STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method A slightly modified version of ANORM from Cody, W.D. (1993). "ALGORITHM 715: SPECFUN - A Portabel FORTRAN Package of Special Function Routines and Test Drivers" acm Transactions on Mathematical Software. 19, 22-32. is used to calulate the cumulative standard normal distribution. The rational functions from pages 90-95 of Kennedy and Gentle, Statistical Computing, Marcel Dekker, NY, 1980 are used as starting values to Newton's Iterations which compute the inverse standard normal. Therefore no searches are necessary for any parameter. For X < -15, the asymptotic expansion for the normal is used as the starting value in finding the inverse standard normal. This is formula 26.2.12 of Abramowitz and Stegun. Note The normal density is proportional to exp( - 0.5 * (( X - MEAN)/SD)**2) **********************************************************************/ { static int K1 = 1; static double z,pq; /* .. .. Executable Statements .. */ /* Check arguments */ *status = 0; if(!(*which < 1 || *which > 4)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 4.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p <= 0.0e0 || *p > 1.0e0)) goto S60; if(!(*p <= 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = 1.0e0; S50: *status = -2; return; S70: S60: if(*which == 1) goto S110; /* Q */ if(!(*q <= 0.0e0 || *q > 1.0e0)) goto S100; if(!(*q <= 0.0e0)) goto S80; *bound = 0.0e0; goto S90; S80: *bound = 1.0e0; S90: *status = -3; return; S110: S100: if(*which == 1) goto S150; /* P + Q */ pq = *p+*q; if(!(fabs(pq-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S140; if(!(pq < 0.0e0)) goto S120; *bound = 0.0e0; goto S130; S120: *bound = 1.0e0; S130: *status = 3; return; S150: S140: if(*which == 4) goto S170; /* SD */ if(!(*sd <= 0.0e0)) goto S160; *bound = 0.0e0; *status = -6; return; S170: S160: /* Calculate ANSWERS */ if(1 == *which) { /* Computing P */ z = (*x-*mean)/ *sd; cumnor(&z,p,q); } else if(2 == *which) { /* Computing X */ z = dinvnr(p,q); *x = *sd*z+*mean; } else if(3 == *which) { /* Computing the MEAN */ z = dinvnr(p,q); *mean = *x-*sd*z; } else if(4 == *which) { /* Computing SD */ z = dinvnr(p,q); *sd = (*x-*mean)/z; } return; } void cdfpoi(int *which,double *p,double *q,double *s,double *xlam, int *status,double *bound) /********************************************************************** void cdfpoi(int *which,double *p,double *q,double *s,double *xlam, int *status,double *bound) Cumulative Distribution Function POIsson distribution Function Calculates any one parameter of the Poisson distribution given values for the others. Arguments WHICH --> Integer indicating which argument value is to be calculated from the others. Legal range: 1..3 iwhich = 1 : Calculate P and Q from S and XLAM iwhich = 2 : Calculate A from P,Q and XLAM iwhich = 3 : Calculate XLAM from P,Q and S P <--> The cumulation from 0 to S of the poisson density. Input range: [0,1]. Q <--> 1-P. Input range: (0, 1]. P + Q = 1.0. S <--> Upper limit of cumulation of the Poisson. Input range: [0, +infinity). Search range: [0,1E300] XLAM <--> Mean of the Poisson distribution. Input range: [0, +infinity). Search range: [0,1E300] STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Formula 26.4.21 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to reduce the computation of the cumulative distribution function to that of computing a chi-square, hence an incomplete gamma function. Cumulative distribution function (P) is calculated directly. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. **********************************************************************/ { #define tol (1.0e-8) #define atol (1.0e-50) #define inf 1.0e300 static int K1 = 1; static double K2 = 0.0e0; static double K4 = 0.5e0; static double K5 = 5.0e0; static double fx,cum,ccum,pq; static unsigned long qhi,qleft,qporq; static double T3,T6,T7,T8,T9,T10; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 || *which > 3)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 3.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p < 0.0e0 || *p > 1.0e0)) goto S60; if(!(*p < 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = 1.0e0; S50: *status = -2; return; S70: S60: if(*which == 1) goto S110; /* Q */ if(!(*q <= 0.0e0 || *q > 1.0e0)) goto S100; if(!(*q <= 0.0e0)) goto S80; *bound = 0.0e0; goto S90; S80: *bound = 1.0e0; S90: *status = -3; return; S110: S100: if(*which == 2) goto S130; /* S */ if(!(*s < 0.0e0)) goto S120; *bound = 0.0e0; *status = -4; return; S130: S120: if(*which == 3) goto S150; /* XLAM */ if(!(*xlam < 0.0e0)) goto S140; *bound = 0.0e0; *status = -5; return; S150: S140: if(*which == 1) goto S190; /* P + Q */ pq = *p+*q; if(!(fabs(pq-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S180; if(!(pq < 0.0e0)) goto S160; *bound = 0.0e0; goto S170; S160: *bound = 1.0e0; S170: *status = 3; return; S190: S180: if(!(*which == 1)) qporq = *p <= *q; /* Select the minimum of P or Q Calculate ANSWERS */ if(1 == *which) { /* Calculating P */ cumpoi(s,xlam,p,q); *status = 0; } else if(2 == *which) { /* Calculating S */ *s = 5.0e0; T3 = inf; T6 = atol; T7 = tol; dstinv(&K2,&T3,&K4,&K4,&K5,&T6,&T7); *status = 0; dinvr(status,s,&fx,&qleft,&qhi); S200: if(!(*status == 1)) goto S230; cumpoi(s,xlam,&cum,&ccum); if(!qporq) goto S210; fx = cum-*p; goto S220; S210: fx = ccum-*q; S220: dinvr(status,s,&fx,&qleft,&qhi); goto S200; S230: if(!(*status == -1)) goto S260; if(!qleft) goto S240; *status = 1; *bound = 0.0e0; goto S250; S240: *status = 2; *bound = inf; S260: S250: ; } else if(3 == *which) { /* Calculating XLAM */ *xlam = 5.0e0; T8 = inf; T9 = atol; T10 = tol; dstinv(&K2,&T8,&K4,&K4,&K5,&T9,&T10); *status = 0; dinvr(status,xlam,&fx,&qleft,&qhi); S270: if(!(*status == 1)) goto S300; cumpoi(s,xlam,&cum,&ccum); if(!qporq) goto S280; fx = cum-*p; goto S290; S280: fx = ccum-*q; S290: dinvr(status,xlam,&fx,&qleft,&qhi); goto S270; S300: if(!(*status == -1)) goto S330; if(!qleft) goto S310; *status = 1; *bound = 0.0e0; goto S320; S310: *status = 2; *bound = inf; S320: ; } S330: return; #undef tol #undef atol #undef inf } void cdft(int *which,double *p,double *q,double *t,double *df, int *status,double *bound) /********************************************************************** void cdft(int *which,double *p,double *q,double *t,double *df, int *status,double *bound) Cumulative Distribution Function T distribution Function Calculates any one parameter of the t distribution given values for the others. Arguments WHICH --> Integer indicating which argument values is to be calculated from the others. Legal range: 1..3 iwhich = 1 : Calculate P and Q from T and DF iwhich = 2 : Calculate T from P,Q and DF iwhich = 3 : Calculate DF from P,Q and T P <--> The integral from -infinity to t of the t-density. Input range: (0,1]. Q <--> 1-P. Input range: (0, 1]. P + Q = 1.0. T <--> Upper limit of integration of the t-density. Input range: ( -infinity, +infinity). Search range: [ -1E300, 1E300 ] DF <--> Degrees of freedom of the t-distribution. Input range: (0 , +infinity). Search range: [1e-300, 1E10] STATUS <-- 0 if calculation completed correctly -I if input parameter number I is out of range 1 if answer appears to be lower than lowest search bound 2 if answer appears to be higher than greatest search bound 3 if P + Q .ne. 1 BOUND <-- Undefined if STATUS is 0 Bound exceeded by parameter number I if STATUS is negative. Lower search bound if STATUS is 1. Upper search bound if STATUS is 2. Method Formula 26.5.27 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to reduce the computation of the cumulative distribution function to that of an incomplete beta. Computation of other parameters involve a seach for a value that produces the desired value of P. The search relies on the monotinicity of P with the other parameter. **********************************************************************/ { #define tol (1.0e-8) #define atol (1.0e-50) #define zero (1.0e-300) #define inf 1.0e300 #define maxdf 1.0e10 static int K1 = 1; static double K4 = 0.5e0; static double K5 = 5.0e0; static double fx,cum,ccum,pq; static unsigned long qhi,qleft,qporq; static double T2,T3,T6,T7,T8,T9,T10,T11; /* .. .. Executable Statements .. */ /* Check arguments */ if(!(*which < 1 || *which > 3)) goto S30; if(!(*which < 1)) goto S10; *bound = 1.0e0; goto S20; S10: *bound = 3.0e0; S20: *status = -1; return; S30: if(*which == 1) goto S70; /* P */ if(!(*p <= 0.0e0 || *p > 1.0e0)) goto S60; if(!(*p <= 0.0e0)) goto S40; *bound = 0.0e0; goto S50; S40: *bound = 1.0e0; S50: *status = -2; return; S70: S60: if(*which == 1) goto S110; /* Q */ if(!(*q <= 0.0e0 || *q > 1.0e0)) goto S100; if(!(*q <= 0.0e0)) goto S80; *bound = 0.0e0; goto S90; S80: *bound = 1.0e0; S90: *status = -3; return; S110: S100: if(*which == 3) goto S130; /* DF */ if(!(*df <= 0.0e0)) goto S120; *bound = 0.0e0; *status = -5; return; S130: S120: if(*which == 1) goto S170; /* P + Q */ pq = *p+*q; if(!(fabs(pq-0.5e0-0.5e0) > 3.0e0*spmpar(&K1))) goto S160; if(!(pq < 0.0e0)) goto S140; *bound = 0.0e0; goto S150; S140: *bound = 1.0e0; S150: *status = 3; return; S170: S160: if(!(*which == 1)) qporq = *p <= *q; /* Select the minimum of P or Q Calculate ANSWERS */ if(1 == *which) { /* Computing P and Q */ cumt(t,df,p,q); *status = 0; } else if(2 == *which) { /* Computing T .. Get initial approximation for T */ *t = dt1(p,q,df); T2 = -inf; T3 = inf; T6 = atol; T7 = tol; dstinv(&T2,&T3,&K4,&K4,&K5,&T6,&T7); *status = 0; dinvr(status,t,&fx,&qleft,&qhi); S180: if(!(*status == 1)) goto S210; cumt(t,df,&cum,&ccum); if(!qporq) goto S190; fx = cum-*p; goto S200; S190: fx = ccum-*q; S200: dinvr(status,t,&fx,&qleft,&qhi); goto S180; S210: if(!(*status == -1)) goto S240; if(!qleft) goto S220; *status = 1; *bound = -inf; goto S230; S220: *status = 2; *bound = inf; S240: S230: ; } else if(3 == *which) { /* Computing DF */ *df = 5.0e0; T8 = zero; T9 = maxdf; T10 = atol; T11 = tol; dstinv(&T8,&T9,&K4,&K4,&K5,&T10,&T11); *status = 0; dinvr(status,df,&fx,&qleft,&qhi); S250: if(!(*status == 1)) goto S280; cumt(t,df,&cum,&ccum); if(!qporq) goto S260; fx = cum-*p; goto S270; S260: fx = ccum-*q; S270: dinvr(status,df,&fx,&qleft,&qhi); goto S250; S280: if(!(*status == -1)) goto S310; if(!qleft) goto S290; *status = 1; *bound = zero; goto S300; S290: *status = 2; *bound = maxdf; S300: ; } S310: return; #undef tol #undef atol #undef zero #undef inf #undef maxdf } void cumbet(double *x,double *y,double *a,double *b,double *cum, double *ccum) /* ********************************************************************** void cumbet(double *x,double *y,double *a,double *b,double *cum, double *ccum) Double precision cUMulative incomplete BETa distribution Function Calculates the cdf to X of the incomplete beta distribution with parameters a and b. This is the integral from 0 to x of (1/B(a,b))*f(t)) where f(t) = t**(a-1) * (1-t)**(b-1) Arguments X --> Upper limit of integration. X is DOUBLE PRECISION Y --> 1 - X. Y is DOUBLE PRECISION A --> First parameter of the beta distribution. A is DOUBLE PRECISION B --> Second parameter of the beta distribution. B is DOUBLE PRECISION CUM <-- Cumulative incomplete beta distribution. CUM is DOUBLE PRECISION CCUM <-- Compliment of Cumulative incomplete beta distribution. CCUM is DOUBLE PRECISION Method Calls the routine BRATIO. References Didonato, Armido R. and Morris, Alfred H. Jr. (1992) Algorithim 708 Significant Digit Computation of the Incomplete Beta Function Ratios. ACM ToMS, Vol.18, No. 3, Sept. 1992, 360-373. ********************************************************************** */ { static int ierr; /* .. .. Executable Statements .. */ if(!(*x <= 0.0e0)) goto S10; *cum = 0.0e0; *ccum = 1.0e0; return; S10: if(!(*y <= 0.0e0)) goto S20; *cum = 1.0e0; *ccum = 0.0e0; return; S20: bratio(a,b,x,y,cum,ccum,&ierr); /* Call bratio routine */ return; } void cumbin(double *s,double *xn,double *pr,double *ompr, double *cum,double *ccum) /* ********************************************************************** void cumbin(double *s,double *xn,double *pr,double *ompr, double *cum,double *ccum) CUmulative BINomial distribution Function Returns the probability of 0 to S successes in XN binomial trials, each of which has a probability of success, PBIN. Arguments S --> The upper limit of cumulation of the binomial distribution. S is DOUBLE PRECISION XN --> The number of binomial trials. XN is DOUBLE PRECISIO PBIN --> The probability of success in each binomial trial. PBIN is DOUBLE PRECIS OMPR --> 1 - PBIN OMPR is DOUBLE PRECIS CUM <-- Cumulative binomial distribution. CUM is DOUBLE PRECISI CCUM <-- Compliment of Cumulative binomial distribution. CCUM is DOUBLE PRECIS Method Formula 26.5.24 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to reduce the binomial distribution to the cumulative beta distribution. ********************************************************************** */ { static double T1,T2; /* .. .. Executable Statements .. */ if(!(*s < *xn)) goto S10; T1 = *s+1.0e0; T2 = *xn-*s; cumbet(pr,ompr,&T1,&T2,ccum,cum); goto S20; S10: *cum = 1.0e0; *ccum = 0.0e0; S20: return; } void cumchi(double *x,double *df,double *cum,double *ccum) /* ********************************************************************** void cumchi(double *x,double *df,double *cum,double *ccum) CUMulative of the CHi-square distribution Function Calculates the cumulative chi-square distribution. Arguments X --> Upper limit of integration of the chi-square distribution. X is DOUBLE PRECISION DF --> Degrees of freedom of the chi-square distribution. DF is DOUBLE PRECISION CUM <-- Cumulative chi-square distribution. CUM is DOUBLE PRECISIO CCUM <-- Compliment of Cumulative chi-square distribution. CCUM is DOUBLE PRECISI Method Calls incomplete gamma function (CUMGAM) ********************************************************************** */ { static double a,xx; /* .. .. Executable Statements .. */ a = *df*0.5e0; xx = *x*0.5e0; cumgam(&xx,&a,cum,ccum); return; } void cumchn(double *x,double *df,double *pnonc,double *cum, double *ccum) /* ********************************************************************** void cumchn(double *x,double *df,double *pnonc,double *cum, double *ccum) CUMulative of the Non-central CHi-square distribution Function Calculates the cumulative non-central chi-square distribution, i.e., the probability that a random variable which follows the non-central chi-square distribution, with non-centrality parameter PNONC and continuous degrees of freedom DF, is less than or equal to X. Arguments X --> Upper limit of integration of the non-central chi-square distribution. X is DOUBLE PRECISION DF --> Degrees of freedom of the non-central chi-square distribution. DF is DOUBLE PRECISION PNONC --> Non-centrality parameter of the non-central chi-square distribution. PNONC is DOUBLE PRECIS CUM <-- Cumulative non-central chi-square distribution. CUM is DOUBLE PRECISIO CCUM <-- Compliment of Cumulative non-central chi-square distribut CCUM is DOUBLE PRECISI Method Uses formula 26.4.25 of Abramowitz and Stegun, Handbook of Mathematical Functions, US NBS (1966) to calculate the non-central chi-square. Variables EPS --- Convergence criterion. The sum stops when a term is less than EPS*SUM. EPS is DOUBLE PRECISIO NTIRED --- Maximum number of terms to be evaluated in each sum. NTIRED is INTEGER QCONV --- .TRUE. if convergence achieved - i.e., program did not stop on NTIRED criterion. QCONV is LOGICAL CCUM <-- Compliment of Cumulative non-central chi-square distribution. CCUM is DOUBLE PRECISI ********************************************************************** */ { #define dg(i) (*df+2.0e0*(double)(i)) #define qsmall(xx) (int)(sum < 1.0e-20 || (xx) < eps*sum) #define qtired(i) (int)((i) > ntired) static double eps = 1.0e-5; static int ntired = 1000; static double adj,centaj,centwt,chid2,dfd2,lcntaj,lcntwt,lfact,pcent,pterm,sum, sumadj,term,wt,xnonc; static int i,icent,iterb,iterf; static double T1,T2,T3; /* .. .. Executable Statements .. */ if(!(*x <= 0.0e0)) goto S10; *cum = 0.0e0; *ccum = 1.0e0; return; S10: if(!(*pnonc <= 1.0e-10)) goto S20; /* When non-centrality parameter is (essentially) zero, use cumulative chi-square distribution */ cumchi(x,df,cum,ccum); return; S20: xnonc = *pnonc/2.0e0; /* ********************************************************************** The following code calcualtes the weight, chi-square, and adjustment term for the central term in the infinite series. The central term is the one in which the poisson weight is greatest. The adjustment term is the amount that must be subtracted from the chi-square to move up two degrees of freedom. ********************************************************************** */ icent = fifidint(xnonc); if(icent == 0) icent = 1; chid2 = *x/2.0e0; /* Calculate central weight term */ T1 = (double)(icent+1); lfact = alngam(&T1); lcntwt = -xnonc+(double)icent*log(xnonc)-lfact; centwt = exp(lcntwt); /* Calculate central chi-square */ T2 = dg(icent); cumchi(x,&T2,&pcent,ccum); /* Calculate central adjustment term */ dfd2 = dg(icent)/2.0e0; T3 = 1.0e0+dfd2; lfact = alngam(&T3); lcntaj = dfd2*log(chid2)-chid2-lfact; centaj = exp(lcntaj); sum = centwt*pcent; /* ********************************************************************** Sum backwards from the central term towards zero. Quit whenever either (1) the zero term is reached, or (2) the term gets small relative to the sum, or (3) More than NTIRED terms are totaled. ********************************************************************** */ iterb = 0; sumadj = 0.0e0; adj = centaj; wt = centwt; i = icent; goto S40; S30: if(qtired(iterb) || qsmall(term) || i == 0) goto S50; S40: dfd2 = dg(i)/2.0e0; /* Adjust chi-square for two fewer degrees of freedom. The adjusted value ends up in PTERM. */ adj = adj*dfd2/chid2; sumadj += adj; pterm = pcent+sumadj; /* Adjust poisson weight for J decreased by one */ wt *= ((double)i/xnonc); term = wt*pterm; sum += term; i -= 1; iterb += 1; goto S30; S50: iterf = 0; /* ********************************************************************** Now sum forward from the central term towards infinity. Quit when either (1) the term gets small relative to the sum, or (2) More than NTIRED terms are totaled. ********************************************************************** */ sumadj = adj = centaj; wt = centwt; i = icent; goto S70; S60: if(qtired(iterf) || qsmall(term)) goto S80; S70: /* Update weights for next higher J */ wt *= (xnonc/(double)(i+1)); /* Calculate PTERM and add term to sum */ pterm = pcent-sumadj; term = wt*pterm; sum += term; /* Update adjustment term for DF for next iteration */ i += 1; dfd2 = dg(i)/2.0e0; adj = adj*chid2/dfd2; sumadj += adj; iterf += 1; goto S60; S80: *cum = sum; *ccum = 0.5e0+(0.5e0-*cum); return; #undef dg #undef qsmall #undef qtired } void cumf(double *f,double *dfn,double *dfd,double *cum,double *ccum) /* ********************************************************************** void cumf(double *f,double *dfn,double *dfd,double *cum,double *ccum) CUMulative F distribution Function Computes the integral from 0 to F of the f-density with DFN and DFD degrees of freedom. Arguments F --> Upper limit of integration of the f-density. F is DOUBLE PRECISION DFN --> Degrees of freedom of the numerator sum of squares. DFN is DOUBLE PRECISI DFD --> Degrees of freedom of the denominator sum of squares. DFD is DOUBLE PRECISI CUM <-- Cumulative f distribution. CUM is DOUBLE PRECISI CCUM <-- Compliment of Cumulative f distribution. CCUM is DOUBLE PRECIS Method Formula 26.5.28 of Abramowitz and Stegun is used to reduce the cumulative F to a cumulative beta distribution. Note If F is less than or equal to 0, 0 is returned. ********************************************************************** */ { #define half 0.5e0 #define done 1.0e0 static double dsum,prod,xx,yy; static int ierr; static double T1,T2; /* .. .. Executable Statements .. */ if(!(*f <= 0.0e0)) goto S10; *cum = 0.0e0; *ccum = 1.0e0; return; S10: prod = *dfn**f; /* XX is such that the incomplete beta with parameters DFD/2 and DFN/2 evaluated at XX is 1 - CUM or CCUM YY is 1 - XX Calculate the smaller of XX and YY accurately */ dsum = *dfd+prod; xx = *dfd/dsum; if(xx > half) { yy = prod/dsum; xx = done-yy; } else yy = done-xx; T1 = *dfd*half; T2 = *dfn*half; bratio(&T1,&T2,&xx,&yy,ccum,cum,&ierr); return; #undef half #undef done } void cumfnc(double *f,double *dfn,double *dfd,double *pnonc, double *cum,double *ccum) /* ********************************************************************** F -NON- -C-ENTRAL F DISTRIBUTION Function COMPUTES NONCENTRAL F DISTRIBUTION WITH DFN AND DFD DEGREES OF FREEDOM AND NONCENTRALITY PARAMETER PNONC Arguments X --> UPPER LIMIT OF INTEGRATION OF NONCENTRAL F IN EQUATION DFN --> DEGREES OF FREEDOM OF NUMERATOR DFD --> DEGREES OF FREEDOM OF DENOMINATOR PNONC --> NONCENTRALITY PARAMETER. CUM <-- CUMULATIVE NONCENTRAL F DISTRIBUTION CCUM <-- COMPLIMENT OF CUMMULATIVE Method USES FORMULA 26.6.20 OF REFERENCE FOR INFINITE SERIES. SERIES IS CALCULATED BACKWARD AND FORWARD FROM J = LAMBDA/2 (THIS IS THE TERM WITH THE LARGEST POISSON WEIGHT) UNTIL THE CONVERGENCE CRITERION IS MET. FOR SPEED, THE INCOMPLETE BETA FUNCTIONS ARE EVALUATED BY FORMULA 26.5.16. REFERENCE HANDBOOD OF MATHEMATICAL FUNCTIONS EDITED BY MILTON ABRAMOWITZ AND IRENE A. STEGUN NATIONAL BUREAU OF STANDARDS APPLIED MATEMATICS SERIES - 55 MARCH 1965 P 947, EQUATIONS 26.6.17, 26.6.18 Note THE SUM CONTINUES UNTIL A SUCCEEDING TERM IS LESS THAN EPS TIMES THE SUM (OR THE SUM IS LESS THAN 1.0E-20). EPS IS SET TO 1.0E-4 IN A DATA STATEMENT WHICH CAN BE CHANGED. ********************************************************************** */ { #define qsmall(x) (int)(sum < 1.0e-20 || (x) < eps*sum) #define half 0.5e0 #define done 1.0e0 static double eps = 1.0e-4; static double dsum,dummy,prod,xx,yy,adn,aup,b,betdn,betup,centwt,dnterm,sum, upterm,xmult,xnonc; static int i,icent,ierr; static double T1,T2,T3,T4,T5,T6; /* .. .. Executable Statements .. */ if(!(*f <= 0.0e0)) goto S10; *cum = 0.0e0; *ccum = 1.0e0; return; S10: if(!(*pnonc < 1.0e-10)) goto S20; /* Handle case in which the non-centrality parameter is (essentially) zero. */ cumf(f,dfn,dfd,cum,ccum); return; S20: xnonc = *pnonc/2.0e0; /* Calculate the central term of the poisson weighting factor. */ icent = (int)xnonc; if(icent == 0) icent = 1; /* Compute central weight term */ T1 = (double)(icent+1); centwt = exp(-xnonc+(double)icent*log(xnonc)-alngam(&T1)); /* Compute central incomplete beta term Assure that minimum of arg to beta and 1 - arg is computed accurately. */ prod = *dfn**f; dsum = *dfd+prod; yy = *dfd/dsum; if(yy > half) { xx = prod/dsum; yy = done-xx; } else xx = done-yy; T2 = *dfn*half+(double)icent; T3 = *dfd*half; bratio(&T2,&T3,&xx,&yy,&betdn,&dummy,&ierr); adn = *dfn/2.0e0+(double)icent; aup = adn; b = *dfd/2.0e0; betup = betdn; sum = centwt*betdn; /* Now sum terms backward from icent until convergence or all done */ xmult = centwt; i = icent; T4 = adn+b; T5 = adn+1.0e0; dnterm = exp(alngam(&T4)-alngam(&T5)-alngam(&b)+adn*log(xx)+b*log(yy)); S30: if(qsmall(xmult*betdn) || i <= 0) goto S40; xmult *= ((double)i/xnonc); i -= 1; adn -= 1.0; dnterm = (adn+1.0)/((adn+b)*xx)*dnterm; betdn += dnterm; sum += (xmult*betdn); goto S30; S40: i = icent+1; /* Now sum forwards until convergence */ xmult = centwt; if(aup-1.0+b == 0) upterm = exp(-alngam(&aup)-alngam(&b)+(aup-1.0)*log(xx)+ b*log(yy)); else { T6 = aup-1.0+b; upterm = exp(alngam(&T6)-alngam(&aup)-alngam(&b)+(aup-1.0)*log(xx)+b* log(yy)); } goto S60; S50: if(qsmall(xmult*betup)) goto S70; S60: xmult *= (xnonc/(double)i); i += 1; aup += 1.0; upterm = (aup+b-2.0e0)*xx/(aup-1.0)*upterm; betup -= upterm; sum += (xmult*betup); goto S50; S70: *cum = sum; *ccum = 0.5e0+(0.5e0-*cum); return; #undef qsmall #undef half #undef done } void cumgam(double *x,double *a,double *cum,double *ccum) /* ********************************************************************** void cumgam(double *x,double *a,double *cum,double *ccum) Double precision cUMulative incomplete GAMma distribution Function Computes the cumulative of the incomplete gamma distribution, i.e., the integral from 0 to X of (1/GAM(A))*EXP(-T)*T**(A-1) DT where GAM(A) is the complete gamma function of A, i.e., GAM(A) = integral from 0 to infinity of EXP(-T)*T**(A-1) DT Arguments X --> The upper limit of integration of the incomplete gamma. X is DOUBLE PRECISION A --> The shape parameter of the incomplete gamma. A is DOUBLE PRECISION CUM <-- Cumulative incomplete gamma distribution. CUM is DOUBLE PRECISION CCUM <-- Compliment of Cumulative incomplete gamma distribution. CCUM is DOUBLE PRECISIO Method Calls the routine GRATIO. ********************************************************************** */ { static int K1 = 0; /* .. .. Executable Statements .. */ if(!(*x <= 0.0e0)) goto S10; *cum = 0.0e0; *ccum = 1.0e0; return; S10: gratio(a,x,cum,ccum,&K1); /* Call gratio routine */ return; } void cumnbn(double *s,double *xn,double *pr,double *ompr, double *cum,double *ccum) /* ********************************************************************** void cumnbn(double *s,double *xn,double *pr,double *ompr, double *cum,double *ccum) CUmulative Negative BINomial distribution Function Returns the probability that it there will be S or fewer failures before there are XN successes, with each binomial trial having a probability of success PR. Prob(# failures = S | XN successes, PR) = ( XN + S - 1 ) ( ) * PR^XN * (1-PR)^S ( S ) Arguments S --> The number of failures S is DOUBLE PRECISION XN --> The number of successes XN is DOUBLE PRECISIO PR --> The probability of success in each binomial trial. PR is DOUBLE PRECISIO OMPR --> 1 - PR OMPR is DOUBLE PRECIS CUM <-- Cumulative negative binomial distribution. CUM is DOUBLE PRECISI CCUM <-- Compliment of Cumulative negative binomial distribution. CCUM is DOUBLE PRECIS Method Formula 26.5.26 of Abramowitz and Stegun, Handbook of Mathematical Functions (1966) is used to reduce the negative binomial distribution to the cumulative beta distribution. ********************************************************************** */ { static double T1; /* .. .. Executable Statements .. */ T1 = *s+1.e0; cumbet(pr,ompr,xn,&T1,cum,ccum); return; } void cumnor(double *arg,double *result,double *ccum) /* ********************************************************************** void cumnor(double *arg,double *result,double *ccum) Function Computes the cumulative of the normal distribution, i.e., the integral from -infinity to x of (1/sqrt(2*pi)) exp(-u*u/2) du X --> Upper limit of integration. X is DOUBLE PRECISION RESULT <-- Cumulative normal distribution. RESULT is DOUBLE PRECISION CCUM <-- Compliment of Cumulative normal distribution. CCUM is DOUBLE PRECISION Renaming of function ANORM from: Cody, W.D. (1993). "ALGORITHM 715: SPECFUN - A Portabel FORTRAN Package of Special Function Routines and Test Drivers" acm Transactions on Mathematical Software. 19, 22-32. with slight modifications to return ccum and to deal with machine constants. ********************************************************************** Original Comments: ------------------------------------------------------------------ This function evaluates the normal distribution function: / x 1 | -t*t/2 P(x) = ----------- | e dt sqrt(2 pi) | /-oo The main computation evaluates near-minimax approximations derived from those in "Rational Chebyshev approximations for the error function" by W. J. Cody, Math. Comp., 1969, 631-637. This transportable program uses rational functions that theoretically approximate the normal distribution function to at least 18 significant decimal digits. The accuracy achieved depends on the arithmetic system, the compiler, the intrinsic functions, and proper selection of the machine-dependent constants. ******************************************************************* ******************************************************************* Explanation of machine-dependent constants. MIN = smallest machine representable number. EPS = argument below which anorm(x) may be represented by 0.5 and above which x*x will not underflow. A conservative value is the largest machine number X such that 1.0 + X = 1.0 to machine precision. ******************************************************************* ******************************************************************* Error returns The program returns ANORM = 0 for ARG .LE. XLOW. Intrinsic functions required are: ABS, AINT, EXP Author: W. J. Cody Mathematics and Computer Science Division Argonne National Laboratory Argonne, IL 60439 Latest modification: March 15, 1992 ------------------------------------------------------------------ */ { static double a[5] = { 2.2352520354606839287e00,1.6102823106855587881e02,1.0676894854603709582e03, 1.8154981253343561249e04,6.5682337918207449113e-2 }; static double b[4] = { 4.7202581904688241870e01,9.7609855173777669322e02,1.0260932208618978205e04, 4.5507789335026729956e04 }; static double c[9] = { 3.9894151208813466764e-1,8.8831497943883759412e00,9.3506656132177855979e01, 5.9727027639480026226e02,2.4945375852903726711e03,6.8481904505362823326e03, 1.1602651437647350124e04,9.8427148383839780218e03,1.0765576773720192317e-8 }; static double d[8] = { 2.2266688044328115691e01,2.3538790178262499861e02,1.5193775994075548050e03, 6.4855582982667607550e03,1.8615571640885098091e04,3.4900952721145977266e04, 3.8912003286093271411e04,1.9685429676859990727e04 }; static double half = 0.5e0; static double p[6] = { 2.1589853405795699e-1,1.274011611602473639e-1,2.2235277870649807e-2, 1.421619193227893466e-3,2.9112874951168792e-5,2.307344176494017303e-2 }; static double one = 1.0e0; static double q[5] = { 1.28426009614491121e00,4.68238212480865118e-1,6.59881378689285515e-2, 3.78239633202758244e-3,7.29751555083966205e-5 }; static double sixten = 1.60e0; static double sqrpi = 3.9894228040143267794e-1; static double thrsh = 0.66291e0; static double root32 = 5.656854248e0; static double zero = 0.0e0; static int K1 = 1; static int K2 = 2; static int i; static double del,eps,temp,x,xden,xnum,y,xsq,min; /* ------------------------------------------------------------------ Machine dependent constants ------------------------------------------------------------------ */ eps = spmpar(&K1)*0.5e0; min = spmpar(&K2); x = *arg; y = fabs(x); if(y <= thrsh) { /* ------------------------------------------------------------------ Evaluate anorm for |X| <= 0.66291 ------------------------------------------------------------------ */ xsq = zero; if(y > eps) xsq = x*x; xnum = a[4]*xsq; xden = xsq; for(i=0; i<3; i++) { xnum = (xnum+a[i])*xsq; xden = (xden+b[i])*xsq; } *result = x*(xnum+a[3])/(xden+b[3]); temp = *result; *result = half+temp; *ccum = half-temp; } /* ------------------------------------------------------------------ Evaluate anorm for 0.66291 <= |X| <= sqrt(32) ------------------------------------------------------------------ */ else if(y <= root32) { xnum = c[8]*y; xden = y; for(i=0; i<7; i++) { xnum = (xnum+c[i])*y; xden = (xden+d[i])*y; } *result = (xnum+c[7])/(xden+d[7]); xsq = fifdint(y*sixten)/sixten; del = (y-xsq)*(y+xsq); *result = exp(-(xsq*xsq*half))*exp(-(del*half))**result; *ccum = one-*result; if(x > zero) { temp = *result; *result = *ccum; *ccum = temp; } } /* ------------------------------------------------------------------ Evaluate anorm for |X| > sqrt(32) ------------------------------------------------------------------ */ else { *result = zero; xsq = one/(x*x); xnum = p[5]*xsq; xden = xsq; for(i=0; i<4; i++) { xnum = (xnum+p[i])*xsq; xden = (xden+q[i])*xsq; } *result = xsq*(xnum+p[4])/(xden+q[4]); *result = (sqrpi-*result)/y; xsq = fifdint(x*sixten)/sixten; del = (x-xsq)*(x+xsq); *result = exp(-(xsq*xsq*half))*exp(-(del*half))**result; *ccum = one-*result; if(x > zero) { temp = *result; *result = *ccum; *ccum = temp; } } if(*result < min) *result = 0.0e0; /* ------------------------------------------------------------------ Fix up for negative argument, erf, etc. ------------------------------------------------------------------ ----------Last card of ANORM ---------- */ if(*ccum < min) *ccum = 0.0e0; } void cumpoi(double *s,double *xlam,double *cum,double *ccum) /* ********************************************************************** void cumpoi(double *s,double *xlam,double *cum,double *ccum) CUMulative POIsson distribution Function Returns the probability of S or fewer events in a Poisson distribution with mean XLAM. Arguments S --> Upper limit of cumulation of the Poisson. S is DOUBLE PRECISION XLAM --> Mean of the Poisson distribution. XLAM is DOUBLE PRECIS CUM <-- Cumulative poisson distribution. CUM is DOUBLE PRECISION CCUM <-- Compliment of Cumulative poisson distribution. CCUM is DOUBLE PRECIS Method Uses formula 26.4.21 of Abramowitz and Stegun, Handbook of Mathematical Functions to reduce the cumulative Poisson to the cumulative chi-square distribution. ********************************************************************** */ { static double chi,df; /* .. .. Executable Statements .. */ df = 2.0e0*(*s+1.0e0); chi = 2.0e0**xlam; cumchi(&chi,&df,ccum,cum); return; } void cumt(double *t,double *df,double *cum,double *ccum) /* ********************************************************************** void cumt(double *t,double *df,double *cum,double *ccum) CUMulative T-distribution Function Computes the integral from -infinity to T of the t-density. Arguments T --> Upper limit of integration of the t-density. T is DOUBLE PRECISION DF --> Degrees of freedom of the t-distribution. DF is DOUBLE PRECISIO CUM <-- Cumulative t-distribution. CCUM is DOUBLE PRECIS CCUM <-- Compliment of Cumulative t-distribution. CCUM is DOUBLE PRECIS Method Formula 26.5.27 of Abramowitz and Stegun, Handbook of Mathematical Functions is used to reduce the t-distribution to an incomplete beta. ********************************************************************** */ { static double K2 = 0.5e0; static double xx,a,oma,tt,yy,dfptt,T1; /* .. .. Executable Statements .. */ tt = *t**t; dfptt = *df+tt; xx = *df/dfptt; yy = tt/dfptt; T1 = 0.5e0**df; cumbet(&xx,&yy,&T1,&K2,&a,&oma); if(!(*t <= 0.0e0)) goto S10; *cum = 0.5e0*a; *ccum = oma+*cum; goto S20; S10: *ccum = 0.5e0*a; *cum = oma+*ccum; S20: return; } double dbetrm(double *a,double *b) /* ********************************************************************** double dbetrm(double *a,double *b) Double Precision Sterling Remainder for Complete Beta Function Function Log(Beta(A,B)) = Lgamma(A) + Lgamma(B) - Lgamma(A+B) where Lgamma is the log of the (complete) gamma function Let ZZ be approximation obtained if each log gamma is approximated by Sterling's formula, i.e., Sterling(Z) = LOG( SQRT( 2*PI ) ) + ( Z-0.5 ) * LOG( Z ) - Z Returns Log(Beta(A,B)) - ZZ Arguments A --> One argument of the Beta DOUBLE PRECISION A B --> The other argument of the Beta DOUBLE PRECISION B ********************************************************************** */ { static double dbetrm,T1,T2,T3; /* .. .. Executable Statements .. */ /* Try to sum from smallest to largest */ T1 = *a+*b; dbetrm = -dstrem(&T1); T2 = fifdmax1(*a,*b); dbetrm += dstrem(&T2); T3 = fifdmin1(*a,*b); dbetrm += dstrem(&T3); return dbetrm; } double devlpl(double a[],int *n,double *x) /* ********************************************************************** double devlpl(double a[],int *n,double *x) Double precision EVALuate a PoLynomial at X Function returns A(1) + A(2)*X + ... + A(N)*X**(N-1) Arguments A --> Array of coefficients of the polynomial. A is DOUBLE PRECISION(N) N --> Length of A, also degree of polynomial - 1. N is INTEGER X --> Point at which the polynomial is to be evaluated. X is DOUBLE PRECISION ********************************************************************** */ { static double devlpl,term; static int i; /* .. .. Executable Statements .. */ term = a[*n-1]; for(i= *n-1-1; i>=0; i--) term = a[i]+term**x; devlpl = term; return devlpl; } double dexpm1(double *x) /* ********************************************************************** double dexpm1(double *x) Evaluation of the function EXP(X) - 1 Arguments X --> Argument at which exp(x)-1 desired DOUBLE PRECISION X Method Renaming of function rexp from code of: DiDinato, A. R. and Morris, A. H. Algorithm 708: Significant Digit Computation of the Incomplete Beta Function Ratios. ACM Trans. Math. Softw. 18 (1993), 360-373. ********************************************************************** */ { static double p1 = .914041914819518e-09; static double p2 = .238082361044469e-01; static double q1 = -.499999999085958e+00; static double q2 = .107141568980644e+00; static double q3 = -.119041179760821e-01; static double q4 = .595130811860248e-03; static double dexpm1,w; /* .. .. Executable Statements .. */ if(fabs(*x) > 0.15e0) goto S10; dexpm1 = *x*(((p2**x+p1)**x+1.0e0)/((((q4**x+q3)**x+q2)**x+q1)**x+1.0e0)); return dexpm1; S10: w = exp(*x); if(*x > 0.0e0) goto S20; dexpm1 = w-0.5e0-0.5e0; return dexpm1; S20: dexpm1 = w*(0.5e0+(0.5e0-1.0e0/w)); return dexpm1; } double dinvnr(double *p,double *q) /* ********************************************************************** double dinvnr(double *p,double *q) Double precision NoRmal distribution INVerse Function Returns X such that CUMNOR(X) = P, i.e., the integral from - infinity to X of (1/SQRT(2*PI)) EXP(-U*U/2) dU is P Arguments P --> The probability whose normal deviate is sought. P is DOUBLE PRECISION Q --> 1-P P is DOUBLE PRECISION Method The rational function on page 95 of Kennedy and Gentle, Statistical Computing, Marcel Dekker, NY , 1980 is used as a start value for the Newton method of finding roots. Note If P or Q .lt. machine EPS returns +/- DINVNR(EPS) ********************************************************************** */ { #define maxit 100 #define eps (1.0e-13) #define r2pi 0.3989422804014326e0 #define nhalf (-0.5e0) #define dennor(x) (r2pi*exp(nhalf*(x)*(x))) static double dinvnr,strtx,xcur,cum,ccum,pp,dx; static int i; static unsigned long qporq; /* .. .. Executable Statements .. */ /* FIND MINIMUM OF P AND Q */ qporq = *p <= *q; if(!qporq) goto S10; pp = *p; goto S20; S10: pp = *q; S20: /* INITIALIZATION STEP */ strtx = stvaln(&pp); xcur = strtx; /* NEWTON INTERATIONS */ for(i=1; i<=maxit; i++) { cumnor(&xcur,&cum,&ccum); dx = (cum-pp)/dennor(xcur); xcur -= dx; if(fabs(dx/xcur) < eps) goto S40; } dinvnr = strtx; /* IF WE GET HERE, NEWTON HAS FAILED */ if(!qporq) dinvnr = -dinvnr; return dinvnr; S40: /* IF WE GET HERE, NEWTON HAS SUCCEDED */ dinvnr = xcur; if(!qporq) dinvnr = -dinvnr; return dinvnr; #undef maxit #undef eps #undef r2pi #undef nhalf #undef dennor } /* DEFINE DINVR */ static void E0000(int IENTRY,int *status,double *x,double *fx, unsigned long *qleft,unsigned long *qhi,double *zabsst, double *zabsto,double *zbig,double *zrelst, double *zrelto,double *zsmall,double *zstpmu) { #define qxmon(zx,zy,zz) (int)((zx) <= (zy) && (zy) <= (zz)) static double absstp,abstol,big,fbig,fsmall,relstp,reltol,small,step,stpmul,xhi, xlb,xlo,xsave,xub,yy; static int i99999; static unsigned long qbdd,qcond,qdum1,qdum2,qincr,qlim,qok,qup; switch(IENTRY){case 0: goto DINVR; case 1: goto DSTINV;} DINVR: if(*status > 0) goto S310; qcond = !qxmon(small,*x,big); if(qcond) ftnstop(" SMALL, X, BIG not monotone in INVR"); xsave = *x; /* See that SMALL and BIG bound the zero and set QINCR */ *x = small; /* GET-FUNCTION-VALUE */ i99999 = 1; goto S300; S10: fsmall = *fx; *x = big; /* GET-FUNCTION-VALUE */ i99999 = 2; goto S300; S20: fbig = *fx; qincr = fbig > fsmall; if(!qincr) goto S50; if(fsmall <= 0.0e0) goto S30; *status = -1; *qleft = *qhi = 1; return; S30: if(fbig >= 0.0e0) goto S40; *status = -1; *qleft = *qhi = 0; return; S40: goto S80; S50: if(fsmall >= 0.0e0) goto S60; *status = -1; *qleft = 1; *qhi = 0; return; S60: if(fbig <= 0.0e0) goto S70; *status = -1; *qleft = 0; *qhi = 1; return; S80: S70: *x = xsave; step = fifdmax1(absstp,relstp*fabs(*x)); /* YY = F(X) - Y GET-FUNCTION-VALUE */ i99999 = 3; goto S300; S90: yy = *fx; if(!(yy == 0.0e0)) goto S100; *status = 0; qok = 1; return; S100: qup = qincr && yy < 0.0e0 || !qincr && yy > 0.0e0; /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ HANDLE CASE IN WHICH WE MUST STEP HIGHER ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ if(!qup) goto S170; xlb = xsave; xub = fifdmin1(xlb+step,big); goto S120; S110: if(qcond) goto S150; S120: /* YY = F(XUB) - Y */ *x = xub; /* GET-FUNCTION-VALUE */ i99999 = 4; goto S300; S130: yy = *fx; qbdd = qincr && yy >= 0.0e0 || !qincr && yy <= 0.0e0; qlim = xub >= big; qcond = qbdd || qlim; if(qcond) goto S140; step = stpmul*step; xlb = xub; xub = fifdmin1(xlb+step,big); S140: goto S110; S150: if(!(qlim && !qbdd)) goto S160; *status = -1; *qleft = 0; *qhi = !qincr; *x = big; return; S160: goto S240; S170: /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ HANDLE CASE IN WHICH WE MUST STEP LOWER ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ xub = xsave; xlb = fifdmax1(xub-step,small); goto S190; S180: if(qcond) goto S220; S190: /* YY = F(XLB) - Y */ *x = xlb; /* GET-FUNCTION-VALUE */ i99999 = 5; goto S300; S200: yy = *fx; qbdd = qincr && yy <= 0.0e0 || !qincr && yy >= 0.0e0; qlim = xlb <= small; qcond = qbdd || qlim; if(qcond) goto S210; step = stpmul*step; xub = xlb; xlb = fifdmax1(xub-step,small); S210: goto S180; S220: if(!(qlim && !qbdd)) goto S230; *status = -1; *qleft = 1; *qhi = qincr; *x = small; return; S240: S230: dstzr(&xlb,&xub,&abstol,&reltol); /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ IF WE REACH HERE, XLB AND XUB BOUND THE ZERO OF F. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ *status = 0; goto S260; S250: if(!(*status == 1)) goto S290; S260: dzror(status,x,fx,&xlo,&xhi,&qdum1,&qdum2); if(!(*status == 1)) goto S280; /* GET-FUNCTION-VALUE */ i99999 = 6; goto S300; S280: S270: goto S250; S290: *x = xlo; *status = 0; return; DSTINV: small = *zsmall; big = *zbig; absstp = *zabsst; relstp = *zrelst; stpmul = *zstpmu; abstol = *zabsto; reltol = *zrelto; return; S300: /* TO GET-FUNCTION-VALUE */ *status = 1; return; S310: switch((int)i99999){case 1: goto S10;case 2: goto S20;case 3: goto S90;case 4: goto S130;case 5: goto S200;case 6: goto S270;default: break;} #undef qxmon } void dinvr(int *status,double *x,double *fx, unsigned long *qleft,unsigned long *qhi) /* ********************************************************************** void dinvr(int *status,double *x,double *fx, unsigned long *qleft,unsigned long *qhi) Double precision bounds the zero of the function and invokes zror Reverse Communication Function Bounds the function and invokes ZROR to perform the zero finding. STINVR must have been called before this routine in order to set its parameters. Arguments STATUS <--> At the beginning of a zero finding problem, STATUS should be set to 0 and INVR invoked. (The value of parameters other than X will be ignored on this cal When INVR needs the function evaluated, it will set STATUS to 1 and return. The value of the function should be set in FX and INVR again called without changing any of its other parameters. When INVR has finished without error, it will return with STATUS 0. In that case X is approximately a root of F(X). If INVR cannot bound the function, it returns status -1 and sets QLEFT and QHI. INTEGER STATUS X <-- The value of X at which F(X) is to be evaluated. DOUBLE PRECISION X FX --> The value of F(X) calculated when INVR returns with STATUS = 1. DOUBLE PRECISION FX QLEFT <-- Defined only if QMFINV returns .FALSE. In that case it is .TRUE. If the stepping search terminated unsucessfully at SMALL. If it is .FALSE. the search terminated unsucessfully at BIG. QLEFT is LOGICAL QHI <-- Defined only if QMFINV returns .FALSE. In that case it is .TRUE. if F(X) .GT. Y at the termination of the search and .FALSE. if F(X) .LT. Y at the termination of the search. QHI is LOGICAL ********************************************************************** */ { E0000(0,status,x,fx,qleft,qhi,NULL,NULL,NULL,NULL,NULL,NULL,NULL); } void dstinv(double *zsmall,double *zbig,double *zabsst, double *zrelst,double *zstpmu,double *zabsto, double *zrelto) /* ********************************************************************** void dstinv(double *zsmall,double *zbig,double *zabsst, double *zrelst,double *zstpmu,double *zabsto, double *zrelto) Double Precision - SeT INverse finder - Reverse Communication Function Concise Description - Given a monotone function F finds X such that F(X) = Y. Uses Reverse communication -- see invr. This routine sets quantities needed by INVR. More Precise Description of INVR - F must be a monotone function, the results of QMFINV are otherwise undefined. QINCR must be .TRUE. if F is non- decreasing and .FALSE. if F is non-increasing. QMFINV will return .TRUE. if and only if F(SMALL) and F(BIG) bracket Y, i. e., QINCR is .TRUE. and F(SMALL).LE.Y.LE.F(BIG) or QINCR is .FALSE. and F(BIG).LE.Y.LE.F(SMALL) if QMFINV returns .TRUE., then the X returned satisfies the following condition. let TOL(X) = MAX(ABSTOL,RELTOL*ABS(X)) then if QINCR is .TRUE., F(X-TOL(X)) .LE. Y .LE. F(X+TOL(X)) and if QINCR is .FALSE. F(X-TOL(X)) .GE. Y .GE. F(X+TOL(X)) Arguments SMALL --> The left endpoint of the interval to be searched for a solution. SMALL is DOUBLE PRECISION BIG --> The right endpoint of the interval to be searched for a solution. BIG is DOUBLE PRECISION ABSSTP, RELSTP --> The initial step size in the search is MAX(ABSSTP,RELSTP*ABS(X)). See algorithm. ABSSTP is DOUBLE PRECISION RELSTP is DOUBLE PRECISION STPMUL --> When a step doesn't bound the zero, the step size is multiplied by STPMUL and another step taken. A popular value is 2.0 DOUBLE PRECISION STPMUL ABSTOL, RELTOL --> Two numbers that determine the accuracy of the solution. See function for a precise definition. ABSTOL is DOUBLE PRECISION RELTOL is DOUBLE PRECISION Method Compares F(X) with Y for the input value of X then uses QINCR to determine whether to step left or right to bound the desired x. the initial step size is MAX(ABSSTP,RELSTP*ABS(S)) for the input value of X. Iteratively steps right or left until it bounds X. At each step which doesn't bound X, the step size is doubled. The routine is careful never to step beyond SMALL or BIG. If it hasn't bounded X at SMALL or BIG, QMFINV returns .FALSE. after setting QLEFT and QHI. If X is successfully bounded then Algorithm R of the paper 'Two Efficient Algorithms with Guaranteed Convergence for Finding a Zero of a Function' by J. C. P. Bus and T. J. Dekker in ACM Transactions on Mathematical Software, Volume 1, No. 4 page 330 (DEC. '75) is employed to find the zero of the function F(X)-Y. This is routine QRZERO. ********************************************************************** */ { E0000(1,NULL,NULL,NULL,NULL,NULL,zabsst,zabsto,zbig,zrelst,zrelto,zsmall, zstpmu); } double dlanor(double *x) /* ********************************************************************** double dlanor(double *x) Double precision Logarith of the Asymptotic Normal Function Computes the logarithm of the cumulative normal distribution from abs( x ) to infinity for abs( x ) >= 5. Arguments X --> Value at which cumulative normal to be evaluated DOUBLE PRECISION X Method 23 term expansion of formula 26.2.12 of Abramowitz and Stegun. The relative error at X = 5 is about 0.5E-5. Note ABS(X) must be >= 5 else there is an error stop. ********************************************************************** */ { #define dlsqpi 0.91893853320467274177e0 static double coef[12] = { -1.0e0,3.0e0,-15.0e0,105.0e0,-945.0e0,10395.0e0,-135135.0e0,2027025.0e0, -34459425.0e0,654729075.0e0,-13749310575.e0,316234143225.0e0 }; static int K1 = 12; static double dlanor,approx,correc,xx,xx2,T2; /* .. .. Executable Statements .. */ xx = fabs(*x); if(xx < 5.0e0) ftnstop(" Argument too small in DLANOR"); approx = -dlsqpi-0.5e0*xx*xx-log(xx); xx2 = xx*xx; T2 = 1.0e0/xx2; correc = devlpl(coef,&K1,&T2)/xx2; correc = dln1px(&correc); dlanor = approx+correc; return dlanor; #undef dlsqpi } double dln1mx(double *x) /* ********************************************************************** double dln1mx(double *x) Double precision LN(1-X) Function Returns ln(1-x) for small x (good accuracy if x .le. 0.1). Note that the obvious code of LOG(1.0-X) won't work for small X because 1.0-X loses accuracy Arguments X --> Value for which ln(1-x) is desired. X is DOUBLE PRECISION Method If X > 0.1, the obvious code above is used ELSE The Taylor series for 1-x is expanded to 20 terms. ********************************************************************** */ { static double dln1mx,T1; /* .. .. Executable Statements .. */ T1 = -*x; dln1mx = dln1px(&T1); return dln1mx; } double dln1px(double *a) /* ********************************************************************** double dln1px(double *a) Double precision LN(1+X) Function Returns ln(1+x) Note that the obvious code of LOG(1.0+X) won't work for small X because 1.0+X loses accuracy Arguments X --> Value for which ln(1-x) is desired. X is DOUBLE PRECISION Method Renames ALNREL from: DiDinato, A. R. and Morris, A. H. Algorithm 708: Significant Digit Computation of the Incomplete Beta Function Ratios. ACM Trans. Math. Softw. 18 (1993), 360-373. ********************************************************************** ----------------------------------------------------------------------- EVALUATION OF THE FUNCTION LN(1 + A) ----------------------------------------------------------------------- */ { static double p1 = -.129418923021993e+01; static double p2 = .405303492862024e+00; static double p3 = -.178874546012214e-01; static double q1 = -.162752256355323e+01; static double q2 = .747811014037616e+00; static double q3 = -.845104217945565e-01; static double dln1px,t,t2,w,x; /* .. .. Executable Statements .. */ if(fabs(*a) > 0.375e0) goto S10; t = *a/(*a+2.0e0); t2 = t*t; w = (((p3*t2+p2)*t2+p1)*t2+1.0e0)/(((q3*t2+q2)*t2+q1)*t2+1.0e0); dln1px = 2.0e0*t*w; return dln1px; S10: x = 1.e0+*a; dln1px = log(x); return dln1px; } double dlnbet(double *a0,double *b0) /* ********************************************************************** double dlnbet(a0,b0) Double precision LN of the complete BETa Function Returns the natural log of the complete beta function, i.e., ln( Gamma(a)*Gamma(b) / Gamma(a+b) Arguments A,B --> The (symmetric) arguments to the complete beta DOUBLE PRECISION A, B Method Renames BETALN from: DiDinato, A. R. and Morris, A. H. Algorithm 708: Significant Digit Computation of the Incomplete Beta Function Ratios. ACM Trans. Math. Softw. 18 (1993), 360-373. ********************************************************************** ----------------------------------------------------------------------- EVALUATION OF THE LOGARITHM OF THE BETA FUNCTION ----------------------------------------------------------------------- E = 0.5*LN(2*PI) -------------------------- */ { static double e = .918938533204673e0; static double dlnbet,a,b,c,h,u,v,w,z; static int i,n; static double T1; /* .. .. Executable Statements .. */ a = fifdmin1(*a0,*b0); b = fifdmax1(*a0,*b0); if(a >= 8.0e0) goto S100; if(a >= 1.0e0) goto S20; /* ----------------------------------------------------------------------- PROCEDURE WHEN A .LT. 1 ----------------------------------------------------------------------- */ if(b >= 8.0e0) goto S10; T1 = a+b; dlnbet = gamln(&a)+(gamln(&b)-gamln(&T1)); return dlnbet; S10: dlnbet = gamln(&a)+algdiv(&a,&b); return dlnbet; S20: /* ----------------------------------------------------------------------- PROCEDURE WHEN 1 .LE. A .LT. 8 ----------------------------------------------------------------------- */ if(a > 2.0e0) goto S40; if(b > 2.0e0) goto S30; dlnbet = gamln(&a)+gamln(&b)-gsumln(&a,&b); return dlnbet; S30: w = 0.0e0; if(b < 8.0e0) goto S60; dlnbet = gamln(&a)+algdiv(&a,&b); return dlnbet; S40: /* REDUCTION OF A WHEN B .LE. 1000 */ if(b > 1000.0e0) goto S80; n = int(a-1.0e0); w = 1.0e0; for(i=1; i<=n; i++) { a -= 1.0e0; h = a/b; w *= (h/(1.0e0+h)); } w = log(w); if(b < 8.0e0) goto S60; dlnbet = w+gamln(&a)+algdiv(&a,&b); return dlnbet; S60: /* REDUCTION OF B WHEN B .LT. 8 */ n = int(b-1.0e0); z = 1.0e0; for(i=1; i<=n; i++) { b -= 1.0e0; z *= (b/(a+b)); } dlnbet = w+log(z)+(gamln(&a)+(gamln(&b)-gsumln(&a,&b))); return dlnbet; S80: /* REDUCTION OF A WHEN B .GT. 1000 */ n = int(a-1.0e0); w = 1.0e0; for(i=1; i<=n; i++) { a -= 1.0e0; w *= (a/(1.0e0+a/b)); } dlnbet = log(w)-(double)n*log(b)+(gamln(&a)+algdiv(&a,&b)); return dlnbet; S100: /* ----------------------------------------------------------------------- PROCEDURE WHEN A .GE. 8 ----------------------------------------------------------------------- */ w = bcorr(&a,&b); h = a/b; c = h/(1.0e0+h); u = -((a-0.5e0)*log(c)); v = b*alnrel(&h); if(u <= v) goto S110; dlnbet = -(0.5e0*log(b))+e+w-v-u; return dlnbet; S110: dlnbet = -(0.5e0*log(b))+e+w-u-v; return dlnbet; } double dlngam(double *a) /* ********************************************************************** double dlngam(double *a) Double precision LN of the GAMma function Function Returns the natural logarithm of GAMMA(X). Arguments X --> value at which scaled log gamma is to be returned X is DOUBLE PRECISION Method Renames GAMLN from: DiDinato, A. R. and Morris, A. H. Algorithm 708: Significant Digit Computation of the Incomplete Beta Function Ratios. ACM Trans. Math. Softw. 18 (1993), 360-373. ********************************************************************** ----------------------------------------------------------------------- EVALUATION OF LN(GAMMA(A)) FOR POSITIVE A ----------------------------------------------------------------------- WRITTEN BY ALFRED H. MORRIS NAVAL SURFACE WARFARE CENTER DAHLGREN, VIRGINIA -------------------------- D = 0.5*(LN(2*PI) - 1) -------------------------- */ { static double c0 = .833333333333333e-01; static double c1 = -.277777777760991e-02; static double c2 = .793650666825390e-03; static double c3 = -.595202931351870e-03; static double c4 = .837308034031215e-03; static double c5 = -.165322962780713e-02; static double d = .418938533204673e0; static double dlngam,t,w; static int i,n; static double T1; /* .. .. Executable Statements .. */ if(*a > 0.8e0) goto S10; dlngam = gamln1(a)-log(*a); return dlngam; S10: if(*a > 2.25e0) goto S20; t = *a-0.5e0-0.5e0; dlngam = gamln1(&t); return dlngam; S20: if(*a >= 10.0e0) goto S40; n = int(*a-1.25e0); t = *a; w = 1.0e0; for(i=1; i<=n; i++) { t -= 1.0e0; w = t*w; } T1 = t-1.0e0; dlngam = gamln1(&T1)+log(w); return dlngam; S40: t = pow(1.0e0/ *a,2.0); w = (((((c5*t+c4)*t+c3)*t+c2)*t+c1)*t+c0)/ *a; dlngam = d+w+(*a-0.5e0)*(log(*a)-1.0e0); return dlngam; } double dstrem(double *z) { /* ********************************************************************** double dstrem(double *z) Double precision Sterling Remainder Function Returns Log(Gamma(Z)) - Sterling(Z) where Sterling(Z) is Sterling's Approximation to Log(Gamma(Z)) Sterling(Z) = LOG( SQRT( 2*PI ) ) + ( Z-0.5 ) * LOG( Z ) - Z Arguments Z --> Value at which Sterling remainder calculated Must be positive. DOUBLE PRECISION Z Method If Z >= 6 uses 9 terms of series in Bernoulli numbers (Values calculated using Maple) Otherwise computes difference explicitly ********************************************************************** */ #define hln2pi 0.91893853320467274178e0 #define ncoef 10 static double coef[ncoef] = { 0.0e0,0.0833333333333333333333333333333e0, -0.00277777777777777777777777777778e0,0.000793650793650793650793650793651e0, -0.000595238095238095238095238095238e0, 0.000841750841750841750841750841751e0,-0.00191752691752691752691752691753e0, 0.00641025641025641025641025641026e0,-0.0295506535947712418300653594771e0, 0.179644372368830573164938490016e0 }; static int K1 = 10; static double dstrem,sterl,T2; /* .. .. Executable Statements .. */ /* For information, here are the next 11 coefficients of the remainder term in Sterling's formula -1.39243221690590111642743221691 13.4028640441683919944789510007 -156.848284626002017306365132452 2193.10333333333333333333333333 -36108.7712537249893571732652192 691472.268851313067108395250776 -0.152382215394074161922833649589D8 0.382900751391414141414141414141D9 -0.108822660357843910890151491655D11 0.347320283765002252252252252252D12 -0.123696021422692744542517103493D14 */ if(*z <= 0.0e0) ftnstop("Zero or negative argument in DSTREM"); if(!(*z > 6.0e0)) goto S10; T2 = 1.0e0/pow(*z,2.0); dstrem = devlpl(coef,&K1,&T2)**z; goto S20; S10: sterl = hln2pi+(*z-0.5e0)*log(*z)-*z; dstrem = dlngam(z)-sterl; S20: return dstrem; #undef hln2pi #undef ncoef } double dt1(double *p,double *q,double *df) /* ********************************************************************** double dt1(double *p,double *q,double *df) Double precision Initalize Approximation to INVerse of the cumulative T distribution Function Returns the inverse of the T distribution function, i.e., the integral from 0 to INVT of the T density is P. This is an initial approximation Arguments P --> The p-value whose inverse from the T distribution is desired. P is DOUBLE PRECISION Q --> 1-P. Q is DOUBLE PRECISION DF --> Degrees of freedom of the T distribution. DF is DOUBLE PRECISION ********************************************************************** */ { static double coef[4][5] = { 1.0e0,1.0e0,0.0e0,0.0e0,0.0e0,3.0e0,16.0e0,5.0e0,0.0e0,0.0e0,-15.0e0,17.0e0, 19.0e0,3.0e0,0.0e0,-945.0e0,-1920.0e0,1482.0e0,776.0e0,79.0e0 }; static double denom[4] = { 4.0e0,96.0e0,384.0e0,92160.0e0 }; static int ideg[4] = { 2,3,4,5 }; static double dt1,denpow,sum,term,x,xp,xx; static int i; /* .. .. Executable Statements .. */ x = fabs(dinvnr(p,q)); xx = x*x; sum = x; denpow = 1.0e0; for(i=0; i<4; i++) { term = devlpl(&coef[i][0],&ideg[i],&xx)*x; denpow *= *df; sum += (term/(denpow*denom[i])); } if(!(*p >= 0.5e0)) goto S20; xp = sum; goto S30; S20: xp = -sum; S30: dt1 = xp; return dt1; } /* DEFINE DZROR */ static void E0001(int IENTRY,int *status,double *x,double *fx, double *xlo,double *xhi,unsigned long *qleft, unsigned long *qhi,double *zabstl,double *zreltl, double *zxhi,double *zxlo) { #define ftol(zx) (0.5e0*fifdmax1(abstol,reltol*fabs((zx)))) static double a,abstol,b,c,d,fa,fb,fc,fd,fda,fdb,m,mb,p,q,reltol,tol,w,xxhi,xxlo; static int ext,i99999; static unsigned long first,qrzero; switch(IENTRY){case 0: goto DZROR; case 1: goto DSTZR;} DZROR: if(*status > 0) goto S280; *xlo = xxlo; *xhi = xxhi; b = *x = *xlo; /* GET-FUNCTION-VALUE */ i99999 = 1; goto S270; S10: fb = *fx; *xlo = *xhi; a = *x = *xlo; /* GET-FUNCTION-VALUE */ i99999 = 2; goto S270; S20: /* Check that F(ZXLO) < 0 < F(ZXHI) or F(ZXLO) > 0 > F(ZXHI) */ if(!(fb < 0.0e0)) goto S40; if(!(*fx < 0.0e0)) goto S30; *status = -1; *qleft = *fx < fb; *qhi = 0; return; S40: S30: if(!(fb > 0.0e0)) goto S60; if(!(*fx > 0.0e0)) goto S50; *status = -1; *qleft = *fx > fb; *qhi = 1; return; S60: S50: fa = *fx; first = 1; S70: c = a; fc = fa; ext = 0; S80: if(!(fabs(fc) < fabs(fb))) goto S100; if(!(c != a)) goto S90; d = a; fd = fa; S90: a = b; fa = fb; *xlo = c; b = *xlo; fb = fc; c = a; fc = fa; S100: tol = ftol(*xlo); m = (c+b)*.5e0; mb = m-b; if(!(fabs(mb) > tol)) goto S240; if(!(ext > 3)) goto S110; w = mb; goto S190; S110: tol = fifdsign(tol,mb); p = (b-a)*fb; if(!first) goto S120; q = fa-fb; first = 0; goto S130; S120: fdb = (fd-fb)/(d-b); fda = (fd-fa)/(d-a); p = fda*p; q = fdb*fa-fda*fb; S130: if(!(p < 0.0e0)) goto S140; p = -p; q = -q; S140: if(ext == 3) p *= 2.0e0; if(!(p*1.0e0 == 0.0e0 || p <= q*tol)) goto S150; w = tol; goto S180; S150: if(!(p < mb*q)) goto S160; w = p/q; goto S170; S160: w = mb; S190: S180: S170: d = a; fd = fa; a = b; fa = fb; b += w; *xlo = b; *x = *xlo; /* GET-FUNCTION-VALUE */ i99999 = 3; goto S270; S200: fb = *fx; if(!(fc*fb >= 0.0e0)) goto S210; goto S70; S210: if(!(w == mb)) goto S220; ext = 0; goto S230; S220: ext += 1; S230: goto S80; S240: *xhi = c; qrzero = fc >= 0.0e0 && fb <= 0.0e0 || fc < 0.0e0 && fb >= 0.0e0; if(!qrzero) goto S250; *status = 0; goto S260; S250: *status = -1; S260: return; DSTZR: xxlo = *zxlo; xxhi = *zxhi; abstol = *zabstl; reltol = *zreltl; return; S270: /* TO GET-FUNCTION-VALUE */ *status = 1; return; S280: switch((int)i99999){case 1: goto S10;case 2: goto S20;case 3: goto S200; default: break;} #undef ftol } void dzror(int *status,double *x,double *fx,double *xlo, double *xhi,unsigned long *qleft,unsigned long *qhi) /* ********************************************************************** void dzror(int *status,double *x,double *fx,double *xlo, double *xhi,unsigned long *qleft,unsigned long *qhi) Double precision ZeRo of a function -- Reverse Communication Function Performs the zero finding. STZROR must have been called before this routine in order to set its parameters. Arguments STATUS <--> At the beginning of a zero finding problem, STATUS should be set to 0 and ZROR invoked. (The value of other parameters will be ignored on this call.) When ZROR needs the function evaluated, it will set STATUS to 1 and return. The value of the function should be set in FX and ZROR again called without changing any of its other parameters. When ZROR has finished without error, it will return with STATUS 0. In that case (XLO,XHI) bound the answe If ZROR finds an error (which implies that F(XLO)-Y an F(XHI)-Y have the same sign, it returns STATUS -1. In this case, XLO and XHI are undefined. INTEGER STATUS X <-- The value of X at which F(X) is to be evaluated. DOUBLE PRECISION X FX --> The value of F(X) calculated when ZROR returns with STATUS = 1. DOUBLE PRECISION FX XLO <-- When ZROR returns with STATUS = 0, XLO bounds the inverval in X containing the solution below. DOUBLE PRECISION XLO XHI <-- When ZROR returns with STATUS = 0, XHI bounds the inverval in X containing the solution above. DOUBLE PRECISION XHI QLEFT <-- .TRUE. if the stepping search terminated unsucessfully at XLO. If it is .FALSE. the search terminated unsucessfully at XHI. QLEFT is LOGICAL QHI <-- .TRUE. if F(X) .GT. Y at the termination of the search and .FALSE. if F(X) .LT. Y at the termination of the search. QHI is LOGICAL ********************************************************************** */ { E0001(0,status,x,fx,xlo,xhi,qleft,qhi,NULL,NULL,NULL,NULL); } void dstzr(double *zxlo,double *zxhi,double *zabstl,double *zreltl) /* ********************************************************************** void dstzr(double *zxlo,double *zxhi,double *zabstl,double *zreltl) Double precision SeT ZeRo finder - Reverse communication version Function Sets quantities needed by ZROR. The function of ZROR and the quantities set is given here. Concise Description - Given a function F find XLO such that F(XLO) = 0. More Precise Description - Input condition. F is a double precision function of a single double precision argument and XLO and XHI are such that F(XLO)*F(XHI) .LE. 0.0 If the input condition is met, QRZERO returns .TRUE. and output values of XLO and XHI satisfy the following F(XLO)*F(XHI) .LE. 0. ABS(F(XLO) .LE. ABS(F(XHI) ABS(XLO-XHI) .LE. TOL(X) where TOL(X) = MAX(ABSTOL,RELTOL*ABS(X)) If this algorithm does not find XLO and XHI satisfying these conditions then QRZERO returns .FALSE. This implies that the input condition was not met. Arguments XLO --> The left endpoint of the interval to be searched for a solution. XLO is DOUBLE PRECISION XHI --> The right endpoint of the interval to be for a solution. XHI is DOUBLE PRECISION ABSTOL, RELTOL --> Two numbers that determine the accuracy of the solution. See function for a precise definition. ABSTOL is DOUBLE PRECISION RELTOL is DOUBLE PRECISION Method Algorithm R of the paper 'Two Efficient Algorithms with Guaranteed Convergence for Finding a Zero of a Function' by J. C. P. Bus and T. J. Dekker in ACM Transactions on Mathematical Software, Volume 1, no. 4 page 330 (Dec. '75) is employed to find the zero of F(X)-Y. ********************************************************************** */ { E0001(1,NULL,NULL,NULL,NULL,NULL,NULL,NULL,zabstl,zreltl,zxhi,zxlo); } double erf1(double *x) /* ----------------------------------------------------------------------- EVALUATION OF THE REAL ERROR FUNCTION ----------------------------------------------------------------------- */ { static double c = .564189583547756e0; static double a[5] = { .771058495001320e-04,-.133733772997339e-02,.323076579225834e-01, .479137145607681e-01,.128379167095513e+00 }; static double b[3] = { .301048631703895e-02,.538971687740286e-01,.375795757275549e+00 }; static double p[8] = { -1.36864857382717e-07,5.64195517478974e-01,7.21175825088309e+00, 4.31622272220567e+01,1.52989285046940e+02,3.39320816734344e+02, 4.51918953711873e+02,3.00459261020162e+02 }; static double q[8] = { 1.00000000000000e+00,1.27827273196294e+01,7.70001529352295e+01, 2.77585444743988e+02,6.38980264465631e+02,9.31354094850610e+02, 7.90950925327898e+02,3.00459260956983e+02 }; static double r[5] = { 2.10144126479064e+00,2.62370141675169e+01,2.13688200555087e+01, 4.65807828718470e+00,2.82094791773523e-01 }; static double s[4] = { 9.41537750555460e+01,1.87114811799590e+02,9.90191814623914e+01, 1.80124575948747e+01 }; static double erf1,ax,bot,t,top,x2; /* .. .. Executable Statements .. */ ax = fabs(*x); if(ax > 0.5e0) goto S10; t = *x**x; top = (((a[0]*t+a[1])*t+a[2])*t+a[3])*t+a[4]+1.0e0; bot = ((b[0]*t+b[1])*t+b[2])*t+1.0e0; erf1 = *x*(top/bot); return erf1; S10: if(ax > 4.0e0) goto S20; top = ((((((p[0]*ax+p[1])*ax+p[2])*ax+p[3])*ax+p[4])*ax+p[5])*ax+p[6])*ax+p[ 7]; bot = ((((((q[0]*ax+q[1])*ax+q[2])*ax+q[3])*ax+q[4])*ax+q[5])*ax+q[6])*ax+q[ 7]; erf1 = 0.5e0+(0.5e0-exp(-(*x**x))*top/bot); if(*x < 0.0e0) erf1 = -erf1; return erf1; S20: if(ax >= 5.8e0) goto S30; x2 = *x**x; t = 1.0e0/x2; top = (((r[0]*t+r[1])*t+r[2])*t+r[3])*t+r[4]; bot = (((s[0]*t+s[1])*t+s[2])*t+s[3])*t+1.0e0; erf1 = (c-top/(x2*bot))/ax; erf1 = 0.5e0+(0.5e0-exp(-x2)*erf1); if(*x < 0.0e0) erf1 = -erf1; return erf1; S30: erf1 = fifdsign(1.0e0,*x); return erf1; } double erfc1(int *ind,double *x) /* ----------------------------------------------------------------------- EVALUATION OF THE COMPLEMENTARY ERROR FUNCTION ERFC1(IND,X) = ERFC(X) IF IND = 0 ERFC1(IND,X) = EXP(X*X)*ERFC(X) OTHERWISE ----------------------------------------------------------------------- */ { static double c = .564189583547756e0; static double a[5] = { .771058495001320e-04,-.133733772997339e-02,.323076579225834e-01, .479137145607681e-01,.128379167095513e+00 }; static double b[3] = { .301048631703895e-02,.538971687740286e-01,.375795757275549e+00 }; static double p[8] = { -1.36864857382717e-07,5.64195517478974e-01,7.21175825088309e+00, 4.31622272220567e+01,1.52989285046940e+02,3.39320816734344e+02, 4.51918953711873e+02,3.00459261020162e+02 }; static double q[8] = { 1.00000000000000e+00,1.27827273196294e+01,7.70001529352295e+01, 2.77585444743988e+02,6.38980264465631e+02,9.31354094850610e+02, 7.90950925327898e+02,3.00459260956983e+02 }; static double r[5] = { 2.10144126479064e+00,2.62370141675169e+01,2.13688200555087e+01, 4.65807828718470e+00,2.82094791773523e-01 }; static double s[4] = { 9.41537750555460e+01,1.87114811799590e+02,9.90191814623914e+01, 1.80124575948747e+01 }; static int K1 = 1; static double erfc1,ax,bot,e,t,top,w; /* .. .. Executable Statements .. */ /* ABS(X) .LE. 0.5 */ ax = fabs(*x); if(ax > 0.5e0) goto S10; t = *x**x; top = (((a[0]*t+a[1])*t+a[2])*t+a[3])*t+a[4]+1.0e0; bot = ((b[0]*t+b[1])*t+b[2])*t+1.0e0; erfc1 = 0.5e0+(0.5e0-*x*(top/bot)); if(*ind != 0) erfc1 = exp(t)*erfc1; return erfc1; S10: /* 0.5 .LT. ABS(X) .LE. 4 */ if(ax > 4.0e0) goto S20; top = ((((((p[0]*ax+p[1])*ax+p[2])*ax+p[3])*ax+p[4])*ax+p[5])*ax+p[6])*ax+p[ 7]; bot = ((((((q[0]*ax+q[1])*ax+q[2])*ax+q[3])*ax+q[4])*ax+q[5])*ax+q[6])*ax+q[ 7]; erfc1 = top/bot; goto S40; S20: /* ABS(X) .GT. 4 */ if(*x <= -5.6e0) goto S60; if(*ind != 0) goto S30; if(*x > 100.0e0) goto S70; if(*x**x > -exparg(&K1)) goto S70; S30: t = pow(1.0e0/ *x,2.0); top = (((r[0]*t+r[1])*t+r[2])*t+r[3])*t+r[4]; bot = (((s[0]*t+s[1])*t+s[2])*t+s[3])*t+1.0e0; erfc1 = (c-t*top/bot)/ax; S40: /* FINAL ASSEMBLY */ if(*ind == 0) goto S50; if(*x < 0.0e0) erfc1 = 2.0e0*exp(*x**x)-erfc1; return erfc1; S50: w = *x**x; t = w; e = w-t; erfc1 = (0.5e0+(0.5e0-e))*exp(-t)*erfc1; if(*x < 0.0e0) erfc1 = 2.0e0-erfc1; return erfc1; S60: /* LIMIT VALUE FOR LARGE NEGATIVE X */ erfc1 = 2.0e0; if(*ind != 0) erfc1 = 2.0e0*exp(*x**x); return erfc1; S70: /* LIMIT VALUE FOR LARGE POSITIVE X WHEN IND = 0 */ erfc1 = 0.0e0; return erfc1; } double esum(int *mu,double *x) /* ----------------------------------------------------------------------- EVALUATION OF EXP(MU + X) ----------------------------------------------------------------------- */ { static double esum,w; /* .. .. Executable Statements .. */ if(*x > 0.0e0) goto S10; if(*mu < 0) goto S20; w = (double)*mu+*x; if(w > 0.0e0) goto S20; esum = exp(w); return esum; S10: if(*mu > 0) goto S20; w = (double)*mu+*x; if(w < 0.0e0) goto S20; esum = exp(w); return esum; S20: w = *mu; esum = exp(w)*exp(*x); return esum; } double exparg(int *l) /* -------------------------------------------------------------------- IF L = 0 THEN EXPARG(L) = THE LARGEST POSITIVE W FOR WHICH EXP(W) CAN BE COMPUTED. IF L IS NONZERO THEN EXPARG(L) = THE LARGEST NEGATIVE W FOR WHICH THE COMPUTED VALUE OF EXP(W) IS NONZERO. NOTE... ONLY AN APPROXIMATE VALUE FOR EXPARG(L) IS NEEDED. -------------------------------------------------------------------- */ { static int K1 = 4; static int K2 = 9; static int K3 = 10; static double exparg,lnb; static int b,m; /* .. .. Executable Statements .. */ b = ipmpar(&K1); if(b != 2) goto S10; lnb = .69314718055995e0; goto S40; S10: if(b != 8) goto S20; lnb = 2.0794415416798e0; goto S40; S20: if(b != 16) goto S30; lnb = 2.7725887222398e0; goto S40; S30: lnb = log((double)b); S40: if(*l == 0) goto S50; m = ipmpar(&K2)-1; exparg = 0.99999e0*((double)m*lnb); return exparg; S50: m = ipmpar(&K3); exparg = 0.99999e0*((double)m*lnb); return exparg; } double fpser(double *a,double *b,double *x,double *eps) /* ----------------------------------------------------------------------- EVALUATION OF I (A,B) X FOR B .LT. MIN(EPS,EPS*A) AND X .LE. 0.5. ----------------------------------------------------------------------- SET FPSER = X**A */ { static int K1 = 1; static double fpser,an,c,s,t,tol; /* .. .. Executable Statements .. */ fpser = 1.0e0; if(*a <= 1.e-3**eps) goto S10; fpser = 0.0e0; t = *a*log(*x); if(t < exparg(&K1)) return fpser; fpser = exp(t); S10: /* NOTE THAT 1/B(A,B) = B */ fpser = *b/ *a*fpser; tol = *eps/ *a; an = *a+1.0e0; t = *x; s = t/an; S20: an += 1.0e0; t = *x*t; c = t/an; s += c; if(fabs(c) > tol) goto S20; fpser *= (1.0e0+*a*s); return fpser; } double gam1(double *a) /* ------------------------------------------------------------------ COMPUTATION OF 1/GAMMA(A+1) - 1 FOR -0.5 .LE. A .LE. 1.5 ------------------------------------------------------------------ */ { static double s1 = .273076135303957e+00; static double s2 = .559398236957378e-01; static double p[7] = { .577215664901533e+00,-.409078193005776e+00,-.230975380857675e+00, .597275330452234e-01,.766968181649490e-02,-.514889771323592e-02, .589597428611429e-03 }; static double q[5] = { .100000000000000e+01,.427569613095214e+00,.158451672430138e+00, .261132021441447e-01,.423244297896961e-02 }; static double r[9] = { -.422784335098468e+00,-.771330383816272e+00,-.244757765222226e+00, .118378989872749e+00,.930357293360349e-03,-.118290993445146e-01, .223047661158249e-02,.266505979058923e-03,-.132674909766242e-03 }; static double gam1,bot,d,t,top,w,T1; /* .. .. Executable Statements .. */ t = *a; d = *a-0.5e0; if(d > 0.0e0) t = d-0.5e0; T1 = t; if(T1 < 0) goto S40; else if(T1 == 0) goto S10; else goto S20; S10: gam1 = 0.0e0; return gam1; S20: top = (((((p[6]*t+p[5])*t+p[4])*t+p[3])*t+p[2])*t+p[1])*t+p[0]; bot = (((q[4]*t+q[3])*t+q[2])*t+q[1])*t+1.0e0; w = top/bot; if(d > 0.0e0) goto S30; gam1 = *a*w; return gam1; S30: gam1 = t/ *a*(w-0.5e0-0.5e0); return gam1; S40: top = (((((((r[8]*t+r[7])*t+r[6])*t+r[5])*t+r[4])*t+r[3])*t+r[2])*t+r[1])*t+ r[0]; bot = (s2*t+s1)*t+1.0e0; w = top/bot; if(d > 0.0e0) goto S50; gam1 = *a*(w+0.5e0+0.5e0); return gam1; S50: gam1 = t*w/ *a; return gam1; } void gaminv(double *a,double *x,double *x0,double *p,double *q, int *ierr) /* ---------------------------------------------------------------------- INVERSE INCOMPLETE GAMMA RATIO FUNCTION GIVEN POSITIVE A, AND NONEGATIVE P AND Q WHERE P + Q = 1. THEN X IS COMPUTED WHERE P(A,X) = P AND Q(A,X) = Q. SCHRODER ITERATION IS EMPLOYED. THE ROUTINE ATTEMPTS TO COMPUTE X TO 10 SIGNIFICANT DIGITS IF THIS IS POSSIBLE FOR THE PARTICULAR COMPUTER ARITHMETIC BEING USED. ------------ X IS A VARIABLE. IF P = 0 THEN X IS ASSIGNED THE VALUE 0, AND IF Q = 0 THEN X IS SET TO THE LARGEST FLOATING POINT NUMBER AVAILABLE. OTHERWISE, GAMINV ATTEMPTS TO OBTAIN A SOLUTION FOR P(A,X) = P AND Q(A,X) = Q. IF THE ROUTINE IS SUCCESSFUL THEN THE SOLUTION IS STORED IN X. X0 IS AN OPTIONAL INITIAL APPROXIMATION FOR X. IF THE USER DOES NOT WISH TO SUPPLY AN INITIAL APPROXIMATION, THEN SET X0 .LE. 0. IERR IS A VARIABLE THAT REPORTS THE STATUS OF THE RESULTS. WHEN THE ROUTINE TERMINATES, IERR HAS ONE OF THE FOLLOWING VALUES ... IERR = 0 THE SOLUTION WAS OBTAINED. ITERATION WAS NOT USED. IERR.GT.0 THE SOLUTION WAS OBTAINED. IERR ITERATIONS WERE PERFORMED. IERR = -2 (INPUT ERROR) A .LE. 0 IERR = -3 NO SOLUTION WAS OBTAINED. THE RATIO Q/A IS TOO LARGE. IERR = -4 (INPUT ERROR) P + Q .NE. 1 IERR = -6 20 ITERATIONS WERE PERFORMED. THE MOST RECENT VALUE OBTAINED FOR X IS GIVEN. THIS CANNOT OCCUR IF X0 .LE. 0. IERR = -7 ITERATION FAILED. NO VALUE IS GIVEN FOR X. THIS MAY OCCUR WHEN X IS APPROXIMATELY 0. IERR = -8 A VALUE FOR X HAS BEEN OBTAINED, BUT THE ROUTINE IS NOT CERTAIN OF ITS ACCURACY. ITERATION CANNOT BE PERFORMED IN THIS CASE. IF X0 .LE. 0, THIS CAN OCCUR ONLY WHEN P OR Q IS APPROXIMATELY 0. IF X0 IS POSITIVE THEN THIS CAN OCCUR WHEN A IS EXCEEDINGLY CLOSE TO X AND A IS EXTREMELY LARGE (SAY A .GE. 1.E20). ---------------------------------------------------------------------- WRITTEN BY ALFRED H. MORRIS, JR. NAVAL SURFACE WEAPONS CENTER DAHLGREN, VIRGINIA ------------------- */ { static double a0 = 3.31125922108741e0; static double a1 = 11.6616720288968e0; static double a2 = 4.28342155967104e0; static double a3 = .213623493715853e0; static double b1 = 6.61053765625462e0; static double b2 = 6.40691597760039e0; static double b3 = 1.27364489782223e0; static double b4 = .036117081018842e0; static double c = .577215664901533e0; static double ln10 = 2.302585e0; static double tol = 1.e-5; static double amin[2] = { 500.0e0,100.0e0 }; static double bmin[2] = { 1.e-28,1.e-13 }; static double dmin[2] = { 1.e-06,1.e-04 }; static double emin[2] = { 2.e-03,6.e-03 }; static double eps0[2] = { 1.e-10,1.e-08 }; static int K1 = 1; static int K2 = 2; static int K3 = 3; static int K8 = 0; static double am1,amax,ap1,ap2,ap3,apn,b,c1,c2,c3,c4,c5,d,e,e2,eps,g,h,pn,qg,qn, r,rta,s,s2,sum,t,u,w,xmax,xmin,xn,y,z; static int iop; static double T4,T5,T6,T7,T9; /* .. .. Executable Statements .. */ /* ****** E, XMIN, AND XMAX ARE MACHINE DEPENDENT CONSTANTS. E IS THE SMALLEST NUMBER FOR WHICH 1.0 + E .GT. 1.0. XMIN IS THE SMALLEST POSITIVE NUMBER AND XMAX IS THE LARGEST POSITIVE NUMBER. */ e = spmpar(&K1); xmin = spmpar(&K2); xmax = spmpar(&K3); *x = 0.0e0; if(*a <= 0.0e0) goto S300; t = *p+*q-1.e0; if(fabs(t) > e) goto S320; *ierr = 0; if(*p == 0.0e0) return; if(*q == 0.0e0) goto S270; if(*a == 1.0e0) goto S280; e2 = 2.0e0*e; amax = 0.4e-10/(e*e); iop = 1; if(e > 1.e-10) iop = 2; eps = eps0[iop-1]; xn = *x0; if(*x0 > 0.0e0) goto S160; /* SELECTION OF THE INITIAL APPROXIMATION XN OF X WHEN A .LT. 1 */ if(*a > 1.0e0) goto S80; T4 = *a+1.0e0; g = Xgamm(&T4); qg = *q*g; if(qg == 0.0e0) goto S360; b = qg/ *a; if(qg > 0.6e0**a) goto S40; if(*a >= 0.30e0 || b < 0.35e0) goto S10; t = exp(-(b+c)); u = t*exp(t); xn = t*exp(u); goto S160; S10: if(b >= 0.45e0) goto S40; if(b == 0.0e0) goto S360; y = -log(b); s = 0.5e0+(0.5e0-*a); z = log(y); t = y-s*z; if(b < 0.15e0) goto S20; xn = y-s*log(t)-log(1.0e0+s/(t+1.0e0)); goto S220; S20: if(b <= 0.01e0) goto S30; u = ((t+2.0e0*(3.0e0-*a))*t+(2.0e0-*a)*(3.0e0-*a))/((t+(5.0e0-*a))*t+2.0e0); xn = y-s*log(t)-log(u); goto S220; S30: c1 = -(s*z); c2 = -(s*(1.0e0+c1)); c3 = s*((0.5e0*c1+(2.0e0-*a))*c1+(2.5e0-1.5e0**a)); c4 = -(s*(((c1/3.0e0+(2.5e0-1.5e0**a))*c1+((*a-6.0e0)**a+7.0e0))*c1+( (11.0e0**a-46.0)**a+47.0e0)/6.0e0)); c5 = -(s*((((-(c1/4.0e0)+(11.0e0**a-17.0e0)/6.0e0)*c1+((-(3.0e0**a)+13.0e0)* *a-13.0e0))*c1+0.5e0*(((2.0e0**a-25.0e0)**a+72.0e0)**a-61.0e0))*c1+(( (25.0e0**a-195.0e0)**a+477.0e0)**a-379.0e0)/12.0e0)); xn = (((c5/y+c4)/y+c3)/y+c2)/y+c1+y; if(*a > 1.0e0) goto S220; if(b > bmin[iop-1]) goto S220; *x = xn; return; S40: if(b**q > 1.e-8) goto S50; xn = exp(-(*q/ *a+c)); goto S70; S50: if(*p <= 0.9e0) goto S60; T5 = -*q; xn = exp((alnrel(&T5)+gamln1(a))/ *a); goto S70; S60: xn = exp(log(*p*g)/ *a); S70: if(xn == 0.0e0) goto S310; t = 0.5e0+(0.5e0-xn/(*a+1.0e0)); xn /= t; goto S160; S80: /* SELECTION OF THE INITIAL APPROXIMATION XN OF X WHEN A .GT. 1 */ if(*q <= 0.5e0) goto S90; w = log(*p); goto S100; S90: w = log(*q); S100: t = sqrt(-(2.0e0*w)); s = t-(((a3*t+a2)*t+a1)*t+a0)/((((b4*t+b3)*t+b2)*t+b1)*t+1.0e0); if(*q > 0.5e0) s = -s; rta = sqrt(*a); s2 = s*s; xn = *a+s*rta+(s2-1.0e0)/3.0e0+s*(s2-7.0e0)/(36.0e0*rta)-((3.0e0*s2+7.0e0)* s2-16.0e0)/(810.0e0**a)+s*((9.0e0*s2+256.0e0)*s2-433.0e0)/(38880.0e0**a* rta); xn = fifdmax1(xn,0.0e0); if(*a < amin[iop-1]) goto S110; *x = xn; d = 0.5e0+(0.5e0-*x/ *a); if(fabs(d) <= dmin[iop-1]) return; S110: if(*p <= 0.5e0) goto S130; if(xn < 3.0e0**a) goto S220; y = -(w+gamln(a)); d = fifdmax1(2.0e0,*a*(*a-1.0e0)); if(y < ln10*d) goto S120; s = 1.0e0-*a; z = log(y); goto S30; S120: t = *a-1.0e0; T6 = -(t/(xn+1.0e0)); xn = y+t*log(xn)-alnrel(&T6); T7 = -(t/(xn+1.0e0)); xn = y+t*log(xn)-alnrel(&T7); goto S220; S130: ap1 = *a+1.0e0; if(xn > 0.70e0*ap1) goto S170; w += gamln(&ap1); if(xn > 0.15e0*ap1) goto S140; ap2 = *a+2.0e0; ap3 = *a+3.0e0; *x = exp((w+*x)/ *a); *x = exp((w+*x-log(1.0e0+*x/ap1*(1.0e0+*x/ap2)))/ *a); *x = exp((w+*x-log(1.0e0+*x/ap1*(1.0e0+*x/ap2)))/ *a); *x = exp((w+*x-log(1.0e0+*x/ap1*(1.0e0+*x/ap2*(1.0e0+*x/ap3))))/ *a); xn = *x; if(xn > 1.e-2*ap1) goto S140; if(xn <= emin[iop-1]*ap1) return; goto S170; S140: apn = ap1; t = xn/apn; sum = 1.0e0+t; S150: apn += 1.0e0; t *= (xn/apn); sum += t; if(t > 1.e-4) goto S150; t = w-log(sum); xn = exp((xn+t)/ *a); xn *= (1.0e0-(*a*log(xn)-xn-t)/(*a-xn)); goto S170; S160: /* SCHRODER ITERATION USING P */ if(*p > 0.5e0) goto S220; S170: if(*p <= 1.e10*xmin) goto S350; am1 = *a-0.5e0-0.5e0; S180: if(*a <= amax) goto S190; d = 0.5e0+(0.5e0-xn/ *a); if(fabs(d) <= e2) goto S350; S190: if(*ierr >= 20) goto S330; *ierr += 1; gratio(a,&xn,&pn,&qn,&K8); if(pn == 0.0e0 || qn == 0.0e0) goto S350; r = rcomp(a,&xn); if(r == 0.0e0) goto S350; t = (pn-*p)/r; w = 0.5e0*(am1-xn); if(fabs(t) <= 0.1e0 && fabs(w*t) <= 0.1e0) goto S200; *x = xn*(1.0e0-t); if(*x <= 0.0e0) goto S340; d = fabs(t); goto S210; S200: h = t*(1.0e0+w*t); *x = xn*(1.0e0-h); if(*x <= 0.0e0) goto S340; if(fabs(w) >= 1.0e0 && fabs(w)*t*t <= eps) return; d = fabs(h); S210: xn = *x; if(d > tol) goto S180; if(d <= eps) return; if(fabs(*p-pn) <= tol**p) return; goto S180; S220: /* SCHRODER ITERATION USING Q */ if(*q <= 1.e10*xmin) goto S350; am1 = *a-0.5e0-0.5e0; S230: if(*a <= amax) goto S240; d = 0.5e0+(0.5e0-xn/ *a); if(fabs(d) <= e2) goto S350; S240: if(*ierr >= 20) goto S330; *ierr += 1; gratio(a,&xn,&pn,&qn,&K8); if(pn == 0.0e0 || qn == 0.0e0) goto S350; r = rcomp(a,&xn); if(r == 0.0e0) goto S350; t = (*q-qn)/r; w = 0.5e0*(am1-xn); if(fabs(t) <= 0.1e0 && fabs(w*t) <= 0.1e0) goto S250; *x = xn*(1.0e0-t); if(*x <= 0.0e0) goto S340; d = fabs(t); goto S260; S250: h = t*(1.0e0+w*t); *x = xn*(1.0e0-h); if(*x <= 0.0e0) goto S340; if(fabs(w) >= 1.0e0 && fabs(w)*t*t <= eps) return; d = fabs(h); S260: xn = *x; if(d > tol) goto S230; if(d <= eps) return; if(fabs(*q-qn) <= tol**q) return; goto S230; S270: /* SPECIAL CASES */ *x = xmax; return; S280: if(*q < 0.9e0) goto S290; T9 = -*p; *x = -alnrel(&T9); return; S290: *x = -log(*q); return; S300: /* ERROR RETURN */ *ierr = -2; return; S310: *ierr = -3; return; S320: *ierr = -4; return; S330: *ierr = -6; return; S340: *ierr = -7; return; S350: *x = xn; *ierr = -8; return; S360: *x = xmax; *ierr = -8; return; } double gamln(double *a) /* ----------------------------------------------------------------------- EVALUATION OF LN(GAMMA(A)) FOR POSITIVE A ----------------------------------------------------------------------- WRITTEN BY ALFRED H. MORRIS NAVAL SURFACE WARFARE CENTER DAHLGREN, VIRGINIA -------------------------- D = 0.5*(LN(2*PI) - 1) -------------------------- */ { static double c0 = .833333333333333e-01; static double c1 = -.277777777760991e-02; static double c2 = .793650666825390e-03; static double c3 = -.595202931351870e-03; static double c4 = .837308034031215e-03; static double c5 = -.165322962780713e-02; static double d = .418938533204673e0; static double gamln,t,w; static int i,n; static double T1; /* .. .. Executable Statements .. */ if(*a > 0.8e0) goto S10; gamln = gamln1(a)-log(*a); return gamln; S10: if(*a > 2.25e0) goto S20; t = *a-0.5e0-0.5e0; gamln = gamln1(&t); return gamln; S20: if(*a >= 10.0e0) goto S40; n = int(*a-1.25e0); t = *a; w = 1.0e0; for(i=1; i<=n; i++) { t -= 1.0e0; w = t*w; } T1 = t-1.0e0; gamln = gamln1(&T1)+log(w); return gamln; S40: t = pow(1.0e0/ *a,2.0); w = (((((c5*t+c4)*t+c3)*t+c2)*t+c1)*t+c0)/ *a; gamln = d+w+(*a-0.5e0)*(log(*a)-1.0e0); return gamln; } double gamln1(double *a) /* ----------------------------------------------------------------------- EVALUATION OF LN(GAMMA(1 + A)) FOR -0.2 .LE. A .LE. 1.25 ----------------------------------------------------------------------- */ { static double p0 = .577215664901533e+00; static double p1 = .844203922187225e+00; static double p2 = -.168860593646662e+00; static double p3 = -.780427615533591e+00; static double p4 = -.402055799310489e+00; static double p5 = -.673562214325671e-01; static double p6 = -.271935708322958e-02; static double q1 = .288743195473681e+01; static double q2 = .312755088914843e+01; static double q3 = .156875193295039e+01; static double q4 = .361951990101499e+00; static double q5 = .325038868253937e-01; static double q6 = .667465618796164e-03; static double r0 = .422784335098467e+00; static double r1 = .848044614534529e+00; static double r2 = .565221050691933e+00; static double r3 = .156513060486551e+00; static double r4 = .170502484022650e-01; static double r5 = .497958207639485e-03; static double s1 = .124313399877507e+01; static double s2 = .548042109832463e+00; static double s3 = .101552187439830e+00; static double s4 = .713309612391000e-02; static double s5 = .116165475989616e-03; static double gamln1,w,x; /* .. .. Executable Statements .. */ if(*a >= 0.6e0) goto S10; w = ((((((p6**a+p5)**a+p4)**a+p3)**a+p2)**a+p1)**a+p0)/((((((q6**a+q5)**a+ q4)**a+q3)**a+q2)**a+q1)**a+1.0e0); gamln1 = -(*a*w); return gamln1; S10: x = *a-0.5e0-0.5e0; w = (((((r5*x+r4)*x+r3)*x+r2)*x+r1)*x+r0)/(((((s5*x+s4)*x+s3)*x+s2)*x+s1)*x +1.0e0); gamln1 = x*w; return gamln1; } double Xgamm(double *a) /* ----------------------------------------------------------------------- EVALUATION OF THE GAMMA FUNCTION FOR REAL ARGUMENTS ----------- GAMMA(A) IS ASSIGNED THE VALUE 0 WHEN THE GAMMA FUNCTION CANNOT BE COMPUTED. ----------------------------------------------------------------------- WRITTEN BY ALFRED H. MORRIS, JR. NAVAL SURFACE WEAPONS CENTER DAHLGREN, VIRGINIA ----------------------------------------------------------------------- */ { static double d = .41893853320467274178e0; static double pi = 3.1415926535898e0; static double r1 = .820756370353826e-03; static double r2 = -.595156336428591e-03; static double r3 = .793650663183693e-03; static double r4 = -.277777777770481e-02; static double r5 = .833333333333333e-01; static double p[7] = { .539637273585445e-03,.261939260042690e-02,.204493667594920e-01, .730981088720487e-01,.279648642639792e+00,.553413866010467e+00,1.0e0 }; static double q[7] = { -.832979206704073e-03,.470059485860584e-02,.225211131035340e-01, -.170458969313360e+00,-.567902761974940e-01,.113062953091122e+01,1.0e0 }; static int K2 = 3; static int K3 = 0; static double Xgamm,bot,g,lnx,s,t,top,w,x,z; static int i,j,m,n,T1; /* .. .. Executable Statements .. */ Xgamm = 0.0e0; x = *a; if(fabs(*a) >= 15.0e0) goto S110; /* ----------------------------------------------------------------------- EVALUATION OF GAMMA(A) FOR ABS(A) .LT. 15 ----------------------------------------------------------------------- */ t = 1.0e0; m = fifidint(*a)-1; /* LET T BE THE PRODUCT OF A-J WHEN A .GE. 2 */ T1 = m; if(T1 < 0) goto S40; else if(T1 == 0) goto S30; else goto S10; S10: for(j=1; j<=m; j++) { x -= 1.0e0; t = x*t; } S30: x -= 1.0e0; goto S80; S40: /* LET T BE THE PRODUCT OF A+J WHEN A .LT. 1 */ t = *a; if(*a > 0.0e0) goto S70; m = -m-1; if(m == 0) goto S60; for(j=1; j<=m; j++) { x += 1.0e0; t = x*t; } S60: x += (0.5e0+0.5e0); t = x*t; if(t == 0.0e0) return Xgamm; S70: /* THE FOLLOWING CODE CHECKS IF 1/T CAN OVERFLOW. THIS CODE MAY BE OMITTED IF DESIRED. */ if(fabs(t) >= 1.e-30) goto S80; if(fabs(t)*spmpar(&K2) <= 1.0001e0) return Xgamm; Xgamm = 1.0e0/t; return Xgamm; S80: /* COMPUTE GAMMA(1 + X) FOR 0 .LE. X .LT. 1 */ top = p[0]; bot = q[0]; for(i=1; i<7; i++) { top = p[i]+x*top; bot = q[i]+x*bot; } Xgamm = top/bot; /* TERMINATION */ if(*a < 1.0e0) goto S100; Xgamm *= t; return Xgamm; S100: Xgamm /= t; return Xgamm; S110: /* ----------------------------------------------------------------------- EVALUATION OF GAMMA(A) FOR ABS(A) .GE. 15 ----------------------------------------------------------------------- */ if(fabs(*a) >= 1.e3) return Xgamm; if(*a > 0.0e0) goto S120; x = -*a; n = (int)x; t = x-(double)n; if(t > 0.9e0) t = 1.0e0-t; s = sin(pi*t)/pi; if(fifmod(n,2) == 0) s = -s; if(s == 0.0e0) return Xgamm; S120: /* COMPUTE THE MODIFIED ASYMPTOTIC SUM */ t = 1.0e0/(x*x); g = ((((r1*t+r2)*t+r3)*t+r4)*t+r5)/x; /* ONE MAY REPLACE THE NEXT STATEMENT WITH LNX = ALOG(X) BUT LESS ACCURACY WILL NORMALLY BE OBTAINED. */ lnx = log(x); /* FINAL ASSEMBLY */ z = x; g = d+g+(z-0.5e0)*(lnx-1.e0); w = g; t = g-w; if(w > 0.99999e0*exparg(&K3)) return Xgamm; Xgamm = exp(w)*(1.0e0+t); if(*a < 0.0e0) Xgamm = 1.0e0/(Xgamm*s)/x; return Xgamm; } void grat1(double *a,double *x,double *r,double *p,double *q, double *eps) { static int K2 = 0; static double a2n,a2nm1,am0,an,an0,b2n,b2nm1,c,cma,g,h,j,l,sum,t,tol,w,z,T1,T3; /* .. .. Executable Statements .. */ /* ----------------------------------------------------------------------- EVALUATION OF THE INCOMPLETE GAMMA RATIO FUNCTIONS P(A,X) AND Q(A,X) IT IS ASSUMED THAT A .LE. 1. EPS IS THE TOLERANCE TO BE USED. THE INPUT ARGUMENT R HAS THE VALUE E**(-X)*X**A/GAMMA(A). ----------------------------------------------------------------------- */ if(*a**x == 0.0e0) goto S120; if(*a == 0.5e0) goto S100; if(*x < 1.1e0) goto S10; goto S60; S10: /* TAYLOR SERIES FOR P(A,X)/X**A */ an = 3.0e0; c = *x; sum = *x/(*a+3.0e0); tol = 0.1e0**eps/(*a+1.0e0); S20: an += 1.0e0; c = -(c*(*x/an)); t = c/(*a+an); sum += t; if(fabs(t) > tol) goto S20; j = *a**x*((sum/6.0e0-0.5e0/(*a+2.0e0))**x+1.0e0/(*a+1.0e0)); z = *a*log(*x); h = gam1(a); g = 1.0e0+h; if(*x < 0.25e0) goto S30; if(*a < *x/2.59e0) goto S50; goto S40; S30: if(z > -.13394e0) goto S50; S40: w = exp(z); *p = w*g*(0.5e0+(0.5e0-j)); *q = 0.5e0+(0.5e0-*p); return; S50: l = rexp(&z); w = 0.5e0+(0.5e0+l); *q = (w*j-l)*g-h; if(*q < 0.0e0) goto S90; *p = 0.5e0+(0.5e0-*q); return; S60: /* CONTINUED FRACTION EXPANSION */ a2nm1 = a2n = 1.0e0; b2nm1 = *x; b2n = *x+(1.0e0-*a); c = 1.0e0; S70: a2nm1 = *x*a2n+c*a2nm1; b2nm1 = *x*b2n+c*b2nm1; am0 = a2nm1/b2nm1; c += 1.0e0; cma = c-*a; a2n = a2nm1+cma*a2n; b2n = b2nm1+cma*b2n; an0 = a2n/b2n; if(fabs(an0-am0) >= *eps*an0) goto S70; *q = *r*an0; *p = 0.5e0+(0.5e0-*q); return; S80: /* SPECIAL CASES */ *p = 0.0e0; *q = 1.0e0; return; S90: *p = 1.0e0; *q = 0.0e0; return; S100: if(*x >= 0.25e0) goto S110; T1 = sqrt(*x); *p = erf1(&T1); *q = 0.5e0+(0.5e0-*p); return; S110: T3 = sqrt(*x); *q = erfc1(&K2,&T3); *p = 0.5e0+(0.5e0-*q); return; S120: if(*x <= *a) goto S80; goto S90; } void gratio(double *a,double *x,double *ans,double *qans,int *ind) /* ---------------------------------------------------------------------- EVALUATION OF THE INCOMPLETE GAMMA RATIO FUNCTIONS P(A,X) AND Q(A,X) ---------- IT IS ASSUMED THAT A AND X ARE NONNEGATIVE, WHERE A AND X ARE NOT BOTH 0. ANS AND QANS ARE VARIABLES. GRATIO ASSIGNS ANS THE VALUE P(A,X) AND QANS THE VALUE Q(A,X). IND MAY BE ANY INTEGER. IF IND = 0 THEN THE USER IS REQUESTING AS MUCH ACCURACY AS POSSIBLE (UP TO 14 SIGNIFICANT DIGITS). OTHERWISE, IF IND = 1 THEN ACCURACY IS REQUESTED TO WITHIN 1 UNIT OF THE 6-TH SIGNIFICANT DIGIT, AND IF IND .NE. 0,1 THEN ACCURACY IS REQUESTED TO WITHIN 1 UNIT OF THE 3RD SIGNIFICANT DIGIT. ERROR RETURN ... ANS IS ASSIGNED THE VALUE 2 WHEN A OR X IS NEGATIVE, WHEN A*X = 0, OR WHEN P(A,X) AND Q(A,X) ARE INDETERMINANT. P(A,X) AND Q(A,X) ARE COMPUTATIONALLY INDETERMINANT WHEN X IS EXCEEDINGLY CLOSE TO A AND A IS EXTREMELY LARGE. ---------------------------------------------------------------------- WRITTEN BY ALFRED H. MORRIS, JR. NAVAL SURFACE WEAPONS CENTER DAHLGREN, VIRGINIA -------------------- */ { static double alog10 = 2.30258509299405e0; static double d10 = -.185185185185185e-02; static double d20 = .413359788359788e-02; static double d30 = .649434156378601e-03; static double d40 = -.861888290916712e-03; static double d50 = -.336798553366358e-03; static double d60 = .531307936463992e-03; static double d70 = .344367606892378e-03; static double rt2pin = .398942280401433e0; static double rtpi = 1.77245385090552e0; static double third = .333333333333333e0; static double acc0[3] = { 5.e-15,5.e-7,5.e-4 }; static double big[3] = { 20.0e0,14.0e0,10.0e0 }; static double d0[13] = { .833333333333333e-01,-.148148148148148e-01,.115740740740741e-02, .352733686067019e-03,-.178755144032922e-03,.391926317852244e-04, -.218544851067999e-05,-.185406221071516e-05,.829671134095309e-06, -.176659527368261e-06,.670785354340150e-08,.102618097842403e-07, -.438203601845335e-08 }; static double d1[12] = { -.347222222222222e-02,.264550264550265e-02,-.990226337448560e-03, .205761316872428e-03,-.401877572016461e-06,-.180985503344900e-04, .764916091608111e-05,-.161209008945634e-05,.464712780280743e-08, .137863344691572e-06,-.575254560351770e-07,.119516285997781e-07 }; static double d2[10] = { -.268132716049383e-02,.771604938271605e-03,.200938786008230e-05, -.107366532263652e-03,.529234488291201e-04,-.127606351886187e-04, .342357873409614e-07,.137219573090629e-05,-.629899213838006e-06, .142806142060642e-06 }; static double d3[8] = { .229472093621399e-03,-.469189494395256e-03,.267720632062839e-03, -.756180167188398e-04,-.239650511386730e-06,.110826541153473e-04, -.567495282699160e-05,.142309007324359e-05 }; static double d4[6] = { .784039221720067e-03,-.299072480303190e-03,-.146384525788434e-05, .664149821546512e-04,-.396836504717943e-04,.113757269706784e-04 }; static double d5[4] = { -.697281375836586e-04,.277275324495939e-03,-.199325705161888e-03, .679778047793721e-04 }; static double d6[2] = { -.592166437353694e-03,.270878209671804e-03 }; static double e00[3] = { .25e-3,.25e-1,.14e0 }; static double x00[3] = { 31.0e0,17.0e0,9.7e0 }; static int K1 = 1; static int K2 = 0; static double a2n,a2nm1,acc,am0,amn,an,an0,apn,b2n,b2nm1,c,c0,c1,c2,c3,c4,c5,c6, cma,e,e0,g,h,j,l,r,rta,rtx,s,sum,t,t1,tol,twoa,u,w,x0,y,z; static int i,iop,m,max,n; static double wk[20],T3; static int T4,T5; static double T6,T7; /* .. .. Executable Statements .. */ /* -------------------- ****** E IS A MACHINE DEPENDENT CONSTANT. E IS THE SMALLEST FLOATING POINT NUMBER FOR WHICH 1.0 + E .GT. 1.0 . */ e = spmpar(&K1); if(*a < 0.0e0 || *x < 0.0e0) goto S430; if(*a == 0.0e0 && *x == 0.0e0) goto S430; if(*a**x == 0.0e0) goto S420; iop = *ind+1; if(iop != 1 && iop != 2) iop = 3; acc = fifdmax1(acc0[iop-1],e); e0 = e00[iop-1]; x0 = x00[iop-1]; /* SELECT THE APPROPRIATE ALGORITHM */ if(*a >= 1.0e0) goto S10; if(*a == 0.5e0) goto S390; if(*x < 1.1e0) goto S160; t1 = *a*log(*x)-*x; u = *a*exp(t1); if(u == 0.0e0) goto S380; r = u*(1.0e0+gam1(a)); goto S250; S10: if(*a >= big[iop-1]) goto S30; if(*a > *x || *x >= x0) goto S20; twoa = *a+*a; m = fifidint(twoa); if(twoa != (double)m) goto S20; i = m/2; if(*a == (double)i) goto S210; goto S220; S20: t1 = *a*log(*x)-*x; r = exp(t1)/Xgamm(a); goto S40; S30: l = *x/ *a; if(l == 0.0e0) goto S370; s = 0.5e0+(0.5e0-l); z = rlog(&l); if(z >= 700.0e0/ *a) goto S410; y = *a*z; rta = sqrt(*a); if(fabs(s) <= e0/rta) goto S330; if(fabs(s) <= 0.4e0) goto S270; t = pow(1.0e0/ *a,2.0); t1 = (((0.75e0*t-1.0e0)*t+3.5e0)*t-105.0e0)/(*a*1260.0e0); t1 -= y; r = rt2pin*rta*exp(t1); S40: if(r == 0.0e0) goto S420; if(*x <= fifdmax1(*a,alog10)) goto S50; if(*x < x0) goto S250; goto S100; S50: /* TAYLOR SERIES FOR P/R */ apn = *a+1.0e0; t = *x/apn; wk[0] = t; for(n=2; n<=20; n++) { apn += 1.0e0; t *= (*x/apn); if(t <= 1.e-3) goto S70; wk[n-1] = t; } n = 20; S70: sum = t; tol = 0.5e0*acc; S80: apn += 1.0e0; t *= (*x/apn); sum += t; if(t > tol) goto S80; max = n-1; for(m=1; m<=max; m++) { n -= 1; sum += wk[n-1]; } *ans = r/ *a*(1.0e0+sum); *qans = 0.5e0+(0.5e0-*ans); return; S100: /* ASYMPTOTIC EXPANSION */ amn = *a-1.0e0; t = amn/ *x; wk[0] = t; for(n=2; n<=20; n++) { amn -= 1.0e0; t *= (amn/ *x); if(fabs(t) <= 1.e-3) goto S120; wk[n-1] = t; } n = 20; S120: sum = t; S130: if(fabs(t) <= acc) goto S140; amn -= 1.0e0; t *= (amn/ *x); sum += t; goto S130; S140: max = n-1; for(m=1; m<=max; m++) { n -= 1; sum += wk[n-1]; } *qans = r/ *x*(1.0e0+sum); *ans = 0.5e0+(0.5e0-*qans); return; S160: /* TAYLOR SERIES FOR P(A,X)/X**A */ an = 3.0e0; c = *x; sum = *x/(*a+3.0e0); tol = 3.0e0*acc/(*a+1.0e0); S170: an += 1.0e0; c = -(c*(*x/an)); t = c/(*a+an); sum += t; if(fabs(t) > tol) goto S170; j = *a**x*((sum/6.0e0-0.5e0/(*a+2.0e0))**x+1.0e0/(*a+1.0e0)); z = *a*log(*x); h = gam1(a); g = 1.0e0+h; if(*x < 0.25e0) goto S180; if(*a < *x/2.59e0) goto S200; goto S190; S180: if(z > -.13394e0) goto S200; S190: w = exp(z); *ans = w*g*(0.5e0+(0.5e0-j)); *qans = 0.5e0+(0.5e0-*ans); return; S200: l = rexp(&z); w = 0.5e0+(0.5e0+l); *qans = (w*j-l)*g-h; if(*qans < 0.0e0) goto S380; *ans = 0.5e0+(0.5e0-*qans); return; S210: /* FINITE SUMS FOR Q WHEN A .GE. 1 AND 2*A IS AN INTEGER */ sum = exp(-*x); t = sum; n = 1; c = 0.0e0; goto S230; S220: rtx = sqrt(*x); sum = erfc1(&K2,&rtx); t = exp(-*x)/(rtpi*rtx); n = 0; c = -0.5e0; S230: if(n == i) goto S240; n += 1; c += 1.0e0; t = *x*t/c; sum += t; goto S230; S240: *qans = sum; *ans = 0.5e0+(0.5e0-*qans); return; S250: /* CONTINUED FRACTION EXPANSION */ tol = fifdmax1(5.0e0*e,acc); a2nm1 = a2n = 1.0e0; b2nm1 = *x; b2n = *x+(1.0e0-*a); c = 1.0e0; S260: a2nm1 = *x*a2n+c*a2nm1; b2nm1 = *x*b2n+c*b2nm1; am0 = a2nm1/b2nm1; c += 1.0e0; cma = c-*a; a2n = a2nm1+cma*a2n; b2n = b2nm1+cma*b2n; an0 = a2n/b2n; if(fabs(an0-am0) >= tol*an0) goto S260; *qans = r*an0; *ans = 0.5e0+(0.5e0-*qans); return; S270: /* GENERAL TEMME EXPANSION */ if(fabs(s) <= 2.0e0*e && *a*e*e > 3.28e-3) goto S430; c = exp(-y); T3 = sqrt(y); w = 0.5e0*erfc1(&K1,&T3); u = 1.0e0/ *a; z = sqrt(z+z); if(l < 1.0e0) z = -z; T4 = iop-2; if(T4 < 0) goto S280; else if(T4 == 0) goto S290; else goto S300; S280: if(fabs(s) <= 1.e-3) goto S340; c0 = ((((((((((((d0[12]*z+d0[11])*z+d0[10])*z+d0[9])*z+d0[8])*z+d0[7])*z+d0[ 6])*z+d0[5])*z+d0[4])*z+d0[3])*z+d0[2])*z+d0[1])*z+d0[0])*z-third; c1 = (((((((((((d1[11]*z+d1[10])*z+d1[9])*z+d1[8])*z+d1[7])*z+d1[6])*z+d1[5] )*z+d1[4])*z+d1[3])*z+d1[2])*z+d1[1])*z+d1[0])*z+d10; c2 = (((((((((d2[9]*z+d2[8])*z+d2[7])*z+d2[6])*z+d2[5])*z+d2[4])*z+d2[3])*z+ d2[2])*z+d2[1])*z+d2[0])*z+d20; c3 = (((((((d3[7]*z+d3[6])*z+d3[5])*z+d3[4])*z+d3[3])*z+d3[2])*z+d3[1])*z+ d3[0])*z+d30; c4 = (((((d4[5]*z+d4[4])*z+d4[3])*z+d4[2])*z+d4[1])*z+d4[0])*z+d40; c5 = (((d5[3]*z+d5[2])*z+d5[1])*z+d5[0])*z+d50; c6 = (d6[1]*z+d6[0])*z+d60; t = ((((((d70*u+c6)*u+c5)*u+c4)*u+c3)*u+c2)*u+c1)*u+c0; goto S310; S290: c0 = (((((d0[5]*z+d0[4])*z+d0[3])*z+d0[2])*z+d0[1])*z+d0[0])*z-third; c1 = (((d1[3]*z+d1[2])*z+d1[1])*z+d1[0])*z+d10; c2 = d2[0]*z+d20; t = (c2*u+c1)*u+c0; goto S310; S300: t = ((d0[2]*z+d0[1])*z+d0[0])*z-third; S310: if(l < 1.0e0) goto S320; *qans = c*(w+rt2pin*t/rta); *ans = 0.5e0+(0.5e0-*qans); return; S320: *ans = c*(w-rt2pin*t/rta); *qans = 0.5e0+(0.5e0-*ans); return; S330: /* TEMME EXPANSION FOR L = 1 */ if(*a*e*e > 3.28e-3) goto S430; c = 0.5e0+(0.5e0-y); w = (0.5e0-sqrt(y)*(0.5e0+(0.5e0-y/3.0e0))/rtpi)/c; u = 1.0e0/ *a; z = sqrt(z+z); if(l < 1.0e0) z = -z; T5 = iop-2; if(T5 < 0) goto S340; else if(T5 == 0) goto S350; else goto S360; S340: c0 = ((((((d0[6]*z+d0[5])*z+d0[4])*z+d0[3])*z+d0[2])*z+d0[1])*z+d0[0])*z- third; c1 = (((((d1[5]*z+d1[4])*z+d1[3])*z+d1[2])*z+d1[1])*z+d1[0])*z+d10; c2 = ((((d2[4]*z+d2[3])*z+d2[2])*z+d2[1])*z+d2[0])*z+d20; c3 = (((d3[3]*z+d3[2])*z+d3[1])*z+d3[0])*z+d30; c4 = (d4[1]*z+d4[0])*z+d40; c5 = (d5[1]*z+d5[0])*z+d50; c6 = d6[0]*z+d60; t = ((((((d70*u+c6)*u+c5)*u+c4)*u+c3)*u+c2)*u+c1)*u+c0; goto S310; S350: c0 = (d0[1]*z+d0[0])*z-third; c1 = d1[0]*z+d10; t = (d20*u+c1)*u+c0; goto S310; S360: t = d0[0]*z-third; goto S310; S370: /* SPECIAL CASES */ *ans = 0.0e0; *qans = 1.0e0; return; S380: *ans = 1.0e0; *qans = 0.0e0; return; S390: if(*x >= 0.25e0) goto S400; T6 = sqrt(*x); *ans = erf1(&T6); *qans = 0.5e0+(0.5e0-*ans); return; S400: T7 = sqrt(*x); *qans = erfc1(&K2,&T7); *ans = 0.5e0+(0.5e0-*qans); return; S410: if(fabs(s) <= 2.0e0*e) goto S430; S420: if(*x <= *a) goto S370; goto S380; S430: /* ERROR RETURN */ *ans = 2.0e0; return; } double gsumln(double *a,double *b) /* ----------------------------------------------------------------------- EVALUATION OF THE FUNCTION LN(GAMMA(A + B)) FOR 1 .LE. A .LE. 2 AND 1 .LE. B .LE. 2 ----------------------------------------------------------------------- */ { static double gsumln,x,T1,T2; /* .. .. Executable Statements .. */ x = *a+*b-2.e0; if(x > 0.25e0) goto S10; T1 = 1.0e0+x; gsumln = gamln1(&T1); return gsumln; S10: if(x > 1.25e0) goto S20; gsumln = gamln1(&x)+alnrel(&x); return gsumln; S20: T2 = x-1.0e0; gsumln = gamln1(&T2)+log(x*(1.0e0+x)); return gsumln; } double psi(double *xx) /* --------------------------------------------------------------------- EVALUATION OF THE DIGAMMA FUNCTION ----------- PSI(XX) IS ASSIGNED THE VALUE 0 WHEN THE DIGAMMA FUNCTION CANNOT BE COMPUTED. THE MAIN COMPUTATION INVOLVES EVALUATION OF RATIONAL CHEBYSHEV APPROXIMATIONS PUBLISHED IN MATH. COMP. 27, 123-127(1973) BY CODY, STRECOK AND THACHER. --------------------------------------------------------------------- PSI WAS WRITTEN AT ARGONNE NATIONAL LABORATORY FOR THE FUNPACK PACKAGE OF SPECIAL FUNCTION SUBROUTINES. PSI WAS MODIFIED BY A.H. MORRIS (NSWC). --------------------------------------------------------------------- */ { static double dx0 = 1.461632144968362341262659542325721325e0; static double piov4 = .785398163397448e0; static double p1[7] = { .895385022981970e-02,.477762828042627e+01,.142441585084029e+03, .118645200713425e+04,.363351846806499e+04,.413810161269013e+04, .130560269827897e+04 }; static double p2[4] = { -.212940445131011e+01,-.701677227766759e+01,-.448616543918019e+01, -.648157123766197e+00 }; static double q1[6] = { .448452573429826e+02,.520752771467162e+03,.221000799247830e+04, .364127349079381e+04,.190831076596300e+04,.691091682714533e-05 }; static double q2[4] = { .322703493791143e+02,.892920700481861e+02,.546117738103215e+02, .777788548522962e+01 }; static int K1 = 3; static int K2 = 1; static double psi,aug,den,sgn,upper,w,x,xmax1,xmx0,xsmall,z; static int i,m,n,nq; /* .. .. Executable Statements .. */ /* --------------------------------------------------------------------- MACHINE DEPENDENT CONSTANTS ... XMAX1 = THE SMALLEST POSITIVE FLOATING POINT CONSTANT WITH ENTIRELY INTEGER REPRESENTATION. ALSO USED AS NEGATIVE OF LOWER BOUND ON ACCEPTABLE NEGATIVE ARGUMENTS AND AS THE POSITIVE ARGUMENT BEYOND WHICH PSI MAY BE REPRESENTED AS ALOG(X). XSMALL = ABSOLUTE ARGUMENT BELOW WHICH PI*COTAN(PI*X) MAY BE REPRESENTED BY 1/X. --------------------------------------------------------------------- */ xmax1 = ipmpar(&K1); xmax1 = fifdmin1(xmax1,1.0e0/spmpar(&K2)); xsmall = 1.e-9; x = *xx; aug = 0.0e0; if(x >= 0.5e0) goto S50; /* --------------------------------------------------------------------- X .LT. 0.5, USE REFLECTION FORMULA PSI(1-X) = PSI(X) + PI * COTAN(PI*X) --------------------------------------------------------------------- */ if(fabs(x) > xsmall) goto S10; if(x == 0.0e0) goto S100; /* --------------------------------------------------------------------- 0 .LT. ABS(X) .LE. XSMALL. USE 1/X AS A SUBSTITUTE FOR PI*COTAN(PI*X) --------------------------------------------------------------------- */ aug = -(1.0e0/x); goto S40; S10: /* --------------------------------------------------------------------- REDUCTION OF ARGUMENT FOR COTAN --------------------------------------------------------------------- */ w = -x; sgn = piov4; if(w > 0.0e0) goto S20; w = -w; sgn = -sgn; S20: /* --------------------------------------------------------------------- MAKE AN ERROR EXIT IF X .LE. -XMAX1 --------------------------------------------------------------------- */ if(w >= xmax1) goto S100; nq = fifidint(w); w -= (double)nq; nq = fifidint(w*4.0e0); w = 4.0e0*(w-(double)nq*.25e0); /* --------------------------------------------------------------------- W IS NOW RELATED TO THE FRACTIONAL PART OF 4.0 * X. ADJUST ARGUMENT TO CORRESPOND TO VALUES IN FIRST QUADRANT AND DETERMINE SIGN --------------------------------------------------------------------- */ n = nq/2; if(n+n != nq) w = 1.0e0-w; z = piov4*w; m = n/2; if(m+m != n) sgn = -sgn; /* --------------------------------------------------------------------- DETERMINE FINAL VALUE FOR -PI*COTAN(PI*X) --------------------------------------------------------------------- */ n = (nq+1)/2; m = n/2; m += m; if(m != n) goto S30; /* --------------------------------------------------------------------- CHECK FOR SINGULARITY --------------------------------------------------------------------- */ if(z == 0.0e0) goto S100; /* --------------------------------------------------------------------- USE COS/SIN AS A SUBSTITUTE FOR COTAN, AND SIN/COS AS A SUBSTITUTE FOR TAN --------------------------------------------------------------------- */ aug = sgn*(cos(z)/sin(z)*4.0e0); goto S40; S30: aug = sgn*(sin(z)/cos(z)*4.0e0); S40: x = 1.0e0-x; S50: if(x > 3.0e0) goto S70; /* --------------------------------------------------------------------- 0.5 .LE. X .LE. 3.0 --------------------------------------------------------------------- */ den = x; upper = p1[0]*x; for(i=1; i<=5; i++) { den = (den+q1[i-1])*x; upper = (upper+p1[i+1-1])*x; } den = (upper+p1[6])/(den+q1[5]); xmx0 = x-dx0; psi = den*xmx0+aug; return psi; S70: /* --------------------------------------------------------------------- IF X .GE. XMAX1, PSI = LN(X) --------------------------------------------------------------------- */ if(x >= xmax1) goto S90; /* --------------------------------------------------------------------- 3.0 .LT. X .LT. XMAX1 --------------------------------------------------------------------- */ w = 1.0e0/(x*x); den = w; upper = p2[0]*w; for(i=1; i<=3; i++) { den = (den+q2[i-1])*w; upper = (upper+p2[i+1-1])*w; } aug = upper/(den+q2[3])-0.5e0/x+aug; S90: psi = aug+log(x); return psi; S100: /* --------------------------------------------------------------------- ERROR RETURN --------------------------------------------------------------------- */ psi = 0.0e0; return psi; } double rcomp(double *a,double *x) /* ------------------- EVALUATION OF EXP(-X)*X**A/GAMMA(A) ------------------- RT2PIN = 1/SQRT(2*PI) ------------------- */ { static double rt2pin = .398942280401433e0; static double rcomp,t,t1,u; /* .. .. Executable Statements .. */ rcomp = 0.0e0; if(*a >= 20.0e0) goto S20; t = *a*log(*x)-*x; if(*a >= 1.0e0) goto S10; rcomp = *a*exp(t)*(1.0e0+gam1(a)); return rcomp; S10: rcomp = exp(t)/Xgamm(a); return rcomp; S20: u = *x/ *a; if(u == 0.0e0) return rcomp; t = pow(1.0e0/ *a,2.0); t1 = (((0.75e0*t-1.0e0)*t+3.5e0)*t-105.0e0)/(*a*1260.0e0); t1 -= (*a*rlog(&u)); rcomp = rt2pin*sqrt(*a)*exp(t1); return rcomp; } double rexp(double *x) /* ----------------------------------------------------------------------- EVALUATION OF THE FUNCTION EXP(X) - 1 ----------------------------------------------------------------------- */ { static double p1 = .914041914819518e-09; static double p2 = .238082361044469e-01; static double q1 = -.499999999085958e+00; static double q2 = .107141568980644e+00; static double q3 = -.119041179760821e-01; static double q4 = .595130811860248e-03; static double rexp,w; /* .. .. Executable Statements .. */ if(fabs(*x) > 0.15e0) goto S10; rexp = *x*(((p2**x+p1)**x+1.0e0)/((((q4**x+q3)**x+q2)**x+q1)**x+1.0e0)); return rexp; S10: w = exp(*x); if(*x > 0.0e0) goto S20; rexp = w-0.5e0-0.5e0; return rexp; S20: rexp = w*(0.5e0+(0.5e0-1.0e0/w)); return rexp; } double rlog(double *x) /* ------------------- COMPUTATION OF X - 1 - LN(X) ------------------- */ { static double a = .566749439387324e-01; static double b = .456512608815524e-01; static double p0 = .333333333333333e+00; static double p1 = -.224696413112536e+00; static double p2 = .620886815375787e-02; static double q1 = -.127408923933623e+01; static double q2 = .354508718369557e+00; static double rlog,r,t,u,w,w1; /* .. .. Executable Statements .. */ if(*x < 0.61e0 || *x > 1.57e0) goto S40; if(*x < 0.82e0) goto S10; if(*x > 1.18e0) goto S20; /* ARGUMENT REDUCTION */ u = *x-0.5e0-0.5e0; w1 = 0.0e0; goto S30; S10: u = *x-0.7e0; u /= 0.7e0; w1 = a-u*0.3e0; goto S30; S20: u = 0.75e0**x-1.e0; w1 = b+u/3.0e0; S30: /* SERIES EXPANSION */ r = u/(u+2.0e0); t = r*r; w = ((p2*t+p1)*t+p0)/((q2*t+q1)*t+1.0e0); rlog = 2.0e0*t*(1.0e0/(1.0e0-r)-r*w)+w1; return rlog; S40: r = *x-0.5e0-0.5e0; rlog = r-log(*x); return rlog; } double rlog1(double *x) /* ----------------------------------------------------------------------- EVALUATION OF THE FUNCTION X - LN(1 + X) ----------------------------------------------------------------------- */ { static double a = .566749439387324e-01; static double b = .456512608815524e-01; static double p0 = .333333333333333e+00; static double p1 = -.224696413112536e+00; static double p2 = .620886815375787e-02; static double q1 = -.127408923933623e+01; static double q2 = .354508718369557e+00; static double rlog1,h,r,t,w,w1; /* .. .. Executable Statements .. */ if(*x < -0.39e0 || *x > 0.57e0) goto S40; if(*x < -0.18e0) goto S10; if(*x > 0.18e0) goto S20; /* ARGUMENT REDUCTION */ h = *x; w1 = 0.0e0; goto S30; S10: h = *x+0.3e0; h /= 0.7e0; w1 = a-h*0.3e0; goto S30; S20: h = 0.75e0**x-0.25e0; w1 = b+h/3.0e0; S30: /* SERIES EXPANSION */ r = h/(h+2.0e0); t = r*r; w = ((p2*t+p1)*t+p0)/((q2*t+q1)*t+1.0e0); rlog1 = 2.0e0*t*(1.0e0/(1.0e0-r)-r*w)+w1; return rlog1; S40: w = *x+0.5e0+0.5e0; rlog1 = *x-log(w); return rlog1; } double spmpar(int *i) /* ----------------------------------------------------------------------- SPMPAR PROVIDES THE SINGLE PRECISION MACHINE CONSTANTS FOR THE COMPUTER BEING USED. IT IS ASSUMED THAT THE ARGUMENT I IS AN INTEGER HAVING ONE OF THE VALUES 1, 2, OR 3. IF THE SINGLE PRECISION ARITHMETIC BEING USED HAS M BASE B DIGITS AND ITS SMALLEST AND LARGEST EXPONENTS ARE EMIN AND EMAX, THEN SPMPAR(1) = B**(1 - M), THE MACHINE PRECISION, SPMPAR(2) = B**(EMIN - 1), THE SMALLEST MAGNITUDE, SPMPAR(3) = B**EMAX*(1 - B**(-M)), THE LARGEST MAGNITUDE. ----------------------------------------------------------------------- WRITTEN BY ALFRED H. MORRIS, JR. NAVAL SURFACE WARFARE CENTER DAHLGREN VIRGINIA ----------------------------------------------------------------------- ----------------------------------------------------------------------- MODIFIED BY BARRY W. BROWN TO RETURN DOUBLE PRECISION MACHINE CONSTANTS FOR THE COMPUTER BEING USED. THIS MODIFICATION WAS MADE AS PART OF CONVERTING BRATIO TO DOUBLE PRECISION ----------------------------------------------------------------------- */ { static int K1 = 4; static int K2 = 8; static int K3 = 9; static int K4 = 10; static double spmpar,b,binv,bm1,one,w,z; static int emax,emin,ibeta,m; /* .. .. Executable Statements .. */ if(*i > 1) goto S10; b = ipmpar(&K1); m = ipmpar(&K2); spmpar = pow(b,(double)(1-m)); return spmpar; S10: if(*i > 2) goto S20; b = ipmpar(&K1); emin = ipmpar(&K3); one = 1.0; binv = one/b; w = pow(b,(double)(emin+2)); spmpar = w*binv*binv*binv; return spmpar; S20: ibeta = ipmpar(&K1); m = ipmpar(&K2); emax = ipmpar(&K4); b = ibeta; bm1 = ibeta-1; one = 1.0; z = pow(b,(double)(m-1)); w = ((z-one)*b+bm1)/(b*z); z = pow(b,(double)(emax-2)); spmpar = w*z*b*b; return spmpar; } double stvaln(double *p) /* ********************************************************************** double stvaln(double *p) STarting VALue for Neton-Raphon calculation of Normal distribution Inverse Function Returns X such that CUMNOR(X) = P, i.e., the integral from - infinity to X of (1/SQRT(2*PI)) EXP(-U*U/2) dU is P Arguments P --> The probability whose normal deviate is sought. P is DOUBLE PRECISION Method The rational function on page 95 of Kennedy and Gentle, Statistical Computing, Marcel Dekker, NY , 1980. ********************************************************************** */ { static double xden[5] = { 0.993484626060e-1,0.588581570495e0,0.531103462366e0,0.103537752850e0, 0.38560700634e-2 }; static double xnum[5] = { -0.322232431088e0,-1.000000000000e0,-0.342242088547e0,-0.204231210245e-1, -0.453642210148e-4 }; static int K1 = 5; static double stvaln,sign,y,z; /* .. .. Executable Statements .. */ if(!(*p <= 0.5e0)) goto S10; sign = -1.0e0; z = *p; goto S20; S10: sign = 1.0e0; z = 1.0e0-*p; S20: y = sqrt(-(2.0e0*log(z))); stvaln = y+devlpl(xnum,&K1,&y)/devlpl(xden,&K1,&y); stvaln = sign*stvaln; return stvaln; } /************************************************************************ FIFDINT: Truncates a double precision number to an integer and returns the value in a double. ************************************************************************/ double fifdint(double a) /* a - number to be truncated */ { return (double) ((int) a); } /************************************************************************ FIFDMAX1: returns the maximum of two numbers a and b ************************************************************************/ double fifdmax1(double a,double b) /* a - first number */ /* b - second number */ { if (a < b) return b; else return a; } /************************************************************************ FIFDMIN1: returns the minimum of two numbers a and b ************************************************************************/ double fifdmin1(double a,double b) /* a - first number */ /* b - second number */ { if (a < b) return a; else return b; } /************************************************************************ FIFDSIGN: transfers the sign of the variable "sign" to the variable "mag" ************************************************************************/ double fifdsign(double mag,double sign) /* mag - magnitude */ /* sign - sign to be transfered */ { if (mag < 0) mag = -mag; if (sign < 0) mag = -mag; return mag; } /************************************************************************ FIFIDINT: Truncates a double precision number to a long integer ************************************************************************/ long fifidint(double a) /* a - number to be truncated */ { if (a < 1.0) return (long) 0; else return (long) a; } /************************************************************************ FIFMOD: returns the modulo of a and b ************************************************************************/ long fifmod(long a,long b) /* a - numerator */ /* b - denominator */ { return a % b; } /************************************************************************ FTNSTOP: Prints msg to standard error and then exits ************************************************************************/ void ftnstop(char* msg) /* msg - error message */ { if (msg != NULL) fprintf(stderr,"%s\n",msg); exit(0); } plink-1.07-src/blox.cpp~0000644000265600020320000003317611174437035014354 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "options.h" #include "helper.h" #include "plink.h" #include "phase.h" #include "stats.h" extern Plink * PP; /////////////////////////////////////////////////////////////////////// // // // Haplotype block code, adapted from code courtesy of Jeff Barrett, // // and HAPLOVIEW, following some very quick-and-dirty Java->C++... // // // /////////////////////////////////////////////////////////////////////// class LDPair { public: int s1; int s2; int dist; LDPair(int s1_, int s2_, int dist_) { s1 = s1_; s2 = s2_; dist = dist_; } bool operator< (const LDPair & b) const { return (dist < b.dist); } }; class DPrime { public: double dp; double dpl; double dpu; double lod; }; class PairwiseLinkage { public: PairwiseLinkage(int a_, int b_) { a=a_; b=b_; knownAA = knownAB = knownBA = knownBB = unknownDH = 0; } int a; int b; double dp, rsq; double dp_upper, dp_lower; double lod; void calculateCI(); void calculateLD(); int knownAA, knownAB, knownBA, knownBB, unknownDH; }; map > Plink::mkBlks(int null1, int null2 ) { // First SNP, vector of SNPs (inc. first) map< int, vector > blocks; // Some constants const double cutHighCI = 0.98; const double cutLowCI = 0.70; const double cutLowCIVar [5] = {0,0,0.80,0.50,0.50}; const double maxDist [5] = {0,0,20000,30000,1000000}; const double recHighCI = 0.90; const double informFrac = 0.95; const double fourGameteCutoff = 0.01; const double mafThresh = 0.05; // Set to skip SNPs with low MAFs // Uses genome-wide reference number: need to allocate for all SNPs here vector skipMarker(nl_all,false); for (int x = 0; x < nl_all; x++) skipMarker[x] = locus[x]->freq < mafThresh; // Consider each chromosome one at a time; skip X for now int startChromosome = locus[ 0 ]->chr; int finalChromosome = locus[ nl_all - 1]->chr; for (int chr = startChromosome ; chr <= finalChromosome; chr++) { if ( scaffold.find(chr) == scaffold.end() ) continue; int fromPosition = scaffold[chr].lstart; int toPosition = scaffold[chr].lstop; // Sanity check positions given (select to be on same chr) // (note can remove this now) // if ( locus[toPosition]->chr != locus[fromPosition]->chr ) // { // while (1) // { // if ( locus[--toPosition]->chr == locus[fromPosition]->chr ) // break; // } // } int nsnps = toPosition - fromPosition + 1; ///////////////////////////////////////////////////////////////////////// // Make a list of marker pairs in "strong LD", sorted by distance apart multiset strongPairs; map dpStore; int numStrong = 0; int numRec = 0; int numInGroup = 0; // Each pair of markers for (int x = fromPosition; x < toPosition; x++) { if ( ! par::silent ) cerr << "Chromosome " << locus[x]->chr << ", position " << locus[x]->bp/(double)(1000000) << "Mb \r"; for (int y = x+1; y <= toPosition; y++) { if ( locus[x]->chr != locus[y]->chr ) continue; if ( locus[y]->bp - locus[x]->bp > par::disp_r_window_kb ) continue; if ( locus[x]->freq == 0 || locus[y]->freq == 0 ) continue; PairwiseLinkage thisPair(x,y); thisPair.calculateLD(); thisPair.calculateCI(); double lod = thisPair.lod; double lowCI = thisPair.dp_lower; double highCI = thisPair.dp_upper; int2 t(x,y); DPrime d; d.dp = thisPair.dp; d.dpl = lowCI; d.dpu = highCI; d.lod = lod; dpStore.insert( make_pair( t,d ) ); // Is this pair in strong LD? if (lod < -90) continue; //missing data if (highCI < cutHighCI || lowCI < cutLowCI) continue; //must pass "strong LD" test // Store this pair LDPair p(x,y,abs( locus[x]->bp - locus[y]->bp ) ); strongPairs.insert( p ); } } // Now we have a list of SNPs in strong LD within this region // Now construct blocks based on this set used; // #blocks: vector > blockArray; multiset::iterator i = strongPairs.end(); --i; while ( i != strongPairs.begin() ) { int numStrong = 0; int numRec = 0; int numInGroup = 0; vector thisBlock; int first = i->s1; int last = i->s2; long sep = i->dist; // See if this block overlaps with another: if ( used.find(first) != used.end() || used.find(last) != used.end() ) { --i; continue; } // Next, count the number of markers in the block. // (nb. assume all SNPs belong) for (int x = first; x <=last ; x++) { if( !skipMarker[x] ) numInGroup++; } // Skip it if it is too long in bases for it's size in markers if (numInGroup < 4 && sep > maxDist[numInGroup]) { --i; continue; } // Add first SNP thisBlock.push_back( first ); // Test block: requires 95% of informative markers to be "strong" for (int y = first+1; y <= last; y++) { if (skipMarker[y]) continue; thisBlock.push_back(y); //loop over columns in row y for (int x = first; x < y; x++) { if (skipMarker[x]) continue; double lod; double lowCI; double highCI; map::iterator l = dpStore.find( int2(x,y) ); if ( l == dpStore.end() ) { // Recalculate PairwiseLinkage thisPair(x,y); thisPair.calculateLD(); thisPair.calculateCI(); lod = thisPair.lod; lowCI = thisPair.dp_lower; highCI = thisPair.dp_upper; } else { // Get the right bits lod = l->second.lod; lowCI = l->second.dpl; highCI = l->second.dpu; } // Monomorphic marker error if ( lod < -90) continue; // Skip bad markers if ( lod == 0 && lowCI == 0 && highCI == 0) continue; // For small blocks use different CI cutoffs if (numInGroup < 5) { if (lowCI > cutLowCIVar[numInGroup] && highCI >= cutHighCI) numStrong++; } else { if (lowCI > cutLowCI && highCI >= cutHighCI) numStrong++; //strong LD } if (highCI < recHighCI) numRec++; //recombination } } // Change the definition somewhat for small blocks if (numInGroup > 3) { if (numStrong + numRec < 6) { --i; continue; } } else if (numInGroup > 2) { if (numStrong + numRec < 3) { --i; continue; } } else { if (numStrong + numRec < 1) { --i; continue; } } // If this qualifies as a block, add to the block list, but in // order by first marker number: if ( (double)numStrong/(double)(numStrong + numRec) > informFrac) { blocks.insert( make_pair( first , thisBlock )); // Track that these SNPs belong to a block for (int u = first; u <= last; u++) used.insert(u); } --i; } // Next chromosome } if ( ! par::silent ) cerr << "\n"; map >::iterator j = blocks.begin(); printLOG(int2str( blocks.size() ) + " blocks called, writing list to [ " + par::output_file_name + ".blocks ]\n"); ofstream O1( (par::output_file_name+".blocks").c_str() , ios::out ); printLOG("Writing extra block details to [ " + par::output_file_name + ".blocks.det ]\n"); ofstream O2( (par::output_file_name+".blocks.det").c_str() , ios::out ); O2 << setw(4) << "CHR" << " " << setw(12) << "BP1" << " " << setw(12) << "BP2" << " " << setw(12) << "KB" << " " << setw(6) << "NSNPS" << " " << setw(4) << "SNPS" << "\n"; while ( j != blocks.end() ) { O1 << "*"; vector & b = j->second; for (int k=0; klocus[b[k]]->name; O1 << "\n"; O2 << setw(4) << PP->locus[b[0]]->chr << " " << setw(12) << PP->locus[b[0]]->bp << " " << setw(12) << PP->locus[b[b.size()-1]]->bp << " " << setw(12) << (double)(PP->locus[b[b.size()-1]]->bp - PP->locus[b[0]]->bp + 1)/1000.0 << " " << setw(6) << b.size() << " "; for (int k=0; k0 ) O2 << "|" << PP->locus[b[k]]->name; else O2 << PP->locus[b[k]]->name; } O2 << "\n"; ++j; } O1.close(); O2.close(); // List of blocks created here // (dummy; not used) map > blocks0; return blocks0; } void PairwiseLinkage::calculateCI() { // Get counts of observed, unambiguous haplotypes vector > t = two_locus_table(a,b); // Assume autosome knownAA = 2 * t[0][0] + t[0][1] + t[1][0]; knownAB = 2 * t[0][2] + t[0][1] + t[1][2]; knownBA = 2 * t[2][0] + t[1][0] + t[2][1]; knownBB = 2 * t[2][2] + t[2][1] + t[1][2]; unknownDH = t[1][1]; int total_chroms = knownAA + knownAB + knownBA + knownBB + 2*unknownDH; // From Haploview code: // Likelihood surface vector_t lsurface(101); // // Assumed // // denom = of D' // // 4 haplotype frequencies pA1, pA2, pB1, pB2 const double LN10 = log(10.0); string sA1 = PP->locus[a]->allele1 + PP->locus[b]->allele1; string sA2 = PP->locus[a]->allele1 + PP->locus[b]->allele2; string sB1 = PP->locus[a]->allele2 + PP->locus[b]->allele1; string sB2 = PP->locus[a]->allele2 + PP->locus[b]->allele2; double pA1,pA2,pB1,pB2; for ( int i = 0 ; i < 4 ; i++ ) { if ( PP->haplo->haplotypeName(i) == sA1 ) pA1 = PP->haplo->f[i]; else if ( PP->haplo->haplotypeName(i) == sA2 ) pA2 = PP->haplo->f[i]; else if ( PP->haplo->haplotypeName(i) == sB1 ) pB1 = PP->haplo->f[i]; else if ( PP->haplo->haplotypeName(i) == sB2 ) pB2 = PP->haplo->f[i]; } double pA = pA1 + pA2; double pB = 1 - pA; double p1 = pA1 + pB1; double p2 = 1 - p1; // Estimated haplotype counts double D = pA1 - (pA*p1); if (D < 0) { double tmp; /* flip matrix so we get the positive D' */ /* flip AA with AB and BA with BB */ tmp=pA1; pA1=pA2; pA2=tmp; tmp=pB2; pB2=pB1; pB1=tmp; /* flip frequency of second allele */ tmp=p1; p1=p2; p2=tmp; /* flip known array for likelihood computation */ int tmpi; tmpi=knownAA; knownAA=knownAB; knownAB=tmpi; tmpi=knownBB; knownBB=knownBA; knownBA=tmpi; } double dmax1 = pA * p2 ; double dmax2 = pB * p1 ; double denom = dmax1 < dmax2 ? dmax1 : dmax2; for (int i=0; i<=100; i++) { double dpr = (double)i*0.01; double tmpAA = dpr*denom + pA*p1; double tmpAB = pA-tmpAA; double tmpBA = p1-tmpAA; double tmpBB = pB-tmpBA; if (i==100) { /* one value will be 0 */ if (tmpAA < 1e-10) tmpAA=1e-10; if (tmpAB < 1e-10) tmpAB=1e-10; if (tmpBA < 1e-10) tmpBA=1e-10; if (tmpBB < 1e-10) tmpBB=1e-10; } lsurface[i] = ( knownAA * log( tmpAA ) + knownAB * log( tmpAB ) + knownBA * log( tmpBA ) + knownBB * log( tmpBB ) + unknownDH * log( tmpAA*tmpBB + tmpAB*tmpBA)) / LN10; } double loglike1 = unknownDH * log( pA1*pB2 + pB1*pA2 ); if ( pA1>0 ) loglike1 += knownAA * log( pA1 ); if ( pA2>0 ) loglike1 += knownAB * log( pA2 ); if ( pB1>0 ) loglike1 += knownBA * log( pB1 ); if ( pB2>0 ) loglike1 += knownBB * log( pB2 ); loglike1 /= LN10; double loglike0= (knownAA * log(pA*p1) + knownAB * log(pA*p2) + knownBA * log(pB*p1) + knownBB * log(pB*p2) + unknownDH * log(2*pA*pB*p1*p2))/LN10; lod = loglike1-loglike0; if ( lod < 0 ) lod = 0; double total_prob=0; double sum_prob=0; int high_i = 0; int low_i = 0; for (int i=0; i<=100; i++) { lsurface[i] -= loglike1; lsurface[i] = pow(10.0,lsurface[i]); total_prob += lsurface[i]; } for (int i=0; i<=100; i++) { sum_prob += lsurface[i]; if (sum_prob > 0.05*total_prob && sum_prob-lsurface[i] < 0.05*total_prob) { low_i = i-1; break; } } sum_prob=0.0; for (int i=100; i>=0; i--) { sum_prob += lsurface[i]; if (sum_prob > 0.05*total_prob && sum_prob-lsurface[i] < 0.05*total_prob) { high_i = i+1; break; } } if (high_i > 100){ high_i = 100; } dp_lower = (double)low_i/100.0; dp_upper = (double)high_i/100.0; if ( par::verbose ) { cout << PP->locus[ a ]->name << " " << PP->locus[ b ]->name << " : "; cout << "Rsq= " << PP->haplo->rsq(a,b) << " : "; cout << "D' = " << dp << " CI = " << dp_lower << " to " << dp_upper << "; lod = " << lod << " " << loglike1 << " vs. " << loglike0 << "\n"; } } void PairwiseLinkage::calculateLD() { dp = PP->haplo->dprime( a, b ); } plink-1.07-src/dcdflib.h0000644000265600020320000000167011264127626014242 0ustar tilleaadmin#ifndef __DCDFLIB_H__ #define __DCDFLIB_H__ extern void cdfbet(int*,double*,double*,double*,double*,double*,double*, int*,double*); extern void cdfbin(int*,double*,double*,double*,double*,double*,double*, int*,double*); extern void cdfchi(int*,double*,double*,double*,double*,int*,double*); extern void cdfchn(int*,double*,double*,double*,double*,double*,int*,double*); extern void cdff(int*,double*,double*,double*,double*,double*,int*,double*); extern void cdffnc(int*,double*,double*,double*,double*,double*,double*, int*s,double*); extern void cdfgam(int*,double*,double*,double*,double*,double*,int*,double*); extern void cdfnbn(int*,double*,double*,double*,double*,double*,double*, int*,double*); extern void cdfnor(int*,double*,double*,double*,double*,double*,int*,double*); extern void cdfpoi(int*,double*,double*,double*,double*,int*,double*); extern void cdft(int*,double*,double*,double*,double*,int*,double*); #endif plink-1.07-src/hapglm.cpp0000644000265600020320000001476011264127626014462 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "whap.h" #include "helper.h" #include "plink.h" #include "options.h" #include "perm.h" #include "nlist.h" #include "phase.h" #include "model.h" #include "linear.h" #include "logistic.h" #include "stats.h" ////////////////////////////////////////////////////////////// // Implements --hap-logistic and --hap-linear functions // Use framework provided by --chap/whap.cpp // can perform either omnibus or haplotype specific tests vector_t Plink::glmHaplotypeTest(bool print, Perm & perm) { /////////////////////////////////////////////// // // // Some basic setup first // // // /////////////////////////////////////////////// // Use basic GLM function to fit linear and logistic // models: although, let it know that there will not // be a 'main' SNP par::assoc_glm_without_main_snp = true; // Return a single result vector_t results; // Haplotypes at this position have already been phased // Record the number of common haplotypes int nch = 0; set commonHaplotypes; for (int h=0; h < haplo->nh; h++) if ( haplo->f[h] >= par::min_hf ) { ++nch; commonHaplotypes.insert(h); } // if ( ! par::test_hap_GLM_omnibus ) // haplo->HTEST << setw( haplo->ns + 1 ) << haplo->haplotypeName(0) << " "; if ( nch < 2 ) { haplo->HTEST << setw(4) << haplo->ns << " " << setw(4) << nch << " " << setw(4) << locus[haplo->S[0]]->chr << " " << setw(12) << locus[haplo->S[0]]->bp << " " << setw(12) << locus[haplo->S[haplo->ns-1]]->bp << " " << setw(par::pp_maxsnp) << locus[haplo->S[0]]->name << " " << setw(par::pp_maxsnp) << locus[haplo->S[haplo->ns-1]]->name << " "; if ( ! par::test_hap_GLM_omnibus ) { if ( nch==1 ) haplo->HTEST << setw(12) << haplo->haplotypeName(0) << " "; else haplo->HTEST << setw(12) << "NA" << " "; haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << "\n"; } else haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << "\n"; results.push_back( 0 ); return results; } // Single SNP association if ( par::test_hap_GLM_omnibus ) { haplo->HTEST << setw(4) << haplo->ns << " " << setw(4) << nch << " " << setw(4) << locus[haplo->S[0]]->chr << " " << setw(12) << locus[haplo->S[0]]->bp << " " << setw(12) << locus[haplo->S[haplo->ns-1]]->bp << " " << setw(par::pp_maxsnp) << locus[haplo->S[0]]->name << " " << setw(par::pp_maxsnp) << locus[haplo->S[haplo->ns-1]]->name << " "; // H-1 omnibus (H0 is ref.) haplo->sets.clear(); set::iterator i = commonHaplotypes.begin(); // Skip first haplotype (this is reference) // All rare haplotypes will therefore be // lumped in with the reference ++i; while ( i != commonHaplotypes.end() ) { haplo->sets.insert(*i); ++i; } // Fit model glmAssoc(false,*pperm); // Report results haplo->result = model->isValid() ? model->getStatistic() : 0; haplo->pvalue = par::bt ? chiprobP(haplo->result,1) : ((LinearModel*)model)->getPValue(); // Calculate omnibus tests of H-1 terms // Assumes the terms are: e.g. for 4 haplotypes // 0 intercept // 1 haplotype 2 of H // 2 haplotype 3 of H // 3 haplotype 4 of H int df = nch-1; vector_t h; h.resize(df,0); matrix_t H; sizeMatrix(H,df,model->getNP()); for (int j=0; jisValid() ? model->linearHypothesis(H,h) : 0; double pvalue = chiprobP(chisq,df); if ( model->isValid() ) { haplo->HTEST << setw(8) << chisq << " " << setw(8) << pvalue << "\n"; } else { haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << "\n"; } // Clean up delete model; // Return 1-p, as will be different DF for different windows results.push_back( 1 - pvalue ); return results; } // Otherwise, we are performing H haplotype specific tests set::iterator i = commonHaplotypes.begin(); while ( i != commonHaplotypes.end() ) { haplo->sets.clear(); haplo->sets.insert(*i); // Fit model glmAssoc(false,*pperm); // Report results vector_t coef = model->getCoefs(); // Note: the different direction of OR haplo->odds = par::bt ? exp(coef[1]) : coef[1]; haplo->result = model->isValid() ? model->getStatistic() : 0; haplo->pvalue = par::bt ? chiprobP(haplo->result,1) : ((LinearModel*)model)->getPValue(); // Calculate omnibus tests of H-1 terms haplo->HTEST << setw(4) << haplo->ns << " " << setw(4) << nch << " " << setw(4) << locus[haplo->S[0]]->chr << " " << setw(12) << locus[haplo->S[0]]->bp << " " << setw(12) << locus[haplo->S[haplo->ns-1]]->bp << " " << setw(par::pp_maxsnp) << locus[haplo->S[0]]->name << " " << setw(par::pp_maxsnp) << locus[haplo->S[haplo->ns-1]]->name << " "; haplo->HTEST << setw(12) << haplo->haplotypeName(*i) << " " << setw(8) << haplo->f[*i] << " "; if ( model->isValid() ) { haplo->HTEST << setw(8) << haplo->odds << " " << setw(8) << haplo->result << " " << setw(8) << haplo->pvalue << "\n"; } else { haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << "\n"; } // Clean up delete model; // Return chi-sq (always 1df) results.push_back( haplo->result ); // Next common haplotype ++i; } return results; } plink-1.07-src/lapackf.h0000644000265600020320000000143511264127626014253 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __LAPACK_FUNC_H__ #define __LAPACK_FUNC_H__ bool svd_lapack(int,vector_t & A, vector_t & S, matrix_t & V); bool eigen_lapack(int,vector_t & A, vector_t & S, matrix_t & V); #endif plink-1.07-src/haplohelper.cpp0000644000265600020320000010566411264127625015520 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "phase.h" #include "helper.h" #include "genogroup.h" #include "haplowindow.h" extern ofstream LOG; using namespace std; string HaploPhase::haplotypeName(int h) { string str; for (int s=0; sallele1; string a2 = P.locus[S[s]]->allele2; if ( a1 == "" ) a1 = "X"; if ( a2 == "" ) a2 = "X"; if (h == -1) str += "-"; // haploid gap else if (hap[h][s]) str += a1; else str += a2; } return str; } void HaploPhase::imputeAllHaplotypes() { /////////////////////////////////////////////// // Impute missing SNPs -- create a new datafile // Imputation rules: // Missing predictor allele -> missing haplotype // P(H|G) < 0.8 (default) -> missing haplotype // Make space to new, imputed haplotype calls new_one.resize(P.n); new_two.resize(P.n); ///////////////////////////////// // Phase all specified haplotypes phaseAllHaplotypes(true,*P.pperm); /////////////////////////// // Write new PED file P.printLOG("Imputing genotypes with P(H|G) threshold of " + dbl2str( par::hap_post_prob ) + "\n\n"); string filename = par::output_file_name + ".impute.ped"; P.printLOG("Writing imputed ped file to [ " + filename + " ] \n"); ofstream PED(filename.c_str(), ios::out); PED.clear(); for (int i=0; ifid<< " "<< person->iid<< " "<< person->pat<< " " << person->mat<< " "<< person->sexcode<< " "; if (par::bt) PED << (int)person->phenotype; else PED << person->phenotype; for (int l=0; lallele1<< " " << actual_map[l]->allele1; else if ( (!new_one[i][l]) && new_two[i][l]) PED << par::recode_delimit<< actual_map[l]->allele1<< " " << actual_map[l]->allele2; else if (new_one[i][l] && new_two[i][l]) PED << par::recode_delimit<< actual_map[l]->allele2<< " " << actual_map[l]->allele2; else PED << par::recode_delimit<< par::missing_genotype << " " << par::missing_genotype; } PED << "\n"; } PED.close(); ////////////////// // Write new map filename = par::output_file_name + ".impute.map"; P.printLOG("Writing imputed map file to [ " + filename + " ] \n"); ofstream MAP(filename.c_str(), ios::out); MAP.clear(); for (int l=0; l < actual_map.size(); l++) { MAP << chromosomeName(actual_map[l]->chr) << "\t" << actual_map[l]->name<< "\t" << actual_map[l]->pos<< "\t" << actual_map[l]->bp<< "\n"; } MAP.close(); } void HaploPhase::calculateHaplotypeFrequencies() { string f = par::output_file_name + ".frq.hap"; if (par::display_hap_freqs) { P.printLOG("Writing haplotype frequencies to [ " + f + " ]\n"); HFRQ.open(f.c_str(), ios::out); HFRQ.precision(4); HFRQ << setw(10) << "LOCUS"<< " "<< setw(12) << "HAPLOTYPE"<< " " << setw(10) << "F"<< "\n"; } // Phase all SNPs (with frequency flag set, this routine // will write haplotype frequencies to HFRQ P.haplo->phaseAllHaplotypes(true,*P.pperm); if (par::display_hap_freqs) HFRQ.close(); // So we do not re-write them par::display_hap_freqs = false; } void HaploPhase::imputeThisHaplotype(int l) { //////////////////////////////////////////////////// // Impute for all individiuals, if common haplotype double w = 0; double c = 0; if (testHaplotypeFreq() >= par::min_hf && testHaplotypeFreq() <= par::max_hf ) { for (int i=0; i=0) { w += t; c++; }; // And set new imputed genotypes new_one[i].push_back(b1); new_two[i].push_back(b2); } ///////////////////////////////////////////// // Add to map of actually imputed haplotypes Locus * loc = new Locus; loc->chr = new_map[l]->chr; loc->name = hname+"_"+haplotypeName(test_hap)+"_"; loc->bp = new_map[l]->bp; loc->pos = new_map[l]->pos; loc->allele1 = new_map[l]->allele1; loc->allele2 = new_map[l]->allele2; actual_map.push_back( loc ); } } ///////////////////////////////////////////// // Legacy function: now redundant void HaploPhase::enumerateAllPhases() { // Note: this function is no longer called // For individuals w/out parents: make a list of all possible // phases. Note: currently, we do not use this (i.e. we always // require father and mother to be 'observed' (i.e. genotyped and // adequately phased). // Also note: issue with representing heterozygote haplotypes twice // in list: previously we did not, but we now change this (seeing as // it is never used in any case...) // Also: now we build separate lists for diploid and haploid // chromosomes, not that we use either. // Diploid possible phases if ( !haploid ) { for (int h1=0; h1= par::hap_min_phase_prob ) { ph_freq.push_back(freq ); ph_hap1.push_back(h1 ); ph_hap2.push_back(h2 ); } } } // Haploid possible phases if (haploid || X ) { for (int h1=0; h1= par::hap_min_phase_prob ) { haploid_ph_freq.push_back(freq ); haploid_ph_hap1.push_back(h1 ); } } } // Original total number of phases np = ph_hap1.size(); haploid_np = haploid_ph_hap1.size(); } // Legacy function: now redundant ///////////////////////////////////////////// ////////////////////////////////////////////////// // Return a list of all possible haplotype names vector HaploPhase::returnHaplotypes(vector & slist) { vector str; enumerateHaplotypes(slist ); for (int h=0; hallele1; else if (P.locus[S[s]]->allele2=="") hstr += "0"; else hstr += P.locus[S[s]]->allele2; str.push_back(hstr); } return str; } ////////////////////////////////////////////////// // For multi-marker imputation and certain tasks, // we require a 'test' haplotype void HaploPhase::setTestHaplotype(string t) { // No specified test haplotype? if ( t == "" ) { test_hap = -1; return; } // Create match template vector tmp(ns, false); for (int s=0; sallele1 == t.substr(s, 1) ) tmp[s] = true; // Consider each haplotype test_hap = -1; for (int h=0; h= par::min_hf) { HFRQ << setw(10) << hname << " "<< setw(12) << haplotypeName(h) << " "<< setw(10) << f[h]<< "\n"; } } } void HaploPhase::reportPhase() { string fn = par::output_file_name+".phase-"+hname; ofstream PHASE(fn.c_str(), ios::out); P.printLOG("Writing phased haplotypes for "+ hname + " to [ "+ fn + " ]\n"); PHASE << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID"<< " " << setw(4) << "PH"<< " " << setw(10) << "HAP1"<< " " << setw(10) << "HAP2"<< " " << setw(12) << "POSTPROB"<< " " // << setw(12) << "WEIGHT" << " " << setw(6) << "BEST"<< " "<< "\n"; PHASE.precision(4); for (int i = 0; i < P.n; i++) { if (include[i]) { for (int z = 0; z < hap1[i].size(); z++) { PHASE << setw(par::pp_maxfid) << P.sample[i]->fid<< " " << setw(par::pp_maxiid) << P.sample[i]->iid<< " " << setw(4) << z << " " << setw(10) << haplotypeName(hap1[i][z]) << " "; if (haploid || (X && P.sample[i]->sex)) PHASE << setw(10) << haplotypeName( -1 ) << " "; else PHASE << setw(10) << haplotypeName(hap2[i][z]) << " "; if (ambig[i]) { PHASE << setw(12) << pp[i][z]<< " "; int max_z = 0; for (int z2=0; z2 pp[i][max_z] ? z2 : max_z ; if (max_z == z) PHASE << setw(6) << 1<< " "<< " "; else PHASE << setw(6) << 0<< " "<< " "; } else PHASE << setw(12) << 1<< " "<< setw(6) << 1<< " "<< " "; // Genotypes //for (int s=0; sfid << " " << setw(par::pp_maxiid) << P.sample[i]->iid << " " << setw(4) << "NA" << " " << setw(10) << "NA" << " " << setw(10) << "NA" << " " << setw(12) << "NA" << " " << setw(6) << "NA"<< " "; // genotypes // for (int s=0; s= par::min_hf) PHASE << setw(8) << "H_"+haplotypeName(h) << " "; PHASE << "\n"; PHASE.precision(4); for (int i = 0; i < P.n; i++) { if (include[i]) { PHASE << setw(par::pp_maxfid) << P.sample[i]->fid<< " " << setw(par::pp_maxiid) << P.sample[i]->iid<< " "; vector_t hcnt(nh, 0); for (int z = 0; z < hap1[i].size(); z++) { if (ambig[i]) { hcnt[hap1[i][z]] += pp[i][z]; if ( ! (haploid || (X && P.sample[i]->sex))) hcnt[hap2[i][z]] += pp[i][z]; } else { hcnt[hap1[i][z]] ++; if ( ! (haploid || (X && P.sample[i]->sex))) hcnt[hap2[i][z]] ++; } } for (int h=0; h= par::min_hf) PHASE << setw(8) << hcnt[h]<< " "; PHASE << "\n"; } // Report also on excluded individuals // (Should be 0-size phase-set) else { PHASE << setw(par::pp_maxfid) << P.sample[i]->fid<< " " << setw(par::pp_maxiid) << P.sample[i]->iid<< " "; for (int h=0; h= par::min_hf) PHASE << setw(8) << "NA"<< " "; PHASE << "\n"; } } PHASE.close(); } map HaploPhase::makeSubHaplotypeSet(boolvec_t & mask) { map t; map shap; int cnt=0; for (int h=0; h < nh; h++) { boolvec_t sh; for (int s = 0; s < ns ; s++) { if (mask[s]) sh.push_back(hap[h][s]); } map::iterator si = shap.find(sh); if ( si == shap.end() ) { shap.insert(make_pair(sh,cnt)); t.insert(make_pair(h,cnt)); ++cnt; } else t.insert(make_pair(h, si->second)); } return t; } map HaploPhase::makeTestSet(boolvec_t & mask, boolvec_t & allele) { map tests; for (int h2=0; h2 < nh; h2++) { bool is_A = true; for (int s = 0; s < ns ; s++) { if (mask[s] && hap[h2][s] != allele[s]) is_A = false; } if (is_A ) tests.insert(make_pair(h2, 0)); else tests.insert(make_pair(h2, 1)); } return tests; } string HaploPhase::getSubHaplotypeName(boolvec_t & mask, boolvec_t & allele, int blank) { string str = ""; for (int s=0; s < ns; s++) { if (s == blank ) str += " "; else if (mask[s]) { if (allele[s]) str += P.locus[ S[s] ]->allele1; else str += P.locus[ S[s] ]->allele2; } else str += "."; } return str; } vector_t HaploPhase::imputeGenotype(int i, int l) { // Probability of AA, AB and BB for position 'l' // (of ns SNPs) for individual 'i' vector_t g(3); if (X || haploid ) { g[0] = g[1] = g[2] = 0; return g; error("HaploPhase::imputeGenotypess() not yet set up for X \n"); } // Not able to be imputed? if (!include[i]) { g[0] = g[1] = g[2] = 0; return g; } // Unambiguous imputation? if (!ambig[i]) { int h1 = hap1[i][0]; int h2 = hap2[i][0]; bool s1 = hap[h1][l]; bool s2 = hap[h2][l]; if (s1 != s2 ) g[1] = 1; else if (s1 ) g[0] = 1; else g[2] = 1; return g; } // Weighted, ambiguous imputation? for (int z=0; z= par::hap_post_prob) int h1 = hap1[i][z]; int h2 = hap2[i][z]; bool s1 = hap[h1][l]; bool s2 = hap[h2][l]; if (s1 != s2 ) g[1] += pp[i][z]; else if (s1 ) g[0] += pp[i][z]; else g[2] += pp[i][z]; } // next possible phase return g; } double HaploPhase::imputeHaplotypes(int i, bool & n1, bool & n2) { // if ( X || haploid ) // error("HaploPhase::imputeHaplotypes() not yet set up for X \n"); bool actualX = X && P.sample[i]->sex; ////////////////////////////////////////////// // Based on P(H|G) impute inferred haplotypes // for above-threshold individuals // Not imputed double w = -1; if ( ! include[i] ) { n1 = true; n2 = false; return w; } // First for individuals of unambiguous phase if (!ambig[i]) { if (hap1[i][0] == test_hap) n1 = false; else n1 = true; if ( actualX || haploid ) { n2 = n1; } else { if (hap2[i][0] == test_hap) n2 = false; else n2 = true; } // Resolve potential het/missing coding confusion if (n1 && (!n2)) { n1 = false; n2 = true; } // Unambiguous weighting (0, 1 or 2 copie of test_hap) if (!n1) { if ( actualX || haploid ) w=1; else { if (!n2) w = 2; else w = 1; } } else w = 0; } else { // Second, for ambiguous individuals impute and assign weight int max_z = 0; for (int z=0; z pp[i][max_z] ? z : max_z ; // Set missing by default n1 = true; n2 = false; // Consider each phase z // Above threshold? if (pp[i][max_z] >= par::hap_post_prob) { // Do we match 'test_hap' ( '1' allele ) // or not? ( '2' allele ) if (hap1[i][max_z] == test_hap) n1 = false; else n1 = true; if ( actualX || haploid ) { n2 = n1; } else { if (hap2[i][max_z] == test_hap) n2 = false; else n2 = true; } // Resolve potential het/missing coding confusion if (n1 && (!n2)) { n1 = false; n2 = true; } // Unambiguous weighting (1 or 2 copies of test_hap) // We are saying either 0, 1 or 2 copies // Consider each haplotype // Number imputed / Actual number for (int z=0; z ns || s2 > ns ) error("Problem in rsq_internal(int,int)"); boolvec_t m1(ns, false); boolvec_t m2(ns, false); m1[s1] = true; m2[s2] = true; return rsq_internal(m1, m1, m2, m2); } double HaploPhase::rsq_internal(boolvec_t & mask1, boolvec_t & alleles1, boolvec_t & mask2, boolvec_t & alleles2) { // Assume f[] has been populated with sensible values // and hap[][] contains alleles if (mask1.size() != ns ||mask2.size() != ns ||alleles1.size() != ns ||alleles2.size() != ns ) { cout << ns << " " << mask1.size() << " " << mask2.size() << " " << alleles1.size() << " " << alleles2.size() << "\n"; error("Internal error in Phase::rsq"); } // ---X-X- mask1 // 0-0 alleles1 // ----X-- mask2 // 1 alleles2 // i.e. find r^2 between 00 haplotype made of SNPs 4 & 6 // from 7 SNP haplotype with allele 1 of SNP 5 // Calculate frequency of first haplotype (fA) double fA = 0; double fB = 0; double fAB = 0, fAb = 0, faB = 0, fab = 0; for (int h = 0; h < nh; h++) { bool is_A = true; bool is_B = true; bool is_AB = true; bool is_Ab = true; bool is_aB = true; for (int s = 0; s < ns ; s++) { if (mask1[s] && hap[h][s] != alleles1[s]) is_A = false; if (mask2[s] && hap[h][s] != alleles2[s]) is_B = false; if ( (mask1[s] && hap[h][s] != alleles1[s]) || (mask2[s] && hap[h][s] != alleles2[s])) is_AB = false; if ( (mask1[s] && hap[h][s] != alleles1[s]) || (mask2[s] && hap[h][s] == alleles2[s])) is_Ab = false; if ( (mask1[s] && hap[h][s] == alleles1[s]) || (mask2[s] && hap[h][s] != alleles2[s])) is_aB = false; } if (is_A ) fA += f[h]; if (is_B ) fB += f[h]; if (is_AB ) fAB += f[h]; else if (is_aB ) faB += f[h]; else if (is_Ab ) fAb += f[h]; else fab += f[h]; // Next haplotype } double fa = 1 - fA; double fb = 1 - fB; // Calculate either r-sq or D' double D = fAB - fA * fB; if ( calculateDp ) { double dmax1 = D > 0 ? fA * fb : fA * fB; double dmax2 = D > 0 ? fa * fB : fa * fb; double dmax = dmax1 < dmax2 ? dmax1 : dmax2; if ( dmax == 0 ) return -1; return D / dmax; } else { double denom = fA * fa * fB * fb; if (denom == 0) return -1; return (D*D) / denom; } } double HaploPhase::freq(boolvec_t & mask1, boolvec_t & alleles1) { // Assume f[] has been populated with sensible values // and hap[][] contains alleles if (mask1.size() != ns ||alleles1.size() != ns ) { cout << ns << " " << mask1.size() << " " << alleles1.size() << "\n"; error("Internal error in Phase::freq"); } // ---X-X- mask1 // 0-0 alleles1 double fA = 0; for (int h = 0; h < nh; h++) { bool is_A = true; for (int s = 0; s < ns ; s++) { if (mask1[s] && hap[h][s] != alleles1[s]) is_A = false; } if (is_A ) fA += f[h]; // Next haplotype } return fA; } double HaploPhase::rsq(int l1, int l2) { reset(); new_pred_locus.resize(1); new_map.resize(1); vector twoSNPs(2); twoSNPs[0] = l1; twoSNPs[1] = l2; new_pred_locus[0] = twoSNPs; new_map[0] = P.locus[l1]; bool old_silent = par::silent; par::silent = true; new_pred_allele = listPossibleHaplotypes(P, new_pred_locus[0]); phaseAllHaplotypes(true,*P.pperm); // hname = locus[l]->name; par::silent = old_silent; return rsq_internal(0, 1); } double HaploPhase::dprime(int l1, int l2) { calculateDp = true; double dp = rsq(l1,l2); calculateDp = false; return fabs(dp); } void Plink::calcPairwiseLD() { int l1 = getMarkerNumber(*this, par::ld_SNP1); int l2 = getMarkerNumber(*this, par::ld_SNP2); if (l1 == l2 ) error("Cannot compute LD with self"); if (l1 == -1) error("--ld {marker} {marker}: first marker not found"); if (l2 == -1) error("--ld {marker} {marker}: second marker not found"); printLOG("\nLD information for SNP pair [ "+ par::ld_SNP1 + " " + par::ld_SNP2 + " ]\n\n"); printLOG(" R-sq = " + dbl2str_fixed(haplo->rsq(l1, l2) , 3 ) + " "); printLOG("D' = " + dbl2str_fixed(haplo->dprime(l1, l2) , 3 ) + "\n\n"); printLOG(" Haplotype Frequency Expectation under LE\n"); printLOG(" --------- --------- --------------------\n"); for (int h=0; h < haplo->nh; h++) { printLOG(" " + haplo->haplotypeName(h) + " " ); printLOG(dbl2str_fixed( haplo->f[h] ,3) + " "); double e = 0; if ( haplo->haplotypeName(h) == locus[l1]->allele2 + locus[l2]->allele2 ) e = (1 - locus[l1]->freq)*(1 - locus[l2]->freq); else if ( haplo->haplotypeName(h) == locus[l1]->allele1 + locus[l2]->allele2 ) e = ( locus[l1]->freq)*(1 - locus[l2]->freq); else if ( haplo->haplotypeName(h) == locus[l1]->allele2 + locus[l2]->allele1 ) e = (1 - locus[l1]->freq)*( locus[l2]->freq); else if ( haplo->haplotypeName(h) == locus[l1]->allele1 + locus[l2]->allele1 ) e = ( locus[l1]->freq)*( locus[l2]->freq); printLOG(dbl2str_fixed( e ,3 ) + "\n"); } printLOG("\n"); int ch = 0; for (int h=0; h < haplo->nh; h++) if ( haplo->haplotypeName(h) == locus[l1]->allele2 + locus[l2]->allele2 ) ch = h; // Is D positive or negative? string s; if ( haplo->f[ch] > (1 - locus[l1]->freq)*(1 - locus[l2]->freq) ) s = locus[l1]->allele1 + locus[l2]->allele1 + "/" + locus[l1]->allele2 + locus[l2]->allele2; else s = locus[l1]->allele1 + locus[l2]->allele2 + "/" + locus[l1]->allele2 + locus[l2]->allele1; printLOG(" In phase alleles are " + s + "\n"); return; } /////////////////////////////////////////////////////////////// // // // For a particular pair of individuals, track the status // // of haplotype sharing across the chromosome/region; this // // is the driver function // // // /////////////////////////////////////////////////////////////// void HaploPhase::trackSharedHaplotypes() { // Find individual(s) to track p1 = -1; p2 = -1; for (int i=0; ifid == par::segment_haplotrack_fid1 && P.sample[i]->iid == par::segment_haplotrack_iid1 ) { p1 = i; } if ( P.sample[i]->fid == par::segment_haplotrack_fid2 && P.sample[i]->iid == par::segment_haplotrack_iid2 ) { p2 = i; } if ( p1 != -1 && p2 != -1 ) break; } if ( p1 == -1 || p2 == -1 ) { error("Problem finding individual(s) indicated in haplo-track option\n"); return; } // Set whether looking at homozygosity of shared segments homozyg = p1 == p2; Individual * person1 = P.sample[p1]; Individual * person2 = P.sample[p2]; if ( homozyg ) P.printLOG("\nReport for individual [ " + person1->fid + " " + person1->iid + " ]\n"); else P.printLOG("\nReport for pair [ " + person1->fid + " " + person1->iid + ", "+ person2->fid + " " + person2->iid + " ]\n"); string f = par::output_file_name + ".shared"; P.printLOG("Tracking shared haplotypes, writing output to [ " + f + " ]\n"); HFRQ.open(f.c_str(), ios::out); HFRQ.precision(4); HFRQ << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << "\n"; trackedIBS.resize(P.nl_all); trackedN.resize(P.nl_all); //////////////////// // Do all the work P.haplo->phaseAllHaplotypes(true,*P.pperm); // Display for (int l=0; lname << "\t" << P.locus[l]->bp << "\t" << trackedIBS[l] << "\t" << trackedN[l] << "\t" << (double)trackedIBS[l]/(double)trackedN[l] << "\n"; } if (par::display_hap_freqs) HFRQ.close(); } /////////////////////////////////////////////////////////////// // // // For a particular pair of individuals, track the status // // of haplotype sharing across the chromosome/region; this // // function does the actual work // // /////////////////////////////////////////////////////////////// void HaploPhase::trackThisSegment() { // Are the chromosomes consistent with a shared segment at this position? // No information? Then exit if ( ! ( include[p1] && include[p2] ) ) return; if ( haploid || (X && P.sample[p1]->sex) || (X && P.sample[p2]->sex) ) error("Cannot use haplo-track options on non-autosomal chromosomes yet"); // Looking within an individual for homozygous segments? if ( homozyg ) { double probHomozyg = 0; for (int z = 0; z < hap1[p1].size(); z++) { // Is this region shared... if ( hap1[p1][z] == hap2[p1][z] ) { // ...and rare? if ( f[ hap1[p1][z] ] < 0.02 ) { if ( ambig[p1] ) probHomozyg += pp[p1][z]; else probHomozyg = 1; } } } } else // ... or looking between individuals for shared segments? { double probShared = 0; int j=0; for (int z1 = 0; z1 < hap1[p1].size(); z1++) for (int z2 = 0; z2 < hap1[p2].size(); z2++) { // Figure IBS 0, 1 or 2 int a1 = hap1[p1][z1]; int a2 = hap2[p1][z1]; int b1 = hap1[p2][z2]; int b2 = hap2[p2][z2]; double prob = 1; if ( ambig[p1] ) prob *= pp[p1][z1]; if ( ambig[p2] ) prob *= pp[p2][z2]; if ( a1 > a2 ) { int tmp = a1; a1 = a2; a2 = tmp; } if ( b1 > b2 ) { int tmp = b1; b1 = b2; b2 = tmp; } // Count up similar, rare haplotypes int cnt =0 ; if ( a1 == b1 && f[a1] < .2 ) { probShared += 0.5 * prob; cnt++; } if ( a2 == b2 && f[a2] < .2 ) { probShared += 0.5 * prob; cnt++; } // Keep track for ( int s = 0; s < ns ; s++ ) { trackedIBS[S[s]] += probShared; trackedN[S[s]]++; } // next pair of haplotypes } } } /////////////////////////////////////////////////////////////// // // // Return a set of haplotype number codes given the type of // // mask + allele template used in the proxy association // // procedures // // // /////////////////////////////////////////////////////////////// set HaploPhase::returnHaplotypeSet(boolvec_t & mask, boolvec_t & alleles) { set hs; for (int h = 0; h < nh; h++) { bool is_A = true; for (int s = 0; s < ns ; s++) { if (mask[s] && hap[h][s] != alleles[s]) is_A = false; } if (is_A) hs.insert(h); } return hs; } void HaploPhase::calculateEmpiricalVariance(int h) { set hs; hs.insert(h); calculateEmpiricalVariance(hs); } /////////////////////////////////////////////////////////////// // // // Post-phasing, for a group of haplotypes, return the // // empirical variance and ratio of this to asymptotic // // variance for all individuals // // // /////////////////////////////////////////////////////////////// void HaploPhase::calculateEmpiricalVariance(set & hs) { double frequency = 0; set::iterator h = hs.begin(); while ( h != hs.end() ) { frequency += f[*h]; ++h; } // Do we need to consider this haplotype/set of // haplotypes? if( frequency < 0.0000001 ) { ratio = 0; empiricalVariance = 0; return; } // Calculate theoretical variance of frequency given binomial double theoreticalVariance = frequency * ( 1 - frequency ); double weightedVariance = 0; double dosageSSQ = 0; double haplotypeCount = 0; // Calculate empirical variance given imputed haplotype counts: int ncnt = 0; // for allele count int dosageCount = 0; // for dosage (individual) count for( int i = 0; i < P.n; i++ ) { if ( include[i] ) { if ( (!X) || !P.sample[i]->sex ) { if (!ambig[i]) { if( hs.find( hap1[i][0] ) != hs.end() ) haplotypeCount += 1; if( hs.find( hap2[i][0] ) != hs.end() ) haplotypeCount += 1; } else for( int z = 0; z < pp[i].size(); z++ ) { if( hs.find( hap1[i][z] ) != hs.end() ) haplotypeCount += pp[i][z]; if( hs.find( hap2[i][z] ) != hs.end() ) haplotypeCount += pp[i][z]; } ncnt+=2; dosageCount++; } } } double mean = haplotypeCount/(double)ncnt; double dmean = haplotypeCount/(double)dosageCount; // Ratio of variance of weighted versus variance of averages // (i.e. dosage -- this measures information loss, as the deflation // is only for the dosage)) // Calculate variance: S(x-mean)^2/(n-1) for( int i = 0; i < P.n; i++ ) { if ( include[i] ) { if ( (!X) || !P.sample[i]->sex ) { double dosage = 0; if (!ambig[i]) { if( hs.find( hap1[i][0] ) != hs.end() ) { weightedVariance += (1-mean) * (1-mean); dosage++; } else weightedVariance += mean*mean; // (0-mean)^2 if( hs.find( hap2[i][0] ) != hs.end() ) { weightedVariance += (1-mean) * (1-mean); dosage++; } else weightedVariance += mean*mean; dosageSSQ += (dosage-dmean)*(dosage-dmean); } else { // Variance based on weights for( int z = 0; z < pp[i].size(); z++ ) { if( hs.find( hap1[i][z] ) != hs.end() ) { weightedVariance += pp[i][z] * (1-mean) * (1-mean); dosage += pp[i][z]; } else weightedVariance += pp[i][z] * mean *mean; if( hs.find( hap2[i][z] ) != hs.end() ) { weightedVariance += pp[i][z] * (1-mean) * (1-mean); dosage += pp[i][z]; } else weightedVariance += pp[i][z] * mean * mean; } dosageSSQ += (dosage-dmean)*(dosage-dmean); } } } } // Use N, not N-1 denominator, as we are comparing to the expected // variance above (i.e. so ratio == 1 in case of complete // information) // Update variables in HaploPhase weightedVariance /= (double)ncnt; empiricalVariance = dosageSSQ / ((double)dosageCount*2); ratio = theoreticalVariance > 0 ? empiricalVariance / theoreticalVariance : 0; } /////////////////////////////////////////////////////////////// // // // Verbose display function for phasing routine // // // /////////////////////////////////////////////////////////////// void HaploPhase::verboseDisplayWindows(int i, bool use_ref ) { if ( ! include[i] ) return; for (int w = startWindow; w <= finishWindow ; w++) { int r = windows[w]->genoGroup[i]->reference; if ( ! use_ref ) r = i; HaploWindow * thisWindow = windows[w]; VPHASE << "WINDOW " << w << ": " << windows[w]->start << " to " << windows[w]->stop << " ( " << windows[w]->ns << " SNPs )\n"; for ( int s = 0 ; s < windows[w]->ns ; s++ ) VPHASE << P.locus[ thisWindow->S[s]]->name << " "; VPHASE << "\n"; // Display real genotypes VPHASE << setw(w) << " "; for (int s=0; s< thisWindow->ns; s++) { bool s1 = par::SNP_major ? P.SNP[ thisWindow->S[s] ]->one[i] : P.sample[i]->one[ thisWindow->S[s] ]; bool s2 = par::SNP_major ? P.SNP[ thisWindow->S[s] ]->two[i] : P.sample[i]->two[ thisWindow->S[s] ]; if ( s1 ) { if ( s2 ) VPHASE << P.locus[ thisWindow->S[s] ]->allele2 ; else VPHASE << "-"; } else { if ( s2 ) VPHASE << P.locus[ thisWindow->S[s] ]->allele1 ; else VPHASE << P.locus[ thisWindow->S[s] ]->allele1 ; } } VPHASE << " "; for (int s=0; s< thisWindow->ns; s++) { bool s1 = par::SNP_major ? P.SNP[ thisWindow->S[s] ]->one[i] : P.sample[i]->one[ thisWindow->S[s] ]; bool s2 = par::SNP_major ? P.SNP[ thisWindow->S[s] ]->two[i] : P.sample[i]->two[ thisWindow->S[s] ]; if ( s1 ) { if ( s2 ) VPHASE << P.locus[ thisWindow->S[s] ]->allele2 ; else VPHASE << "-"; } else { if ( s2 ) VPHASE << P.locus[ thisWindow->S[s] ]->allele2 ; else VPHASE << P.locus[ thisWindow->S[s] ]->allele1 ; } } VPHASE << "\n"; VPHASE << setw(w) << " "; for (int s=0; s< thisWindow->ns; s++) { bool s1 = par::SNP_major ? P.SNP[ thisWindow->S[s] ]->one[i] : P.sample[i]->one[ thisWindow->S[s] ]; bool s2 = par::SNP_major ? P.SNP[ thisWindow->S[s] ]->two[i] : P.sample[i]->two[ thisWindow->S[s] ]; if ( s1 ) { if ( s2 ) VPHASE << P.locus[ thisWindow->S[s] ]->allele2 ; else VPHASE << "-"; } else { if ( s2 ) VPHASE << P.locus[ thisWindow->S[s] ]->allele1 ; else VPHASE << P.locus[ thisWindow->S[s] ]->allele1 ; } } VPHASE << " "; for (int s=0; s< thisWindow->ns; s++) { bool s1 = par::SNP_major ? P.SNP[ thisWindow->S[s] ]->one[i] : P.sample[i]->one[ thisWindow->S[s] ]; bool s2 = par::SNP_major ? P.SNP[ thisWindow->S[s] ]->two[i] : P.sample[i]->two[ thisWindow->S[s] ]; if ( s1 ) { if ( s2 ) VPHASE << P.locus[ thisWindow->S[s] ]->allele2 ; else VPHASE << "-"; } else { if ( s2 ) VPHASE << P.locus[ thisWindow->S[s] ]->allele2 ; else VPHASE << P.locus[ thisWindow->S[s] ]->allele1 ; } } VPHASE << "\n"; for (int z = 0; z < windows[w]->hap1[r].size(); z++) { VPHASE << setw(w) << " " << thisWindow->haplotypeName(thisWindow->hap1[r][z]) << "/" << thisWindow->haplotypeName(thisWindow->hap2[r][z]) << " "; VPHASE << "( " << thisWindow->f[ thisWindow->hap1[r][z] ] << " / " << thisWindow->f[ thisWindow->hap2[r][z] ] << " ) "; if ( thisWindow->hap1[r].size() == 1) VPHASE << "[1]\n"; else VPHASE << thisWindow->pp[r][z]<< "\n"; } } } plink-1.07-src/haplowindow.h0000644000265600020320000000473211264127626015210 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __HAPWINDOW_H_ #define __HAPWINDOW_H__ class HaploPhase; class Plink; class MultiLocusGenotype; class HaploWindow { public: int ns; // Number of SNPs in haplotype int nh; // Number of possible haplotypes int np; // Number of phases, diploid // Parent 'region', PLINK HaploPhase * haplo; Plink * P; // Start and stop positions (relative to region) int start, stop; // Haplotype frequencies vector_t f; // Window haplotype codes vector > hap; // Stub codes for each haplotype (for quick lookup) vector leftStub; vector rightStub; // Lookup table for haplotype number given SNPs map,int> hapmap; // List of SNP numbers intvec_t S; // Posterior probabilities, per individual matrix_t pp; // Haplotype phases, per individual table_t hap1; table_t hap2; // Ambiguous for this window? boolvec_t ambig; // Unamiguous haplotype counts vector_t uc; // Store count, reference individual set genotypes; // Store which genoGroup a person belongs to vector genoGroup; // Finished with this window? bool converged; bool left_passed; bool right_passed; // Convergence vector zero; double sampleLogLikelihood; int iter; /////////////////////////////// // Functions HaploWindow(HaploPhase *, Plink *); ~HaploWindow(); void expandGenogroups(); void enumerateGenogroups(); void pruneGenogroups(double t=par::haplo_plem_window_prune_phase); void enumerateHaplotypes(intvec_t &); void setStubCodes(); void performEM(); void enumeratePhase(int); void prunePhase(int,double t=par::haplo_plem_window_prune_phase); void reportPhase(); string haplotypeName(int i); // Get overlap frequency from a window vector_t leftStubFrequency(); vector_t rightStubFrequency(); void tallyUnambiguousCounts(); }; #endif plink-1.07-src/helper.cpp0000644000265600020320000023444411264127624014472 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include "helper.h" #include "crandom.h" #include "options.h" #include "plink.h" #include "perm.h" #include "stats.h" #include "nlist.h" #include "model.h" #include "logistic.h" #include "linear.h" #define FPMIN 1.0e-30 extern ofstream LOG; extern Plink * PP; vector nvec_bool() { vector t(0); return t; } inline bool is_rare(Locus * loc) { if (loc->freq < 0 || loc->freq < par::min_af || (1-loc->freq) < par::min_af) return true; else return false; }; string display(vector & s) { string t = ""; for (int i=0; i & s) { string t = ""; for (int i=0; i & m) { if (par::silent) return; cout << "\n"; for (int i=0; i< m.size(); i++) cout << i << ")\t" << m[i] << "\n"; cout << "\n"; cout << "\n"; } CArgs::CArgs(int _n , char *argv[] ) { n = _n; a.push_back(argv[0]); parsed.resize(n,false); option.resize(n,false); for (int i=1 ; i < n ; i++ ) a.push_back(argv[i]); original = a; // Valid option labels optionLabel.insert(make_pair("--lookup-gene2","LOOKUP")); optionLabel.insert(make_pair("--lookup-gene","LOOKUP")); optionLabel.insert(make_pair("--id-replace","IDHELP")); optionLabel.insert(make_pair("--id-match","IDHELP")); optionLabel.insert(make_pair("--meta-analysis","META")); optionLabel.insert(make_pair("--annotate","ANNOT")); optionLabel.insert(make_pair("--dosage","DOSAGE")); } void CArgs::fromScript(string f) { checkFileExists(f); ifstream INP(f.c_str()); string buff; while( INP >> buff ) { a.push_back(buff); parsed.push_back(false); option.push_back(false); n++; if (INP.eof()) continue; } INP.close(); original = a; } void CArgs::fromPriorLog(string f) { checkFileExists(f); ifstream INP(f.c_str()); string buff; // Points to note: // 1) check whether any flag has already been given: // if it has, then do not add the flag (i.e. allow // the possibility of over-writing values). This // will automatically take care of the multiple rerun // issue, and will allow for a different naming, via // the --out command which would be the most common use // Read log file up to where commands start while (1) { vector tokens = tokenizeLine(INP); if ( tokens.size() == 3 && tokens[0] == "Options" && tokens[2] == "effect:" ) break; if (INP.eof()) break; } while( 1 ) { vector tokens = tokenizeLine(INP); if ( tokens.size()==0 ) break; // Are we over-riding this command / do we // want to skip it? if ( find(tokens[0]) ) continue; for (int t=0; t CArgs::value(string s, int c) { vector r(0); for (int i=0 ; i < n ; i++ ) if (a[i] == s && (i+1 < n) ) { for (int j=1;j<=c;j++) { if ( (i+j) < a.size() ) { parsed[i+j]=true; r.push_back(a[i+j]); } else error("Not enough arguments given for option: "+s+" "); } } if (r.size() != c) error("Not enough arguments given for option: "+s+" "); return r; } vector CArgs::varValue(string s) { vector r(0); for (int i=0 ; i < n ; i++ ) if (a[i] == s && (i+1 < n) ) { for (int j=i+1;j parse2str(string s) { vector y; string t=""; for (int i=0 ; i < s.length() ; i++) if (s[i] == ',' || i == s.length()-1 ) { if (i == s.length()-1) t += s[i]; y.push_back(t); t = ""; } else t += s[i]; return y; } vector parse2int(string s) { vector v = parse2str(s); vector y; for (int i=0; i::iterator i = optionLabel.find(cmd); if ( i == optionLabel.end() ) return false; // Get handle (creating it if needed) OptionSet * o = par::opt.addOption( i->second ); NList n2(0); n2.setRangeChar(" "); n2.setDelimiter("="); vector tok2 = n2.deparseStringList( opt ); // Can only have 1 or fewer '=' signs if ( tok2.size() > 2 ) return false; // error("Problem with option " + opt + "\n"); // Single flag? if ( tok2.size() == 1 || ( tok2.size()==2 && tok2[1]=="" ) ) { vector dummy; o->val.insert( make_pair( opt , dummy ) ); } // Key-value(s) pair? if ( tok2.size() == 2 ) { NList n2(0); n2.setRangeChar(" "); n2.setDelimiter(","); vector r = n2.deparseStringList( tok2[1] ); o->val.insert( make_pair( tok2[0] , r ) ); } return true; } int getInt(string s, string a) { int x; if(from_string(x,s,std::dec)) { return x; } else { error("Not valid integer argument for : "+a+" [ "+s+" ]"); } } long unsigned int getLongUnsignedInt(string s, string a) { long unsigned int x; if(from_string(x,s,std::dec)) { return x; } else { error("Not valid integer argument for : "+a+" [ "+s+" ]"); } } double getDouble(string s, string a) { double x; if(from_string(x,s,std::dec)) { return x; } else { error("Not valid numeric argument for : "+a+" [ "+s+" ]"); } } void CArgs::check_unused_options(Plink & P) { // Any unused options get added as options string cmd = ""; bool okay=true; for (int i=1; i0 && original[i-1] == "+" ) P.printLOG(" " + original[i]); else P.printLOG("\n\t " + original[i]); } else P.printLOG(" "+original[i]); } P.printLOG("\n\n"); } void checkDupes(Plink & P) { set people; vector::iterator person = P.sample.begin(); string errmsg; while (person != P.sample.end() ) { string str = (*person)->fid + "_" + (*person)->iid; if ( people.find(str) != people.end() ) errmsg += "Duplicate individual found: [ " + (*person)->fid + " " + (*person)->iid + " ]\n"; else people.insert(str); person++; } if (errmsg.size()>0) { P.printLOG(" *** WARNING *** DUPLICATE INDIVIDUAL IDS FOUND *** \n"); P.printLOG(errmsg+"\n"); } people.clear(); set markers; vector::iterator loc = P.locus.begin(); errmsg=""; while (loc != P.locus.end() ) { if ( markers.find( (*loc)->name ) != markers.end() ) errmsg += "Duplicate marker name found: [ " + (*loc)->name + " ]\n"; else markers.insert( (*loc)->name ); loc++; } if (errmsg.size()>0) { P.printLOG(" *** WARNING *** DUPLICATE MARKERS FOUND *** \n"); P.printLOG(errmsg+"\n"); } markers.clear(); } void error(string msg) { cerr << "\nERROR: " << msg << "\n"; LOG << "\nERROR: " << msg << "\n"; LOG.close(); if (par::gplink) { ofstream GP((par::output_file_name+".gplink").c_str(),ios::out); GP << "1\n"; GP.close(); } PP->cleanUp(); exit(1); } void shutdown() { time_t curr=time(0); string tdstamp = ctime(&curr); if (!par::silent) cout << "\nAnalysis finished: " + tdstamp +"\n"; LOG << "\nAnalysis finished: " + tdstamp +"\n"; if (PP->warnings) { if (!par::silent) cout << "*** One or more WARNINGS were issued (see this LOG file) ***\n"; LOG << "*** One or more WARNINGS were issued (see this LOG file) ***\n"; } LOG.close(); if (par::gplink) { ofstream GP((par::output_file_name+".gplink").c_str(),ios::out); GP << "0\n"; GP.close(); } PP->cleanUp(); exit(0); } void affCoding(Plink & P) { // Create affection coding for (int i=0; iphenotype == 2 && ! person->missing ) person->aff = true; else person->aff = false; } } void summaryBasics(Plink & P) { if (par::bt) { int ncase = 0; int ncontrol = 0; int nmissing = 0; for (int i=0; imissing ) nmissing++; else if ( P.sample[i]->phenotype == 1 ) ncontrol++; else if ( P.sample[i]->phenotype == 2 ) ncase++; P.printLOG("After filtering, " + int2str(ncase)+" cases, " +int2str(ncontrol)+" controls and " +int2str(nmissing)+" missing\n"); } else { int nmissing = 0; for (int i=0; imissing ) nmissing++; P.printLOG("After filtering, " + int2str(P.sample.size()-nmissing)+" individuals with non-missing status\n"); } // Display sex counts int nmale = 0; int nfemale = 0; int nambig = 0; for (int i=0; isexcode=="1") nmale++; else if (person->sexcode=="2") nfemale++; else nambig++; } P.printLOG("After filtering, "+int2str(nmale)+" males, "+int2str(nfemale) +" females, and "+int2str(nambig)+" of unspecified sex\n"); } #define MISSING1(i,l) ( P.SNP[l]->one[i] && ( ! P.SNP[l]->two[i] ) ) #define MISSING2(i,l) ( P.sample[i]->one[l] && ( ! P.sample[i]->two[l] ) ) double genotypingRate(Plink & P, int l) { // Because we do not store genotyping rate, provide this // convenience function; // Do *not* distinguish between obligatory and non-obligatory // missingness -- for the purpose of proxy-assoc, it is all the // same. int m = 0; if ( par::SNP_major ) { for (int i=0;in; i++) { if ( P->SNP[l1]->one[i] != P->SNP[l2]->one[i] ) return false; if ( P->SNP[l1]->two[i] != P->SNP[l2]->two[i] ) return false; } return true; } else { for (int i=0; in; i++) { if ( P->sample[i]->one[l1] != P->sample[i]->one[l2] ) return false; if ( P->sample[i]->two[l1] != P->sample[i]->two[l2] ) return false; } return true; } } vector listPossibleHaplotypes(Plink & P, vector S) { vector str; unsigned int h=0; int ns = S.size(); int nh = (int)pow((double)2,ns); vector > hap; while(h m1; unsigned int p=1; for (int s=0;sallele1; else if (P.locus[S[s]]->allele2=="") hstr += "0"; else hstr += P.locus[S[s]]->allele2; str.push_back(hstr); } return str; } bool readString(FILE * fp, string & s) { bool done = false; s=""; while (1) { char ch = fgetc(fp); if ( ch==' ' || ch == '\t' ) { if (done) return true; } else if ( ch=='\n' || ch=='\r' || feof(fp) ) { if (done) return true; else return false; } else { s += ch; done = true; } } } void removeMissingPhenotypes(Plink & P) { vector del(P.sample.size()); for (int i=0; imissing; int n_removed = P.deleteIndividuals(del); if ( n_removed > 0 ) P.printLOG(int2str(n_removed)+ " individuals removed because of missing phenotypes\n"); } void geno2matrix(vector & snps, matrix_t & g, boolmatrix_t & m, bool dom) { // return a S x N matrix coded 0,1,2, where m is missing (1=yes,0=no) m.clear(); sizeMatrix(g,PP->n,snps.size()); m.resize(PP->n); for (int p = 0 ; p < PP->n ; p++) m[p].resize(snps.size()); for (int s = 0 ; s < snps.size() ; s++) { for (int p = 0 ; p < PP->n ; p++) { bool s1 = par::SNP_major ? PP->SNP[snps[s]]->one[p] : PP->sample[p]->one[snps[s]]; bool s2 = par::SNP_major ? PP->SNP[snps[s]]->two[p] : PP->sample[p]->two[snps[s]]; if ( (!s1) && (!s2) ) { ++g[p][s] = dom ? 1 : 2; } else if ( (!s1) && s2 ) { ++g[p][s] = 1; } else if ( s1 && s2 ) { ++g[p][s] = 0; } else { m[p][s] = true; } } } } string genotypeToFile(Plink & P, int i, int l) { // Return a genotype in suitable text format to be // written to most output files string a1 = par::recode_12 ? "1" : P.locus[l]->allele1; string a2 = par::recode_12 ? "2" : P.locus[l]->allele2; bool s1 = par::SNP_major ? P.SNP[l]->one[i] : P.sample[i]->one[l]; bool s2 = par::SNP_major ? P.SNP[l]->two[i] : P.sample[i]->two[l]; if ( (!s1) && (!s2) ) return par::recode_delimit+a1+par::recode_indelimit+a1; else if ( (!s1) && s2 ) return par::recode_delimit+a1+par::recode_indelimit+a2; else if ( s1 && s2 ) return par::recode_delimit+a2+par::recode_indelimit+a2; else return par::recode_delimit + par::out_missing_genotype + par::recode_indelimit+par::out_missing_genotype; return "?"; } string genotype(Plink & P, int i, int l) { string delimit = "/"; string g; Locus * loc = P.locus[l]; if (par::SNP_major) { CSNP * s = P.SNP[l]; if ( (!s->one[i]) && (!s->two[i]) ) g = loc->allele1 + delimit + loc->allele1; else if ( (!s->one[i]) && s->two[i]) g = loc->allele1 + delimit + loc->allele2; else if ( s->one[i] && s->two[i]) g = loc->allele2 + delimit + loc->allele2; else g = par::missing_genotype + delimit + par::missing_genotype; } else { Individual * person = P.sample[i]; if ( (!person->one[l]) && (!person->two[l]) ) g = loc->allele1 + delimit + loc->allele1; else if ( (!person->one[l]) && person->two[l]) g = loc->allele1 + delimit + loc->allele2; else if ( person->one[l] && person->two[l]) g = loc->allele2 + delimit + loc->allele2; else g = par::missing_genotype + delimit + par::missing_genotype; } return g; } string genotype(Plink & P, Individual * person, int l) { string delimit = "/"; string g; Locus * loc = P.locus[l]; if ( (!person->one[l]) && (!person->two[l]) ) g = loc->allele1 + delimit + loc->allele1; else if ( (!person->one[l]) && person->two[l]) g = loc->allele1 + delimit + loc->allele2; else if ( person->one[l] && person->two[l]) g = loc->allele2 + delimit + loc->allele2; else g = par::missing_genotype + delimit + par::missing_genotype; return g; } void permute(vector &a) { // generate random permutation of 0..n-1 // where n is a.size(); const long int n = a.size( ); for( long int i = 0; i < n; i++ ) a[ i ] = i; for( long int j = 1; j < n; j++ ) { long int pos = CRandom::rand(j+1); long int tmp = a[ j ]; a[ j ] = a[ pos ]; a[ pos ] = tmp; } } void permute(vector &a) { // Generate a random permutation of 0 // to n-1 where n is a.size(); const int n = a.size( ); for( int i = 0; i < n; i++ ) a[ i ] = i; for( int j = 1; j < n; j++ ) { int pos = CRandom::rand(j+1); int tmp = a[ j ]; a[ j ] = a[ pos ]; a[ pos ] = tmp; } } int getChromosomeCode(string chr) { map::iterator cc = par::chr_map.find(chr); return cc == par::chr_map.end() ? 0 : cc->second; } string chromosomeName(int c) { if ( c < 0 || c >= par::chr_code.size() ) return "0"; return par::chr_code[c]; } int getMarkerChromosome(Plink & P, string m) { for (int i=0;iname==m) return P.locus[i]->chr; return -1; } int getMarkerNumber(Plink & P, string m) { for (int i=0;iname==m) return i; return -1; } bool seeChromosome(Plink & P, int c) { for (int l=0;lchr==c) return true; else if (P.locus[l]->chr>c) return false; } return false; } vector getChromosomeMarkerRange(Plink & P, int c) { vector m(2); m[0] = -1; // first m[1] = -1; // last for (int i=0;ichr==c) { if (i<=m[0] || m[0]==-1) m[0]=i; if (i>=m[1] || m[1]==-1) m[1]=i; } } return m; } vector getWindowRange(Plink &P, int s) { vector m(2); m[0] = s; // first SNP m[1] = s; // last SNP // move backwards int x=s; int chr=P.locus[s]->chr; int bp=P.locus[s]->bp; int win = (int)(par::window * 1000); // half window size in bases int nl = P.locus.size() - 1; while ( 1 ) { if ( x==0 ) break; // Move one position, until on different chromosome, or outside window x--; if ( P.locus[x]->chr != chr ) { x++; break; } if ( bp - P.locus[x]->bp > win ) { x++; break; } } m[0]=x; x=s; while ( 1 ) { if ( x== nl ) break; x++; if ( P.locus[x]->chr != chr ) { x--; break; } if ( P.locus[x]->bp - bp > win ) { x--; break; } } m[1]=x; return m; } vector getChromosomeRange(Plink & P) { vector m(2); m[0] = -1; // first chromosome m[1] = -1; // last chromosome for (int i=0;ichr<=m[0] || m[0]==-1) m[0]=P.locus[i]->chr; if (P.locus[i]->chr>=m[1] || m[1]==-1) m[1]=P.locus[i]->chr; } return m; } std::string int2str(int n) { std::ostringstream s2( std::stringstream::out ); s2 << n; return s2.str(); } std::string longint2str(long int n) { std::ostringstream s2( std::stringstream::out ); s2 << n; return s2.str(); } std::string dbl2str(double n, int prc) { std::ostringstream s2; if ( prc > 0 ) s2.precision(prc); s2 << n; return s2.str(); } std::string dbl2str_fixed(double n, int prc) { std::ostringstream s2; s2 << setiosflags( ios::fixed ); if ( prc > 0 ) s2.precision(prc); s2 << n; return s2.str(); } std::string sw(std::string s , int n) { int l = n - s.size(); if ( l < 1 ) return " " + s; s.insert(s.begin(), l , ' ' ); return s; } std::string sw(double d , int n) { std::string s = realnum(d) ? dbl2str(d) : "NA"; int l = n - s.size(); if ( l < 1 ) return " " + s; s.insert(s.begin(), l , ' ' ); return s; } std::string sw(double d , int f, int n) { std::string s = realnum(d) ? ( f < 0 ? dbl2str(d,-f) : dbl2str_fixed(d,f) ) : "NA"; int l = n - s.size(); if ( l < 1 ) return " " + s; s.insert(s.begin(), l , ' ' ); return s; } std::string sw(int i , int n) { std::string s = realnum(i) ? int2str(i) : "NA"; int l = n - s.size(); if ( l < 1 ) return " " + s; s.insert(s.begin(), l , ' ' ); return s; } void NoMem() { cerr << "*****************************************************\n" << "* FATAL ERROR Exhausted system memory *\n" << "* *\n" << "* You need a smaller dataset or a bigger computer...*\n" << "* *\n" << "* Forced exit now... *\n" << "*****************************************************\n\n"; exit(1); } std::string itoa(int value, int base) { enum { kMaxDigits = 35 }; std::string buf; buf.reserve( kMaxDigits ); // Pre-allocate enough space. // check that the base if valid if (base < 2 || base > 16) return buf; int quotient = value; // Translating number to string with base: do { buf += "0123456789abcdef"[ std::abs( quotient % base ) ]; quotient /= base; } while ( quotient ); // Append the negative sign for base 10 if ( value < 0 && base == 10) buf += '-'; std::reverse( buf.begin(), buf.end() ); return buf; } void checkFileExists(string f) { ifstream inp; inp.open(f.c_str(), ifstream::in); if(inp.fail()) { inp.clear(ios::failbit); inp.close(); string msg = "No file [ " + f + " ] exists."; error(msg); } inp.close(); return; } void checkFileExists(vector f) { for (int k=0; k 3 && s.substr(l-3,3) == ".gz" ) return true; if ( l > 2 && s.substr(l-2,2) == ".Z" ) return true; return false; } vector tokenizeLine(ifstream & F1) { char cline[par::MAX_LINE_LENGTH]; F1.getline(cline,par::MAX_LINE_LENGTH,'\n'); string sline = cline; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); return tokens; } vector tokenizeLine(string sline) { string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); return tokens; } vector tokenizeLine(ifstream & F1,string d) { char cline[par::MAX_LINE_LENGTH]; F1.getline(cline,par::MAX_LINE_LENGTH,'\n'); string sline = cline; NList nl(0); nl.setDelimiter(d); nl.setRangeChar(" "); return nl.deparseStringList( sline ); } bool Plink::obligMissing(int i, int l) { int2 p; p.p1 = l; p.p2 = sample[i]->sol; return ( oblig_missing.find(p) != oblig_missing.end() ); } bool Plink::missingGenotype(int i, int l) { bool s1 = par::SNP_major ? SNP[l]->one[i] : sample[i]->one[l]; bool s2 = par::SNP_major ? SNP[l]->two[i] : sample[i]->two[l]; return ( s1 && ! s2 ); } void Plink::prettyPrintLengths() { par::pp_maxfid = 4; par::pp_maxiid = 4; par::pp_maxsnp = 4; for (int i=0;ifid.length() > par::pp_maxfid) par::pp_maxfid = sample[i]->fid.length() + 2; if (sample[i]->iid.length() > par::pp_maxiid) par::pp_maxiid = sample[i]->iid.length() + 2; } for (int l=0;lname.length() > par::pp_maxsnp) par::pp_maxsnp = locus[l]->name.length() + 2; } vector vif_prune(vector > m , double threshold , vector & varcode ) { // Number of variables int p = m.size(); vector cur(p,true); // This only is needed if we have 2+ SNPs if (p<2) { return cur; } vector > r = m; // Make 'm' a correlation matrix for (int i=0; i par::prune_ld_r2 ) { if ( par::prune_ld_pairwise_maf ) { // Remove SNP with lower MAF if ( PP->locus[ varcode[i] ]->freq < PP->locus[ varcode[j] ]->freq ) cur[i] = false; else cur[j] = false; it++; done = false; break; } else { // Just remove first cur[i] = false; it++; done = false; break; } } } } } } if (done) break; } // Skip VIF calculation? if (par::prune_ld_pairwise) return cur; // Calculate r^2 for each element versus all others // considering only the current non-pruned elements while (1) { // Build correlation matrix all included items vector > u; for (int i=0;i mt; for (int j=0;j threshold ) { // exclude this item cur[maxI] = false; } else { break; } // Increase count of removed items it++; // Down to a single item or worse? if (it==p-1) break; } return cur; } vector > calcSetCovarianceMatrix(vector & nSNP) { int nss = nSNP.size(); vector > var( nss ); if ( nss == 0 ) return var; for ( int i = 0; i < nss; i++ ) { var[i].resize( nss ); } // Use helper function to calculate correlation coefficient // between two SNPs (that allows for haploid,diploid nature) // second flag 'true' indicates to return covariance term, not // correlation for (int i=0; icorrelation2SNP( nSNP[i], nSNP[j] , false, true ); } return var; } string leftWindowEdge(Plink & P, int chr, int bp) { // Get nearest SNP Locus * marker = NULL; int distance = -1; vector::iterator loc = P.locus.begin(); while ( loc != P.locus.end() ) { if ( (*loc)->chr == chr) if ( (*loc)->bp >= bp ) if ( (*loc)->bp - bp < distance || ! marker ) { distance = (*loc)->bp - bp; marker = *loc; } loc++; } if (!marker) error("Could not place marker for left window edge"); return marker->name; } string rightWindowEdge(Plink & P, int chr, int bp) { // Get nearest SNP Locus * marker = NULL; int distance = -1; vector::iterator loc = P.locus.begin(); while ( loc != P.locus.end() ) { if ( (*loc)->chr == chr) if ( (*loc)->bp <= bp ) if ( bp - (*loc)->bp < distance || ! marker ) { distance = bp - (*loc)->bp; marker = *loc; } loc++; } if (!marker) error("Could not place marker for right window edge"); return marker->name; } void Plink::setMarkerRange() { // If chromosome code >0, implies a specific chromosome if (par::run_chr>0 && !par::position_window) { // Get first and last markers on this chromosome vector m = getChromosomeMarkerRange((*this),par::run_chr); if(m[0]==-1 || m[1]==-1) error("--chr {chromosome} not found:"+int2str(par::run_chr)); par::run_start = m[0]; par::run_end = m[1]; } else if (par::position_window) { // Physical position specified (chromosome and range) par::m1 = leftWindowEdge(*this, par::run_chr, par::from_window); par::m2 = rightWindowEdge(*this, par::run_chr, par::to_window); par::run_start = getMarkerNumber( (*this), par::m1 ); par::run_end = getMarkerNumber( (*this), par::m2 ); } else { // Two SNPs specified (or a SNP and a range) // If a specific range on one chromosome is specified par::run_start = getMarkerNumber((*this),par::m1); par::run_end = getMarkerNumber((*this),par::m2); if (par::run_start==-1) error("--from {marker} not found"); if (par::run_end==-1) error("--to {marker} not found"); // Do we require a window around a specific SNP? if ( par::run_start == par::run_end && par::window > 0 ) { vector m = getWindowRange( *this , par::run_start ); par::run_start = m[0]; par::run_end = m[1]; } if (getMarkerChromosome((*this),par::m1) != getMarkerChromosome((*this),par::m2)) { string msg = "--from {marker} and --to {marker} must lie on same chromosome"; msg += "\nwhereas these lie on chromosomes "+int2str(getMarkerChromosome((*this),par::m1)); msg += " and "+int2str(getMarkerChromosome((*this),par::m2)); error(msg); } } // Get order right if (par::run_start > par::run_end) { int tmp = par::run_start; par::run_start = par::run_end; par::run_end = tmp; } int ccode = locus[par::run_start]->chr; printLOG("Scan region on chromosome " + int2str(ccode) + " from [ " + locus[par::run_start]->name + " ] to [ " + locus[par::run_end]->name + " ]\n"); } void defineHorseChromosomes() { // 31 autosomes + X + Y + etc par::chr_haploid.resize(31 + 2 + 1 ); par::chr_sex.resize(31 + 2 + 1 ); par::chr_Y.resize(31 + 2 + 1 ); par::chr_code.resize(31 + 2 + 1 ); for (int i=0; i<=31; i++) { par::chr_haploid[i] = par::chr_sex[i] = par::chr_Y[i] = false; par::chr_code[i] = int2str(i); par::chr_map.insert( make_pair( int2str(i), i ) ); } par::chr_sex[32] = true; par::chr_haploid[32] = false; par::chr_Y[32] = false; par::chr_code[32] = "X"; par::chr_map.insert( make_pair("X",32) ); par::chr_map.insert( make_pair("x",32) ); par::chr_map.insert( make_pair("32",32) ); par::chr_sex[33] = false; par::chr_haploid[33] = true; par::chr_Y[33] = true; par::chr_code[33] = "Y"; par::chr_map.insert( make_pair("Y",33) ); par::chr_map.insert( make_pair("y",33) ); par::chr_map.insert( make_pair("33",33) ); } void defineSheepChromosomes() { // 2n = 54 // 26 autosomes + X + Y + etc par::chr_haploid.resize(26 + 2 + 1 ); par::chr_sex.resize(26 + 2 + 1 ); par::chr_Y.resize(26 + 2 + 1 ); par::chr_code.resize(26 + 2 + 1 ); for (int i=0; i<=26; i++) { par::chr_haploid[i] = par::chr_sex[i] = par::chr_Y[i] = false; par::chr_code[i] = int2str(i); par::chr_map.insert( make_pair( int2str(i), i ) ); } par::chr_sex[27] = true; par::chr_haploid[27] = false; par::chr_Y[27] = false; par::chr_code[27] = "X"; par::chr_map.insert( make_pair("X",27) ); par::chr_map.insert( make_pair("x",27) ); par::chr_map.insert( make_pair("27",27) ); par::chr_sex[28] = false; par::chr_haploid[28] = true; par::chr_Y[28] = true; par::chr_code[28] = "Y"; par::chr_map.insert( make_pair("Y",28) ); par::chr_map.insert( make_pair("y",28) ); par::chr_map.insert( make_pair("28",28) ); } void defineRiceChromosomes() { // 12 haploid chromosomes (+ 0 dummy code) par::chr_haploid.resize(12 + 1 ); par::chr_sex.resize(12 + 1); par::chr_Y.resize(12 + 1); par::chr_code.resize(12 + 1); for (int i=0; i<=12; i++) { par::chr_haploid[i] = true; par::chr_sex[i] = par::chr_Y[i] = false; par::chr_code[i] = int2str(i); par::chr_map.insert( make_pair( int2str(i), i ) ); } } void defineDogChromosomes() { // 38 autosomes + X + Y + XY + 0 missing code par::chr_haploid.resize(38 + 3 + 1 ); par::chr_sex.resize(38 + 3 + 1 ); par::chr_Y.resize(38 + 3 + 1 ); par::chr_code.resize(38 + 3 + 1 ); for (int i=0; i<=38; i++) { par::chr_haploid[i] = par::chr_sex[i] = par::chr_Y[i] = false; par::chr_code[i] = int2str(i); par::chr_map.insert( make_pair( int2str(i), i ) ); } par::chr_sex[39] = true; par::chr_haploid[39] = false; par::chr_Y[39] = false; par::chr_code[39] = "X"; par::chr_map.insert( make_pair("X",39) ); par::chr_map.insert( make_pair("x",39) ); par::chr_map.insert( make_pair("39",39) ); par::chr_sex[40] = false; par::chr_haploid[40] = true; par::chr_Y[40] = true; par::chr_code[40] = "Y"; par::chr_map.insert( make_pair("Y",40) ); par::chr_map.insert( make_pair("y",40) ); par::chr_map.insert( make_pair("40",40) ); par::chr_sex[41] = false; par::chr_haploid[41] = false; par::chr_Y[41] = false; par::chr_code[41] = "XY"; par::chr_map.insert( make_pair("XY",41) ); par::chr_map.insert( make_pair("xy",41) ); par::chr_map.insert( make_pair("41",41) ); } void defineMouseChromosomes() { // 19 autosomes + X + Y + 0 missing code par::chr_haploid.resize(19 + 2 + 1 ); par::chr_sex.resize(19 + 2 + 1 ); par::chr_Y.resize(19 + 2 + 1 ); par::chr_code.resize(19 + 2 + 1 ); for (int i=0; i<=19; i++) { par::chr_haploid[i] = par::chr_sex[i] = par::chr_Y[i] = false; par::chr_code[i] = int2str(i); par::chr_map.insert( make_pair( int2str(i), i ) ); } par::chr_sex[20] = true; par::chr_haploid[20] = false; par::chr_Y[20] = false; par::chr_code[20] = "X"; par::chr_map.insert( make_pair("X",20) ); par::chr_map.insert( make_pair("x",20) ); par::chr_map.insert( make_pair("20",20) ); par::chr_sex[21] = false; par::chr_haploid[21] = true; par::chr_Y[21] = true; par::chr_code[21] = "Y"; par::chr_map.insert( make_pair("Y",21) ); par::chr_map.insert( make_pair("y",21) ); par::chr_map.insert( make_pair("21",21) ); } void defineCowChromosomes() { // 29 autosomes + X + Y + 0 missing code par::chr_haploid.resize(29 + 2 + 1 ); par::chr_sex.resize(29 + 2 + 1 ); par::chr_Y.resize(29 + 2 + 1 ); par::chr_code.resize(29 + 2 + 1 ); for (int i=0; i<=29; i++) { par::chr_haploid[i] = par::chr_sex[i] = par::chr_Y[i] = false; par::chr_code[i] = int2str(i); par::chr_map.insert( make_pair( int2str(i), i ) ); } par::chr_sex[30] = true; par::chr_haploid[30] = false; par::chr_Y[30] = false; par::chr_code[30] = "X"; par::chr_map.insert( make_pair("X",30) ); par::chr_map.insert( make_pair("x",30) ); par::chr_map.insert( make_pair("30",30) ); par::chr_sex[31] = false; par::chr_haploid[31] = true; par::chr_Y[31] = true; par::chr_code[31] = "Y"; par::chr_map.insert( make_pair("Y",31) ); par::chr_map.insert( make_pair("y",31) ); par::chr_map.insert( make_pair("31",31) ); } void defineHumanChromosomes() { // 22 autosomes + X + Y + XY + M + 0 missing code par::chr_haploid.resize( 22 + 2 + 2 + 1 ); par::chr_sex.resize(22 + 2 + 2 + 1 ); par::chr_Y.resize(22 + 2 + 2 + 1 ); par::chr_code.resize(22 + 2 + 2 + 1 ); for (int i=0; i<=22; i++) { par::chr_haploid[i] = par::chr_sex[i] = par::chr_Y[i] = false; par::chr_code[i] = int2str(i); par::chr_map.insert( make_pair( int2str(i), i ) ); } // X chromosome par::chr_sex[23] = true; par::chr_haploid[23] = false; par::chr_Y[23] = false; par::chr_code[23] = "X"; par::chr_map.insert( make_pair("X",23) ); par::chr_map.insert( make_pair("x",23) ); par::chr_map.insert( make_pair("23",23) ); // Y chromosome par::chr_sex[24] = false; par::chr_haploid[24] = true; par::chr_Y[24] = true; par::chr_code[24] = "Y"; par::chr_map.insert( make_pair("Y",24) ); par::chr_map.insert( make_pair("y",24) ); par::chr_map.insert( make_pair("24",24) ); // XY chromosome par::chr_sex[25] = false; par::chr_haploid[25] = false; par::chr_Y[25] = false; par::chr_code[25] = "XY"; par::chr_map.insert( make_pair("XY",25) ); par::chr_map.insert( make_pair("xy",25) ); par::chr_map.insert( make_pair("25",25) ); // MT chromosome par::chr_sex[26] = false; par::chr_haploid[26] = true; par::chr_Y[26] = false; par::chr_code[26] = "MT"; par::chr_map.insert( make_pair("MT",26) ); par::chr_map.insert( make_pair("mt",26) ); par::chr_map.insert( make_pair("M",26) ); par::chr_map.insert( make_pair("m",26) ); par::chr_map.insert( make_pair("26",26) ); } void sizeMatrix(matrix_t & m , int r, int c) { m.clear(); m.resize(r); for (int i=0; i 1; curr_hets -= 2) { het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); sum += het_probs[curr_hets - 2]; /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ curr_homr++; curr_homc++; } curr_hets = mid; curr_homr = (rare_copies - mid) / 2; curr_homc = genotypes - curr_hets - curr_homr; for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) { het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0)); sum += het_probs[curr_hets + 2]; /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ curr_homr--; curr_homc--; } for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum; /* alternate p-value calculation for p_hi/p_lo double p_hi = het_probs[obs_hets]; for (i = obs_hets + 1; i <= rare_copies; i++) p_hi += het_probs[i]; double p_lo = het_probs[obs_hets]; for (i = obs_hets - 1; i >= 0; i--) p_lo += het_probs[i]; double p_hi_lo = p_hi < p_lo ? 2.0 * p_hi : 2.0 * p_lo; */ double p_hwe = 0.0; /* p-value calculation for p_hwe */ for (i = 0; i <= rare_copies; i++) { if (het_probs[i] > het_probs[obs_hets]) continue; p_hwe += het_probs[i]; } p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; free(het_probs); return p_hwe; } // Convert dataset from Individual-major format to SNP-major void Plink::Ind2SNP() { printLOG("Converting data to SNP-major format\n"); SNP.clear(); vector::iterator person = sample.begin(); // Initialise SNP positions per person while ( person != sample.end() ) { (*person)->i1 = (*person)->one.end()-1; (*person)->i2 = (*person)->two.end()-1; person++; } // Copy, per SNP int l = 0; while ( l < nl_all ) { CSNP * newlocus = new CSNP; person = sample.begin(); while ( person != sample.end() ) { // Add genotype to SNP-major storage newlocus->one.push_back( *((*person)->i1) ); newlocus->two.push_back( *((*person)->i2) ); // Shift one SNP back (*person)->i1--; (*person)->i2--; // Remove individual-major storage (*person)->one.pop_back(); (*person)->two.pop_back(); // Advance to next person person++; } // And add this new SNP to the main list SNP.push_back(newlocus); // Next SNP l++; } // We finally need to reverse the order of these reverse(SNP.begin(), SNP.end()); par::SNP_major = true; } // Convert dataset from SNP-major format to Individual-major void Plink::SNP2Ind() { printLOG("Converting data to Individual-major format\n"); vector::iterator person = sample.begin(); // Make sure these containers are empty while ( person != sample.end() ) { (*person)->one.clear(); (*person)->two.clear(); person++; } /////////////////////////////// // Iterate over SNPs vector::iterator s = SNP.begin(); while ( s != SNP.end() ) { ///////////////////////////// // Iterate over individuals vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator person = sample.begin(); while ( person != sample.end() ) { // Add SNP alleles (*person)->one.push_back(*i1); (*person)->two.push_back(*i2); // Shift one SNP back i1++; i2++; // Advance to next person person++; } // For this SNP, remove SNP-major storage completely delete (*s); // Next SNP s++; } SNP.clear(); par::SNP_major = false; } int Plink::deleteSNPs(set & mset) { vector b(nl_all); for (int l=0; lname ) != mset.end() ) b[l] = true; else b[l] = false; return deleteSNPs(b); } int Plink::deleteSNPs(set & mset) { vector b(nl_all); for (int l=0; l & pset) { vector b(n); for (int i=0; i & mset) { vector b(nl_all); for (int l=0; lname ) != mset.end() ) b[l] = false; else b[l] = true; return deleteSNPs(b); } int Plink::keepSNPs(set & mset) { vector b(nl_all); for (int l=0; l & pset) { vector b(n); for (int i=0; i & del) { // Remove SNPs that have T in vector 'del' // We expect this vector to be same length // current number of SNPs int original = SNP.size(); ////////////////////////////////////////////// // SNP-major mode if (par::SNP_major) { vector::iterator s1 = SNP.begin(); vector::iterator s2 = SNP.begin(); vector::iterator d = del.begin(); int i = 0; while ( s1 != SNP.end() ) { //cout << "i = " << i++ << "\n"; // Keep this SNP? if ( ! *d ) { *s2 = *s1; s2++; } else { delete (*s1); } s1++; d++; } // Then erase the remaining SNPs SNP.erase(s2,SNP.end()); } else { // Individual major-mode // copy remaining SNPs to a new list vector::iterator person = sample.begin(); while ( person != sample.end() ) { // Set individual iterator at start of SNP list vector::iterator one1 = (*person)->one.begin(); vector::iterator two1 = (*person)->two.begin(); vector::iterator one2 = (*person)->one.begin(); vector::iterator two2 = (*person)->two.begin(); vector::iterator d = del.begin(); while ( one1 != (*person)->one.end() ) { // Keep this marker if ( ! * d ) { *one2 = *one1; *two2 = *two1; // Advance next saved SNP one2++; two2++; } // Advance to next to-be-checked SNP one1++; two1++; d++; } // Then erase the remaining SNPs (*person)->one.erase(one2,(*person)->one.end()); (*person)->two.erase(two2,(*person)->two.end()); // Next individual person++; } } /////////////////////////////////////////////////// // Second, remove deleted SNPs from the locus list vector::iterator loc1 = locus.begin(); vector::iterator loc2 = locus.begin(); vector::iterator d = del.begin(); while ( loc1 != locus.end() ) { // Should we keep this SNP? if ( ! *d ) { *loc2 = *loc1; loc2++; } else delete (*loc1); loc1++; d++; } // Then erase the remaining SNPs // and the storage locus.erase(loc2,locus.end()); // Keep track of the number of SNPs remaining nl_all = locus.size(); // Return the number of SNPs we chucked return original - nl_all; } int Plink::deleteIndividuals(vector & del) { // Remove SNPs that have T in vector 'del' // We expect this vector to be same length // current number of SNPs int original = sample.size(); ////////////////////////////////////////////// // SNP-major mode if (par::SNP_major) { // Erase genotype data (SNP-major order) // Consider each SNP in outer loop vector::iterator s = SNP.begin(); while ( s != SNP.end() ) { vector::iterator one1 = (*s)->one.begin(); vector::iterator two1 = (*s)->two.begin(); vector::iterator one2 = (*s)->one.begin(); vector::iterator two2 = (*s)->two.begin(); vector::iterator d = del.begin(); // Consider each person while ( one1 != (*s)->one.end() ) { // Keep this individual? if ( ! *d ) { (*one2) = (*one1); (*two2) = (*two1); one2++; two2++; } one1++; two1++; d++; } // Erase old storage (*s)->one.erase(one2,(*s)->one.end()); (*s)->two.erase(two2,(*s)->two.end()); // Next SNP s++; } } ///////////////////////////////////////////// // Whether SNP-major or individual-major // we still need to take care of the sample[] vector::iterator person1 = sample.begin(); vector::iterator person2 = sample.begin(); vector::iterator d = del.begin(); while ( person1 != sample.end() ) { // Keep this person if ( ! * d ) { *person2 = *person1; // Advance next saved SNP person2++; } else { // Free storage delete (*person1); } // Advance to next to-be-checked SNP person1++; d++; } // Erase pointers at end of vector sample.erase(person2,sample.end()); // Adjust sample statistics n = sample.size(); np = (int)((double)(n*(n-1))/(double)2); return original - n; } void Plink::filterOnCovariate() { printLOG("Filtering individuals based on [ "+par::filter_filename+" ]\n"); printLOG("Filtering criterion is [ " + par::filter_value + " ] for cluster " + int2str(par::mult_filter) +"\n"); // Expand filter criteria to allow a list string tmp = par::range_delimiter; par::range_delimiter = " "; NList tlist(0); vector filters = tlist.deparseStringList( par::filter_value ); par::range_delimiter = tmp; // Swap q-match filename as covariate file string tmp_covar_file = par::include_cluster_filename; int tmp_mult_covar = par::mult_clst; par::include_cluster_filename = par::filter_filename; par::mult_clst = par::mult_filter; if (!readClusterFile()) error("Problem reading filter file [ " + par::filter_filename + " ]\n"); // Put back the original covariate specificiation par::include_cluster_filename = tmp_covar_file; par::mult_clst = tmp_mult_covar; // Screen-based on covariate // vector to record which individuals to be deleted vector indel(sample.size(),false); int n_removed1=0; for (int i=0; isol; for (int j=0; j::iterator k = kmap.find( filters[j] ); if ( k != kmap.end() && k->second == thisK ) { removeThisSample = false; break; } } if ( removeThisSample ) { indel[i] = true; n_removed1++; } } // And now remove these individuals, so that // SNP-based statistics are calculated with // these samples already excluded int n_removed = deleteIndividuals(indel); if (n_removed != n_removed1) error("Internal problem in filterOnCovariate, please contact SMP\n"); printLOG(int2str(n_removed)+" individuals removed based on filter\n"); // Remove these as clusters now for (int i=0; isol = 0; } nk=1; kname.resize(0); kmap.clear(); } void Plink::filterOnCase() { printLOG("Filtering cases only ... "); // Remove controls, missing vector indel(sample.size(),false); for (int i=0; iaff) || sample[i]->missing ) indel[i] = true; int n_original = sample.size(); int n_removed = deleteIndividuals(indel); printLOG(int2str(n_original-n_removed)+" individuals remaining\n"); // Reset number of individuals n = sample.size(); np = (int)((double)(n*(n-1))/(double)2); } void Plink::filterOnControl() { printLOG("Filtering controls only ... "); // Remove cases, missing vector indel(sample.size(),false); for (int i=0; iaff || sample[i]->missing ) indel[i] = true; int n_original = sample.size(); int n_removed = deleteIndividuals(indel); printLOG(int2str(n_original-n_removed)+" individuals remaining\n"); // Reset number of individuals n = sample.size(); np = (int)((double)(n*(n-1))/(double)2); } void Plink::filterOnMale() { printLOG("Filtering males only ... "); vector indel(sample.size(),false); for (int i=0; isexcode != "1" ) indel[i] = true; int n_original = sample.size(); int n_removed = deleteIndividuals(indel); printLOG(int2str(n_original-n_removed)+" individuals remaining\n"); // Reset number of individuals n = sample.size(); np = (int)((double)(n*(n-1))/(double)2); } void Plink::filterOnFemale() { printLOG("Filtering females only ... "); vector indel(sample.size(),false); for (int i=0; isexcode != "2" ) indel[i] = true; int n_original = sample.size(); int n_removed = deleteIndividuals(indel); printLOG(int2str(n_original-n_removed)+" individuals remaining\n"); // Reset number of individuals n = sample.size(); np = (int)((double)(n*(n-1))/(double)2); } void Plink::filterOnFounder() { printLOG("Filtering founders only ... "); vector indel(sample.size(),false); for (int i=0; ifounder ) indel[i] = true; int n_original = sample.size(); int n_removed = deleteIndividuals(indel); printLOG(int2str(n_original-n_removed)+" individuals remaining\n"); // Reset number of individuals n = sample.size(); np = (int)((double)(n*(n-1))/(double)2); } void Plink::filterOnNonFounder() { printLOG("Filtering nonfounders only ... "); vector indel(sample.size(),false); for (int i=0; ifounder ) indel[i] = true; int n_original = sample.size(); int n_removed = deleteIndividuals(indel); printLOG(int2str(n_original-n_removed)+" individuals remaining\n"); // Reset number of individuals n = sample.size(); np = (int)((double)(n*(n-1))/(double)2); } void Plink::attribFilterSNP() { string tmp = par::range_delimiter; par::range_delimiter = " "; printLOG("Filtering markers, from [ " + par::snp_attrib_file + " ] " + "criterion: " + par::snp_attrib_value + "\n"); checkFileExists( par::snp_attrib_file ); ifstream IN1( par::snp_attrib_file.c_str() , ios::in ); NList nl(0); vector vlist = nl.deparseStringList( par::snp_attrib_value ); map vset; bool posMatch = false; bool negMatch = false; for (int i=0; i mset; map mlocus; for (int l=0; lname,locus[l])); while ( ! IN1.eof() ) { vector tok = tokenizeLine( IN1 ); if ( tok.size() == 0 ) continue; map::iterator i = mlocus.find( tok[0] ); if ( i == mlocus.end() ) continue; bool match = false; // T if at least 1 positive match bool exclude = false; // T if at least 1 negative match // Logical OR for matching for (int j=1; j::iterator k = vset.find( tok[j] ); // No if ( k == vset.end() ) continue; // Yes if ( k->second ) match = true; else exclude = true; } // Keep this SNP? if ( ( match || (!posMatch) ) && ( (!exclude) || (!negMatch) ) ) mset.insert( i->second ); } IN1.close(); int rem = keepSNPs( mset ); printLOG("Removed " + int2str(rem) + " SNPs based on this\n"); par::range_delimiter = tmp; return; } void Plink::attribFilterInd() { string tmp = par::range_delimiter; par::range_delimiter = " "; printLOG("Filtering individuals, from [ " + par::ind_attrib_file + " ] " + "criterion: " + par::ind_attrib_value + "\n"); checkFileExists( par::ind_attrib_file ); ifstream IN1( par::ind_attrib_file.c_str() , ios::in ); NList nl(0); vector vlist = nl.deparseStringList( par::ind_attrib_value ); map vset; bool posMatch = false; bool negMatch = false; for (int i=0; i mset; map mpeople; for (int i=0; ifid + "_" + sample[i]->iid,sample[i])); while ( ! IN1.eof() ) { vector tok = tokenizeLine( IN1 ); if ( tok.size() < 2 ) continue; map::iterator i = mpeople.find( tok[0] + "_" + tok[1] ); if ( i == mpeople.end() ) continue; bool match = false; // T if at least 1 positive match bool exclude = false; // T if at least 1 negative match for (int j=1; j::iterator k = vset.find( tok[j] ); if ( k == vset.end() ) continue; if ( k->second ) match = true; else exclude = true; } // Keep these people if ( ( match || (!posMatch) ) && ( (!exclude) || (!negMatch) ) ) mset.insert( i->second ); } IN1.close(); int rem = keepIndividuals( mset ); printLOG("Removed " + int2str(rem) + " individuals based on this\n"); par::range_delimiter = tmp; return; } void Plink::dummyLoader() { // Create dummy dataset full of heterozygotes int L = par::dummy_nsnp; int N = par::dummy_nind; for (int l=0;lname = "snp"+int2str(l); loc->chr = 1; loc->allele1 = "A"; loc->allele2 = "B"; loc->bp = l ; loc->pos = 0; locus.push_back(loc); CSNP * newset = new CSNP; newset->one.resize(N); newset->two.resize(N); for ( int i = 0 ; i < N ; i++ ) { int g = 0; if (CRandom::rand() > 0.5) g++; if (CRandom::rand() > 0.5) g++; if ( g == 0 ) { newset->one[i] = false; newset->two[i] = false; } else if ( g == 1 ) { newset->one[i] = false; newset->two[i] = true; } else { newset->one[i] = true; newset->two[i] = true; } } SNP.push_back(newset); } for (int i=0;ifid = person->iid = "per"+int2str(i); person->missing = false; person->pat = "0"; person->mat = "0"; if ( CRandom::rand() > 0.5 ) person->phenotype = 1; else person->phenotype = 2; person->sex = false; person->sexcode = "2"; sample.push_back(person); } } void Plink::alleleRecoding() { vector::iterator loc = locus.begin(); while( loc != locus.end() ) { if ( par::recode_1234 ) { if ( (*loc)->allele1 == "A" || (*loc)->allele1 == "a" ) (*loc)->allele1 = "1"; if ( (*loc)->allele1 == "C" || (*loc)->allele1 == "c" ) (*loc)->allele1 = "2"; if ( (*loc)->allele1 == "G" || (*loc)->allele1 == "g" ) (*loc)->allele1 = "3"; if ( (*loc)->allele1 == "T" || (*loc)->allele1 == "t" ) (*loc)->allele1 = "4"; if ( (*loc)->allele2 == "A" || (*loc)->allele1 == "a" ) (*loc)->allele2 = "1"; if ( (*loc)->allele2 == "C" || (*loc)->allele1 == "c" ) (*loc)->allele2 = "2"; if ( (*loc)->allele2 == "G" || (*loc)->allele1 == "g" ) (*loc)->allele2 = "3"; if ( (*loc)->allele2 == "T" || (*loc)->allele1 == "t" ) (*loc)->allele2 = "4"; } else if ( par::recode_ACGT ) { if ( (*loc)->allele1 == "1" ) (*loc)->allele1 = "A"; if ( (*loc)->allele1 == "2" ) (*loc)->allele1 = "C"; if ( (*loc)->allele1 == "3" ) (*loc)->allele1 = "G"; if ( (*loc)->allele1 == "4" ) (*loc)->allele1 = "T"; if ( (*loc)->allele2 == "1" ) (*loc)->allele2 = "A"; if ( (*loc)->allele2 == "2" ) (*loc)->allele2 = "C"; if ( (*loc)->allele2 == "3" ) (*loc)->allele2 = "G"; if ( (*loc)->allele2 == "4" ) (*loc)->allele2 = "T"; } loc++; } } vector commaParse(string s) { NList tlist(0); tlist.setRangeChar(" "); return tlist.deparseStringList( s ); } string searchAndReplace(string str, string searchString, string replaceString) { string::size_type pos = 0; while ( (pos = str.find(searchString, pos)) != string::npos ) { str.replace( pos, searchString.size(), replaceString ); pos++; } return str; } void makePersonMap(Plink &P, map & uid) { for (int i=0; ifid+"_"+P.sample[i]->iid,P.sample[i])); } void makeLocusMap(Plink &P, map & mlocus) { for (int l=0; lname,l)); } void smoother(Plink & P, vector_t & input, int n, vector_t & output1, vector_t & output2, vector & count) { // Take a vector a numbers, 0..nl_all, and // smooth, respecting chromosome boundaries, // based one par::seg_window_kb and par::seg_window_step if ( input.size() != P.nl_all ) error("Problem in smoother()\n"); output1.resize( P.nl_all ); output2.resize( P.nl_all ); count.resize( P.nl_all ); for (int l=0; lchr != loc1->chr ) break; if ( loc1->bp - loc2->bp > par::seg_test_window_bp ) break; x1 += input[l2]; c1 += n - input[l2]; involved++; } l2 = l; while ( true ) { l2++; if ( l2 == P.nl_all ) break; Locus * loc2 = P.locus[l2]; if ( loc2->chr != loc1->chr ) break; if ( loc2->bp - loc1->bp > par::seg_test_window_bp ) break; x1 += input[l2]; c1 += n - input[l2]; involved++; } output1[l] = x1; output2[l] = c1; count[l] = involved; } return; } map > readRange(string filename) { // Format: CHR BP1 BP2 (NAME) (GROUP) // If same named range read twice, take largest range checkFileExists(filename); ifstream IN(filename.c_str(),ios::in); IN.clear(); PP->printLOG("Reading list of ranges from [ " + filename + " ]\n"); PP->printLOG("Allowing a " + int2str( par::make_set_border/1000 ) + " kb window around each range\n"); map > ranges; int rcount = 0; // Track number of ranges while (!IN.eof()) { Range r; char cline[par::MAX_LINE_LENGTH]; IN.getline(cline,par::MAX_LINE_LENGTH,'\n'); string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); if ( tokens.size() < 4 ) error("Problem with line:\n" + sline ); string chr = tokens[0]; if ( ! from_string( r.start , tokens[1], std::dec ) ) error("Problem with position : " + tokens[1] ); if ( ! from_string( r.stop , tokens[2], std::dec ) ) error("Problem with position : " + tokens[2] ); // Add any specified border region r.start -= par::make_set_border; r.stop += par::make_set_border; if (r.start < 0 ) r.start = 0; if ( chr == "" ) continue; if ( r.start > r.stop ) continue; r.chr = getChromosomeCode(chr); // Assign a name for this range/set r.name = tokens[3]; bool hasGroupLabel = tokens.size() >= 5; // Assign a group? if ( par::make_set_ignore_group || ! hasGroupLabel ) r.group = -1; else { map::iterator i = Range::groupNames.find( tokens[4] ); if ( i != Range::groupNames.end() ) r.group = i->second; else { int t = Range::groupNames.size(); Range::groupNames.insert( make_pair( tokens[4] , t ) ); r.group = t; } } // Have we already seen this range, or is it new? set * s; string fullName = r.name; if ( hasGroupLabel ) fullName += "_" + int2str(r.group); if ( ranges.find( fullName ) == ranges.end() ) { // Never seen this set before set tmp; tmp.insert(r); ranges.insert(make_pair( fullName, tmp )); } else { map >::iterator ri = ranges.find( fullName ); // Add this new range to the existing set ri->second.insert( r ); } ++rcount; } IN.close(); PP->printLOG("Added " + int2str( rcount ) + " distinct ranges to " + int2str(ranges.size() ) + " sets\n"); return ranges; } double modelComparisonPValue(Model * alternate, Model * null) { if ( par::bt ) { LogisticModel * lalternate = (LogisticModel*)alternate; LogisticModel * lnull = (LogisticModel*)null; return chiprobP( lnull->getLnLk() - lalternate->getLnLk() , lalternate->getNP() - lnull->getNP() ) ; } else { LinearModel * lalternate = (LinearModel*)alternate; LinearModel * lnull = (LinearModel*)null; double F = lalternate->calculateFTest(lnull); return pF( F, alternate->getNP() - null->getNP(), alternate->Ysize() - alternate->getNP() - 1 ) ; } return -1; } set rangeIntersect(Range & r1, map > & ranges) { // Return all ranges from 'ranges' that intersect with 'r1' set intersected; int bp1 = r1.start; int bp2 = r1.stop; int chr = r1.chr; map >::iterator si = ranges.begin(); while ( si != ranges.end() ) { // string rname = si->first; set::iterator ri = si->second.begin(); while ( ri != si->second.end() ) { if ( ri->chr != chr ) { ++ri; continue; } if ( ri->start <= bp2 && ri->stop >= bp1 ) { intersected.insert( (Range*)&(*ri) ); // Only enter this name once break; } ++ri; } ++si; } return intersected; } set mapRanges2SNP(int l, map > & ranges) { // For a single SNP, return a list of pointers to all // ranges that span it set r; // { TODO } return r; } int2 mapSNPs2Range(Plink & P, const Range * range) { // For a single range, return the start and stop SNPs that fall within // that range int chr = range->chr; // Cannot map chromosome if ( P.scaffold.find( chr ) == P.scaffold.end() ) return int2(-1,-1); int lstart = P.scaffold[chr].lstart; int lstop = P.scaffold[chr].lstop; int bpstart = P.scaffold[chr].bpstart; int bpstop = P.scaffold[chr].bpstop; // Assume a roughly uniform SNP spacing, to get // good gueses at where to start looking for this // range double prop_start = ( range->start - bpstart ) / (double)( bpstop - bpstart ); double prop_stop = ( range->stop - bpstart ) / (double)( bpstop - bpstart ); int guess_start = int ( lstart + prop_start * ( lstop-lstart ) ); int guess_stop = int ( lstart + prop_stop * ( lstop-lstart ) ); if ( guess_start < lstart ) guess_start = lstart; if ( guess_stop < lstart ) guess_stop = lstart; if ( guess_start > lstop ) guess_start = lstop; if ( guess_stop > lstop ) guess_stop = lstop; //////////////////// // Adjust start while (1) { if ( P.locus[guess_start]->bp == range->start ) break; if ( P.locus[guess_start]->bp > range->start ) { if ( guess_start == lstart || P.locus[guess_start-1]->bp < range->start ) break; --guess_start; } else { if ( guess_start == lstop ) break; // else next SNP right must be first in range if ( P.locus[guess_start+1]->bp >= range->start ) { ++guess_start; break; } ++guess_start; } } //////////////////// // Adjust stop while (1) { if ( P.locus[guess_stop]->bp == range->stop ) break; if ( P.locus[guess_stop]->bp > range->stop ) { if ( guess_stop == lstart ) break; if ( P.locus[guess_stop-1]->bp <= range->stop ) { --guess_stop; break; } --guess_stop; } else { if ( guess_stop == lstop || P.locus[guess_stop+1]->bp > range->stop ) break; ++guess_stop; } } if ( P.locus[guess_start]->bp > range->stop || P.locus[guess_stop]->bp < range->start ) return int2(-1,-1); return int2(guess_start,guess_stop); } void makeScaffold(Plink & P) { int last = -1; P.scaffold.clear(); int thisChromosome = P.locus[0]->chr; int nextChromosome; int lastSNP = P.nl_all-1; for (int l=0; l< P.nl_all; l++) { int chr = P.locus[l]->chr; // Have we seen this chromosome before? If not, // add this to the list. if ( P.scaffold.find( chr ) == P.scaffold.end() ) { CInfo ci; ci.lstart = l; ci.bpstart = P.locus[l]->bp; P.scaffold.insert( make_pair( chr, ci )); } // Is this the end of this chromosome? If so, // also record that. if ( l == lastSNP || chr != P.locus[l+1]->chr ) { map::iterator i = P.scaffold.find( chr ); (i->second).lstop = l; (i->second).bpstop = P.locus[l]->bp; } } } void mapRangesToSNPs(string filename, map > & ranges, map > & snp2range ) { // Read list of ranges ranges = readRange(filename); // Consider each range map >::iterator r = ranges.begin(); int rc = 0; while ( r != ranges.end() ) { set * theseRanges = &( r->second ); // Assume a single, unqiue named range (and so set-size=1) set::iterator thisRangeSet = theseRanges->begin(); const Range * thisRange = &(*thisRangeSet); // Assign SNPs int2 srange = mapSNPs2Range(*PP, thisRange); if ( srange.p1 != -1 ) { for ( int l= srange.p1; l<= srange.p2; l++) { if ( snp2range.find(l) == snp2range.end() ) { set t; t.insert( (Range*)thisRange ); snp2range.insert(make_pair( l, t ) ); } else { set * rp = &(snp2range.find(l)->second); rp->insert((Range*)thisRange); } } ++rc; } // Next range/gene ++r; } // Save the range labels in the correct order // r = ranges.begin(); // while ( r != ranges.end() ) // { // set * theseRanges = &( r->second ); // // Assume a single, unqiue named range (and so set-size=1) // set::iterator thisRangeSet = theseRanges->begin(); // const Range * thisRange = &(*thisRangeSet); // rangeLabels.push_back( thisRange->name ); // ++r; // } PP->printLOG(int2str(rc)+" ranges with at least 1 marker\n"); PP->printLOG("Assigned " + int2str( snp2range.size() ) + " SNPs to at least 1 range\n"); } void Plink::setFlagToCase() { for (int i=0;imissing ) person->flag = false; else if ( person->aff ) person->flag = true; else person->flag = false; } } void Plink::setFlagToControl() { for (int i=0;imissing ) person->flag = false; else if ( person->aff ) person->flag = false; else person->flag = true; } } string relType(Individual *a , Individual * b) { // UN unrelated // FS sibling // HS half-sibling // PO parent-offspring // OT other if ( a->fid != b->fid ) return "UN"; if ( ! ( a->founder || b->founder ) ) { if ( a->pat == b->pat && a->mat == b->mat ) return "FS"; else if ( a->pat == b->pat || a->mat == b->mat ) return "HS"; } if ( a->pat == b->iid || a->mat == b->iid || b->pat == a->iid || b->mat == a->iid ) return "PO"; return "OT"; } void Plink::outputPermedPhenotypes(Perm & perm) { // Dummy test statistic perm.setTests(1); vector pr(1,0); vector original(1,0); perm.setPermClusters(*this); perm.originalOrder(); matrix_t permphe; sizeMatrix(permphe, n, par::replicates ); bool finished = false; int j = 0; while(!finished) { if (par::perm_genedrop) { if (par::perm_genedrop_and_swap) perm.permuteInCluster(); perm.geneDrop(); } else perm.permuteInCluster(); for (int i=0; ipperson->phenotype; finished = perm.update(pr,original); ++j; } // next permutation ofstream PPHE; printLOG("Writing permuted phenotype file to [ " + par::output_file_name + ".pphe ]\n"); PPHE.open( ( par::output_file_name + ".pphe").c_str() , ios::out ); for (int i = 0 ; i < n ; i++ ) { PPHE << sample[i]->fid << "\t" << sample[i]->iid << "\t"; for (int j=0; j > two_locus_table(int l1, int l2) { // 0 1 2 tot // 0 a b c d // 1 e f g h // 2 i g k l // M // tot // i.e. so t[4][4] contains # non-missing individuals vector< vector > t(5); for (int i=0; i<5; i++) t[i].resize(5,0); for (int i=0; in; i++) { Individual * person = PP->sample[i]; if ( person->missing || ! person->founder ) continue; bool a1 = par::SNP_major ? PP->SNP[l1]->one[i] : person->one[l1]; bool a2 = par::SNP_major ? PP->SNP[l1]->two[i] : person->two[l1]; bool b1 = par::SNP_major ? PP->SNP[l2]->one[i] : person->one[l2]; bool b2 = par::SNP_major ? PP->SNP[l2]->two[i] : person->two[l2]; if ( ! a1 ) { if ( ! a2 ) { if ( ! b1 ) { if ( ! b2 ) ++t[0][0]; else ++t[0][1]; } else { if ( ! b2 ) ++t[0][3]; else ++t[0][2]; } } else { if ( ! b1 ) { if ( ! b2 ) ++t[1][0]; else ++t[1][1]; } else { if ( ! b2 ) ++t[1][3]; else ++t[1][2]; } } } else { if ( ! a2 ) { if ( ! b1 ) { if ( ! b2 ) ++t[3][0]; else ++t[3][1]; } else { if ( ! b2 ) ++t[3][3]; else ++t[3][2]; } } else { if ( ! b1 ) { if ( ! b2 ) ++t[2][0]; else ++t[2][1]; } else { if ( ! b2 ) ++t[2][3]; else ++t[2][2]; } } } } // Row and col totals for (int i = 0; i<4; i++) for (int j = 0; j<4; j++) { t[i][4] += t[i][j]; t[4][j] += t[i][j]; if ( i<3 && j<3 ) t[4][4] += t[i][j]; } return t; } map > filterRanges(map > & ranges, string filename) { set isubset; set inotfound; checkFileExists( filename ); PP->printLOG("Reading gene subset list from [ " + filename + " ]\n"); ifstream IN(filename.c_str(), ios::in); while ( ! IN.eof() ) { string gname; IN >> gname; if ( gname=="" ) continue; isubset.insert(gname); } // Copy over extracted set to here map > newRanges; set::iterator i = isubset.begin(); while ( i != isubset.end() ) { map >::iterator rf = ranges.find( *i ); if ( rf == ranges.end() ) { inotfound.insert( *i ); ++i; continue; } newRanges.insert( make_pair( *i , rf->second ) ); ++i; } PP->printLOG("Extracted " + int2str( newRanges.size() ) + " ranges from this list\n"); if ( inotfound.size() > 0 ) { PP->printLOG("Was unable to find " + int2str( inotfound.size() ) + " ranges\n"); PP->printLOG("Writing this list of not-found genes to [ " + par::output_file_name + ".notfound ]\n"); ofstream O2; O2.open( (par::output_file_name+".notfound").c_str() , ios::out); set::iterator i1 = inotfound.begin(); while ( i1 != inotfound.end() ) { O2 << *i1 << "\n"; ++i1; } O2.close(); } return newRanges; } plink-1.07-src/ipmpar.h0000644000265600020320000002237111264127626014144 0ustar tilleaadmin #ifndef __IPMPAR_CPP__ #define __IPMPAR_CPP__ /* ----------------------------------------------------------------------- IPMPAR PROVIDES THE INTEGER MACHINE CONSTANTS FOR THE COMPUTER THAT IS USED. IT IS ASSUMED THAT THE ARGUMENT I IS AN INTEGER HAVING ONE OF THE VALUES 1-10. IPMPAR(I) HAS THE VALUE ... INTEGERS. ASSUME INTEGERS ARE REPRESENTED IN THE N-DIGIT, BASE-A FORM SIGN ( X(N-1)*A**(N-1) + ... + X(1)*A + X(0) ) WHERE 0 .LE. X(I) .LT. A FOR I=0,...,N-1. IPMPAR(1) = A, THE BASE. IPMPAR(2) = N, THE NUMBER OF BASE-A DIGITS. IPMPAR(3) = A**N - 1, THE LARGEST MAGNITUDE. FLOATING-POINT NUMBERS. IT IS ASSUMED THAT THE SINGLE AND DOUBLE PRECISION FLOATING POINT ARITHMETICS HAVE THE SAME BASE, SAY B, AND THAT THE NONZERO NUMBERS ARE REPRESENTED IN THE FORM SIGN (B**E) * (X(1)/B + ... + X(M)/B**M) WHERE X(I) = 0,1,...,B-1 FOR I=1,...,M, X(1) .GE. 1, AND EMIN .LE. E .LE. EMAX. IPMPAR(4) = B, THE BASE. SINGLE-PRECISION IPMPAR(5) = M, THE NUMBER OF BASE-B DIGITS. IPMPAR(6) = EMIN, THE SMALLEST EXPONENT E. IPMPAR(7) = EMAX, THE LARGEST EXPONENT E. DOUBLE-PRECISION IPMPAR(8) = M, THE NUMBER OF BASE-B DIGITS. IPMPAR(9) = EMIN, THE SMALLEST EXPONENT E. IPMPAR(10) = EMAX, THE LARGEST EXPONENT E. ----------------------------------------------------------------------- TO DEFINE THIS FUNCTION FOR THE COMPUTER BEING USED REMOVE THE COMMENT DELIMITORS FROM THE DEFINITIONS DIRECTLY BELOW THE NAME OF THE MACHINE ----------------------------------------------------------------------- IPMPAR IS AN ADAPTATION OF THE FUNCTION I1MACH, WRITTEN BY P.A. FOX, A.D. HALL, AND N.L. SCHRYER (BELL LABORATORIES). IPMPAR WAS FORMED BY A.H. MORRIS (NSWC). THE CONSTANTS ARE FROM BELL LABORATORIES, NSWC, AND OTHER SOURCES. ----------------------------------------------------------------------- .. Scalar Arguments .. */ int ipmpar(int *i) { static int imach[11]; static int ipmpar; /* MACHINE CONSTANTS FOR AMDAHL MACHINES. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 16; imach[5] = 6; imach[6] = -64; imach[7] = 63; imach[8] = 14; imach[9] = -64; imach[10] = 63; */ /* MACHINE CONSTANTS FOR THE AT&T 3B SERIES, AT&T PC 7300, AND AT&T 6300. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -125; imach[7] = 128; imach[8] = 53; imach[9] = -1021; imach[10] = 1024; */ /* MACHINE CONSTANTS FOR THE BURROUGHS 1700 SYSTEM. */ /* imach[1] = 2; imach[2] = 33; imach[3] = 8589934591; imach[4] = 2; imach[5] = 24; imach[6] = -256; imach[7] = 255; imach[8] = 60; imach[9] = -256; imach[10] = 255; */ /* MACHINE CONSTANTS FOR THE BURROUGHS 5700 SYSTEM. */ /* imach[1] = 2; imach[2] = 39; imach[3] = 549755813887; imach[4] = 8; imach[5] = 13; imach[6] = -50; imach[7] = 76; imach[8] = 26; imach[9] = -50; imach[10] = 76; */ /* MACHINE CONSTANTS FOR THE BURROUGHS 6700/7700 SYSTEMS. */ /* imach[1] = 2; imach[2] = 39; imach[3] = 549755813887; imach[4] = 8; imach[5] = 13; imach[6] = -50; imach[7] = 76; imach[8] = 26; imach[9] = -32754; imach[10] = 32780; */ /* MACHINE CONSTANTS FOR THE CDC 6000/7000 SERIES 60 BIT ARITHMETIC, AND THE CDC CYBER 995 64 BIT ARITHMETIC (NOS OPERATING SYSTEM). */ /* imach[1] = 2; imach[2] = 48; imach[3] = 281474976710655; imach[4] = 2; imach[5] = 48; imach[6] = -974; imach[7] = 1070; imach[8] = 95; imach[9] = -926; imach[10] = 1070; */ /* MACHINE CONSTANTS FOR THE CDC CYBER 995 64 BIT ARITHMETIC (NOS/VE OPERATING SYSTEM). */ /* imach[1] = 2; imach[2] = 63; imach[3] = 9223372036854775807; imach[4] = 2; imach[5] = 48; imach[6] = -4096; imach[7] = 4095; imach[8] = 96; imach[9] = -4096; imach[10] = 4095; */ /* MACHINE CONSTANTS FOR THE CRAY 1, XMP, 2, AND 3. */ /* imach[1] = 2; imach[2] = 63; imach[3] = 9223372036854775807; imach[4] = 2; imach[5] = 47; imach[6] = -8189; imach[7] = 8190; imach[8] = 94; imach[9] = -8099; imach[10] = 8190; */ /* MACHINE CONSTANTS FOR THE DATA GENERAL ECLIPSE S/200. */ /* imach[1] = 2; imach[2] = 15; imach[3] = 32767; imach[4] = 16; imach[5] = 6; imach[6] = -64; imach[7] = 63; imach[8] = 14; imach[9] = -64; imach[10] = 63; */ /* MACHINE CONSTANTS FOR THE HARRIS 220. */ /* imach[1] = 2; imach[2] = 23; imach[3] = 8388607; imach[4] = 2; imach[5] = 23; imach[6] = -127; imach[7] = 127; imach[8] = 38; imach[9] = -127; imach[10] = 127; */ /* MACHINE CONSTANTS FOR THE HONEYWELL 600/6000 AND DPS 8/70 SERIES. */ /* imach[1] = 2; imach[2] = 35; imach[3] = 34359738367; imach[4] = 2; imach[5] = 27; imach[6] = -127; imach[7] = 127; imach[8] = 63; imach[9] = -127; imach[10] = 127; */ /* MACHINE CONSTANTS FOR THE HP 2100 3 WORD DOUBLE PRECISION OPTION WITH FTN4 */ /* imach[1] = 2; imach[2] = 15; imach[3] = 32767; imach[4] = 2; imach[5] = 23; imach[6] = -128; imach[7] = 127; imach[8] = 39; imach[9] = -128; imach[10] = 127; */ /* MACHINE CONSTANTS FOR THE HP 2100 4 WORD DOUBLE PRECISION OPTION WITH FTN4 */ /* imach[1] = 2; imach[2] = 15; imach[3] = 32767; imach[4] = 2; imach[5] = 23; imach[6] = -128; imach[7] = 127; imach[8] = 55; imach[9] = -128; imach[10] = 127; */ /* MACHINE CONSTANTS FOR THE HP 9000. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -126; imach[7] = 128; imach[8] = 53; imach[9] = -1021; imach[10] = 1024; */ /* MACHINE CONSTANTS FOR THE IBM 360/370 SERIES, THE ICL 2900, THE ITEL AS/6, THE XEROX SIGMA 5/7/9 AND THE SEL SYSTEMS 85/86. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 16; imach[5] = 6; imach[6] = -64; imach[7] = 63; imach[8] = 14; imach[9] = -64; imach[10] = 63; */ /* MACHINE CONSTANTS FOR THE IBM PC. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -125; imach[7] = 128; imach[8] = 53; imach[9] = -1021; imach[10] = 1024; */ /* MACHINE CONSTANTS FOR THE MACINTOSH II - ABSOFT MACFORTRAN II. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -125; imach[7] = 128; imach[8] = 53; imach[9] = -1021; imach[10] = 1024; */ /* MACHINE CONSTANTS FOR THE MICROVAX - VMS FORTRAN. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -127; imach[7] = 127; imach[8] = 56; imach[9] = -127; imach[10] = 127; */ /* MACHINE CONSTANTS FOR THE PDP-10 (KA PROCESSOR). */ /* imach[1] = 2; imach[2] = 35; imach[3] = 34359738367; imach[4] = 2; imach[5] = 27; imach[6] = -128; imach[7] = 127; imach[8] = 54; imach[9] = -101; imach[10] = 127; */ /* MACHINE CONSTANTS FOR THE PDP-10 (KI PROCESSOR). */ /* imach[1] = 2; imach[2] = 35; imach[3] = 34359738367; imach[4] = 2; imach[5] = 27; imach[6] = -128; imach[7] = 127; imach[8] = 62; imach[9] = -128; imach[10] = 127; */ /* MACHINE CONSTANTS FOR THE PDP-11 FORTRAN SUPPORTING 32-BIT INTEGER ARITHMETIC. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -127; imach[7] = 127; imach[8] = 56; imach[9] = -127; imach[10] = 127; */ /* MACHINE CONSTANTS FOR THE SEQUENT BALANCE 8000. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -125; imach[7] = 128; imach[8] = 53; imach[9] = -1021; imach[10] = 1024; */ /* MACHINE CONSTANTS FOR THE SILICON GRAPHICS IRIS-4D SERIES (MIPS R3000 PROCESSOR). */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -125; imach[7] = 128; imach[8] = 53; imach[9] = -1021; imach[10] = 1024; */ /* MACHINE CONSTANTS FOR IEEE ARITHMETIC MACHINES, SUCH AS THE AT&T 3B SERIES, MOTOROLA 68000 BASED MACHINES (E.G. SUN 3 AND AT&T PC 7300), AND 8087 BASED MICROS (E.G. IBM PC AND AT&T 6300). */ imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -125; imach[7] = 128; imach[8] = 53; imach[9] = -1021; imach[10] = 1024; /* MACHINE CONSTANTS FOR THE UNIVAC 1100 SERIES. */ /* imach[1] = 2; imach[2] = 35; imach[3] = 34359738367; imach[4] = 2; imach[5] = 27; imach[6] = -128; imach[7] = 127; imach[8] = 60; imach[9] = -1024; imach[10] = 1023; */ /* MACHINE CONSTANTS FOR THE VAX 11/780. */ /* imach[1] = 2; imach[2] = 31; imach[3] = 2147483647; imach[4] = 2; imach[5] = 24; imach[6] = -127; imach[7] = 127; imach[8] = 56; imach[9] = -127; imach[10] = 127; */ ipmpar = imach[*i]; return ipmpar; } #endif plink-1.07-src/lapackf.cpp0000644000265600020320000001060311264127626014603 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "plink.h" #include "helper.h" #ifdef WITH_LAPACK #include "lapackf.h" extern "C" int dgesdd_(char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info); extern "C" int dsyevx_( char * , char * , char * , int * , double * , int * , double * , double * , int * , int * , double * , int * , double * , double * , int * , double * , int * , int * , int * , int * ) ; #endif bool svd_lapack(int n, vector_t & A, vector_t & S, matrix_t & V) { int m=n; vector_t U(n*n); vector_t tV(n*n); cout << "Using LAPACK SVD library function...\n"; #ifdef WITH_LAPACK int info=0; vector iwork(8*m,0); double optim_lwork; int lwork; lwork = -1; // Determine workspace needed dgesdd_("A", &m, &n, &A[0] , &m, &S[0], &U[0], &m, &tV[0], &n, &optim_lwork, &lwork, &iwork[0], &info); lwork = (int) optim_lwork; vector_t work( lwork, 0 ); // Perform actual SVD dgesdd_("A", &m, &n, &A[0] , &m, &S[0], &U[0], &m, &tV[0], &n, &work[0], &lwork, &iwork[0], &info); // Copy and transpose V int k = 0; for( int i = 0; i < n; i++ ) for( int j = 0; j < n; j++ ) { V[j][i] = tV[k]; ++k; } return true; #else // LAPACK support not compiled return false; #endif } bool eigen_lapack(int n, vector_t & A, vector_t & S, matrix_t & V) { // Use eigenvalue decomposition instead of SVD // Get only the highest eigen-values, (par::cluster_mds_dim) int i1 = n - par::cluster_mds_dim + 1; int i2 = n; double z = -1; // Integer workspace size, 5N vector iwork(5*n,0); double optim_lwork; int lwork = -1; int out_m; vector_t out_w( par::cluster_mds_dim , 0 ); vector_t out_z( n * par::cluster_mds_dim ,0 ); int ldz = n; vector ifail(n,0); int info=0; double nz = 0; // Get workspace dsyevx_("V" , // get eigenvalues and eigenvectors "I" , // get interval of selected eigenvalues "L" , // data stored as upper triangular &n , // order of matrix &A[0] , // input matrix &n , // LDA &nz , // Vlower &nz , // Vupper &i1, // from 1st ... &i2, // ... to nth eigenvalue &z , // 0 for ABSTOL &out_m, // # of eigenvalues found &out_w[0], // first M entries contain sorted eigen-values &out_z[0], // array (can be mxm? nxn) &ldz, // make n at first &optim_lwork, // Get optimal workspace &lwork, // size of workspace &iwork[0], // int workspace &ifail[0], // output: failed to converge &info ); // Assign workspace lwork = (int) optim_lwork; vector_t work( lwork, 0 ); dsyevx_("V" , // get eigenvalues and eigenvectors "I" , // get interval of selected eigenvalues "L" , // data stored as upper triangular &n , // order of matrix &A[0] , // input matrix &n , // LDA &nz , // Vlower &nz , // Vupper &i1, // from 1st ... &i2, // ... to nth eigenvalue &z , // 0 for ABSTOL &out_m, // # of eigenvalues found &out_w[0], // first M entries contain sorted eigen-values &out_z[0], // array (can be mxm? nxn) &ldz, // make n at first &work[0], // Workspace &lwork, // size of workspace &iwork[0], // int workspace &ifail[0], // output: failed to converge &info ); // Get eigenvalues, vectors for (int i=0; i< par::cluster_mds_dim; i++) S[i] = out_w[i]; for (int i=0; i #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "perm.h" #include "sets.h" #include "helper.h" #include "nlist.h" extern ofstream LOG; void Plink::printLOG(string s) { LOG << s; LOG.flush(); if (!par::silent) { cout << s; cout.flush(); } } void Plink::display_indivReport() { Individual * p1 = NULL; int i1; for (int i=0; ifid == par::indiv_report_fid && sample[i]->iid == par::indiv_report_iid ) { p1 = sample[i]; i1 = i; break; } } if ( p1 == NULL ) error("Problem finding individual indicated in --report\n"); printLOG("\nReport for individual [ " + p1->fid + " " + p1->iid + " ]\n\n"); for ( int l = 0 ; l < nl_all ; l++) { stringstream s2; s2 << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(6) << genotype(*this,i1,l) << "\n"; printLOG( s2.str() ); } } void Plink::displayGenomePV() { ofstream PLO; string f = par::output_file_name + ".plink2"; PLO.open(f.c_str(),ios::out); printLOG("Writing genome-wide corrected PLINK results to [ " +f +" ] \n"); for (int i=0; i= original[i][j]) pv++; PLO << "G " << double(pv+1)/double(par::replicates+1) << "\n"; } } PLO.close(); } void Plink::display_pairList() { if (par::SNP_major) SNP2Ind(); string f = par::output_file_name + ".plist"; printLOG("Writing pair-list file to [ " + f + " ] \n"); ofstream PED(f.c_str(), ios::out); PED.clear(); PED.precision(4); // Find individuals Individual * p1 = NULL; Individual * p2 = NULL; int i1, i2; for (int i=0; ifid == par::plist_fid1 && sample[i]->iid == par::plist_iid1 ) { p1 = sample[i]; i1 = i; } if ( sample[i]->fid == par::plist_fid2 && sample[i]->iid == par::plist_iid2 ) { p2 = sample[i]; i2 = i; } } if (p1 == NULL || p2 == NULL) error("Problem finding individuals indicated in --plist\n"); PED << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(14) << "BP" << " " << setw(par::pp_maxfid+par::pp_maxiid) << (p1->fid+"/"+p1->iid) << " " << setw(par::pp_maxfid+par::pp_maxiid) << (p2->fid+"/"+p2->iid) << " " << setw(4) << "A1" << " " << setw(8) << "MAF" << " " << setw(4) << "IBS" << "\n"; for (int l=0; lchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(14) << locus[l]->bp << " " << setw(par::pp_maxfid+par::pp_maxiid) << genotype(*this,i1,l) << " " << setw(par::pp_maxfid+par::pp_maxiid) << genotype(*this,i2,l) << " " << setw(4) << locus[l]->allele1 << " " << setw(8) << locus[l]->freq << " "; bool a1 = p1->one[l]; bool a2 = p1->two[l]; bool b1 = p2->one[l]; bool b2 = p2->two[l]; bool ibs0 = false; bool ibs1 = false; bool miss = false; if ( a1 == a2 && b1 == b2 && a1 != b1 ) ibs0 = true; else if ( a1 && !(a2) ) miss = true; else if ( b1 && !(b2) ) miss = true; else if ( a1 != b1 || a2 != b2 ) ibs1 = true; PED << setw(4); if (ibs0) PED << "0" << "\n"; else if (ibs1) PED << "1" << "\n"; else if (miss) PED << "NA" << "\n"; else PED << "2" << "\n"; } PED.close(); } void Plink::display_listByAllele() { if (!par::SNP_major) Ind2SNP(); // Create an output file that lists one allele per line // N = number of individuals FID1 IID2 FID2 IID2 ... FIDN IIDN // SNP ALLELE N { Individual list ( 2N entries) } string f = par::output_file_name + ".list"; printLOG("Writing recoded list file to [ " + f + " ] \n"); ofstream PED(f.c_str(), ios::out); PED.clear(); vector::iterator s = SNP.begin(); vector::iterator loc = locus.begin(); while ( s != SNP.end() ) { // Genotype 11 PED << (*loc)->chr << par::recode_delimit << (*loc)->name << par::recode_delimit << (*loc)->allele1 << (*loc)->allele1 ; vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator gperson = sample.begin(); while ( gperson != sample.end() ) { if ( (!*i1) && (!*i2) ) PED << par::recode_delimit << (*gperson)->fid << par::recode_delimit << (*gperson)->iid ; i1++; i2++; gperson++; } PED << "\n"; // Genotype 12 PED << (*loc)->chr << par::recode_delimit << (*loc)->name << par::recode_delimit << (*loc)->allele1 << (*loc)->allele2 ; i1 = (*s)->one.begin(); i2 = (*s)->two.begin(); gperson = sample.begin(); while ( gperson != sample.end() ) { if ( (!*i1) && *i2 ) PED << par::recode_delimit << (*gperson)->fid << par::recode_delimit << (*gperson)->iid ; i1++; i2++; gperson++; } PED << "\n"; // Genotype 22 PED << (*loc)->chr << par::recode_delimit << (*loc)->name << par::recode_delimit << (*loc)->allele2 << (*loc)->allele2 ; i1 = (*s)->one.begin(); i2 = (*s)->two.begin(); gperson = sample.begin(); while ( gperson != sample.end() ) { if ( *i1 && *i2 ) PED << par::recode_delimit << (*gperson)->fid << par::recode_delimit << (*gperson)->iid ; i1++; i2++; gperson++; } PED << "\n"; // Genotype 00 PED << (*loc)->chr << par::recode_delimit << (*loc)->name << par::recode_delimit << "00"; i1 = (*s)->one.begin(); i2 = (*s)->two.begin(); gperson = sample.begin(); while ( gperson != sample.end() ) { if ( *i1 && !*i2 ) PED << par::recode_delimit << (*gperson)->fid << par::recode_delimit << (*gperson)->iid ; i1++; i2++; gperson++; } PED << "\n"; // Next SNP s++; loc++; } PED.close(); } void make2LTable(ofstream & TWOL, Plink & P, int m1, int m2, vector_t count, bool percent) { vector_t margA(4); vector_t margB(4); double total = 0; for (int i=0; i<16; i++) total += count[i]; if (percent) { for (int i=0; i<16; i++) count[i] /= total; total = 1; } margA[0] = count[0]+count[1]+count[2]+count[3]; margA[1] = count[4]+count[5]+count[6]+count[7]; margA[2] = count[8]+count[9]+count[10]+count[11]; margA[3] = count[12]+count[13]+count[14]+count[15]; margB[0] = count[0]+count[4]+count[8]+count[12]; margB[1] = count[1]+count[5]+count[9]+count[13]; margB[2] = count[2]+count[6]+count[10]+count[14]; margB[3] = count[3]+count[7]+count[11]+count[15]; TWOL << setw(par::pp_maxsnp) << " " << " " << setw(4) << " " << " " << " " << par::twolocus_snp2 << "\n"; TWOL << setw(par::pp_maxsnp) << " " << " " << setw(4) << " " << " " << setw(6) << (P.locus[m2]->allele1+"/"+P.locus[m2]->allele1) << " " << setw(6) << (P.locus[m2]->allele1+"/"+P.locus[m2]->allele2) << " " << setw(6) << (P.locus[m2]->allele2+"/"+P.locus[m2]->allele2) << " " << setw(6) << "0/0" << " " << setw(6) << "*/*" << "\n"; TWOL << setw(par::pp_maxsnp) << par::twolocus_snp1 << " " << setw(4) << (P.locus[m1]->allele1+"/"+P.locus[m1]->allele1) << " "; TWOL << setw(6) << count[0] << " " << setw(6) << count[1] << " " << setw(6) << count[2] << " " << setw(6) << count[3] << " " << setw(6) << margA[0] << "\n"; TWOL << setw(par::pp_maxsnp) << " " << " " << setw(4) << (P.locus[m1]->allele1+"/"+P.locus[m1]->allele2) << " "; TWOL << setw(6) << count[4] << " " << setw(6) << count[5] << " " << setw(6) << count[6] << " " << setw(6) << count[7] << " " << setw(6) << margA[1] << "\n"; TWOL << setw(par::pp_maxsnp) << " " << " " << setw(4) << (P.locus[m1]->allele2+"/"+P.locus[m1]->allele2) << " "; TWOL << setw(6) << count[8] << " " << setw(6) << count[9] << " " << setw(6) << count[10] << " " << setw(6) << count[11] << " " << setw(6) << margA[2] << "\n"; TWOL << setw(par::pp_maxsnp) << " " << " " << setw(4) << "0/0" << " "; TWOL << setw(6) << count[12] << " " << setw(6) << count[13] << " " << setw(6) << count[14] << " " << setw(6) << count[15] << " " << setw(6) << margA[3] << "\n"; TWOL << setw(par::pp_maxsnp) << " " << " " << setw(4) << "*/*" << " "; TWOL << setw(6) << margB[0] << " " << setw(6) << margB[1] << " " << setw(6) << margB[2] << " " << setw(6) << margB[3] << " " << setw(6) << total << "\n"; TWOL << "\n"; } void Plink::display_twolocus() { if (!par::SNP_major) Ind2SNP(); // Find the two SNPs // Write to file a contingency table, possibly stratified // by phenotype string f = par::output_file_name + ".twolocus"; printLOG("Writing two-locus table for " + par::twolocus_snp1 + " by " + par::twolocus_snp2 + " to [ " + f + " ] \n"); ofstream TWOL(f.c_str(), ios::out); TWOL.clear(); TWOL.precision(3); int m1 = getMarkerNumber(*this,par::twolocus_snp1); int m2 = getMarkerNumber(*this,par::twolocus_snp2); if (m1<0) error("Marker "+par::twolocus_snp1+" not found\n"); if (m2<0) error("Marker "+par::twolocus_snp2+" not found\n"); vector::iterator sa = SNP.begin()+m1; vector::iterator sb = SNP.begin()+m2; vector::iterator loc1 = locus.begin()+m1; vector::iterator loc2 = locus.begin()+m2; vector::iterator ia1 = (*sa)->one.begin(); vector::iterator ia2 = (*sa)->two.begin(); vector::iterator ib1 = (*sb)->one.begin(); vector::iterator ib2 = (*sb)->two.begin(); vector::iterator person = sample.begin(); int gtype = 0; vector_t c_all(16); vector_t c_case(16); vector_t c_control(16); while ( person != sample.end() ) { if ( (!*ia1) && (!*ia2) ) { if ( (!*ib1) && (!*ib2) ) gtype=0; else if ( (!*ib1) && *ib2 ) gtype=1; else if ( *ib1 && *ib2 ) gtype=2; else if ( *ib1 && (!*ib2) ) gtype=3; } else if ( (!*ia1) && *ia2 ) { if ( (!*ib1) && (!*ib2) ) gtype=4; else if ( (!*ib1) && *ib2 ) gtype=5; else if ( *ib1 && *ib2 ) gtype=6; else if ( *ib1 && (!*ib2) ) gtype=7; } else if ( *ia1 && *ia2 ) { if ( (!*ib1) && (!*ib2) ) gtype=8; else if ( (!*ib1) && *ib2 ) gtype=9; else if ( *ib1 && *ib2 ) gtype=10; else if ( *ib1 && (!*ib2) ) gtype=11; } else if ( *ia1 && (!*ia2) ) { if ( (!*ib1) && (!*ib2) ) gtype=12; else if ( (!*ib1) && *ib2 ) gtype=13; else if ( *ib1 && *ib2 ) gtype=14; else if ( *ib1 && (!*ib2) ) gtype=15; } c_all[gtype]++; if ( ! (*person)->missing ) { if ( (*person)->aff ) c_case[gtype]++; else c_control[gtype]++; } ia1++; ia2++; ib1++; ib2++; person++; } TWOL << "\nAll individuals\n===============\n"; make2LTable(TWOL,*this,m1,m2,c_all,false); TWOL.setf(ios::fixed); make2LTable(TWOL,*this,m1,m2,c_all,true); TWOL.unsetf(ios::fixed); TWOL << "\nCases\n=====\n"; make2LTable(TWOL,*this,m1,m2,c_case,false); TWOL.setf(ios::fixed); make2LTable(TWOL,*this,m1,m2,c_case,true); TWOL.unsetf(ios::fixed); TWOL << "\nControls\n========\n"; make2LTable(TWOL,*this,m1,m2,c_control,false); TWOL.setf(ios::fixed); make2LTable(TWOL,*this,m1,m2,c_control,true); TWOL.unsetf(ios::fixed); TWOL << "\n"; TWOL.close(); } void Plink::extractExcludeSet(bool exclude) { // Make map of locus name with 'l' number map mlocus; vector del(nl_all); for (int l=0;lname,l)); if (exclude) del[l] = false; // start off all included else del[l] = true; // start off all excluded } map::iterator ilocus; ////////////////////////////////// // Either extract a certain "GENE" if (par::dump_gene) { // Temporarily read sets -- these will get updated later readSet(); bool found_gene = false; int c=0; for (int s=0; s lst = nl.deparseStringList( par::snp_include_range, & mlocus ); for (int l=0; l slist; // Read list of ranges map > ranges; map > snp2range; makeScaffold( *this ); mapRangesToSNPs( filename , ranges, snp2range ); map >::iterator i1 = snp2range.begin(); while ( i1 != snp2range.end() ) { del[ i1->first ] = exclude ? true : false; slist.insert( i1->first ); ++i1; } if ( exclude ) printLOG("Excluding " + int2str( slist.size() ) + " SNPs\n"); else printLOG("Extracting " + int2str( slist.size() ) + " SNPs\n"); } ///////////////////////////////////// // OR get a list of SNPs from a file else { string filename = par::extract_file; if (exclude) filename = par::exclude_file; checkFileExists(filename); ifstream INFILE(filename.c_str(),ios::in); INFILE.clear(); printLOG("Reading list of SNPs "); if (exclude) printLOG("to exclude [ " + par::exclude_file + " ] ... "); else printLOG("to extract [ " + par::extract_file + " ] ... "); int c=0; while (!INFILE.eof()) { string m; INFILE >> m; if (m=="") continue; ilocus = mlocus.find(m); if (ilocus != mlocus.end()) { if (exclude) del[ilocus->second] = true; else del[ilocus->second] = false; c++; } } INFILE.close(); printLOG(int2str(c)+" read\n"); } //////////////////////////////////////// // Remove selected loci from locus list, deleteSNPs(del); } void Plink::removeIndividuals(bool keep) { ////////////////////////////////// // Make map of individuals FID/IID map mperson; map::iterator iperson; vector del(n); for (int i=0;ifid+"_"+sample[i]->iid,i)); if (!keep) del[i] = false; // start off all included else del[i] = true; // start off all excluded } /////////////////////////// // Read list of individuals string filename = par::remove_indiv_list; if (keep) filename = par::keep_indiv_list; checkFileExists(filename); ifstream INFILE(filename.c_str(),ios::in); INFILE.clear(); printLOG("Reading individuals "); if (keep) printLOG("to keep [ "+par::keep_indiv_list + " ] ... "); else printLOG( "to remove [ "+par::remove_indiv_list + " ] ... "); int c=0; while (!INFILE.eof()) { vector s = tokenizeLine( INFILE ); if ( s.size() == 1 ) error("Problem with line:\n"+s[0]); else if (s.size() == 0 ) continue; string fid = s[0]; string iid = s[1]; iperson = mperson.find(fid+"_"+iid); if (iperson != mperson.end()) { if (!keep) del[iperson->second] = true; else del[iperson->second] = false; c++; } } INFILE.close(); printLOG(int2str(c)+" read\n"); //////////////////////////////////// // Remove individuals as appropriate int n_removed = deleteIndividuals(del); printLOG(int2str(n_removed)+" individuals removed with "); if (keep) printLOG("--keep option\n"); else printLOG("--remove option\n"); } void Plink::keep2SetsForGenome() { ////////////////////////////////// // Make map of individuals FID/IID map mperson; map::iterator iperson; vector del(n); for (int i=0;ifid+"_"+sample[i]->iid,i)); del[i] = true; // start off all excluded } /////////////////////////// // Read list of individuals checkFileExists(par::genome_setlist1); checkFileExists(par::genome_setlist2); ifstream INFILE1(par::genome_setlist1.c_str(),ios::in); INFILE1.clear(); ifstream INFILE2(par::genome_setlist2.c_str(),ios::in); INFILE2.clear(); int c=0; while (!INFILE1.eof()) { string fid,iid; INFILE1 >> fid >> iid; if (fid=="" || iid=="") continue; iperson = mperson.find(fid+"_"+iid); if (iperson != mperson.end()) { del[iperson->second] = false; gset1.insert(sample[iperson->second]); c++; } } INFILE1.close(); printLOG(int2str(c)+" read from [ "+par::genome_setlist1 +" ]\n"); c=0; while (!INFILE2.eof()) { string fid,iid; INFILE2 >> fid >> iid; if (fid=="" || iid=="") continue; iperson = mperson.find(fid+"_"+iid); if (iperson != mperson.end()) { del[iperson->second] = false; gset2.insert(sample[iperson->second]); c++; } } INFILE2.close(); printLOG(int2str(c)+" read from [ "+par::genome_setlist2 +" ]\n"); //////////////////////////////////// // Remove individuals as appropriate int n_removed = deleteIndividuals(del); } void Plink::zeroOnCluster() { // Make map of locus name with 'l' number map mlocus; vector del(nl_all); for (int l=0;lname,l)); map::iterator ilocus; /////////////////////////////////////////// // Get a list of SNPs/clusters from a file string filename = par::zero_cluster_filename; checkFileExists(filename); ifstream INFILE(filename.c_str(),ios::in); INFILE.clear(); printLOG("Reading list of SNP/clusters to zero out [ "); printLOG(par::zero_cluster_filename + " ]\n"); int c=0; while (!INFILE.eof()) { string m; string k; INFILE >> m >> k; if (m=="") continue; ilocus = mlocus.find(m); if (ilocus != mlocus.end()) { int l = ilocus->second; map::iterator ki = kmap.find(k); if ( ki == kmap.end() ) { continue; // cluster does not exist //printLOG("Cluster [ "+k+" ] not found \n"); } else { int k2 = ki->second; // Zero out the following for (int i=0; isol == k2 ) { if ( par::SNP_major ) { SNP[l]->one[i] = true; SNP[l]->two[i] = false; } else { sample[i]->one[l] = true; sample[i]->two[l] = false; } c++; } } } } } INFILE.close(); printLOG(int2str(c)+" genotypes zeroed\n"); } void Plink::setObligMissing() { // Temporaily use the cluster (sol) variables to store information // about oblig missing (i.e. call this *before* we read any // subsequent cluster variables printLOG("Reading list of SNP/clusters that are obligatory missing [ "); printLOG(par::oblig_missing_filename + " ]\n"); printLOG("Reading the clusters that define obligatory missingness [ "); printLOG(par::oblig_clusters_filename + " ]\n"); string stored_name = par::include_cluster_filename; par::include_cluster_filename = par::oblig_clusters_filename; readClusterFile(); par::include_cluster_filename = stored_name; // Make map of locus name with 'l' number map mlocus; vector del(nl_all); for (int l=0;lname,l)); map::iterator ilocus; /////////////////////////////////////////// // Get a list of SNPs/clusters from a file string filename = par::oblig_missing_filename; checkFileExists(filename); ifstream INFILE(filename.c_str(),ios::in); INFILE.clear(); int c=0; while (!INFILE.eof()) { string m; string k; INFILE >> m >> k; if (m=="") continue; ilocus = mlocus.find(m); if (ilocus != mlocus.end()) { int l = ilocus->second; map::iterator ki = kmap.find(k); if ( ki == kmap.end() ) continue; // cluster does not exist else { int2 p; p.p1 = l; p.p2 = ki->second; oblig_missing.insert(p); ++c; } } } INFILE.close(); printLOG(int2str(c)+" SNP/cluster combinations set as obligatory missing\n"); } void Plink::display_recoded_PEDFILE() { string f = par::output_file_name + ".ped"; printLOG("Writing recoded ped file to [ " + f + " ] \n"); ofstream PED(f.c_str(), ios::out); PED.clear(); string missingCode = par::out_missing_phenotype; if ( par::recode_HV ) missingCode = "0"; for (int i=0;ifid << par::recode_delimit << person->iid << par::recode_delimit << person->pat << par::recode_delimit << person->mat << par::recode_delimit << person->sexcode << par::recode_delimit; if ( person->missing ) { PED << missingCode; } else { if (par::bt) PED << (int)person->phenotype; else PED << person->phenotype; } for (int l=0;lchr; if ( par::recode_HV || par::recode_whap ) { if ( nl_all > 5000 ) printLOG(" *** WARNING : you are exporting a large number of SNPs (>5000) for\n" " input into Haploview/WHAP -- make sure you have adequate\n" " system resources to handle this file\n\n"); } if ( par::recode_HV) for (int l=0;lchr != HV_chr ) HV_okay = false; MAP << locus[l]->name << "\t" << locus[l]->bp << "\n"; } else if ( par::recode_whap ) { for (int l=0;lchr << "\t" << locus[l]->name << "\t" << locus[l]->bp << "\n"; } } else for (int l=0;lchr << "\t" << locus[l]->name << "\t" << locus[l]->pos << "\t" << locus[l]->bp << "\n"; } MAP.close(); if ( par::recode_HV ) { if ( ! HV_okay ) printLOG(" *** WARNING : you've created a Haploview file containing\n" " *** SNPs from more than 1 chromosome -- these will\n" " *** not be read properly in Haploview\n\n"); } if ( par::recode_whap ) { // WHAP also needs a DAT file string f = par::output_file_name + ".dat"; printLOG("Writing whap-format DAT file [ " + f + " ] \n"); ofstream DAT(f.c_str(), ios::out); DAT.clear(); if ( par::qt ) DAT << "T trait\n"; else if ( par::coding01 ) DAT << "B trait\n"; else DAT << "A trait\n"; for (int l=0;lname << "\n"; DAT.close(); } } void Plink::display_recoded_MUTLIST() { string f = par::output_file_name + ".rlist"; printLOG("Writing rare-list file to [ " + f + " ] \n"); ofstream PED(f.c_str(), ios::out); for (int l=0;l het; set hom; set missing; for (int i=0;ione[i] : person->one[l]; bool a2 = par::SNP_major ? SNP[l]->two[i] : person->two[l]; if ( (!a1) && (!a2) ) hom.insert(i); else if ( (!a1) && a2 ) het.insert(i); else if ( a1 && ! a2 ) missing.insert(i); } // Hets if ( het.size() > 0 ) { PED << locus[l]->name << par::recode_delimit << "HET" << par::recode_delimit << locus[l]->allele1 << par::recode_indelimit << locus[l]->allele2; set::iterator j = het.begin(); while ( j != het.end() ) { PED << par::recode_delimit << sample[*j]->fid << par::recode_delimit << sample[*j]->iid; ++j; } PED << "\n"; } // Homozygotes if ( hom.size() > 0 ) { PED << locus[l]->name << par::recode_delimit << "HOM" << par::recode_delimit << locus[l]->allele1 << par::recode_indelimit << locus[l]->allele1; set::iterator j = hom.begin(); while ( j != hom.end() ) { PED << par::recode_delimit << sample[*j]->fid << par::recode_delimit << sample[*j]->iid; ++j; } PED << "\n"; } // Missing genotypes if ( missing.size() > 0 ) { PED << locus[l]->name << par::recode_delimit << "NIL" << par::recode_delimit << par::missing_genotype << par::recode_indelimit << par::missing_genotype; set::iterator j = missing.begin(); while ( j != missing.end() ) { PED << par::recode_delimit << sample[*j]->fid << par::recode_delimit << sample[*j]->iid; ++j; } PED << "\n"; } } PED.close(); // And a corresponding MAP file f = par::output_file_name + ".map"; printLOG( "Writing new map file to [ " + f + " ] \n"); ofstream MAP(f.c_str(), ios::out); for (int l=0;lchr << "\t" << locus[l]->name << "\t" << locus[l]->pos << "\t" << locus[l]->bp << "\n"; } MAP.close(); printLOG( "Writing pedigree information to [ " + par::output_file_name + ".fam ] \n"); ofstream FAM((par::output_file_name+".fam").c_str(), ios::out); for (int i=0;ifid << " " << person->iid << " " << person->pat << " " << person->mat << " " << person->sexcode << " "; if ( person->missing ) FAM << par::out_missing_phenotype << "\n"; else { if (par::bt) FAM << (int)person->phenotype << "\n"; else FAM << person->phenotype << "\n"; } } FAM.clear(); FAM.close(); } void Plink::display_recoded_LONG() { string f = par::output_file_name + ".lgen"; printLOG("Writing recoded LGEN file to [ " + f + " ] \n"); ofstream PED(f.c_str(), ios::out); string missingCode = par::out_missing_phenotype; for (int l=0;lone[i] : person->one[l]; bool a2 = par::SNP_major ? SNP[l]->two[i] : person->two[l]; if ( a1 && a2 ) continue; } PED << person->fid << par::recode_delimit << person->iid << par::recode_delimit << locus[l]->name << par::recode_delimit << genotypeToFile( *this , i, l ) << "\n"; } PED.close(); // And a corresponding MAP file f = par::output_file_name + ".map"; printLOG( "Writing new map file to [ " + f + " ] \n"); ofstream MAP(f.c_str(), ios::out); for (int l=0;lchr << "\t" << locus[l]->name << "\t" << locus[l]->pos << "\t" << locus[l]->bp << "\n"; } MAP.close(); ////////////////////////////////////////// // And a corresponding reference file? if ( par::recode_long_ref ) { f = par::output_file_name + ".ref"; printLOG( "Writing new reference file to [ " + f + " ] \n"); ofstream MAP(f.c_str(), ios::out); for (int l=0;lallele1 == par::missing_genotype && locus[l]->allele2 == par::missing_genotype ) continue; MAP << locus[l]->name; if ( locus[l]->allele2 != par::missing_genotype ) MAP << " " << locus[l]->allele2; if ( locus[l]->allele1 != par::missing_genotype ) MAP << " " << locus[l]->allele1; MAP << "\n"; } MAP.close(); } ////////////////////////////////////////// // And a corresponding FAM file? printLOG( "Writing pedigree information to [ " + par::output_file_name + ".fam ] \n"); ofstream FAM((par::output_file_name+".fam").c_str(), ios::out); for (int i=0;ifid << " " << person->iid << " " << person->pat << " " << person->mat << " " << person->sexcode << " "; if ( person->missing ) FAM << par::out_missing_phenotype << "\n"; else { if (par::bt) FAM << (int)person->phenotype << "\n"; else FAM << person->phenotype << "\n"; } } FAM.clear(); FAM.close(); } void Plink::output_fastphase_format() { string f = par::output_file_name + ".recode.phase.inp"; printLOG("Writing fastphase-format file to [ " + f + " ] \n"); ofstream OUT(f.c_str(), ios::out); OUT.clear(); // Sample size and number of SNPs OUT << n << "\n" << nl_all << "\nP "; // Positions bool output_okay = true; int output_chr = locus[0]->chr; for (int l=0;lchr != output_chr ) output_okay = false; OUT << locus[l]->bp << " "; } OUT << "\n"; if ( ! output_okay ) printLOG(" *** WARNING : you've created a fastphase file containing\n" " *** SNPs from more than 1 chromosome -- these will\n" " *** not be read properly in fastphase\n\n"); for (int i=0;iiid << "\n"; for (int l=0;lone[i] : person->one[l]; bool a2 = par::SNP_major ? SNP[l]->two[i] : person->two[l]; if ( ! a1 ) OUT << "0"; else { if ( a2 ) OUT << "1"; else OUT << "?"; } } OUT << "\n"; // Second allele for (int l=0;lone[i] : person->one[l]; bool a2 = par::SNP_major ? SNP[l]->two[i] : person->two[l]; if ( ! a1 ) { if ( a2 ) OUT << "1"; else OUT << "0"; } else { if ( a2 ) OUT << "1"; else OUT << "?"; } } // Next individual OUT << "\n"; } OUT.close(); } void Plink::output_bimbam_format() { // One file of SNP locations (SNP BP) *.pos.txt // One file of phenotypes (column 6 from FAM) // One file containing individual IDs and genotypes (transposed format) // N_ind N_snp // "IND,"IID // 'SNP Name' genotypes... // Position file string f = par::output_file_name + ".recode.pos.txt"; printLOG("Writing BIMBAM position file to [ " + f + " ] \n"); ofstream POS(f.c_str(), ios::out); for (int l=0;lname << " " << locus[l]->bp << "\n"; POS.close(); // Phenotype information f = par::output_file_name + ".recode.pheno.txt"; printLOG("Writing BIMBAM phenotype file to [ " + f + " ] \n"); ofstream PHE(f.c_str(), ios::out); if (par::bt) for (int i=0;iphenotype << "\n"; else for (int i=0;iphenotype << "\n"; PHE.close(); // Genotype file f = par::output_file_name + ".recode.geno.txt"; printLOG("Writing BIMBAM genotype file to [ " + f + " ] \n"); ofstream GEN(f.c_str(), ios::out); par::missing_genotype = "?"; par::recode_delimit = ","; par::recode_indelimit = ""; GEN << n << "\n" << nl_all << "\n" << "IND"; for (int i=0;iiid; GEN << "\n"; for (int l=0;lname; for (int i=0;iname << " "; GEN << "\n"; for (int l=0;lchr != locus[l]->chr ? -1 : locus[l]->bp - locus[l-1]->bp ) << " "; GEN << "\n"; par::missing_genotype = "0"; par::recode_delimit = " "; par::recode_indelimit = " "; par::recode_12 = true; map fmap; int cnt = 0; for (int i=0;ifid ) == fmap.end() ) fmap.insert(make_pair(person->fid,++cnt)); } for (int i=0;iiid << " " << fmap.find(sample[i]->fid)->second; for (int l=0;lchr << par::recode_delimit << locus[l]->name << par::recode_delimit << locus[l]->pos << par::recode_delimit << locus[l]->bp; for (int i=0;ifid << par::recode_delimit << person->iid << par::recode_delimit << person->pat << par::recode_delimit << person->mat << par::recode_delimit << person->sexcode << par::recode_delimit; if ( person->missing ) FAM << par::out_missing_phenotype; else { if (par::bt) FAM << (int)person->phenotype; else FAM << person->phenotype; } FAM << "\n"; } FAM.close(); } void Plink::display_recoded_PEDFILE_AD() { string f = par::output_file_name + ".raw"; printLOG( "Writing recoded file to [ " + f + " ] \n"); ofstream PED(f.c_str(), ios::out); PED.clear(); map amap; if ( par::recode_allele_coding ) { string f = par::recode_allele_coding_file; printLOG( "Reading allele coding list from [ " + f + " ] \n"); ifstream AMAP; checkFileExists(f); AMAP.open(f.c_str(), ios::in); while ( ! AMAP.eof() ) { string snp, allele; AMAP >> snp >> allele; if (snp=="") break; amap.insert(make_pair(snp,allele)); } printLOG("Read allele codes for " + int2str( amap.size() ) + " SNPs\n"); AMAP.close(); } // Header row PED << "FID" << par::recode_delimit << "IID" << par::recode_delimit << "PAT" << par::recode_delimit << "MAT" << par::recode_delimit << "SEX" << par::recode_delimit << "PHENOTYPE"; for (int l=0;lallele1; if ( par::recode_allele_coding ) { map::iterator a = amap.find(locus[l]->name); if ( a != amap.end() ) aname = a->second; } PED << par::recode_delimit << locus[l]->name+"_"+aname; if ( ! par::recode_AD_Aonly ) PED << par::recode_delimit << locus[l]->name+"_HET"; } PED << "\n"; for (int i=0;ifid << par::recode_delimit << person->iid << par::recode_delimit << person->pat << par::recode_delimit << person->mat << par::recode_delimit << person->sexcode; if ( person->missing ) PED << par::recode_delimit << par::out_missing_phenotype; else { if (par::bt) PED << par::recode_delimit << (int)person->phenotype; else PED << par::recode_delimit << person->phenotype; } string g0 = par::recode_AD_Aonly ? par::recode_delimit + "2" : par::recode_delimit + "2" + par::recode_delimit + "0"; string g1 = par::recode_AD_Aonly ? par::recode_delimit + "1" : par::recode_delimit + "1" + par::recode_delimit + "1"; string g2 = par::recode_AD_Aonly ? par::recode_delimit + "0" : par::recode_delimit + "0" + par::recode_delimit + "0"; string gX = par::recode_AD_Aonly ? par::recode_delimit + "NA" : par::recode_delimit + "NA" + par::recode_delimit + "NA"; for (int l=0;lone[i] : sample[i]->one[l]; bool a2 = par::SNP_major ? SNP[l]->two[i] : sample[i]->two[l]; if (par::recode_AD_fixed && locus[l]->allele1=="1") { if ( (!a1) && (!a2) ) PED << g2; else if ( (!a1) && a2) PED << g1; else if ( a1 && a2 ) PED << g0; else PED << gX; } else if ( par::recode_allele_coding ) { Locus * loc = locus[l]; map::iterator a = amap.find(loc->name); bool flip = false; if ( a != amap.end() ) { if ( a->second == loc->allele2 ) flip = true; } if ( flip ) { if ( (!a1) && (!a2) ) PED << g2; else if ( (!a1) && a2) PED << g1; else if ( a1 && a2 ) PED << g0; else PED << gX; } else { if ( (!a1) && (!a2) ) PED << g0; else if ( (!a1) && a2) PED << g1; else if ( a1 && a2 ) PED << g2; else PED << gX; } } else { if ( (!a1) && (!a2) ) PED << g0; else if ( (!a1) && a2) PED << g1; else if ( a1 && a2 ) PED << g2; else PED << gX; } } PED << "\n"; } PED.close(); } void Plink::write_covariates() { printLOG( "Writing covariate information to [ " + par::output_file_name + ".cov ] \n"); ofstream COV((par::output_file_name+".cov").c_str(), ios::out); // First, make dummy-variables for any multi-category variables; up // to a limit (10 levels) vector downcoding_level(par::clist_number); vector< map > levels(par::clist_number); vector< map > backcode(par::clist_number); if ( par::dump_covar_dummy_coding ) { for (int c=0; cclistMissing[c] ) { int covariate = (int)sample[i]->clist[c]; if ( levels[c].find( covariate ) == levels[c].end() ) { int t = levels[c].size(); levels[c].insert( make_pair(covariate, t)); backcode[c].insert( make_pair(t,covariate)); } } } if ( levels[c].size() > 2 && levels[c].size() < 50 ) downcoding_level[c] = levels[c].size(); else downcoding_level[c] = 0; } } // Always write a header row COV << "FID IID "; if (par::dump_covar_with_phenotype) COV << "PAT MAT SEX PHENOTYPE "; // Covariate names in header row (possibly with dummy-variable // downcoding) if ( par::dump_covar_dummy_coding ) { for (int c=0; csecond << " "; } } } else for (int c=0; cfid << " " << person->iid << " "; if (par::dump_covar_with_phenotype) { COV << person->pat << " " << person->mat << " " << person->sex << " "; if ( person->missing ) COV << par::out_missing_phenotype << " "; else COV << person->phenotype << " "; } if ( par::dump_covar_dummy_coding ) { for (int c=0; cclistMissing[c] ) COV << par::out_missing_phenotype << " "; else COV << person->clist[c] << " "; } else { for (int d=1; dclistMissing[c] ) COV << par::out_missing_phenotype << " "; else { if ( levels[c].find( (int)person->clist[c] )->second == d ) COV << "1 "; else COV << "0 "; } } } } } else { for (int c=0; cclist[c] << " "; } COV << "\n"; } COV.close(); } void Plink::write_clusters() { printLOG( "Writing cluster information to [ " + par::output_file_name + ".clst ] \n"); ofstream COV((par::output_file_name+".clst").c_str(), ios::out); // Do not write a header row // COV << "FID IID CLST"; // COV << "\n"; // Output values for each individual for (int i=0;ifid << " " << person->iid << " "; if ( person->sol == -1 ) COV << "NA\n"; else COV << kname[ person->sol ] << "\n"; } COV.close(); } void Plink::write_snplist() { printLOG( "Writing list of SNPs to [ " + par::output_file_name + ".snplist ] \n"); ofstream SL((par::output_file_name+".snplist").c_str(), ios::out); for (int l=0;lname << "\n"; SL.close(); } void Plink::write_BITFILE() { printLOG( "Writing pedigree information to [ " + par::output_file_name + ".fam ] \n"); ofstream BIT((par::output_file_name+".fam").c_str(), ios::out); // For each individual for (int i=0;ifid << " " << person->iid << " " << person->pat << " " << person->mat << " " << person->sexcode << " "; if ( person->missing ) BIT << par::out_missing_phenotype << "\n"; else { if (par::bt) BIT << (int)person->phenotype << "\n"; else BIT << person->phenotype << "\n"; } } BIT.clear(); BIT.close(); printLOG( "Writing map (extended format) information to [ " + par::output_file_name + ".bim ] \n"); BIT.open((par::output_file_name+".bim").c_str(), ios::out); for (int l=0;lallele1=="") { if (locus[l]->allele2!="0") locus[l]->allele1="0"; else locus[l]->allele1="X"; } if (locus[l]->allele2=="") locus[l]->allele2="0"; BIT << locus[l]->chr << "\t" << locus[l]->name << "\t" << locus[l]->pos << "\t" << locus[l]->bp << "\t" << locus[l]->allele1 << "\t" << locus[l]->allele2 << "\n"; } BIT.clear(); BIT.close(); ////////////////////////////////////// // Save genotype data in BITFILE format printLOG("Writing genotype bitfile to [ " + par::output_file_name + ".bed ] \n"); BIT.open((par::output_file_name+".bed").c_str(), ios::out | ios::binary); if (par::out_SNP_major) printLOG("Using (default) SNP-major mode\n"); else printLOG("Using individual-major mode\n"); bitset<8> b; char ch[1]; // Magic numbers for .bed file: 00110110 11011000 = v1.00 bed file b.reset(); b.set(2); b.set(3); b.set(5); b.set(6); ch[0] = (char)b.to_ulong(); BIT.write(ch,1); b.reset(); b.set(0); b.set(1); b.set(3); b.set(4); ch[0] = (char)b.to_ulong(); BIT.write(ch,1); // BIT represents status of SNP-major (true) or Ind-major (false) b.reset(); if (par::out_SNP_major) b.set(0); ch[0] = (char)b.to_ulong(); BIT.write(ch,1); ////////////////////////////////////////// // Now consider genotypes: SNP-major mode if (par::out_SNP_major) { // Create the SNP-major dataset, if need be if (!par::SNP_major) Ind2SNP(); vector::iterator s = SNP.begin(); // Outer loop over SNPs while ( s != SNP.end() ) { vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); // vector::iterator person = sample.begin(); // Inner loop over individuals while ( i1 != (*s)->one.end() ) { bitset<8> b; b.reset(); int c=0; while (c<8 && i1 != (*s)->one.end() ) { if ( *(i1) ) b.set(c); i1++; c++; if ( *(i2) ) b.set(c); i2++; c++; //person++; } char ch[1]; ch[0] = (char)b.to_ulong(); BIT.write(ch,1); } // next SNP s++; } } //////////////////////////////////////////////////////////// // Alternatively, consider genotypes: Individual-major mode else { // Create the individual-major dataset, if need be if (par::SNP_major) SNP2Ind(); vector::iterator person = sample.begin(); // Outer loop over individuals while ( person != sample.end() ) { vector::iterator i1 = (*person)->one.begin(); vector::iterator i2 = (*person)->two.begin(); // Inner loop over SNPs while ( i1 != (*person)->one.end() ) { bitset<8> b; b.reset(); int c=0; while (c<8 && i1 != (*person)->one.end() ) { if ( *(i1) ) b.set(c); i1++; c++; if ( *(i2) ) b.set(c); i2++; c++; } char ch[1]; ch[0] = (char)b.to_ulong(); BIT.write(ch,1); } // next person person++; } } BIT.close(); } void Plink::outputSetFile() { if ( par::make_set_complement ) { if ( !par::make_set_ignore_group ) printLOG("Making sets of SNPs not in each group\n"); else printLOG("Making a set of all SNPs not in region(s)\n"); } else if ( par::make_set_collapse ) { if ( !par::make_set_ignore_group ) printLOG("Collapsing all regions into groups\n"); else printLOG("Collapsing all regions into a single set\n"); } // We need a scaffold in place now, for range lookup makeScaffold(*this); map > ranges = readRange( par::make_set_file ); map >::iterator r = ranges.begin(); int eset = 0 , fset = 0; // Clear set storage for (int i=0; i > inSet; if ( par::make_set_complement || par::make_set_collapse ) { // Collapse to a single group... if ( par::make_set_ignore_group ) { setname.resize(1); inSet.resize(1); setname[0] = par::make_set_collapse_label; } else { // ...or collapse to "group" label level? setname.resize( Range::groupNames.size() ); inSet.resize( Range::groupNames.size() ); map::iterator i1 = Range::groupNames.begin(); while ( i1 != Range::groupNames.end() ) { setname[i1->second] = par::make_set_collapse ? i1->first : "C_" + i1->first; ++i1; } } } // Loop over all ranges int cnt = -1; while ( r != ranges.end() ) { set * theseRanges = &( r->second ); // Assume a single, unqiue named range (and so set-size=1) set::iterator thisRangeSet = theseRanges->begin(); const Range * thisRange = &(*thisRangeSet); // Get group name/label if ( ! ( par::make_set_complement || par::make_set_collapse ) ) { set t; inSet.push_back(t); ++cnt; setname.push_back( thisRange->name ); } else { // Collapsing, but by gorup if ( ! par::make_set_ignore_group ) { cnt = thisRange->group; } } // Consider all inner ranges in this set while ( thisRangeSet != theseRanges->end() ) { const Range * thisRange = &(*thisRangeSet); // Assign SNPs int2 srange = mapSNPs2Range(*this, thisRange); // Write SNPs if ( srange.p1 != -1 ) { if ( par::make_set_complement || par::make_set_collapse ) { if ( par::make_set_ignore_group ) for ( int l = srange.p1; l<= srange.p2; l++) inSet[0].insert(l); else for ( int l = srange.p1; l<= srange.p2; l++) inSet[cnt].insert(l); } else { for ( int l = srange.p1; l<= srange.p2; l++) inSet[cnt].insert(l); } ++fset; } else ++eset; // Next range in this set ++thisRangeSet; } // Next set of ranges ++r; } printLOG("Found " + int2str( fset ) + " ranges with at least 1 SNP; " + int2str(eset) + " empty ranges\n"); // Copy SNPs into sets snpset.resize( inSet.size() ); for (int j=0; j & thisSet = inSet[j]; for (int l=0; l::iterator i = thisSet.find(l); if ( i == thisSet.end() ) { snpset[j].push_back( l ); } } } else { set::iterator i = inSet[j].begin(); while ( i != inSet[j].end() ) { snpset[j].push_back( *i ); ++i; } } } } void Plink::setTable() { // Write SNPs out, scoring with sets (0/1) printLOG("Writing set in table form to [ " + par::output_file_name + ".set.table ]\n"); ofstream O( ( par::output_file_name+".set.table" ).c_str() , ios::out ); O << "SNP\tCHR\tBP"; for (int i=0; i < setname.size();i++) O << "\t" << setname[i]; O << "\n"; pS->initialiseSetMapping(); for (int l=0; lname << "\t" << locus[l]->chr << "\t" << locus[l]->bp << "\t"; for (int i=0;i >::iterator si = pS->setMapping.find(l); set::iterator si2 = si->second.find(i); if ( si2 == si->second.end() ) O << "\t" << "0"; else O << "\t" << "1"; } O << "\n"; } } void Plink::writeSetFile() { // Write SNPs out, scoring with sets (0/1) printLOG("Writing set file to [ " + par::output_file_name + ".set ]\n"); ofstream O( ( par::output_file_name+".set" ).c_str() , ios::out ); for (int j=0; jname << "\n"; O << "END\n\n"; } O.close(); } plink-1.07-src/clumpld.cpp0000644000265600020320000006706711264127625014661 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "clumpld.h" #include "phase.h" #include "options.h" #include "plink.h" ////////////////////////////////////////////////// // Set user-defined parameters within constructor clump_LD::clump_LD(Plink * pp, HaploPhase * hp_, double sig, double dist, double secondp, float r2c) { P = pp; hp = hp_; pval_cutoff = sig; ld_distance = dist; second_pval_cutoff = secondp; r2_cutoff = r2c; } ///////////// // helper functions string returnFullRangeList(Range & r1, map > & ranges, bool verbose); ///////////// // accessors void clump_LD::set_pval( double sig ){ pval_cutoff = sig; } void clump_LD::set_second_pval( double secondp ){ second_pval_cutoff = secondp; } void clump_LD::set_ld( double dist ){ ld_distance = dist; } void clump_LD::set_r2( double r2c ){ r2_cutoff = r2c; } ////////////////////////////////////////////////// // read association file and pull out significant // results sort by pvalue vector clump_LD::read_assoc_file(string fileList) { vector sp; // We may be reading a single file, or more than one file fileList = searchAndReplace(fileList,","," "); // Tokenize filename.clear(); string buf; stringstream ss(fileList); while (ss >> buf) filename.push_back(buf); // Read each file for (int f=0; f annot_field; // Get header row char cline[par::MAX_LINE_LENGTH]; RESIN.getline(cline,par::MAX_LINE_LENGTH,'\n'); string sline = cline; if (sline=="") error("Problem reading [ " + par::clumpld_results + " ]\n"); vector tok_annot; if ( par::clumpld_annot ) { string afields = searchAndReplace(par::clumpld_annot_fields,","," "); string buf; stringstream ss(afields); while (ss >> buf) tok_annot.push_back(buf); } string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); for (int i=0; iprintLOG("Reading results for clumping from [ " + filename[f] + " ]\n"); P->printLOG("Extracting fields SNP and " + par::clumpld_column + "\n"); while (!RESIN.eof()) { char cline[par::MAX_LINE_LENGTH]; RESIN.getline(cline,par::MAX_LINE_LENGTH,'\n'); string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); if ( tokens.size() <= snp_column || tokens.size() <= pval_column ) continue; ResultTrio pt; // Keep track of which file this is from pt.f = f+1; if ( ! from_string( pt.p, tokens[pval_column] , std::dec)) continue; pt.s = tokens[snp_column]; // Create clumped vector, based just on SNP name if( pt.p <= pval_cutoff ) clumped.insert(make_pair(pt.s,false)); ClumpPair cp; cp.snp = pt.s; cp.f = f+1; ClumpResults cr; cr.p = pt.p; cr.annot = ""; if ( par::clumpld_annot ) { for (int f=0; fprintLOG("\nParameters for --clump:\n"); P->printLOG(" p-value threshold for index SNPs = " + dbl2str(pval_cutoff) + "\n"); P->printLOG(" Physical (kb) threshold for clumping = " + dbl2str(ld_distance/1000.0) + "\n"); P->printLOG(" LD (r-squared) threshold for clumping = " + dbl2str(r2_cutoff) + "\n"); P->printLOG(" p-value threshold for clumped SNPs = " + dbl2str(second_pval_cutoff) + "\n\n"); if ( par::clumpld_annot ) P->printLOG("Including annotation fields: " + par::clumpld_annot_fields +"\n"); vector sp = read_assoc_file( par::clumpld_results ); if ( par::clumpld_index1 ) P->printLOG("Indexing only on [ " + filename[0] + " ]\n"); else P->printLOG("Indexing on all files\n"); if ( par::clumpld_only_show_replications ) P->printLOG("Only showing cross-file clumps\n"); if ( par::clumpld_only_show_replications_list ) P->printLOG("Only showing non-index clumped SNPs\n"); int zero, one, two, three, four; map mlocus; map grouped_snps; ofstream CLMP; CLMP.open( (par::output_file_name + ".clumped").c_str() , ios::out); CLMP.precision(3); P->printLOG("Writing clumped results file to [ " + par::output_file_name + ".clumped ]\n"); ofstream CLMP2; if ( par::clumpld_range_annotate && ! par::clumpld_verbose ) { P->printLOG("Writing clumped ranges file to [ " + par::output_file_name + ".clumped.ranges ]\n"); CLMP2.open( (par::output_file_name + ".clumped.ranges").c_str() , ios::out); CLMP2 << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "P" << " " << setw(6) << "N" << " " << setw(28) << "POS" << " " << setw(10) << "KB" << " " << "RANGES" << "\n"; } ofstream BEST; if ( par::clumpld_best ) { BEST.open( (par::output_file_name + ".clumped.best").c_str() , ios::out); BEST.precision(3); P->printLOG("Writing best per clump to [ " + par::output_file_name + ".clumped.best" + " ]\n"); BEST << setw(par::pp_maxsnp) << "INDEX" << " " << setw(par::pp_maxsnp) << "PSNP" << " " << setw(6) << "RSQ" << " " << setw(8) << "KB" << " " << setw(8) << "P" << " " << setw(8) << "ALLELES" << " " << setw(8) << "F" << "\n"; } ////////////////////////// // Read a list of ranges? map > ranges; map > snp2range; if ( par::clumpld_range_annotate ) { // Helper function to map ranges to SNPs mapRangesToSNPs( par::clumpld_range_file, ranges, snp2range ); } string vmessage; ///////////////// // Output header if ( ! par::clumpld_verbose ) CLMP << setw(4) << "CHR" << " " << setw(4) << "F" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "BP" << " " << setw(8) << "P" << " " << setw(8) << "TOTAL" << " " << setw(6) << "NSIG" << " " << setw(6) << "S05" << " " << setw(6) << "S01" << " " << setw(6) << "S001" << " " << setw(6) << "S0001" << " " << setw(6) << "SP2" << "\n"; ////////////////// // Build SNP map for (int l=0; lnl_all; l++) mlocus.insert(make_pair(P->locus[l]->name, l)); if (sp.size()==0) { P->printLOG("No significant results given --clump parameters\n"); return; } //////////////////////////////////////////////// // Iterate through all association results by // decreasing p-value until cutoff is reached int clumpCount = 0; for (int i = 0; i < sp.size(); i++) { // if ( ! par::silent) // cout << "Tested " << i << " of " << sp.size() << " SNPs \r"; zero=one=two=three=four=0; //////////////////////////////////// // End if p-value cutoff is reached if( sp[i].p > pval_cutoff ) break; ////////////////////////////////// // Skip already clumped this SNP // (unless running in "best" mode // where we always want to pull the // best SNP if( clumped[sp[i].s] && ! par::clumpld_best ) { continue; } ///////////////////////////////////////////// // Are we only indexing based on first file? if ( par::clumpld_index1 && sp[i].f > 1 ) { continue; } /////////////////////////////////////////// // Record dataset of index SNP int indexF = sp[i].f; /////////////////////////////////////////// // Make sure associated SNP is in SNP map map::iterator ilocus; ilocus = mlocus.find(sp[i].s); int l = -1; if (ilocus != mlocus.end()) { l = ilocus->second; } else { if ( !par::clumpld_verbose ) CLMP << setw(4) << "NA" << " " << setw(4) << "NA" << " " << setw(par::pp_maxsnp) << sp[i].s << " " << setw(10) << "NA" << " " << setw(10) << sp[i].p << " " << setw(8) << "NA" << " " << setw(6) << "NA" << " " << setw(6) << "NA" << " " << setw(6) << "NA" << " " << setw(6) << "NA" << " " << setw(6) << "NA" << " " << setw(6) << "NA" << "\n"; else vmessage += sp[i].s + " not found in dataset\n"; if ( par::clumpld_best ) { BEST << setw(par::pp_maxsnp) << sp[i].s << " " << setw(par::pp_maxsnp) << "NF" << " " << setw(6) << "NF" << " " << setw(8) << "NF" << " " << setw(8) << "NF" << " " << setw(8) << "NF" << " " << setw(8) << "NF" << " "; BEST << "\n"; } continue; } /////////////////////////////// // Check all SNPs in LD range // Set at reference SNP, l, and move out // left and right (l1,l2) int l1 = l; int l2 = l; // If multiple files, allow for self comparison also if ( filename.size()>1 ) --l1; set willClump; map inPhaseAllele; while(1) { bool failed1 = false, failed2 = false; // Expand outwards to physical limits/SNP sets // Moving right if( l1 < P->locus.size()-1 ) { // Advance a position l1++; double r2a = -1; // Compute r-squared if this SNP is close // enough to the index SNP, physically if( P->locus[l1]->chr == P->locus[l]->chr && P->locus[l1]->bp - P->locus[l]->bp < ld_distance ) r2a = hp->rsq( l, l1 ); else failed1 = true; ////////////////////////////////// // Skip already clumped this SNP if( par::clumpld_indep && clumped[ P->locus[l1]->name ] ) continue; // If in LD with association result if( r2a > r2_cutoff ) { // Record which alleles are correlated inPhaseAllele.insert(make_pair(l1, allelePairs(l,l1))); // Record that this SNP has been clumped willClump.insert(P->locus[l1]->name); // Now look at the assocations (in the multiple files) // with this SNP for (int f=1; f<=filename.size(); f++) { // Do not clump with self if ( l == l1 && f == sp[i].f ) continue; // Are we requiring that only cross-file // (i.e. no index) SNPs are listed? if ( par::clumpld_only_show_replications_list && f == indexF ) continue; ClumpPair result; result.snp = P->locus[l1]->name; result.f = f; // Result not found if ( assoc_results.find(result) == assoc_results.end() ) continue; double pval = assoc_results[result].p; if( pval < second_pval_cutoff ) { int2 t(l1,f); grouped_snps.insert(make_pair(t,r2a)); } if( pval < .0001 ) four++; else if( pval < .001 ) three++; else if( pval < .01 ) two++; else if( pval < .05 ) one++; else zero++; } } } else failed1 = true; ////////////// // Move left if( l2 > 0 ) { l2--; double r2b = -1; if( P->locus[l2]->chr == P->locus[l]->chr && P->locus[l]->bp - P->locus[l2]->bp < ld_distance ) r2b = hp->rsq( l, l2 ); else failed2 = true; ////////////////////////////////// // Skip already clumped this SNP if( par::clumpld_indep && clumped[ P->locus[l2]->name ] ) continue; ///////////////////////////////////// // Does this SNP meet r-sq threshold? if( r2b > r2_cutoff ) { // Find the allele in phase with rare allele for l inPhaseAllele.insert(make_pair(l2, allelePairs(l,l2))); // Record that this SNP is clumped willClump.insert(P->locus[l2]->name); // Now look at associations for (int f=1; f<=filename.size(); f++) { // Are we requiring that only cross-file // (i.e. no index) SNPs are listed? if ( par::clumpld_only_show_replications_list && f == indexF ) continue; ClumpPair result; result.snp = P->locus[l2]->name; result.f = f; // Result not found if ( assoc_results.find(result) == assoc_results.end() ) continue; double pval = assoc_results[result].p; if( pval < second_pval_cutoff ) { int2 t(l2,f); grouped_snps.insert(make_pair(t,r2b)); } if( pval < .0001 ) four++; else if( pval < .001 ) three++; else if( pval < .01 ) two++; else if( pval < .05 ) one++; else zero++; } } } else failed2 = true; // No point in looking further? if( failed1 && failed2 ) break; } ////////////////////////////////////////////////////////// // Report results // Are we only interested in cross-file clumpings? if ( par::clumpld_only_show_replications ) { bool seen_replication = false; map::iterator gi = grouped_snps.begin(); int cnt=0; int lastf; while( gi != grouped_snps.end() ) { if ( cnt > 0 ) { if ( gi->first.p2 != lastf ) seen_replication = true; else if ( gi->first.p2 != indexF ) seen_replication = true; } lastf = gi->first.p2; ++cnt; ++gi; } // If no cross-file results, then do not report this // clump -- clear all flags for it if ( ! par::clumpld_best ) if ( ! seen_replication ) { grouped_snps.clear(); continue; } } // Indicate that this SNP has now been clumped set::iterator si = willClump.begin(); while ( si != willClump.end() ) { clumped[ *si ] = true; ++si; } int total = zero+one+two+three+four; if ( par::clumpld_verbose ) { // Repeat header CLMP << "\n" << setw(4) << "CHR" << " " << setw(4) << "F" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "BP" << " " << setw(10) << "P" << " " << setw(8) << "TOTAL" << " " << setw(6) << "NSIG" << " " << setw(6) << "S05" << " " << setw(6) << "S01" << " " << setw(6) << "S001" << " " << setw(6) << "S0001" << "\n"; } ClumpPair cp; cp.snp = P->locus[l]->name; cp.f = sp[i].f; CLMP << setw(4) << P->locus[l]->chr << " " << setw(4) << cp.f << " " << setw(par::pp_maxsnp) << P->locus[l]->name << " " << setw(10) << P->locus[l]->bp << " " << setw(10) << assoc_results[cp].p << " " << setw(8) << total << " " << setw(6) << zero << " " << setw(6) << one << " " << setw(6) << two << " " << setw(6) << three << " " << setw(6) << four << " "; ///////////////////////////////////// // Convenient format that just gives // the single best SNP if ( par::clumpld_best ) { // BEST, index SNP, best SNP (or NA), r^2, KB int bestSNP = -1; double bestRsq = -1; double bestP = -1; string bestAllele = "NA"; int bestF = -1; string bestAnnot = ""; double bestKB = 0; bool foundSelf = false; map::iterator gi = grouped_snps.begin(); while( gi != grouped_snps.end() ) { int l0 = gi->first.p1; int f = gi->first.p2; // The same SNP? bool isSelf = false; if ( l == l0 ) { foundSelf = true; isSelf = true; } ClumpPair cp; cp.snp = P->locus[l0]->name; cp.f = f; if ( par::clumpld_only_show_replications && f == indexF ) { ++gi; continue; } if ( assoc_results.find(cp) != assoc_results.end() ) { if ( isSelf || ( ( ! foundSelf ) && gi->second > bestRsq ) ) { bestRsq = gi->second; bestSNP = l0; bestF = f; bestAllele = inPhaseAllele.find(l0)->second; bestP = assoc_results[cp].p; if ( par::clumpld_annot ) bestAnnot = assoc_results[cp].annot; bestKB = (double)(P->locus[l0]->bp - P->locus[l]->bp)/1000.0; } } // Consider next proxy SNP ++gi; } // Report Best SNP if ( bestSNP > -1 ) { BEST << setw(par::pp_maxsnp) << P->locus[l]->name << " " << setw(par::pp_maxsnp) << P->locus[bestSNP]->name << " "; if ( foundSelf ) BEST << setw(6) << "*" << " "; else BEST << setw(6) << bestRsq << " "; BEST << setw(8) << bestKB << " " << setw(8) << bestP << " " << setw(8) << bestAllele << " " << setw(8) << bestF << " "; if ( par::clumpld_annot ) BEST << bestAnnot; } else { BEST << setw(par::pp_maxsnp) << P->locus[l]->name << " " << setw(par::pp_maxsnp) << "NA" << " " << setw(6) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " "; } BEST << "\n"; } ////////////////////////////////// // verbose output if ( par::clumpld_verbose ) { int minBP = P->locus[l]->bp; int maxBP = P->locus[l]->bp; set range_notes; // Now list one per line, sorted by distance CLMP << "\n"; if ( grouped_snps.size() > 0 ) { CLMP << "\n" << setw(4) << " " << " " << setw(4) << " " << " " << setw(par::pp_maxsnp) << " " << " " << setw(10) << "KB" << " " << setw(8) << "RSQ" << " " << setw(8) << "ALLELES" << " " << setw(4) << "F" << " " << setw(12) << "P" << " "; if ( par::clumpld_annot ) CLMP << setw(12) << "ANNOT" << "\n"; else CLMP << "\n"; CLMP << setw(4) << " (INDEX) " << setw(par::pp_maxsnp) << P->locus[l]->name << " "; CLMP << setw(10) << (double)(P->locus[l]->bp - P->locus[l]->bp)/1000.0 << " "; CLMP << setw(8) << "1.000" << " " << setw(8) << P->locus[l]->allele1 << " " << setw(4) << sp[i].f << " " << setw(12) << assoc_results[cp].p << " "; if ( par::clumpld_annot ) CLMP << setw(12) << assoc_results[cp].annot << "\n"; else CLMP << "\n"; CLMP << "\n"; ////////////////////////////////////////////////// // Track if SNP info already listed in this group set infoDisplayed; // Are we tracking ranges? if ( par::clumpld_range_annotate ) { map >::iterator mi = snp2range.find(l); if ( mi != snp2range.end() ) { set::iterator si = mi->second.begin(); while ( si != mi->second.end() ) { range_notes.insert( (*si)->name ); ++si; } } } map::iterator gi = grouped_snps.begin(); while( gi != grouped_snps.end() ) { int l0 = gi->first.p1; int f = gi->first.p2; ClumpPair cp; cp.snp = P->locus[l0]->name; cp.f = f; if ( assoc_results.find(cp) != assoc_results.end() ) { if ( infoDisplayed.find(l0) == infoDisplayed.end() ) { CLMP << setw(4) << " " << " " << setw(4) << " " << " " << setw(par::pp_maxsnp) << P->locus[l0]->name << " " << setw(10) << (double)(P->locus[l0]->bp - P->locus[l]->bp)/1000.0 << " "; CLMP << setw(8) << gi->second << " "; CLMP << setw(8) << inPhaseAllele.find(l0)->second << " " << setw(4) << f << " " << setw(12) << assoc_results[cp].p << " "; if ( par::clumpld_annot ) CLMP << setw(12) << assoc_results[cp].annot << "\n"; else CLMP << "\n"; // Note that we have now seen it infoDisplayed.insert(l0); // Track overall range of this clump if ( P->locus[l0]->bp < minBP ) minBP = P->locus[l0]->bp; if ( P->locus[l0]->bp > maxBP ) maxBP = P->locus[l0]->bp; //////////////////////////////// // Are we also tracking ranges? if ( par::clumpld_range_annotate ) { map >::iterator mi = snp2range.find(l0); if ( mi != snp2range.end() ) { set::iterator si = mi->second.begin(); while ( si != mi->second.end() ) { range_notes.insert( (*si)->name ); ++si; } } } } else { CLMP << setw(4) << " " << " " << setw(4) << " " << " " << setw(par::pp_maxsnp) << " " << " " << setw(10) << " " << " " << setw(8) << " " << " " << setw(8) << " " << " " << setw(4) << f << " " << setw(12) << assoc_results[cp].p << " "; if ( par::clumpld_annot ) CLMP << setw(12) << assoc_results[cp].annot << "\n"; else CLMP << "\n"; } } gi++; } // Output range of p2-passing SNPs in UCSC cut-and-paste friendly format CLMP << "\n RANGE: " << "chr" << chromosomeName( P->locus[l]->chr ) << ":" << minBP << ".." << maxBP << "\n"; CLMP << " SPAN: " << ( maxBP - minBP ) / 1000 << "kb\n"; } //////////////////////////////////// // Print ranges encountered here if ( par::clumpld_range_annotate ) { if ( grouped_snps.size() > 0 ) { set::iterator ri = range_notes.begin(); bool first = true; CLMP << " GENES w/SNPs: "; while ( ri != range_notes.end() ) { if ( first ) { CLMP << *ri; first = false; } else { CLMP << "," << *ri; } ++ri; } CLMP << "\n"; } // Now list all genes in region Range r1; r1.start = minBP; r1.stop = maxBP; r1.chr = P->locus[l]->chr; if ( grouped_snps.size() == 0 ) CLMP << "\n"; CLMP << " GENES: "; set intRanges = rangeIntersect(r1,ranges); set::iterator ri2 = intRanges.begin(); int cnt = 0; while ( ri2 != intRanges.end() ) { if ( cnt == 0 ) { CLMP << (*ri2)->name; } else if ( cnt % 8 == 0 ) { CLMP << "\n " << (*ri2)->name; } else { CLMP << "," << (*ri2)->name; } ++ri2; ++cnt; } CLMP << "\n"; } } else { // Just list of SNP names if( grouped_snps.size() == 0 ) CLMP << "NONE"; map::iterator gi = grouped_snps.begin(); int j = 0; while( gi != grouped_snps.end() ) { CLMP << P->locus[ gi->first.p1 ]->name << "(" << gi->first.p2 << ")"; if( j < grouped_snps.size()-1) CLMP << ","; j++; gi++; } // Non-verbose mode gene information // goes to a separate file (CLMP2) if ( par::clumpld_range_annotate ) { int minBP = P->locus[l]->bp; int maxBP = P->locus[l]->bp; map::iterator gi = grouped_snps.begin(); while( gi != grouped_snps.end() ) { int l0 = gi->first.p1; if ( P->locus[l0]->bp < minBP ) minBP = P->locus[l0]->bp; if ( P->locus[l0]->bp > maxBP ) maxBP = P->locus[l0]->bp; ++gi; } Range r1; r1.start = minBP; r1.stop = maxBP; r1.chr = P->locus[l]->chr; string glist = returnFullRangeList(r1,ranges,false); if ( glist == "" ) glist = "(NONE)"; CLMP2 << setw(4) << P->locus[l]->chr << " " << setw(par::pp_maxsnp) << P->locus[l]->name << " " << setw(10) << assoc_results[cp].p << " " << setw(6) << grouped_snps.size()+1 << " " << setw(28) << ("chr"+chromosomeName(P->locus[l]->chr)+ ":"+int2str(minBP)+".."+int2str(maxBP)) << " " << setw(10) << ( maxBP - minBP ) / 1000.0 << " " << "["<nh; h++) if ( hp->haplotypeName(h) == P->locus[l1]->allele2 + P->locus[l2]->allele2 ) ch = h; // Is D positive or negative? string s; if ( hp->f[ch] > (1 - P->locus[l1]->freq)*(1 - P->locus[l2]->freq) ) s = P->locus[l1]->allele1 + P->locus[l2]->allele1 + "/" + P->locus[l1]->allele2 + P->locus[l2]->allele2; else s = P->locus[l1]->allele1 + P->locus[l2]->allele2 + "/" + P->locus[l1]->allele2 + P->locus[l2]->allele1; return s; } string returnFullRangeList(Range & r1, map > & ranges, bool verbose) { string rlist = ""; set intRanges = rangeIntersect(r1,ranges); set::iterator ri2 = intRanges.begin(); int cnt = 0; while ( ri2 != intRanges.end() ) { if ( cnt == 0 ) { rlist += (*ri2)->name; } else if ( verbose && cnt % 8 == 0 ) { rlist += "\n " + (*ri2)->name; } else { rlist += "," + (*ri2)->name; } ++ri2; ++cnt; } return rlist; } plink-1.07-src/Rsrv.h0000644000265600020320000003710411264127626013610 0ustar tilleaadmin/* * Rsrv.h : constants and macros for Rserve client/server architecture * Copyright (C) 2002-8 Simon Urbanek * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; version 2.1 of the License * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Note: This header file is licensed under LGPL to allow other * programs to use it under LGPL. Rserve itself is licensed under GPL. * * $Id: Rsrv.h 253 2009-01-15 19:30:11Z urbanek $ */ /* external defines: MAIN - should be defined in just one file that will contain the fn definitions and variables */ #ifndef __RSRV_H__ #define __RSRV_H__ #ifndef NO_CONFIG_H #include "config.h" #endif #define RSRV_VER 0x000503 /* Rserve v0.5-3 */ #define default_Rsrv_port 6311 /* Rserve communication is done over any reliable connection-oriented protocol (usually TCP/IP or local sockets). After the connection is established, the server sends 32 bytes of ID-string defining the capabilities of the server. Each attribute of the ID-string is 4 bytes long and is meant to be user-readable (i.e. don't use special characters), and it's a good idea to make "\r\n\r\n" the last attribute the ID string must be of the form: [0] "Rsrv" - R-server ID signature [4] "0100" - version of the R server [8] "QAP1" - protocol used for communication (here Quad Attributes Packets v1) [12] any additional attributes follow. \r\n and '-' are ignored. optional attributes (in any order; it is legitimate to put dummy attributes, like "----" or " " between attributes): "R151" - version of R (here 1.5.1) "ARpt" - authorization required (here "pt"=plain text, "uc"=unix crypt, "m5"=MD5) connection will be closed if the first packet is not CMD_login. if more AR.. methods are specified, then client is free to use the one he supports (usually the most secure) "K***" - key if encoded authentification is challenged (*** is the key) for unix crypt the first two letters of the key are the salt required by the server */ /* QAP1 transport protocol header structure all int and double entries throughout the transfer are in Intel-endianess format: int=0x12345678 -> char[4]=(0x78,0x56,x34,0x12) functions/macros for converting from native to protocol format are available below Please note also that all values muse be quad-aligned, i.e. the length must be divisible by 4. This is automatically assured for int/double etc., but care must be taken when using strings and byte streams. */ struct phdr { /* always 16 bytes */ int cmd; /* command */ int len; /* length of the packet minus header (ergo -16) */ int dof; /* data offset behind header (ergo usually 0) */ int res; /* high 32-bit of the packet length (since 0103 and supported on 64-bit platforms only) aka "lenhi", but the name was not changed to maintain compatibility */ }; /* each entry in the data section (aka parameter list) is preceded by 4 bytes: 1 byte : parameter type 3 bytes: length parameter list may be terminated by 0/0/0/0 but doesn't have to since "len" field specifies the packet length sufficiently (hint: best method for parsing is to allocate len+4 bytes, set the last 4 bytes to 0 and trverse list of parameters until (int)0 occurs since 0102: if the 7-th bit (0x40) in parameter type is set then the length is encoded in 7 bytes enlarging the header by 4 bytes. */ /* macros for handling the first int - split/combine (24-bit version only!) */ #define PAR_TYPE(X) ((X)&255) #define PAR_LEN(X) ((X)>>8) #define PAR_LENGTH PAR_LEN #define SET_PAR(TY,LEN) ((((LEN)&0xffffff)<<8)|((TY)&255)) #define CMD_STAT(X) (((X)>>24)&127) /* returns the stat code of the response */ #define SET_STAT(X,s) ((X)|(((s)&127)<<24)) /* sets the stat code */ #define CMD_RESP 0x10000 /* all responses have this flag set */ #define RESP_OK (CMD_RESP|0x0001) /* command succeeded; returned parameters depend on the command issued */ #define RESP_ERR (CMD_RESP|0x0002) /* command failed, check stats code attached string may describe the error */ /* stat codes; 0-0x3f are reserved for program specific codes - e.g. for R connection they correspond to the stat of Parse command. the following codes are returned by the Rserv itself codes <0 denote Rerror as provided by R_tryEval */ #define ERR_auth_failed 0x41 /* auth.failed or auth.requested but no login came. in case of authentification failure due to name/pwd mismatch, server may send CMD_accessDenied instead */ #define ERR_conn_broken 0x42 /* connection closed or broken packet killed it */ #define ERR_inv_cmd 0x43 /* unsupported/invalid command */ #define ERR_inv_par 0x44 /* some parameters are invalid */ #define ERR_Rerror 0x45 /* R-error occured, usually followed by connection shutdown */ #define ERR_IOerror 0x46 /* I/O error */ #define ERR_notOpen 0x47 /* attempt to perform fileRead/Write on closed file */ #define ERR_accessDenied 0x48 /* this answer is also valid on CMD_login; otherwise it's sent if the server deosn;t allow the user to issue the specified command. (e.g. some server admins may block file I/O operations for some users) */ #define ERR_unsupportedCmd 0x49 /* unsupported command */ #define ERR_unknownCmd 0x4a /* unknown command - the difference between unsupported and unknown is that unsupported commands are known to the server but for some reasons (e.g. platform dependent) it's not supported. unknown commands are simply not recognized by the server at all. */ /* The following ERR_.. exist since 1.23/0.1-6 */ #define ERR_data_overflow 0x4b /* incoming packet is too big. currently there is a limit as of the size of an incoming packet. */ #define ERR_object_too_big 0x4c /* the requested object is too big to be transported in that way. If received after CMD_eval then the evaluation itself was successful. optional parameter is the size of the object */ /* since 1.29/0.1-9 */ #define ERR_out_of_mem 0x4d /* out of memory. the connection is usually closed after this error was sent */ /* since 0.4-0 */ #define ERR_session_busy 0x50 /* session is still busy */ #define ERR_detach_failed 0x51 /* unable to detach seesion (cannot determine peer IP or problems creating a listening socket for resume) */ /* availiable commands */ #define CMD_login 0x001 /* "name\npwd" : - */ #define CMD_voidEval 0x002 /* string : - */ #define CMD_eval 0x003 /* string : encoded SEXP */ #define CMD_shutdown 0x004 /* [admin-pwd] : - */ /* file I/O routines. server may answe */ #define CMD_openFile 0x010 /* fn : - */ #define CMD_createFile 0x011 /* fn : - */ #define CMD_closeFile 0x012 /* - : - */ #define CMD_readFile 0x013 /* [int size] : data... ; if size not present, server is free to choose any value - usually it uses the size of its static buffer */ #define CMD_writeFile 0x014 /* data : - */ #define CMD_removeFile 0x015 /* fn : - */ /* object manipulation */ #define CMD_setSEXP 0x020 /* string(name), REXP : - */ #define CMD_assignSEXP 0x021 /* string(name), REXP : - ; same as setSEXP except that the name is parsed */ /* session management (since 0.4-0) */ #define CMD_detachSession 0x030 /* : session key */ #define CMD_detachedVoidEval 0x031 /* string : session key; doesn't */ #define CMD_attachSession 0x032 /* session key : - */ /* 'internal' commands (since 0.1-9) */ #define CMD_setBufferSize 0x081 /* [int sendBufSize] this commad allow clients to request bigger buffer sizes if large data is to be transported from Rserve to the client. (incoming buffer is resized automatically) */ #define CMD_setEncoding 0x082 /* string (one of "native","latin1","utf8") : -; since 0.5-3 */ /* special commands - the payload of packages with this mask does not contain defined parameters */ #define CMD_SPECIAL_MASK 0xf0 #define CMD_serEval 0xf5 /* serialized eval - the packets are raw serialized data without data header */ #define CMD_serAssign 0xf6 /* serialized assign - serialized list with [[1]]=name, [[2]]=value */ #define CMD_serEEval 0xf7 /* serialized expression eval - like serEval with one additional evaluation round */ /* data types for the transport protocol (QAP1) do NOT confuse with XT_.. values. */ #define DT_INT 1 /* int */ #define DT_CHAR 2 /* char */ #define DT_DOUBLE 3 /* double */ #define DT_STRING 4 /* 0 terminted string */ #define DT_BYTESTREAM 5 /* stream of bytes (unlike DT_STRING may contain 0) */ #define DT_SEXP 10 /* encoded SEXP */ #define DT_ARRAY 11 /* array of objects (i.e. first 4 bytes specify how many subsequent objects are part of the array; 0 is legitimate) */ #define DT_LARGE 64 /* new in 0102: if this flag is set then the length of the object is coded as 56-bit integer enlarging the header by 4 bytes */ /* XpressionTypes REXP - R expressions are packed in the same way as command parameters transport format of the encoded Xpressions: [0] int type/len (1 byte type, 3 bytes len - same as SET_PAR) [4] REXP attr (if bit 8 in type is set) [4/8] data .. */ #define XT_NULL 0 /* P data: [0] */ #define XT_INT 1 /* - data: [4]int */ #define XT_DOUBLE 2 /* - data: [8]double */ #define XT_STR 3 /* P data: [n]char null-term. strg. */ #define XT_LANG 4 /* - data: same as XT_LIST */ #define XT_SYM 5 /* - data: [n]char symbol name */ #define XT_BOOL 6 /* - data: [1]byte boolean (1=TRUE, 0=FALSE, 2=NA) */ #define XT_S4 7 /* P data: [0] */ #define XT_VECTOR 16 /* P data: [?]REXP,REXP,.. */ #define XT_LIST 17 /* - X head, X vals, X tag (since 0.1-5) */ #define XT_CLOS 18 /* P X formals, X body (closure; since 0.1-5) */ #define XT_SYMNAME 19 /* s same as XT_STR (since 0.5) */ #define XT_LIST_NOTAG 20 /* s same as XT_VECTOR (since 0.5) */ #define XT_LIST_TAG 21 /* P X tag, X val, Y tag, Y val, ... (since 0.5) */ #define XT_LANG_NOTAG 22 /* s same as XT_LIST_NOTAG (since 0.5) */ #define XT_LANG_TAG 23 /* s same as XT_LIST_TAG (since 0.5) */ #define XT_VECTOR_EXP 26 /* s same as XT_VECTOR (since 0.5) */ #define XT_VECTOR_STR 27 /* - same as XT_VECTOR (since 0.5 but unused, use XT_ARRAY_STR instead) */ #define XT_ARRAY_INT 32 /* P data: [n*4]int,int,.. */ #define XT_ARRAY_DOUBLE 33 /* P data: [n*8]double,double,.. */ #define XT_ARRAY_STR 34 /* P data: string,string,.. (string=byte,byte,...,0) padded with '\01' */ #define XT_ARRAY_BOOL_UA 35 /* - data: [n]byte,byte,.. (unaligned! NOT supported anymore) */ #define XT_ARRAY_BOOL 36 /* P data: int(n),byte,byte,... */ #define XT_RAW 37 /* P data: int(n),byte,byte,... */ #define XT_ARRAY_CPLX 38 /* P data: [n*16]double,double,... (Re,Im,Re,Im,...) */ #define XT_UNKNOWN 48 /* P data: [4]int - SEXP type (as from TYPEOF(x)) */ /* | +--- interesting flags for client implementations: P = primary type s = secondary type - its decoding is identical to a primary type and thus the client doesn't need to decode it separately. - = deprecated/removed. if a client doesn't need to support old Rserve versions, those can be safely skipped. Total primary: 4 trivial types (NULL, STR, S4, UNKNOWN) + 6 array types + 3 recursive types */ #define XT_LARGE 64 /* new in 0102: if this flag is set then the length of the object is coded as 56-bit integer enlarging the header by 4 bytes */ #define XT_HAS_ATTR 128 /* flag; if set, the following REXP is the attribute */ /* the use of attributes and vectors results in recursive storage of REXPs */ #define BOOL_TRUE 1 #define BOOL_FALSE 0 #define BOOL_NA 2 #define GET_XT(X) ((X)&63) #define GET_DT(X) ((X)&63) #define HAS_ATTR(X) (((X)&XT_HAS_ATTR)>0) #define IS_LARGE(X) (((X)&XT_LARGE)>0) #if defined sun && ! defined ALIGN_DOUBLES #define ALIGN_DOUBLES #endif /* functions/macros to convert native endianess of int/double for transport currently ony PPC style and Intel style are supported */ /* Since 0.4-5 we no longer use configure-time endianness tests to allow cross-compilation. Either BS_xx_ENDIAN constant is defined by configure and thus should be relied upon only if the compiler contants don't work */ // Added by Dean Snyder, CIDR, for Solaris support #ifdef SOLARIS #define __BIG_ENDIAN__ #endif #if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN_ #define SWAPEND 1 #elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN_ || defined BS_LITTLE_ENDIAN /* #undef SWAPEND */ #elif defined BS_BIG_ENDIAN #define SWAPEND 1 #elif __ia64__ || __i386__ || __x86_64__ /* take a guess based on the architecture (Intel-like) */ #define __LITTLE_ENDIAN__ 1 #elif __ppc__ || __ppc64__ /* any ppc */ #define __BIG_ENDIAN__ 1 #define SWAPEND 1 #elif ! defined Win32 /* Windows is little-endian is most cases, anywhere else we're stuck */ #error "Cannot determine endianness. Make sure config.h is included or __{BIG|LITTLE}_ENDIAN__ is defined ." #endif /* FIXME: all the mess below needs more efficient implementation - the current one is so messy to work around alignment problems on some platforms like Sun and HP 9000 */ #ifdef SWAPEND /* swap endianness - for PPC and co. */ #ifdef MAIN unsigned int itop(unsigned int i) { char b[4]; b[0]=((char*)&i)[3]; b[3]=((char*)&i)[0]; b[1]=((char*)&i)[2]; b[2]=((char*)&i)[1]; return *((unsigned int*)b); } double dtop(double i) { char b[8]; b[0]=((char*)&i)[7]; b[1]=((char*)&i)[6]; b[2]=((char*)&i)[5]; b[3]=((char*)&i)[4]; b[7]=((char*)&i)[0]; b[6]=((char*)&i)[1]; b[5]=((char*)&i)[2]; b[4]=((char*)&i)[3]; return *((double*)b); } void fixdcpy(void *t,void *s) { int i=0; while (i<8) { ((char*)t)[7-i]=((char*)s)[i]; i++; } } #else extern unsigned int itop(unsigned int i); extern double dtop(double i); extern void fixdcpy(void *t,void *s); #endif #define ptoi(X) itop(X) /* itop*itop=id */ #define ptod(X) dtop(X) #else #define itop(X) (X) #define ptoi(X) (X) #define dtop(X) (X) #define ptod(X) (X) #define fixdcpy(T,S) ((double*)(T))[0]=((double*)(S))[0]; #endif #ifndef HAVE_CONFIG_H /* this tiny function can be used to make sure that the endianess is correct (it is not included if the package was configured with autoconf since then it should be fine anyway) */ #ifdef MAIN int isByteSexOk() { int i; i=itop(0x12345678); return (*((char*)&i)==0x78); } #else extern int isByteSexOk(); #endif #else #define isByteSexOk 1 #endif #endif /*--- The following makes the indenting behavior of emacs compatible with Xcode's 4/4 setting ---*/ /* Local Variables: */ /* indent-tabs-mode: t */ /* tab-width: 4 */ /* c-basic-offset: 4 */ /* End: */ plink-1.07-src/zed.cpp0000644000265600020320000001103711264127626013766 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "zed.h" #include "helper.h" #include "nlist.h" extern Plink * PP; ZInput::ZInput(string f, bool cmode) { open(f,cmode); } ZInput::ZInput() { // } void ZInput::open(string f, bool cmode) { filename = f; compressed = cmode; #ifndef WITH_ZLIB if ( compressed ) { error("ZLIB support is not currently compiled in"); } #endif if ( compressed ) { zinf.open( filename.c_str() ); if ( ! zinf.is_open() ) error("Problem opening " + filename + "\n"); } else { inf.open( filename.c_str() ); if ( ! inf.is_open() ) error("Problem opening " + filename + "\n"); } } string ZInput::readLine() { if ( compressed ) { zinf.getline(buf,MAX_LINE_LENGTH,'\n'); } else { inf.getline(buf,MAX_LINE_LENGTH,'\n'); } return buf; // std::cerr << buf // << "\t(" << inf.rdbuf()->in_avail() // << " chars left in buffer) "; } char ZInput::readChar() { char c; if ( compressed ) { zinf.get(c); } else { inf.get(c); } return c; } vector ZInput::tokenizeLine() { string s = readLine(); vector tok; string buf; stringstream ss(s); while (ss >> buf) tok.push_back(buf); return tok; } void ZInput::close() { if ( compressed ) inf.close(); else zinf.close(); } bool ZInput::endOfFile() { // Check -- eof() doesn't work here -- look up the differences between // these different file states if ( compressed ) return zinf.fail() || ( ! zinf.good() ) ; else return inf.fail() || ( ! inf.good() ) ; } void ZInput::unbuffered() { if ( compressed ) zinf.rdbuf()->pubsetbuf(0,0); } void ZOutput::open(string f, bool cmode) { filename = f; compressed = cmode; #ifndef WITH_ZLIB if ( compressed ) { PP->printLOG("Warning: ZLIB support not enabled, so writing uncompressed file\n"); compressed = false; } #endif if ( compressed ) { zoutf.open( filename.c_str() ); if ( ! zoutf.is_open() ) { error("Problem opening " + filename ); } } else { outf.open( filename.c_str() ); if ( ! outf.is_open() ) { error("Problem opening " + filename ); } } } ZOutput::ZOutput(string f, bool cmode) { open(f,cmode); } ZOutput::ZOutput() { // } void ZOutput::write(string s) { if ( compressed ) zoutf << s; else outf << s; } void ZOutput::writeLine(string s) { if ( compressed ) zoutf << s << endl; else outf << s << endl; } void ZOutput::close() { if ( compressed ) zoutf.close(); else outf.close(); } void ZOutput::unbuffered() { if ( compressed ) zoutf.rdbuf()->pubsetbuf(0,0); } void fileCompress() { #ifndef WITH_ZLIB error("ZLIB support is not compiled in"); #endif PP->printLOG("Compressing [ " + par::compress_filename + " ]...\n"); ZInput zin( par::compress_filename , false ); ZOutput zout( par::compress_filename+".gz", true ); while ( ! zin.endOfFile() ) { string line = zin.readLine(); if ( zin.endOfFile() ) break; zout.writeLine(line); } zin.close(); zout.close(); PP->printLOG("Wrote compressed file to [ " + par::compress_filename + ".gz ]\n"); } void fileUncompress() { #ifndef WITH_ZLIB error("ZLIB support is not compiled in"); #endif PP->printLOG("Uncompressing [ " + par::compress_filename + " ]...\n"); int s = par::compress_filename.size(); if ( s < 3 || par::compress_filename.substr(s-3,3) != ".gz" ) error("Filename must end if .gz"); ZInput zin( par::compress_filename , true ); ZOutput zout( par::compress_filename.substr(0,s-3), false ); while ( ! zin.endOfFile() ) { string line = zin.readLine(); if ( zin.endOfFile() ) break; zout.writeLine(line); } zin.close(); zout.close(); PP->printLOG("Wrote uncompressed file to [ " + par::compress_filename.substr(0,s-3) + " ]\n"); } plink-1.07-src/crandom.cpp0000644000265600020320000000352111264127624014624 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "crandom.h" int CRandom::iy=0; vector CRandom::iv; const int CRandom::IA=16807; const int CRandom::IM=2147483647; const int CRandom::IQ=127773; const int CRandom::IR=2836; const int CRandom::NTAB=32; const int CRandom::NDIV=(1+(IM-1)/NTAB); const double CRandom::EPS=3.0e-16; const double CRandom::AM=1.0/IM; const double CRandom::RNMX=(1.0-EPS); int CRandom::idum=0; // Set seed void CRandom::srand ( long unsigned i ) { idum = -i; CRandom::iv.resize(NTAB); if (idum <= 0 || !iy) { if (-idum < 1) idum=1; else idum = -idum; for (int j=NTAB+7;j>=0;j--) { int k=idum/IQ; idum=IA*(idum-k*IQ)-IR*k; if (idum < 0) idum += IM; if (j < NTAB) iv[j] = idum; } iy=iv[0]; } } // Return the next random number double CRandom::rand () { int j,k; double temp; k=idum/IQ; idum=IA*(idum-k*IQ)-IR*k; if (idum < 0) idum += IM; j=iy/NDIV; iy=iv[j]; iv[j] = idum; if ((temp=AM*iy) > RNMX) return RNMX; else return temp; } // Return a random integer between 0 and fac-1 int CRandom::rand (int n) { int r = int(rand() * n); if (r == n) r--; return r; } plink-1.07-src/qualscores.cpp0000644000265600020320000001204511264127626015365 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "stats.h" #include "helper.h" #include "options.h" using namespace std; void Plink::filterQualSNPs() { // Remove entire SNPs if they do not pass the qual score // threshold vector del( nl_all, false ); ///////////////////////////// // Look-up table by SNP name map mlocus; map::iterator ilocus; for ( int l = 0 ; l < nl_all ; l++ ) mlocus.insert(make_pair( locus[l]->name, l ) ); checkFileExists( par::snp_qual_file ); printLOG("Reading SNP quality scores from [ " + par::snp_qual_file + " ]\n"); ifstream I( par::snp_qual_file.c_str() , ios::in ); int ndel = 0; int nfound = 0; while ( ! I.eof() ) { string snp; double qual; I >> snp >> qual; if ( snp == "" ) continue; ilocus = mlocus.find( snp ); if ( ilocus != mlocus.end() ) { ++nfound; if ( qual < par::snp_qual_min || qual > par::snp_qual_max ) { ++ndel; del[ ilocus->second ] = true; } } } I.close(); printLOG("Read quality scores for " + int2str(nfound) + " of " + int2str(nl_all) + " SNPs\n"); printLOG("Removing " + int2str(ndel) + " SNPs based on quality scores\n"); //////////////////////////////////////// // Remove selected loci from locus list, deleteSNPs(del); } void Plink::filterQualGenotypes() { // Only blank out genotypes if they do not meet the quality // score threshold // ?To add: an automatic option to treat these as obligatory missing? //////////////////////////// // Look-up table by SNP name map mpeople; map mlocus; for ( int l = 0 ; l < nl_all ; l++ ) mlocus.insert(make_pair( locus[l]->name, l ) ); for ( int i = 0 ; i < n ; i++ ) mpeople.insert(make_pair( sample[i]->fid+"_"+sample[i]->iid, i ) ); checkFileExists( par::geno_qual_file ); printLOG("Reading genotype quality scores from [ " + par::geno_qual_file + " ]\n"); ifstream I( par::geno_qual_file.c_str() , ios::in ); long int ndel = 0; long int nfound = 0; // Format 0) Q // 1) FID/IID // 2) SNP // 3) QUAL // But can wild card, either person or SNP // Q * rs12345 {list all qual scores for rs12345 for all people (order as file) // Q P1 I1 * {list all qual scores for person P1 I1 (order as file) } // Q * * { list all qual scores for each person, for each SNP } printLOG("Acceptable genotype quality score range is " + dbl2str( par::geno_qual_min ) + " to " + dbl2str( par::geno_qual_max ) + "\n"); string p, m; while (!I.eof()) { // Expecting a "Q" string q; I >> q; if ( q == "" ) continue; if ( q != "Q" && q != "q" ) error("Problem with file format: leading 'Q' not found\n"); // Read person int pcode = -1; bool person_wildcard = false; string fid , iid; I >> fid; if ( fid == "*" ) person_wildcard = true; else { I >> iid; string pstring = fid + "_" + iid; map::iterator i = mpeople.find( pstring ); if ( i != mpeople.end() ) pcode = i->second; } // Read SNP int scode = -1; string sstring; bool snp_wildcard = false; I >> sstring; if ( sstring == "*" ) snp_wildcard = true; else { map::iterator i = mlocus.find( sstring ); if ( i != mlocus.end() ) scode = i->second; } // Now read quality score int pstart = pcode; int pstop = pcode; if ( person_wildcard ) { pstart = 0; pstop = n - 1; } int sstart = scode; int sstop = scode; if ( snp_wildcard ) { sstart = 0; sstop = nl_all - 1; } for ( int p = pstart ; p <= pstop ; p++ ) for ( int s = sstart ; s <= sstop ; s++ ) { // Read qual score double q; I >> q; if ( p >= 0 && s >= 0 ) { if ( q < par::geno_qual_min || q > par::geno_qual_max ) { ++ndel; // Assume SNP major // Set to missing SNP[s]->one[p] = true; SNP[s]->two[p] = false; } } } } I.close(); printLOG(int2str(ndel) + " genotypes did not meet quality score, set to missing\n"); } plink-1.07-src/filters.cpp0000644000265600020320000007430211264127624014656 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "stats.h" #include "crandom.h" #define MISSING(i,l) ( SNP[l]->one[i] && ( ! SNP[l]->two[i] ) ) void Plink::filterSNPs() { ////////////////////////////////////////////////////// // This functions applies the following filters and // functions: // Counts of number of founders, nonfounders // Per-individual genotyping rate // Read in, or calculate, allele frequencies (and save these) // Optionally write allele frequencies, then close // Exclude SNPs with too many missing genotypes // Identify/correct heterozygote haploid // Identify SNPs with no founder genotypes // Calculate/report/filter on HWE tests // Calculate/report genotyping rate per SNP/per individual // Filter on MAF // Remove filtered-out SNPs bool original_SNP_major = par::SNP_major; if ( ! par::SNP_major ) Ind2SNP(); // Which SNPs to delete vector del(locus.size(),false); // Which individuals to delete vector indel(sample.size(),false); ////////////////////////////////////////// // Display number of founders/nonfounders cnt_f=0; vector::iterator person = sample.begin(); while ( person != sample.end() ) { if ( (*person)->founder ) cnt_f++; person++; } printLOG(int2str(cnt_f)+" founders and "+int2str(n-cnt_f)+ " non-founders found\n"); if (cnt_f::iterator p = oblig_missing.begin(); while ( p != oblig_missing.end() ) { int l = p->p1; int k = p->p2; for (int i=0; isol == k ) { SNP[l]->one[i] = true; SNP[l]->two[i] = false; } } ++p; } } ///////////////////////////////////////////////// // Remove individuals with too many missing calls double total_genotyping = 0; if ( par::MAX_IND_MISSING < 1 ) { int n_removed = 0; int n_orig = n; // Consider each individual if ( ! par::oblig_missing ) { for (int i=0;isex; // Sum missingness over all SNPs int m=0; // Missing SNPs int nsnps=0; // All non-obligatory missing SNPs for (int l=0; lchr] ) { continue; } ++nsnps; if ( MISSING(i,l) ) m++; } // Too much missingness? if ( (double)m/(double)nsnps > par::MAX_IND_MISSING ) { indel[i] = true; n_removed++; } } // next individual } else // ... allow oblig missing values { for (int i=0;isex; // Sum missingness over all SNPs int m=0; // Missing SNPs int nsnps=0; // All non-obligatory missing SNPs for (int l=0; lchr] ) continue; if ( ! obligMissing(i,l) ) { if ( MISSING(i,l) ) ++m; ++nsnps; } } // Too much missingness? if ( (double)m/(double)nsnps > par::MAX_IND_MISSING ) { indel[i] = true; n_removed++; } } // next individual } // end if oblig-missing section //////////////////////////////////////// // Save list of any removed individuals if (n_removed>0) { string f = par::output_file_name + ".irem"; printLOG("Writing list of removed individuals to [ " + f + " ]\n"); ofstream REM; REM.open(f.c_str(), ifstream::out); for (int i=0;ifid << "\t" << sample[i]->iid << "\n"; REM.close(); // And now remove these individuals, so that // SNP-based statistics are calculated with // these samples already excluded n_removed = deleteIndividuals(indel); } printLOG(int2str(n_removed)+" of "+int2str(n_orig)); printLOG(" individuals removed for low genotyping ( MIND > "); printLOG(dbl2str(par::MAX_IND_MISSING)+" )\n"); } // end of remove people conditional ///////////////////////////////// // Calculate or read from file? if (par::af_read) { checkFileExists(par::af_file); printLOG( "Reading allele frequencies from [ " + par::af_file + " ] \n"); // Make hash of original SNP names map mlocus; map::iterator ilocus; vector::iterator loc = locus.begin(); int l=0; while ( loc != locus.end() ) { mlocus.insert(make_pair( (*loc)->name,l)); loc++; l++; } // Read allele frequencies ifstream FRQ; FRQ.open(par::af_file.c_str()); FRQ.clear(); string dum1, dum2, dum3, dum4, dum5, dum6; string snpname; double freq; int nm; loc = locus.begin(); while ( loc != locus.end() ) { (*loc)->freq = -1; (*loc)->nm = 0; loc++; } // Skip header line FRQ >> dum1 >> dum2 >> dum3 >> dum4 >> dum5 >> dum6; while(!FRQ.eof()) { vector tokens = tokenizeLine(FRQ); if (tokens.size() == 0) continue; else if (tokens.size() != 6) { string sline=""; for (int i=0; isecond]; if( ! from_string( loc->freq, tokens[4],std::dec)) { loc->freq = 0; loc->nm = 0; } else if( ! from_string(loc->nm,tokens[5],std::dec)) { loc->freq = 0; loc->nm = 0; } // But was that pointing to the correct allele? if ( rareAllele == loc->allele2 && rareAllele != par::missing_genotype && loc->allele2 != par::missing_genotype ) loc->freq = 1 - loc->freq; else if ( commonAllele == loc->allele1 && commonAllele != par::missing_genotype && loc->allele1 != par::missing_genotype ) loc->freq = 1 - loc->freq; } } FRQ.clear(); FRQ.close(); } ///////////////////////////////// // Calculate allele frequencies vector hetlist(0); vector::iterator d = del.begin(); vector::iterator loc = locus.begin(); vector::iterator s = SNP.begin(); int l = 0; // Main locus counter int exc_maf = 0; int exc_miss = 0; vector no_founders_found_list; while ( loc != locus.end() ) { if (!par::af_read) { (*loc)->freq = 0; // count 1 per allele, for frequency (*loc)->nm = 0; } // count 1 per genotype, for missingness int geno_nm = 0; // count 1 per non-obligatory missing genotype // (or set to N individuals) int geno_real = 0; bool X = false; bool haploid = false; // Determine type of SNP if (par::chr_sex[(*loc)->chr]) X=true; else if (par::chr_haploid[(*loc)->chr]) haploid=true; /////////////////////////////// // Iterate over each individual vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator person = sample.begin(); int i = 0; while ( person != sample.end() ) { bool s1 = *i1; bool s2 = *i2; // Check female Y genotypes if ( par::chr_Y[(*loc)->chr] && ! (*person)->sex ) { // Set to missing, unless in a RECODE mode if ( ! par::preserve_all_genotypes ) { s1 = *i1 = true; s2 = *i2 = false; } // But in any case, do not include this marker in // any genotype counts: skip to next person ++person; ++i; ++i1; ++i2; continue; } // For haploid heterozygosity check, also consider all individuals if ( haploid || ( X && (*person)->sex ) ) { if ( (!s1) && s2 ) { hetlist.push_back( (*person)->fid + "\t" + (*person)->iid + "\t" + (*loc)->name ); // Set to missing, unless in a RECODE mode if ( ! par::preserve_all_genotypes ) { s1 = *i1 = true; s2 = *i2 = false; } } } // For missing genotypes if ( ! ( s1 && (!s2) ) ) geno_nm++; // But is this a real genotype in any case? if ( par::oblig_missing ) { if ( ! obligMissing(i,l) ) ++geno_real; } else ++geno_real; // Do not recount alleles if we have read in allele frequencies if (!par::af_read) { // For allele frequencies // only consider founders if ( par::summ_nonfounders || (*person)->founder ) { if ( haploid || ( X && (*person)->sex ) ) { ////////////////// // Haploid counts // "1" allele count if ( (!s1) && (!s2) ) // FF = hom(11) { (*loc)->freq++; (*loc)->nm++; } else if ( s1 && s2 ) // TT = hom(22) { (*loc)->nm++; } } else { ////////////////// // Autosomal count // "1" allele count if (!s1) { if (!s2) // 00 = hom(11) { (*loc)->freq+=2; (*loc)->nm+=2; } else // 01 = het(12) { (*loc)->freq+=1; (*loc)->nm+=2; } } else if ( s2 ) // 11 = hom(22) { (*loc)->nm+=2; } } } } // Next individual ++person; ++i; ++i1; ++i2; } //////////////////////////////// // Calculate allele frequencies if (!par::af_read) { if ( par::af_count) // Allele counts... { // Use freq to store count (keep as is) // Use "bp" to store number of allele 2 (*loc)->bp = (long int)((*loc)->nm - (*loc)->freq); // Use "pos" to store number missing genotypes (*loc)->pos = geno_real - geno_nm; } else // ... or frequencies { if ((*loc)->nm>0) (*loc)->freq /= (double)(*loc)->nm; else { (*loc)->freq = 1; // If we aren't getting rid of it anyway if ( (double)geno_nm/(double)geno_real >= (1-par::MAX_GENO_MISSING)) no_founders_found_list.push_back(*loc); } } } ////////////////////////////////////////// // Record total proportion of missingness double snp_genotyping = n>0 ? (double)geno_nm/(double)geno_real : 0; total_genotyping += snp_genotyping; ///////////////////////////////////////////////// // Exclude if SNP has too many missing genotypes if ( snp_genotyping < (1-par::MAX_GENO_MISSING) ) { *d = true; exc_miss++; } //////////////////////////////////////////////// // Make allele1 always the least common allele if ( par::make_minor_allele && (!par::af_count) && (*loc)->freq > 0.5 ) { // then we need to swap alleles (*loc)->freq = 1 - (*loc)->freq; string tmp = (*loc)->allele2; (*loc)->allele2 = (*loc)->allele1; (*loc)->allele1 = tmp; vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); while ( i1 != (*s)->one.end() ) { if ( (*i1) == (*i2) ) { *i1 = ! (*i1); *i2 = ! (*i2); } i1++; i2++; } } // Next SNP ++d; ++loc; ++l; ++s; } ///////////////////////////////////////////////// // Save list of any heterozygous haploid alleles if (hetlist.size()>0) { printLOG(int2str( hetlist.size()) + " heterozygous haploid genotypes; set to missing\n"); string f = par::output_file_name + ".hh"; printLOG("Writing list of heterozygous haploid genotypes to [ " + f + " ]\n"); ofstream REM; REM.open(f.c_str(), ifstream::out); for (int i=0; i0) { printLOG(int2str( no_founders_found_list.size()) + " SNPs with no founder genotypes observed\n"); printLOG("Warning, MAF set to 0 for these SNPs (see --nonfounders)\n"); string f = par::output_file_name + ".nof"; printLOG( "Writing list of these SNPs to [ " + f + " ]\n"); ofstream NOF; NOF.open(f.c_str(), ifstream::out); for (int i=0; iname << "\n"; NOF.close(); } no_founders_found_list.clear(); ////////////////////////// // Write allele freq file if (par::af_write) { if (par::include_cluster_from_file) calcStratifiedAlleleFreqs(); else { ofstream FRQ; string f = par::output_file_name + ".frq"; if (par::af_count) f += ".count"; if (par::summ_nonfounders) printLOG("Writing allele frequencies (all individuals) to [ " + f + " ] \n"); else printLOG("Writing allele frequencies (founders-only) to [ " + f + " ] \n"); if (par::af_count) printLOG("Display counts rather than frequencies\n"); FRQ.open(f.c_str(), ifstream::out); FRQ.precision(4); FRQ << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(4) << "A1" << " " << setw(4) << "A2" << " "; if (par::af_count) FRQ << setw(6) << "C1" << " " << setw(6) << "C2" << " " << setw(6) << "G0" << "\n"; else FRQ << setw(12) << "MAF" << " " << setw(8) << "NCHROBS" << "\n"; vector::iterator loc = locus.begin(); while (loc != locus.end() ) { string a1 = (*loc)->allele1; string a2 = (*loc)->allele2; if (a1=="") a1="0"; if (a2=="") a2="0"; FRQ << setw(4) << (*loc)->chr << " " << setw(par::pp_maxsnp) << (*loc)->name << " " << setw(4) << a1 << " " << setw(4) << a2 << " "; if (par::af_count) { FRQ << setw(6) << int( (*loc)->freq ) << " " << setw(6) << int( (*loc)->bp ) << " " << setw(6) << int( (*loc)->pos ) << "\n"; } else { if ( (*loc)->nm > 0 ) FRQ << setw(12) << (*loc)->freq << " "; else FRQ << setw(12) << "NA" << " "; FRQ << setw(8) << (*loc)->nm << "\n"; } loc++; } FRQ.close(); } // Close after we've done alle freqs, shutdown(); } ///////////////////////// // Write HWE statistics if (par::HWD_test || par::HWD_report) { ofstream HWD; if (par::HWD_report) { if (par::summ_nonfounders) printLOG("Writing Hardy-Weinberg tests (all individuals) to [ " + par::output_file_name + ".hwe ] \n"); else printLOG("Writing Hardy-Weinberg tests (founders-only) to [ " + par::output_file_name + ".hwe ] \n"); string f = par::output_file_name + ".hwe"; HWD.open(f.c_str(), ifstream::out); HWD.precision(4); HWD << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(8) << "TEST" << " " << setw(4) << "A1" << " " << setw(4) << "A2" << " " << setw(20) << "GENO" << " " << setw(8) << "O(HET)" << " " << setw(8) << "E(HET)" << " " << setw(12) << "P" << " " << "\n"; } int cnt=0, cnt_a=0, cnt_u=0; //////////////////////// // Consider each locus vector::iterator d = del.begin(); vector::iterator loc = locus.begin(); for ( int l = 0 ; l < locus.size() ; l++ ) { // Compute p-values for HWE test in cases, controls & all // Only consider founders int a11, a12, a22; int u11, u12, u22; int b11, b12, b22; a11=a12=a22=0; u11=u12=u22=0; b11=b12=b22=0; bool X = false, haploid = false; if (par::chr_sex[(*loc)->chr]) X=true; else if (par::chr_haploid[(*loc)->chr]) haploid=true; /////////////////////////////////////////////// // Iterate over each individual, founders only for ( int i = 0 ; i < sample.size() ; i++ ) { Individual * person = sample[i]; /////////////////////////////////////////////// // Only consider founders, & diploid genotypes if ( par::summ_nonfounders || person->founder ) if ( ! ( haploid || ( X && person->sex ) ) ) { bool s1 = SNP[l]->one[i]; bool s2 = SNP[l]->two[i]; // Consider everybody, irrespective of phenotype // (QT, C/C or missing) if (!s1) { if (!s2) b11++; // 00 = hom(11) else b12++; // 01 = het(12) } else if ( s2 ) b22++; // 11 = hom(22) if (par::bt) // for binary trait, separately for cases/controls { if (person->phenotype == 1) { if (!s1) { if (!s2) u11++; // 00 = hom(11) else u12++; // 01 = het(12) } else if ( s2 ) u22++; // 11 = hom(22) } else if (person->phenotype == 2) { if (!s1) { if (!s2) a11++; // 00 = hom(11) else a12++; // 01 = het(12) } else if ( s2 ) a22++; // 11 = hom(22) } } } // Next individual } // Allele frequencies double afreq = 0, ufreq = 0, freq = 0; bool include_cases = true; bool include_controls = true; if (par::qt) freq = ( b11 + (double)b12/2.0 ) / (double)( b11+b12+b22 ); else { afreq = ( a11 + (double)a12/2.0 ) / (double)( a11+a12+a22 ); ufreq = ( u11 + (double)u12/2.0 ) / (double)( u11+u12+u22 ); freq = ( b11 + (double)b12/2.0 ) / (double)( b11+b12+b22 ); if ( a11+a12+a22 == 0 ) include_cases = false; if ( u11+u12+u22 == 0 ) include_controls = false; } if (par::qt) { double p; if (par::HWD_standard) { double tot = b11 + b12 + b22; double exp_11 = freq * freq * tot; double exp_12 = 2 * freq * (1-freq) * tot; double exp_22 = (1-freq) * (1-freq) * tot; double chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11 + ( (b12-exp_12)*(b12-exp_12) ) / exp_12 + ( (b22-exp_22)*(b22-exp_22) ) / exp_22 ; p = chiprobP(chisq,1); } else p = SNPHWE( b12, b11, b22 ); if (par::HWD_report) { HWD << setw(4) << (*loc)->chr << " " << setw(par::pp_maxsnp) << (*loc)->name << " " << setw(8) << "ALL(QT)" << " " << setw(4) << (*loc)->allele1 << " " << setw(4) << (*loc)->allele2 << " " << setw(20) << (int2str(b11)+ "/"+int2str(b12)+ "/"+int2str(b22)) << " " << setw(8) << (double)b12/(double)(b11+b12+b22) << " " << setw(8) << 2 * freq * (1-freq) << " "; if ( realnum(p) ) HWD << setw(12) << p << "\n"; else HWD << setw(12) << "NA" << "\n"; } if ( p <= par::HWD_limit && p > -1 ) { cnt++; *d = true; } } else { // For case/control data double p, p_a, p_u; if (par::HWD_standard) { double exp_a11 = afreq * afreq * (a11+a12+a22); double exp_a12 = 2 * afreq * (1-afreq) * (a11+a12+a22); double exp_a22 = (1-afreq) * (1-afreq) * (a11+a12+a22); double exp_u11 = ufreq * ufreq * (u11+u12+u22); double exp_u12 = 2 * ufreq * (1-ufreq) * (u11+u12+u22); double exp_u22 = (1-ufreq) * (1-ufreq) * (u11+u12+u22); double exp_11 = freq * freq * (b11+b12+b22); double exp_12 = 2 * freq * (1-freq) * (b11+b12+b22); double exp_22 = (1-freq) * (1-freq) * (b11+b12+b22); double chisq_a = ( (a11-exp_a11)*(a11-exp_a11) ) / exp_a11 + ( (a12-exp_a12)*(a12-exp_a12) ) / exp_a12 + ( (a22-exp_a22)*(a22-exp_a22) ) / exp_a22 ; double chisq_u = ( (u11-exp_u11)*(u11-exp_u11) ) / exp_u11 + ( (u12-exp_u12)*(u12-exp_u12) ) / exp_u12 + ( (u22-exp_u22)*(u22-exp_u22) ) / exp_u22 ; double chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11 + ( (b12-exp_12)*(b12-exp_12) ) / exp_12 + ( (b22-exp_22)*(b22-exp_22) ) / exp_22 ; p = chiprobP(chisq,1); p_a = chiprobP(chisq_a,1); p_u = chiprobP(chisq_u,1); } else { p = SNPHWE( b12, b11, b22 ); p_a = SNPHWE( a12, a11, a22 ); p_u = SNPHWE( u12, u11, u22 ); } if (par::HWD_report) { HWD << setw(4) << (*loc)->chr << " " << setw(par::pp_maxsnp) << (*loc)->name << " " << setw(8) << "ALL" << " " << setw(4) << (*loc)->allele1 << " " << setw(4) << (*loc)->allele2 << " " << setw(20) << int2str(b11)+"/"+int2str(b12)+"/"+int2str(b22) << " " << setw(8) << (double)b12/(double)(b11+b12+b22) << " " << setw(8) << 2 * freq * (1-freq) << " "; if ( p > -1 ) HWD << setw(12) << p << "\n"; else HWD << setw(12) << "NA" << "\n"; HWD << setw(4) << (*loc)->chr << " " << setw(par::pp_maxsnp) << (*loc)->name << " " << setw(8) << "AFF" << " " << setw(4) << (*loc)->allele1 << " " << setw(4) << (*loc)->allele2 << " " << setw(20) << int2str(a11)+"/"+int2str(a12)+"/"+int2str(a22) << " " << setw(8) << (double)a12/(double)(a11+a12+a22) << " " << setw(8) << 2 * afreq * (1-afreq) << " "; if (include_cases && p_a > -1 ) HWD << setw(12) << p_a << "\n"; else HWD << setw(12) << "NA" << "\n"; HWD << setw(4) << (*loc)->chr << " " << setw(par::pp_maxsnp) << (*loc)->name << " " << setw(8) << "UNAFF" << " " << setw(4) << (*loc)->allele1 << " " << setw(4) << (*loc)->allele2 << " " << setw(20) << int2str(u11)+"/"+int2str(u12)+"/"+int2str(u22) << " " << setw(8) << (double)u12/(double)(u11+u12+u22) << " " << setw(8) << 2 * ufreq * (1-ufreq) << " "; if (include_controls && p_u > -1 ) HWD << setw(12) << p_u << "\n"; else HWD << setw(12) << "NA" << "\n"; } // Increase counts: in cases if ( include_cases && p_a < par::HWD_limit && p_a > -1 ) cnt_a++; // Controls (and, if possible, exclude on this value) if ( include_controls && p_u < par::HWD_limit && p_u > -1 ) { cnt_u++; if ( ! par::HWD_filter_on_all ) { *d = true; cnt++; } } // In total sample, and if needed, exclude here if ( p < par::HWD_limit && p>-1 ) { if ( par::HWD_filter_on_all || ! include_controls ) { *d = true; cnt++; } } } // next locus ++loc; ++d; } // Finish the report... if (par::HWD_report) HWD.close(); // ...or finish pruning printLOG( int2str(cnt) + " markers to be excluded based on HWE test ( p <= " + dbl2str(par::HWD_limit) + " )\n"); if (par::bt) { printLOG("\t" + int2str(cnt_a) + " markers failed HWE test in cases\n"); printLOG("\t" + int2str(cnt_u) + " markers failed HWE test in controls\n"); } } /////////////////////////////////////////////////// // Summary statistics for genotyping/missing rates if (par::report_missing) { /////////////////////////////////////////// // Report by genotyping rate by individual // possibly allowing for obligatory missingness printLOG( "Writing individual missingness information to [ " + par::output_file_name + ".imiss ] \n"); ofstream MIS; string f = par::output_file_name + ".imiss"; MIS.open(f.c_str(), ifstream::out); MIS.precision(4); MIS << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(10) << "MISS_PHENO" << " " << setw(8) << "N_MISS" << " "; MIS << setw(8) << "N_GENO" << " "; MIS << setw(8) << "F_MISS" << "\n"; for (int i=0; ifid << " " << setw(par::pp_maxiid) << sample[i]->iid << " "; if (sample[i]->missing) MIS << setw(10) << "Y" << " "; else MIS << setw(10) << "N" << " " ; // Sum missingness over all SNPs int m=0; // Missing SNPs int nsnps=0; // All non-obligatory missing SNPs bool female = ! sample[i]->sex; if ( ! par::oblig_missing ) { for (int l=0; lchr] ) continue; if ( MISSING(i,l) ) ++m; ++nsnps; } } else // ... allow oblig missing values { for (int l=0; lchr] ) continue; if ( ! obligMissing(i,l) ) { if ( MISSING(i,l) ) ++m; ++nsnps; } } } MIS << setw(8) << m << " "; MIS << setw(8) << nsnps << " "; MIS << setw(8) << (double)m/(double)nsnps << "\n"; } MIS.close(); /////////////////////////////////////////// // Report by genotyping rate by locus // possibly allowing for sample strata // possibly allowing for obligatory missingness printLOG("Writing locus missingness information to [ " + par::output_file_name +".lmiss ] \n"); f = par::output_file_name + ".lmiss"; MIS.open(f.c_str(), ifstream::out); MIS.clear(); MIS.precision(4); MIS << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " "; if (par::include_cluster_from_file) MIS << setw(10) << "CLST" << " "; MIS << setw(8) << "N_MISS" << " "; MIS << setw(8) << "N_GENO" << " "; if (par::include_cluster_from_file) MIS << setw(8) << "N_CLST" << " "; MIS << setw(8) << "F_MISS" << "\n"; for (int l=0; lchr]; // nk==1 for basic missingness (i.e. not stratified by // cluster) for (int k=0; kchr << " " << setw(par::pp_maxsnp) << loc->name << " "; if (par::include_cluster_from_file) MIS << setw(10) << kname[k] << " "; int m=0; // Number of missing genotypes int c=0; // Number of people in cluster int nsnps=0; // Number of actual genotypes in cluster for ( int i=0; isex ) continue; if (par::include_cluster_from_file) { if ( sample[i]->sol == k ) { if ( ( ! par::oblig_missing ) || ( ! obligMissing(i,l) ) ) { if ( MISSING(i,l) ) ++m; ++nsnps; } ++c; } } else // ... ignore cluster strata { if ( ( ! par::oblig_missing ) || ( ! obligMissing(i,l) ) ) { if ( MISSING(i,l) ) ++m; ++nsnps; } } // Next individual } MIS << setw(8) << m << " "; if (par::include_cluster_from_file) MIS << setw(8) << c << " "; MIS << setw(8) << nsnps << " "; MIS << setw(8) << (double)m / (double)nsnps << "\n"; } // Next SNP } MIS.close(); } ///////////////////////////////// // Remove rare SNPs loc = locus.begin(); d = del.begin(); while ( loc != locus.end() ) { // Note epsilon correction for MAF, due to floating point // issues: only apply to the lower MAF range if ( (*loc)->freq < 0 || (*loc)->freq + par::epsilon < par::min_af || (*loc)->freq > par::max_af ) { *d = true; exc_maf++; } d++; loc++; } ///////////////////////////////////////// // Remove SNPs based on thresholds if ( locus.size() > 0 ) printLOG("Total genotyping rate in remaining individuals is " + dbl2str(total_genotyping/(double)locus.size())+"\n"); printLOG(int2str(exc_miss)+" SNPs failed missingness test ( GENO > " +dbl2str(par::MAX_GENO_MISSING)+" )\n"); printLOG(int2str(exc_maf)+" SNPs failed frequency test ( MAF < "+dbl2str(par::min_af)); if (par::max_af < 0.5 ) printLOG(" or MAF > " + dbl2str(par::max_af)); printLOG(" )\n"); int tmp = deleteSNPs(del); ////////////////////////////////////////// // Need to make back to individual major? if ( ! original_SNP_major ) SNP2Ind(); return; } void Plink::thinSNPs() { if ( par::thin_param <= 0 || par::thin_param >= 1 ) error("Parameter for --thin must be 0 toKeep; for (int l=0;l #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "phase.h" #include "helper.h" #include "genogroup.h" #include "haplowindow.h" ///////////////////////////////////////////////////////////////////// // For a given window, collapse all genotypes into a unique groups, // 'genoGroups' and perform subsequent EM on these entitities rather // than on individuals void HaploWindow::enumerateGenogroups() { // Consider each individual for (int i=0; i < P->n ; i++) { // Only phase non-missing founders if ( ! (P->sample[i]->founder && haplo->include[i])) continue; // Build a new multilocus genotype set MultiLocusGenotype * m = new MultiLocusGenotype; // Include sex here for X chr SNPs if ( haplo->X ) m->g.push_back(P->sample[i]->sex); // Genotypes for (int s=0; sSNP[ S[s] ]->one[i] : P->sample[i]->one[ S[s] ]; bool s2 = par::SNP_major ? P->SNP[ S[s] ]->two[i] : P->sample[i]->two[ S[s] ]; m->g.push_back(s1); m->g.push_back(s2); } // One individual, this individual m->count = 1; m->reference = i; // But have we already seen a similar genoGroup? set::iterator im = genotypes.find(m); if (im == genotypes.end() ) { genoGroup[i] = m; genotypes.insert( m ); } else { delete m; (*im)->count++; genoGroup[i] = *im; } } // Next individual } plink-1.07-src/elf.cpp0000644000265600020320000006175111264127625013761 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "perm.h" #include "stats.h" #include "linear.h" #include "logistic.h" extern Plink * PP; class RCount { public: RCount(Plink * p_, map * rl_) { P = p_; // not used now: rangeLookup = rl_; rval.resize(P->n,0); // Potentially weighted score aval.resize(P->n,0); // 0/1/2 for present rare allele gval.resize(P->n,0); // 0/1 for genotyped or not nsnps = nalleles = 0; npc = 0; pcMode = false; domModel = true; } Plink * P; map * rangeLookup; vector_t rval; vector gval; vector aval; // Current SNPs in window -> SNP specific counts map rwin; map > gwin; map > awin; double acnt, ucnt; int acnt2, ucnt2; int nsnps; int nalleles; int npc; bool pcMode; bool domModel; bool addSNP(int l); bool removeSNP(int l); bool setWindow(int chr, int bp); void displayWindow(); void loadCovariate(); void loadPCACovariate(); void mainStats(); bool ignorePosition(); }; bool RCount::ignorePosition() { if ( nsnps < 1 ) return true; return nalleles < 5; } void RCount::loadCovariate() { // Place as last covariate for (int i=0; in; i++) { P->sample[i]->clist[ par::clist_number - 1 ] = gval[i] > 0 ? rval[i] / (double)(gval[i]) : 0; } } void RCount::loadPCACovariate() { // In place of simply counting all the rare SNPs, perform a PCA on the // rare SNP data matrix, then sum the standardized PCs above a certian // threshold (i.e. this way giving equal weight to equal independently // detected component of rare variation) vector snplist; map::iterator i = rwin.begin(); while ( i != rwin.end() ) { snplist.push_back( i->first ); ++i; } boolmatrix_t mask; matrix_t g; bool dominantModel = true; geno2matrix( snplist , g , mask , dominantModel ); vector_t pc; matrix_t ps; matrix_t pv; matrix_t g0 = g; // Setting last flag to false implies no mean-centering // This version of PCA return U.W*.V' in 'g', where // W* is an editted eigen-value set, such that they // equal eithe 0 or 1 bool meanCentre = par::elf_pcmode_2sided; int pcn = pca( g , mask , pc , ps , pv, meanCentre); int ntot = g[0].size(); if ( ! par::elf_pcmode_2sided ) { // Calculate score which is sum of squares of U.W*.V' vector_t sc(P->n,0); for (int i=0; in; i++) { for (int p = 0 ; p < ntot ; p++ ) sc[i] += g[i][p] * g[i][p]; } // Standardize and threshold at 4SD double m = 0; double ssq = 0; double v = 0; for (int i=0; in; i++) { m += sc[i]; ssq += sc[i] * sc[i]; } m /= P->n; ssq /= P->n; v = ssq - m*m; double sd = sqrt(v); // Now assign... for (int i=0; in; i++) { double z = (sc[i]-m)/sd; if ( z > 4) z = 4; P->sample[i]->clist[ par::clist_number - 1 ] = z; } } else { // Resize covariate load par::clist_number -= npc; // Place as set of covariates, 1 past last covariate int start = par::clist_number; par::clist_number += pcn; // End int end = par::clist_number-1; P->clistname.resize( par::clist_number ); // cout << "pcn, npc = " << pcn << " " << npc << "\n"; // cout << "Loading covars : " << start << " to " << end << "\n"; for (int i=0; in; i++) { P->sample[i]->clist.resize( par::clist_number ); int k=0; for (int j=start; j<=end; j++) { P->sample[i]->clist[ j ] = ps[i][k++]; } } // Track number of PCs so they can be accounted for in the next analysis npc = pcn; } } bool RCount::setWindow(int chr, int bp) { // Find all SNPs with x kb of bp, and add to rwin, if not already in // there Also, keep track of what we have added, and remove any SNPs // that should no longer be in the window // Return true if window actually changes since last position bool changed = false; set nwin; // Use a Range to lookup the SNPs in this range Range r; r.chr = chr; r.start = bp - (int)par::rarer_dist_threshold; r.stop = bp + (int)par::rarer_dist_threshold; // Start and stop sites for this range: int2 l2 = mapSNPs2Range( *PP , &r ); ///////////////////////////// // Need to add any new SNPs? if ( l2.p1 != -1 ) for (int l = l2.p1 ; l <= l2.p2 ; l++ ) { if ( addSNP(l) ) changed = true; nwin.insert(l); } //////////////////////// // Need to remove any? map::iterator iter = rwin.begin(); set toRemove; while ( iter != rwin.end() ) { if ( nwin.find( iter->first ) == nwin.end() ) toRemove.insert( iter->first ); ++iter; } set::iterator i2 = toRemove.begin(); while( i2 != toRemove.end() ) { if ( removeSNP( *i2 ) ) changed = true; ++i2; } return changed; } void RCount::displayWindow() { int rmin = 9999999; int rmax = -1; map::iterator iter = rwin.begin(); while ( iter != rwin.end() ) { if ( iter->first < rmin ) rmin = iter->first; if ( iter->first > rmax ) rmax = iter->first; ++iter; } acnt = 0; ucnt = 0; for (int i=0; in; i++) if ( P->sample[i]->pperson->aff ) acnt += rval[i]; else ucnt += rval[i]; cout << "Window from " << P->locus[rmin]->name << "(" << P->locus[rmin]->bp << ") to " << P->locus[rmax]->name << "(" << P->locus[rmax]->bp << ") to " << rwin.size() << " SNPs with vals (A/U) " << acnt << " and " << ucnt << "\n"; } void RCount::mainStats() { nsnps = rwin.size(); acnt = 0; ucnt = 0; acnt2 = 0; ucnt2 = 0; int acnt3 = 0; int ucnt3 = 0; for (int i=0; in; i++) { if ( ! P->sample[i]->missing ) { if ( par::bt ) { if ( P->sample[i]->pperson->aff ) { acnt += rval[i]; ++acnt2; acnt3 += aval[i]; } else { ucnt += rval[i]; ++ucnt2; ucnt3 += aval[i]; } } else { ucnt += rval[i]; ++ucnt2; ucnt3 += aval[i]; } } } // Number of low-frequency alleles in // this window nalleles = (int)acnt3 + (int)ucnt3; // The proportion of alleles that are LF if ( par::bt && acnt2>0) acnt /= (double)acnt2; if ( ucnt2>0 ) ucnt /= (double)ucnt2; } bool RCount::addSNP(int l) { if ( rwin.find(l) != rwin.end() ) return false; if ( P->locus[l]->freq > par::rarer_maf_threshold ) { return false; } vector_t r(P->n,0); vector g(P->n,0); vector a(P->n,0); double wt; if ( par::rare_test_weight1 ) wt = 1/P->locus[l]->freq; CSNP * snp = P->SNP[l]; for (int i=0; in; i++) { ////////////////////////////////////////// // Get and parse genotypes bool one = snp->one[i]; bool two = snp->two[i]; // Skip if missing if ( one && !two ) continue; if ( domModel ) { // Dominant coding if ( ( ! one ) || ( ! two ) ) { r[i] += par::rare_test_weight1 ? wt : 1 ; ++a[i]; } g[i] += 1; } else { // Additive coding if ( ! one ) { r[i] += par::rare_test_weight1 ? wt : 1 ; ++a[i]; } if ( ! two ) { r[i] += par::rare_test_weight1 ? wt : 1 ; ++a[i]; } g[i] += 2; } // Add to current total per person rval[i] += r[i]; aval[i] += a[i]; if ( domModel ) gval[i] += 1; else gval[i] += 2; } rwin.insert(make_pair( l , r )); gwin.insert(make_pair( l , g )); awin.insert(make_pair( l , a )); return true; } bool RCount::removeSNP(int l) { map::iterator iter = rwin.find(l); // If SNP never added to window, nothing to do if ( iter == rwin.end() ) return false; map >::iterator giter = gwin.find(l); map >::iterator aiter = awin.find(l); for (int i=0; in; i++) { rval[i] -= iter->second[i]; gval[i] -= giter->second[i]; aval[i] -= aiter->second[i]; } rwin.erase(iter); gwin.erase(giter); awin.erase(aiter); return true; } // Other output helper functions void displayScoresPerson(ofstream & O, RCount & rc) { for (int i = 0 ; i < PP->n ; i++ ) { O << setw(par::pp_maxfid ) << PP->sample[i]->fid << " " << setw(par::pp_maxiid ) << PP->sample[i]->iid << " "; if ( PP->sample[i]->missing ) O << "NA" << "\t" << "NA" << "\t" << "NA" << "\n"; else O << PP->sample[i]->phenotype << "\t" << PP->sample[i]->clist[ par::clist_number - 1 ] << "\t" << rc.aval[i] << "\t" << rc.gval[i] << "\n"; } } void displayScoresRegion(ofstream & O, RCount & rc) { map >::iterator i = rc.awin.begin(); while ( i != rc.awin.end() ) { int count = 0; for ( int k = 0 ; k < i->second.size(); k++) count += i->second[k]; O << setw(4) << PP->locus[ i->first ]->chr << " " << setw(par::pp_maxsnp ) << PP->locus[ i->first ]->name << " " << setw(12) << PP->locus[ i->first ]->bp << " " << setw(12) << PP->locus[ i->first ]->freq << " " << setw(12) << PP->locus[ i->first ]->allele1 << " " << setw(12) << count << "\n"; ++i; } } void Plink::permTestRareDistribution(Perm & perm) { printLOG("Testing for Enrichment of Low Frequency variants "); if ( par::rare_test_weight1 ) printLOG(" (1/MAF weighting, "); else printLOG(" ( MAF < " +dbl2str(par::rarer_maf_threshold) +", "); printLOG("within " +int2str(int(par::rarer_dist_threshold/1000)) +" kb)\n"); //////////////////////////// // Use last covariate slot par::assoc_glm_without_main_snp = true; par::clist = true; if ( !par::elf_pcmode_2sided ) { ++par::clist_number; clistname.push_back("RCNT"); for (int i=0; iclist.push_back(0); } // NOTE: Not used now map ranges; /////////////////////// // Original vector_t original = testRareDistribution(perm, true, ranges); /////////////////////// // Set up permutations perm.setTests( original.size() ); perm.setPermClusters(*this); perm.originalOrder(); if ( ! par::permute ) return; if (par::mperm_rank) perm.setOriginalRanking(original); ////////////////////// // Begin permutations bool finished = false; while(!finished) { perm.permuteInCluster(); vector_t pr = testRareDistribution(perm,false, ranges); finished = perm.update(pr,original); } if (!par::silent) cout << "\n\n"; ///////////////////////////////////////////////////////////////////// // Write results to file ofstream ASC; string f; if (par::adaptive_perm) f = par::output_file_name + ".elf.perm"; else f = par::output_file_name + ".elf.mperm"; ASC.open(f.c_str(),ios::out); ASC.precision(4); printLOG("Writing permutation association results to [ " + f + " ] \n"); ASC << setw(4) << "CHR" << " " << setw(par::pp_maxsnp)<< "SNP" << " " << setw(12)<< "STAT" << " " << setw(12) << "EMP1" << " "; if (par::adaptive_perm) ASC << setw(12)<< "NP" << " "; else if ( par::mperm_rank ) ASC << setw(12)<< "EMP3" << " " << setw(12)<< "RANK" << " "; else ASC << setw(12)<< "EMP2" << " "; ASC << "\n"; for (int l=0; l< original.size(); l++) { // Skip?, if filtering p-values if ( par::pfilter && perm.pvalue(l) > par::pfvalue ) continue; // ASC << setw(4) << locus[l]->chr << " " // << setw(par::pp_maxsnp) << locus[l]->name << " "; ASC << setw(8) << l << " "; ASC << setw(12) << original[l] << " " << setw(12) << perm.pvalue(l) << " "; if (par::adaptive_perm) ASC << setw(12) << perm.reps_done(l) << " "; else if ( par::mperm_rank ) ASC << setw(12) << perm.max_pvalue(l) << " " << setw(12) << perm.rank(l) << " "; else ASC << setw(12) << perm.max_pvalue(l) << " "; ASC << "\n"; } ASC.close(); } vector_t Plink::testRareDistribution(Perm & perm , bool disp, map & ranges) { ///////////////////////////////////////////////////////////////////// // Write results to file ofstream OUT; OUT.precision(4); ofstream SUM; const double pthresh = 0.01; bool one_sided = true; ofstream SDET_SNP; ofstream SDET_IND; if ( disp ) { string f = par::output_file_name + ".elf"; OUT.open(f.c_str(),ios::out); printLOG("Writing results to [ " + f + " ]\n"); OUT << setw(4) << "CHR" << " " << setw(12) << "BP1" << " " << setw(12) << "BP2" << " " << setw(12) << "BP" << " " << setw(6) << "NSNP" << " " << setw(8) << "NALLELE" << " "; if ( par::bt ) {OUT << setw(8) << "ACNT" << " " << setw(8) << "UCNT" << " "; if ( ! par::elf_pcmode_2sided ) OUT << setw(10) << "OR" << " "; } else { OUT << setw(8) << "CNT" << " "; if ( ! par::elf_pcmode_2sided ) OUT << setw(10) << "BETA" << " "; } OUT << setw(10) << "CHISQ" << " " << setw(10) << "P" << "\n"; } // Use regression model: put # of rare variants per individual as a // covariate, and use glmAssoc() // We do not know how many results we will obtain to start off with vector_t results; RCount rc(this,&ranges); if ( par::elf_pcmode ) rc.pcMode = true; vector_t b; double chisq; double pvalue; int srange_cnt = 0; bool inRange = false; int startChromosome = locus[ 0 ]->chr; int finalChromosome = locus[ nl_all - 1]->chr; for (int chr = startChromosome ; chr <= finalChromosome; chr++) { int bpstart = scaffold[chr].bpstart; int bpstop = scaffold[chr].bpstop; for ( int bp = bpstart; bp <= bpstop; bp += par::rarer_interval ) { bool windowMoved = rc.setWindow(chr,bp); //rc.displayWindow(); // If no new SNPs have been added or removed from window, // then just serve up the same results as last time if ( ! windowMoved ) { continue; } rc.mainStats(); // Enough to be bothering with? if ( rc.ignorePosition() ) { continue; } // Perform actual test if ( rc.pcMode ) rc.loadPCACovariate(); else rc.loadCovariate(); glmAssoc( false , perm ); // Get results double beta; if ( ! par::elf_pcmode_2sided ) { model->testParameter = par::clist_number; b = model->getCoefs(); chisq = model->getStatistic(); pvalue = chiprobP(chisq,1); beta = b[ par::clist_number ]; } else { b = model->getCoefs(); beta = 1; // no direction vector_t h; // dim = number of fixes (to =0) matrix_t H; // row = number of fixes; cols = np h.resize(rc.npc,0); sizeMatrix(H,rc.npc,model->getNP()); int startpc = par::clist_number - rc.npc; for (int i=0; iisValid() ? model->linearHypothesis(H,h) : 0; pvalue = model->isValid() ? chiprobP(chisq, rc.npc) : -1; } // Permutation test is 1-sided if ( beta < 0 ) results.push_back( 0 ); else results.push_back( chisq ); // Clean up delete model; // Write results to a file? if ( disp ) { double coef = par::bt ? exp( beta ) : beta ; int bp1 = bp - (int)par::rarer_dist_threshold < bpstart ? bpstart : bp - (int)par::rarer_dist_threshold; int bp2 = bp + (int)par::rarer_dist_threshold > bpstop ? bpstop : bp + (int)par::rarer_dist_threshold; OUT << setw(4) << chr << " " << setw(12) << bp1 << " " << setw(12) << bp2 << " " << setw(12) << (int)((bp1+bp2)/2.0) << " " << setw(6) << rc.nsnps << " " << setw(8) << rc.nalleles << " "; if ( par::bt ) OUT << setw(8) << rc.acnt << " "; OUT << setw(8) << rc.ucnt << " "; if ( ! par::elf_pcmode_2sided ) OUT << setw(10) << coef << " "; OUT << setw(10) << chisq << " " << setw(10) << pvalue << "\n"; OUT.flush(); if ( par::rare_test_print_details && int2str(chr)+":"+int2str( (int)((bp1+bp2)/2.0)) == par::rare_test_print_details_snp ) { printLOG("Printing details for region around " + par::rare_test_print_details_snp + "\n"); SDET_SNP.open( ( par::output_file_name+".elf.det." + par::rare_test_print_details_snp + ".snp" ).c_str() , ios::out ); SDET_IND.open( ( par::output_file_name+".elf.det." + par::rare_test_print_details_snp + ".ind" ).c_str() , ios::out ); ////////////////////////////////// // Print scores per person, and per SNP displayScoresPerson( SDET_IND , rc ); SDET_IND.close(); displayScoresRegion( SDET_SNP , rc ); SDET_SNP.close(); } } // end if verbose display mode } // Next window location } // Finished, close any open streams if ( disp ) { OUT.close(); } return results; } void Plink::displayRareRange() { map > ranges = readRange( par::rare_test_score_range_file ); printLOG("Reading ELF results file from [ " + par::rare_test_score_results_file + " ]\n"); printLOG("Reading ELF ranges from [ " + par::rare_test_score_range_file + " ]\n"); checkFileExists( par::rare_test_score_results_file ); ifstream IN; IN.open( par::rare_test_score_results_file.c_str() ); // Read first row int pcol = -1; int bpcol = -1; int bcol = -1; // int snpcol = -1; int chrcol = -1; vector tokens = tokenizeLine(IN); for (int i = 0 ; i < tokens.size() ; i++) { if ( tokens[i] == "P" ) pcol = i; if ( tokens[i] == "OR" || tokens[i] == "BETA" ) bcol = i; // if ( tokens[i] == "SNP" ) // snpcol = i; if ( tokens[i] == "CHR" ) chrcol = i; if ( tokens[i] == "BP" ) bpcol = i; } int ncol = tokens.size(); if ( pcol == -1 ) error("Could not find P field in header"); // if ( snpcol == -1 ) // error("Could not find SNP field in header"); if ( bpcol == -1 ) error("Could not find BP field in header"); if ( chrcol == -1 ) error("Could not find CHR field in header"); bool no_beta = false; if ( bcol == -1 ) { no_beta = true; printLOG("Couldn't find OR/BETA field, so reporting all regions\n"); } // map snpmap; map pmap; map bmap; while ( !IN.eof() ) { vector tokens = tokenizeLine(IN); if ( tokens.size() == 0 ) continue; if ( tokens.size() != ncol ) error("Wrong number of columns in input file"); double p, b=1; int chr, bp; // string snp; //snp = tokens[snpcol]; chr = getChromosomeCode( tokens[chrcol ] ); if ( ! from_string( p , tokens[pcol] , std::dec ) ) p = 1; if ( ! no_beta ) { if ( ! from_string( b , tokens[bcol] , std::dec ) ) b = 1; } if ( ! from_string( bp , tokens[bpcol] , std::dec ) ) error("Problem converting BP value: " + tokens[bpcol] ); int2 t( chr , bp ); // snpmap.insert( make_pair( t , snp ) ); pmap.insert( make_pair( t , p ) ); bmap.insert( make_pair( t , b ) ); } IN.close(); ofstream SUM; printLOG("Writing range summary to [ " + par::output_file_name + ".elf.summary ]\n"); SUM.open( ( par::output_file_name + ".elf.summary").c_str() , ios::out ); SUM << setw(4) << "CHR" << " " << setw(12) << "BP1" << " " << setw(12) << "BP2" << " " << setw(12) << "BESTP" << " " << "GENES" << "\n"; map::iterator i = pmap.begin(); int srange_cnt = 0; bool inRange = false; Range srange; int l = 0; int ntot = pmap.size() - 1; double bestp = 1; while ( i != pmap.end() ) { double pvalue = i->second; double coef; if ( no_beta ) coef = 99 ; else coef = bmap.find( i->first )->second; // Look for control enrichment? If so, just flip coef here if ( par::rare_test_summary_controls ) coef = 1 / coef; if ( ( ! inRange ) && pvalue <= par::rare_test_score_range_threshold && coef > 1 ) { inRange = true; bestp = pvalue; srange.chr = i->first.p1; srange.start = srange.stop = i->first.p2; } else if ( inRange && ( pvalue > par::rare_test_score_range_threshold || coef < 1 || i->first.p1 != srange.chr || l == ntot ) ) { SUM << setw(4) << srange.chr << " " << setw(12) << srange.start << " " << setw(12) << srange.stop << " " << setw(12) << bestp << " "; ++srange_cnt; SUM.flush(); // Lookup genes in this region? if ( true ) { Range r1(srange.chr, srange.start , srange.stop , "dummy"); set implicated = rangeIntersect(r1,ranges); set::iterator ri = implicated.begin(); while ( ri != implicated.end() ) { SUM << (*ri)->name << ","; ++ri; } } SUM << "\n"; inRange = false; if ( pvalue <= par::rare_test_score_range_threshold && coef > 1 ) { srange.chr = i->first.p1; srange.start = srange.stop = i->first.p2; inRange = true; bestp = pvalue; } } if ( inRange ) { srange.stop = i->first.p2; if ( pvalue < bestp && realnum(pvalue) ) bestp = pvalue; } ++l; ++i; } printLOG("Found " + int2str(srange_cnt) + " distinct regions of contiguous association\n"); SUM.close(); shutdown(); } void Plink::elfBaseline() { ofstream ELF; string f = par::output_file_name + ".elf.baseline"; ELF.open(f.c_str(),ios::out); ELF.precision(4); printLOG("Writing baseline LF SNP count to [ " + f + " ] \n"); ELF << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(4) << "CHR" << " " << setw(8) << "CNT" << " " << setw(8) << "GENO" << " " << setw(8) << "RATE" << "\n"; for (int i = 0 ; i < n ; i++ ) { Individual * person = sample[i]; int cnt = 0; int gcnt = 0; map chr_cnt; map chr_gcnt; int chr = -1; int * p_cnt; int * p_gcnt; for (int l = 0; l < nl_all; l++) { if ( locus[l]->freq > par::rarer_maf_threshold ) continue; if ( locus[l]->chr != chr ) { chr = locus[l]->chr; chr_cnt.insert( make_pair( chr , 0 ) ); chr_gcnt.insert( make_pair( chr , 0 ) ); p_cnt = &(chr_cnt.find(chr)->second); p_gcnt = &(chr_gcnt.find(chr)->second); } bool X=false, haploid=false; if ( par::chr_sex[locus[l]->chr] ) X=true; else if ( par::chr_haploid[locus[l]->chr] ) haploid=true; bool s1 = par::SNP_major ? SNP[l]->one[i] : person->one[l]; bool s2 = par::SNP_major ? SNP[l]->two[i] : person->two[l]; if ( s1 && !s2 ) continue; if ( haploid || ( X && person->sex ) ) { ++gcnt; ++(*p_gcnt); if ( !s1 ) { ++cnt; ++(*p_cnt); } } else { gcnt +=2; ++(*p_gcnt); ++(*p_gcnt); if ( !s1 ) { ++cnt; ++(*p_cnt); } if ( !s2 ) { ++cnt; ++(*p_cnt); } } } ELF << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(4) << "G" << " "; ELF << setw(8) << cnt << " " << setw(8) << gcnt << " " << setw(8) << (double)cnt / (double)gcnt << "\n"; map::iterator i = chr_cnt.begin(); while ( i != chr_cnt.end() ) { int c = i->first; int x = chr_cnt.find( c )->second; int y = chr_gcnt.find( c )->second; ELF << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(4) << c << " "; ELF << setw(8) << x << " " << setw(8) << y << " " << setw(8) << (double)x / (double)y << "\n"; ++i; } } ELF.close(); } plink-1.07-src/cfamily.cpp0000644000265600020320000001362511264127626014635 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include "plink.h" #include "options.h" #include bool isAncestorOf(Individual *indx, Individual * f); int mCount(Individual * indx, Individual *f); void listAllAncestors(Individual * a, set & anclist, int d) { anclist.insert( a ); if ( a->pm == NULL && a->pp == NULL ) return; if ( a->pp ) { anclist.insert( a->pp ); listAllAncestors( a->pp , anclist , d+1 ); } if ( a->pm ) { anclist.insert( a->pm ); listAllAncestors( a->pm , anclist , d+1 ); } return; } double genrel(Individual * a, Individual * b) { // cout << "GR for " << a->iid << " and " << b->iid << "\n"; double g = 0; // Same person? if ( a == b ) return 1; // Are both individuals founders or in different families? if ( a->fid != b->fid ) return 0; if ( a->founder && b->founder ) return 0; // Assuming no inbreeding, find the nearest common ancestor // For each possible individual, store the number of meioses that // separate a from k and b from k (in an int2, -1 for not a common // ancestor) map nca; // Start with ancestors of A set ancestorsA; set ancestorsB; listAllAncestors(a,ancestorsA,0); listAllAncestors(b,ancestorsB,0); multiset commonAncestors; set::iterator i = ancestorsA.begin(); while( i != ancestorsA.end() ) { commonAncestors.insert( *i ); ++i; } i = ancestorsB.begin(); while( i != ancestorsB.end() ) { commonAncestors.insert( *i ); ++i; } // Any individuals represented twice? set mrca; multiset::iterator j = commonAncestors.begin(); while ( j != commonAncestors.end() ) { if ( commonAncestors.count( *j ) == 2 ) mrca.insert( *j ); ++j; } // cout << "sizes = " << ancestorsA.size() << " " << ancestorsB.size() << "\n"; // cout << "MRCA for " << a->fid << " " << a->iid << " / " << b->iid << "\n"; // i = mrca.begin(); // while( i != mrca.end() ) // { // cout << (*i)->fid << " " << (*i)->iid << "\n"; // ++i; // } // cout << "\n"; // Iterate through common ancestors, finding # of // meioses back to the two founders individuals i = mrca.begin(); while( i != mrca.end() ) { int2 m( (*i)->countMeioses(a) , (*i)->countMeioses(b) ); nca.insert( make_pair( *i, m ) ); ++i; } int kmin = 9999; i = mrca.begin(); while( i != mrca.end() ) { int2 m( (*i)->countMeioses(a) , (*i)->countMeioses(b) ); nca.insert( make_pair( *i, m ) ); if ( m.p1 + m.p2 < kmin ) kmin = m.p1 + m.p2; ++i; } ////////////////////// // Calculate 'g' map::iterator k = nca.begin(); while( k != nca.end() ) { int2 m = k->second; int m2 = m.p1 + m.p2; if ( m2 == kmin ) { g += pow(0.5,m2); //cout << "adding " << k->first->iid << " : " << m.p1 << " + " << m.p2 << "\n"; } ++k; } return g; } int Individual::countMeioses(Individual *f) { if ( isAncestorOf(this,f ) ) return mCount(this,f); else if ( isAncestorOf( f,this ) ) return mCount(f,this); else return 0; } int mCount(Individual * indx, Individual *f) { vector inds; vector checked; bool finished = false; int nm = 0; // Add self to list inds.push_back(indx); checked.push_back(false); while (!finished) { // Check list for a match // needs changing if inbreeding for (int i = 0 ; i < inds.size() ; i++) { if (inds[i] == f) return nm; } // Increment meioses counter nm++; // Add children of unchecked inds int already = inds.size(); for (int i = 0 ; i < already ; i++) { if (!checked[i]) { for (int j = 0 ; j < inds[i]->kids.size() ; j++) { inds.push_back(inds[i]->kids[j]); checked.push_back(false); } checked[i] = true; } } // All done? finished = true; for (int i = 0 ; i < inds.size() ; i++) if ( checked[i] == false ) finished = false; // loop back } return nm; } bool isAncestorOf(Individual *indx, Individual * f) { vector inds; vector checked; bool finished = false; int nm = 0; // Add self to list inds.push_back(indx); checked.push_back(false); while (!finished) { // Check list for a match for (int i = 0 ; i < inds.size() ; i++) if (inds[i] == f) return true; // Increment meioses counter nm++; // Add children of unchecked inds int already = inds.size(); for (int i = 0 ; i < already ; i++) { if (!checked[i]) { for (int j = 0 ; j < inds[i]->kids.size() ; j++) { inds.push_back( inds[i]->kids[j] ); checked.push_back(false); } checked[i] = true; } } // All done? finished = true; for (int i = 0 ; i < inds.size() ; i++) if ( checked[i] == false ) finished = false; // loop back } return false; } plink-1.07-src/plink.cpp0000644000265600020320000014601611264127624014325 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "crandom.h" #include "perm.h" #include "sets.h" #include "linear.h" #include "logistic.h" #include "phase.h" #include "clumpld.h" #include "nlist.h" #include "sets.h" #include "stats.h" #include "idhelp.h" #include "zed.h" using namespace std; ofstream LOG; string PVERSION; string PDATE; string PREL; Plink * PP; map Range::groupNames; int main(int argc, char* argv[]) { ///////////////////////// // Setup, display title cout.setf(ios::fixed); cout.precision(8); set_new_handler(NoMem); PVERSION = "1.07"; // 4 chars PREL = " "; // space or p (full, or prelease) PDATE = "10/Aug/2009"; // 11 chars ////////////////// // The major class Plink P; PP = &P; ///////////////////////////////////////////////////// // General class for all haplotype-related functions P.haplo = new HaploPhase(P); ////////////////////////// // Command line arguments CArgs a(argc,argv); getOutputFilename(a); ////////////////////////// // Start logging, title LOG.open(string(par::output_file_name + ".log").c_str()); P.printLOG("\n" "@----------------------------------------------------------@\n" "| PLINK! | v"+PVERSION+PREL+" | "+PDATE+" |\n" "|----------------------------------------------------------|\n" "| (C) 2009 Shaun Purcell, GNU General Public License, v2 |\n" "|----------------------------------------------------------|\n" "| For documentation, citation & bug-report instructions: |\n" "| http://pngu.mgh.harvard.edu/purcell/plink/ |\n" "@----------------------------------------------------------@\n" "\n"); ////////////////////////// // Fully parse command line setOptions(a); ///////////////////// // Permutation class if ( par::random_seed == 0 ) CRandom::srand(time(0)); else CRandom::srand( par::random_seed ); Perm perm(P); P.pperm = & perm; //////////////// // Check version if (par::web_check) P.webcheck(a); else P.printLOG("Skipping web check... [ --noweb ] \n"); ///////////// // Time stamp P.printLOG("Writing this text to log file [ "+ par::output_file_name + ".log ]\n"); time_t curr=time(0); string tdstamp = (string)ctime(&curr); P.printLOG("Analysis started: " + tdstamp +"\n"); ///////////////////////////////////// // Validate and record all arguments a.check_unused_options(P); if ( par::output_file_name.find(".",0) != string::npos ) P.printLOG("** For gPLINK compatibility, do not use '.' in --out **\n"); ////////////////////////// // Some basic definitions if (par::species_dog) defineDogChromosomes(); else if (par::species_sheep) defineSheepChromosomes(); else if (par::species_cow) defineCowChromosomes(); else if (par::species_horse) defineHorseChromosomes(); else if (par::species_rice) defineRiceChromosomes(); else if (par::species_mouse) defineMouseChromosomes(); else defineHumanChromosomes(); /////////////////////////////// // Web-based SNPServer lookup? if ( par::lookup ) { P.lookup(); shutdown(); } if ( par::lookup2 ) { P.lookup2(); shutdown(); } ///////////////////////// // ID helper? if ( par::idhelp ) { IDHelper ID; ID.idHelp(); shutdown(); } ///////////////////////// // File compression utility if ( par::compress_file ) { fileCompress(); shutdown(); } if ( par::uncompress_file ) { fileUncompress(); shutdown(); } ////////////////////////////////////////////////// // Main Input files // Simulate or read in data: // Binary or ASCII format; transposed/long/generic if (par::dummy) P.dummyLoader(); else if (par::greport) P.displayGeneReport(); else if (par::annot_file) P.annotateFile(); else if (par::meta_analysis) P.metaAnalysis(); else if (par::rare_test_score_range) P.displayRareRange(); else if (par::simul) { if ( par::simul_qt ) P.simulateSNPs_QT(); else P.simulateSNPs(); } else if (par::cnv_list) P.setUpForCNVList(); else if (par::read_bitfile) P.readBinData(); else if (par::lfile_input) P.readDataLongFormat(); else if (par::tfile_input) P.readTransposedData(); else if (par::read_ped) P.readData(); else if (par::gvar) { par::load_gvar=true; P.readGenericVariantData(); } else if ( par::dosage_assoc ) { P.readFamFile(par::famfile); if ( par::dosage_hasMap ) { checkFileExists( par::mapfile ); vector include; vector include_pos(0); int nvar = 0; P.readMapFile(par::mapfile, include, include_pos, nvar); } } // Set number of individuals P.n = P.sample.size(); // Set number of pairs P.np = (int)((double)(P.n*(P.n-1))/(double)2); // Total number of all (test+background) loci P.nl_all = P.locus.size(); // Number of permutations to store P.maxr2.resize(par::replicates); // Check for duplicate individual or SNP names checkDupes(P); ///////////////////////////////////// // Merge with a secondary data file // Standard (non-list) mode if (par::merge_data && !par::merge_list) { if (par::merge_binary) P.mergeBinaryData(); else P.mergeData(); // Reset number of individuals P.n = P.sample.size(); // Set number of pairs P.np = (int)((double)(P.n*(P.n-1))/(double)2); // Total number of all (test+background) loci P.nl_all = P.locus.size(); } ///////////////////////////////////// // Merge with a secondary data file // List mode if (par::merge_list) P.mergeList(); ////////////////////////////////////////// // A different phenotype file specified? if (par::pheno_file) P.readPhenoFile(); else if (par::make_pheno) P.makePhenotype(); else if (par::multiple_phenotypes) P.readMultiplePhenoFile(); //////////////////////////////// // Remove any individuals with // missing phenotypes if (!par::ignore_phenotypes) removeMissingPhenotypes(P); ////////////////////////////////// // Binary affection status coding if (par::bt) affCoding(P); ///////////////////////////////// // Update MAP file information? if (par::update_map) P.updateMapFile(); ///////////////////////////////// // Update FAM information? if (par::update_ids || par::update_parents || par::update_sex || par::update_pheno ) P.updateFamFile(); ///////////////////////////////// // Update allele file information? if (par::update_alleles) P.updateAlleles(); ///////////////////////////////// // Flip DNA strand for any SNPs? if (par::flip_strand) P.flipStrand(); ///////////////////////////////// // Recode any alleles? if (par::recode_ACGT || par::recode_1234) P.alleleRecoding(); ////////////////////////////////////////////////////////// // Output a specific set of SNPs (--extract or --exclude) if ( par::extract_before_exclude ) { if (par::extract_set) P.extractExcludeSet(false); if (par::exclude_set) P.extractExcludeSet(true); } else { if (par::exclude_set) P.extractExcludeSet(true); if (par::extract_set) P.extractExcludeSet(false); } ///////////////////////////////////////////////////////////// // Output a specific set of individuals --remove or --keep if ( par::remove_before_keep ) { if (par::remove_indiv) P.removeIndividuals(false); if (par::keep_indiv) P.removeIndividuals(true); } else { if (par::keep_indiv) P.removeIndividuals(true); if (par::remove_indiv) P.removeIndividuals(false); } /////////////////////////////////////////////// // Filter based on attribute files if ( par::snp_attrib_filter ) P.attribFilterSNP(); if ( par::ind_attrib_filter ) P.attribFilterInd(); /////////////////////////////////////////////// // Filter based on qualiy scores if ( par::read_snp_qual ) P.filterQualSNPs(); if ( par::read_geno_qual ) P.filterQualGenotypes(); ////////////////////////////////////////////////// // Pull a random subset of SNPs? if ( par::thin_snps ) P.thinSNPs(); ///////////////////////////////////////////////////////////// // If in --genome list mode, keep the two lists of individuals if (par::genome_2sets) { P.keep2SetsForGenome(); } /////////////////////////////////////////////// // Read a list of obligatory missing genotypes? if (par::oblig_missing) P.setObligMissing(); ////////////////////////////////////////////////// // Filter individuals based on external covariate? if (par::filter_on_covar) { P.filterOnCovariate(); // Reset number of individuals P.n = P.sample.size(); P.np = (int)((double)(P.n*(P.n-1))/(double)2); } //////////////////////////// // Any simple preset filters if (par::filter_males) P.filterOnMale(); else if (par::filter_females) P.filterOnFemale(); if (par::filter_cases) P.filterOnCase(); else if (par::filter_controls) P.filterOnControl(); if (par::filter_founders) P.filterOnFounder(); else if (par::filter_nonfounders) P.filterOnNonFounder(); //////////////////////////////// // A covariate file specified? if (par::covar_file) { // Multiple covariates? if (par::clist) { if (!P.readCovListFile()) error("Problem reading the covariates"); } else // a single covariate { if (!P.readCovariateFile()) error("Problem reading the specified covariate from the covariate file"); } } ////////////////////////////////////// // Assign cluster solution from file if (par::include_cluster_from_file) { P.printLOG("Reading clusters from [ " + par::include_cluster_filename+" ]\n"); if (!P.readClusterFile()) error("Problem reading from [ "+par::include_cluster_filename+" ]"); } else if ( par::sol_family ) { P.printLOG("Setting clusters based on family IDs\n"); vector famlist; P.kname.resize(0); for (int i=0; ifid == famlist[j]) { match=true; person->sol=j; } if (!match) { famlist.push_back(person->fid); person->sol=famlist.size()-1; P.kname.push_back(person->fid); } } // Set number of clusters/families P.nk = famlist.size(); // Set klist variable P.klist.clear(); for (int j=0;jsol > -1 ) P.klist[P.sample[i]->sol]->person.push_back(P.sample[i]); } else { P.klist.clear(); P.klist.push_back( new Cluster ); for (int i=0; iperson.push_back(P.sample[i]); } ///////////////////////////////////////// // Zero-out specific sets of genotypes? if ( par::zero_cluster ) P.zeroOnCluster(); ///////////////////////////////// // Fix reference allele? if ( par::set_reference_allele ) P.setReferenceAllele(); ////////////////////////////////// // Determine formats for printing P.prettyPrintLengths(); ////////////////////////////////////////////////// // // // Process a dosage file // // // ////////////////////////////////////////////////// if ( par::dosage_assoc ) { // Normal behavior is to load data, and perform // analysis; if the hard-call option is specified, // then this will generate a dataset, that we can // subsequent filter and save, etc, as usual, i.e. // in that case, do not halt P.processDosageFile(); if ( ! par::dosage_hard_call ) shutdown(); } ////////////////////////////////////////////////// // // // Handle CNV segments separately // // // ////////////////////////////////////////////////// if ( par::cnv_list ) { P.readCNVList(); P.processCNVList(); shutdown(); } ////////////////////////////////////////////////// // // // Handle non-SNP data separately // // // ////////////////////////////////////////////////// if ( par::gvar || par::gvar_write ) { // We might want to load generic variants on top // of existing SNP data; or afresh if none of the // above have been specified if ( ! par::load_gvar ) P.readGenericVariantData(); if ( par::gvar_write ) { P.outputGenericVariantFile(); shutdown(); } P.processGVAR(); shutdown(); } ////////////////////////////////////////////////// // // // Misc. .genome grouper utility // // // ////////////////////////////////////////////////// if ( par::genome_groups ) { P.groupGenome(); shutdown(); } ////////////////////////////////// // Missing code // if ( par::bt && ! par::missing_phenotype_explicit ) // par::missing_phenotype = "0"; ////////////////////////////////////////////////// // // // Basic MAF, genotyping filters & HWE/ME // // // ////////////////////////////////////////////////// P.printLOG("Before frequency and genotyping pruning, there are " +int2str(P.nl_all)+" SNPs\n"); if (!par::FIXED_p) { P.filterSNPs(); } else for (int i=0; iallele1=="1") P.locus[i]->freq = par::FIX_p; else P.locus[i]->freq = 1-par::FIX_p; } P.printLOG("After frequency and genotyping pruning, there are " +int2str(P.nl_all)+" SNPs\n"); if ( P.nl_all == 0 ) error("Stopping as there are no SNPs left for analysis\n"); if ( P.n == 0 ) error("Stopping as there are no individuals left for analysis\n"); ////////////////////////////////////////////////// // Re-report basic case/control counts, etc summaryBasics(P); ////////////////////////////////////////////////// // Any null allele codes (monomorhpics)? for (int l=0; lallele1 == "" ) P.locus[l]->allele1 = "0"; } ///////////////////////////////////////// // SET statistics? if ( par::read_set ) P.readSet(); else if (par::make_set) P.outputSetFile(); Set S( P.snpset ); P.pS = & S; // Remove any SNPs not in a set // unless using particular commands // (set-by-all epistasis, set-table) if ( par::read_set || par::make_set ) { if ( par::drop_sets ) P.pS->dropNotSet(P); } ////////////////////////////////////////////////// // Build final marker scaffold makeScaffold(P); ////////////////////////////////////////////////// // // // Create family units? // // // ////////////////////////////////////////////////// /////////////////////// // Create family units? if (par::MENDEL_test || par::MENDEL_report || par::TDT_test || par::QTDT_test || par::make_founders && !par::built_families) { map fnd; map idmap; P.linkRelateds(idmap, fnd); P.parseTrios(); par::built_families = true; // Perform now, so that the user has an option to // save a new fileset with mendel errors removed if (par::MENDEL_report || par::MENDEL_test) P.checkMendel(); } //////////////////////////////////////////////// // Reset PAT/MAT codes of any non- nonfounders? // i.e. if parents not actually present in sample? if ( par::make_founders ) { P.makeFounders(); } ////////////////////////////////////// // Sex check if (par::check_sex) { P.sexCheck(); } ////////////////////////////////////// // Split TDT units to case/controls if ( par::tucc ) { if ( !par::built_families ) { map fnd; map idmap; P.linkRelateds(idmap, fnd); P.parseTrios(); par::built_families = true; } P.checkMendel(); P.pseudoCaseControl(); } ////////////////////////////////////////////////// // // // Haplotype imputation methods // // // ////////////////////////////////////////////////// // Do not use this old IMPUTATION method // Restrict to --proxy-impute, or original // --hap-impute (i.e. based on multi-marker list) if ( par::meta_large_phase ) { // Automatically try to impute all one window per chromosome // We can put in some other restraints here if need be if (par::has_nonfounders && !par::built_families) { map fnd; map idmap; P.linkRelateds(idmap, fnd); P.parseTrios(); P.checkMendel(); par::built_families = true; } P.printLOG("Estimating haplotype frequencies/phases ( MHF >= " + dbl2str(par::min_hf)+" )\n"); P.printLOG("Considering phases P(H|G) >= " +dbl2str(par::hap_min_phase_prob)+"\n"); P.printLOG("Requiring per individual per haplotype missingness < " +dbl2str(par::hap_missing_geno)+" \n"); P.printLOG("Initial EM window size " + int2str(par::haplo_plem_window) + " SNPs with " + int2str(par::haplo_plem_overlap) + " SNP overlap\n"); // Count number of founders P.haplo->cnt_f = 0; vector::iterator person = P.sample.begin(); while ( person != P.sample.end() ) { if ( (*person)->founder ) P.haplo->cnt_f++; person++; } if (P.haplo->cnt_fnonfounders = true; P.printLOG("Initial phasing based on "+ int2str(P.haplo->cnt_f)+" founders ("+ int2str(P.n-P.haplo->cnt_f)+ " non-founders)\n"); } // Start off just with the autosomes // We assume that "--chr" has been specified on the command line, // and so we are only dealing with a single chromosome here if ( par::impute_verbose ) { P.printLOG("Writing verbose imputation output to [ " +par::output_file_name + ".phased.out ]\n"); P.haplo->HIMPUTE.open((par::output_file_name+".phased.out").c_str(), ios::out); P.haplo->HIMPUTE.setf(ios::fixed); P.haplo->HIMPUTE.precision(2); } // Run imputation in blocks of up to 1000 SNPs P.haplo->makeSlidingWindow( "20+20" ); P.haplo->phaseAllHaplotypes(true,perm); if ( par::impute_verbose ) P.haplo->HIMPUTE.close(); } //////////////////////////////////// // Proxy-based haplotype imputation if (par::proxy_impute) { P.proxyWrapper(); // Do not shut down: we assume a --make-bed will // be called below } ////////////////////////////////////////////////// // // // Generate dummy permuted phenotype file // // // ////////////////////////////////////////////////// if ( par::output_pheno_perm ) { P.outputPermedPhenotypes(perm); shutdown(); } ////////////////////////////////////////////////// // // // Output formats and transformations // // // ////////////////////////////////////////////////// // Covariate files can also be output (--covar) // for the major options: --make-bed, --recode* // and also just --write-covar option if (par::set_table) { P.setTable(); shutdown(); } if (par::write_set) { P.writeSetFile(); shutdown(); } if (par::dump_covar) { P.write_covariates(); shutdown(); } if (par::dump_clst) { P.write_clusters(); shutdown(); } if (par::write_snplist) { P.write_snplist(); shutdown(); } if (par::write_bitfile) { P.write_BITFILE(); if (par::clist) P.write_covariates(); shutdown(); } if ( par::recode_fastphase ) { P.output_fastphase_format(); shutdown(); } if ( par::recode_bimbam ) { P.output_bimbam_format(); shutdown(); } if ( par::recode_structure ) { P.output_structure_format(); shutdown(); } if (par::recode || par::recode_HV || par::recode_12 || par::recode_whap ) { if ( ! par::recode_transpose ) P.display_recoded_PEDFILE(); else P.display_recoded_PEDFILE_transpose(); if (par::clist) P.write_covariates(); shutdown(); } if (par::recode_AD) { P.display_recoded_PEDFILE_AD(); if (par::clist) P.write_covariates(); shutdown(); } if (par::recode_long) { P.display_recoded_LONG(); if (par::clist) P.write_covariates(); shutdown(); } if (par::recode_mutlist) { P.display_recoded_MUTLIST(); if (par::clist) P.write_covariates(); shutdown(); } if (par::list_by_allele) { P.display_listByAllele(); shutdown(); } if (par::plist) { P.display_pairList(); } if (par::indiv_report) { P.display_indivReport(); shutdown(); } if (par::list_twolocus) { P.display_twolocus(); shutdown(); } ////////////////////////////////////////////////// // // // LD-based lookups // // // ////////////////////////////////////////////////// //////////////////////////////////////////// // Set summary statistics if (par::set_screen) { P.setAssocSummary(); shutdown(); } //////////////////////////////////////////// // LD-based clumping if ( par::clumpld ) { clump_LD cld(&P,P.haplo, par::clumpld_p1, par::clumpld_kb, par::clumpld_p2, par::clumpld_r2); cld.clump(); shutdown(); } //////////////////////////////////////////// // Show tags if ( par::gettag_mode ) { P.tagMode(); shutdown(); } //////////////////////////////////////////// // Haplotype block action if ( par::make_blocks ) { P.mkBlks(0,P.nl_all-1); shutdown(); } ////////////////////////////////////////////////// // // // Main set of whole-genome tests // // // ////////////////////////////////////////////////// ///////////////////////////////// // Some initial set-up work here ///////////////////// // Conditioning SNPs if (par::conditioning_snps) { if (par::conditioning_snp_single) { // ** todo ** change this to allow a NList int x = getMarkerNumber( P, par::conditioning_snp_name ); if (x<0) error("Marker " +par::conditioning_snp_name +" does not exist in filtered data\n"); P.conditioner.push_back( x ); P.conditioner_mask.push_back( false ); } else P.readConditioningList(); } ////////////////////////////////////////// // Warn if not enough markers in analysis if ( par::plink || par::cluster || par::cluster_plot || par::outlier_detection || par::genome_output || par::inbreeding ) { if (P.nl_all < 10000) P.printLOG("\n **Warning** this analysis typically requires whole-genome level data\n" " to give accurate results \n\n"); } ////////////////////////////////////////// // Arbitrary external functions if (par::myfunction) { if (1) { if (par::has_nonfounders && !par::built_families) { map fnd; map idmap; P.linkRelateds(idmap, fnd); P.parseTrios(); P.checkMendel(); par::built_families = true; } } // P.callMe(); shutdown(); } ////////////////////////////////////////////////// // // // IBS and IBD genome-wide analyses // // // ////////////////////////////////////////////////// ////////////////////////////////////////////// // Perform a cluster analysis and/or MDS plot if (par::cluster || par::cluster_plot || par::outlier_detection ) { P.buildCluster(); shutdown(); } /////////////////////////////////// // Permutation test between groups // based on IBS diffeences if (par::ibs_test) { P.permutationIBSTest(perm); shutdown(); } //////////////////////////////////////////////////// // Precalculate frequency-averaged P(IBD|IBS) table if (par::plink || par::genome_output) { if (par::has_nonfounders && !par::built_families) { map fnd; map idmap; P.linkRelateds(idmap, fnd); P.parseTrios(); // P.checkMendel(); // skip this when in --rel-check mode par::built_families = true; } // So that correct IBD expectation is calculated, // we need to fill in empty slots for missing parents P.makeMissingParents(); P.preCalcGenomeIBD(); } ////////////////////////////// // Genome-wide output only if (par::genome_output) { P.displayGenomeWideInfo(); if ( par::genome_test ) P.testGenomeIBDByCovariate(perm); shutdown(); } ////////////////////////////////////// // Genome-wide inbreeding output only if (par::inbreeding) { if (par::SNP_major) P.SNP2Ind(); ofstream HET; string f = par::output_file_name + ".het"; HET.open(f.c_str(),ios::out); HET.precision(4); P.printLOG("Writing individual heterozygosity information to [ "+f+" ] \n"); HET << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(12) << "O(HOM)" << " " << setw(12) << "E(HOM)" << " " << setw(12) << "N(NM)" << " " << setw(12) << "F" << "\n"; for (int i1=0; i1makeSlidingWindow(par::sliding_window_size); else if (par::hap_specific_snps) P.haplo->setSpecificSNPs(par::hap_specific_snps_list); else P.haplo->readTagFile(); } if (par::has_nonfounders && !par::built_families) { map fnd; map idmap; P.linkRelateds(idmap, fnd); P.parseTrios(); P.checkMendel(); par::built_families = true; } P.printLOG("Estimating haplotype frequencies/phases ( MHF >= " + dbl2str(par::min_hf)+" )\n"); P.printLOG("Considering phases P(H|G) >= " +dbl2str(par::hap_min_phase_prob)+"\n"); P.printLOG("Requiring per individual per haplotype missingness < " +dbl2str(par::hap_missing_geno)+" \n"); // Count number of founders P.haplo->cnt_f = 0; vector::iterator person = P.sample.begin(); while ( person != P.sample.end() ) { if ( (*person)->founder ) P.haplo->cnt_f++; person++; } if (P.haplo->cnt_fnonfounders = true; P.printLOG("Initial phasing based on "+ int2str(P.haplo->cnt_f)+" founders ("+ int2str(P.n-P.haplo->cnt_f)+ " non-founders)\n"); } if (P.n == P.haplo->cnt_f && ( par::test_hap_TDT || par::proxy_TDT ) ) error("Can not perform TDT in sample with no non-founders"); } ///////////////////////// // Haplotype frequencies if (par::phase_snps && par::display_hap_freqs) { P.haplo->calculateHaplotypeFrequencies(); shutdown(); } //////////////////////////////// // Haplotype phase probabilities if (par::phase_snps && par::display_phase_probs) { P.haplo->calculateHaplotypeFrequencies(); shutdown(); } ///////////////////////////////////////////// // Haplotypic test of non-random missing data if (par::mishap_test) { P.performMisHapTests(); shutdown(); } //////////////////////////////////////////////////// // Haplotype tracking of an extended region, for an // individual or pair if (par::phase_snps && par::segment_haplotrack) { P.haplo->trackSharedHaplotypes(); shutdown(); } //////////////////////////////////////////////////// // Haplotype tracking of an extended region, for an // individual or pair if (par::phase_snps && par::impute_tags ) { P.haplo->imputeAllHaplotypes(); shutdown(); } /////////////////////////////////////////////////////// // Haplotypic test of SNP proxy (convenience function) // (we've already done the imputation step above) if (par::proxy_assoc && ! par::proxy_impute) { P.proxyWrapper(); shutdown(); } ////////////////////////////////////////////////// // // // Misc tests that do not fall within the // // main phenotype loop // // // ////////////////////////////////////////////////// ////////////////////////////// // Genome-wide IBS sharing test if (par::ibs_sharing_test) { P.perm_sharingIBSTest(perm); shutdown(); } ////////////////////////////// // Gene-based test of epistasis if (par::epi_genebased) { P.driverSCREEPI(); shutdown(); } ////////////////////////////// // Genome-wide epistasis tests if (par::epistasis) { P.calcEpistasis(); shutdown(); } ///////////////////////////////////////////// // Determine per-individual risk profiles if (par::score_risk) { P.scoreIndividuals(); shutdown(); } ////////////////////////////////// // Apply an R-script to the data? if ( par::run_R_script ) { #ifdef WITH_R_PLUGINS P.Rfunc(); shutdown(); #else error("R plugin support has not been compiled in"); #endif } ////////////////////////////////////////////////// // // // Genome-wide association tests // // // ////////////////////////////////////////////////// // Allow for the fact that we might be iterating // over multiple phenotypes string original_file_root = par::output_file_name; if ( ! par::plink ) while ( 1 ) { if ( par::all_pheno ) { if ( par::loop_over ) { P.phenoLabel = P.kname[ par::loop_counter ]; par::output_file_name = original_file_root + "." + P.phenoLabel; par::bt = true; par::qt = false; for (int i=0; imissing = false; P.sample[i]->aff = P.sample[i]->sol == par::loop_counter ? true : false ; } } else { if ( P.phenotype_name == "" ) P.phenoLabel = "P"+int2str(par::mult_pheno); else P.phenoLabel = P.phenotype_name; par::output_file_name = original_file_root + "." + P.phenoLabel; } } if (par::assoc_test) { if (par::CMH_test_2) P.calcMH(); else if (par::OR_homog_test) P.calcHomog(); else if (par::QTDT_test) { // Force a Mendel error check if (! (par::MENDEL_report || par::MENDEL_test) ) P.checkMendel(); P.perm_testQTDT(perm); } else if (par::boot) { // Redundant error("Bootstrap option is no longer supported\n"); P.calcAssociationWithBootstrap(); } else { // Includes // basic allelic test // model-based tests // linear & logistic models // 2x2xK Cochran-Mantel-Haenszel P.calcAssociationWithPermutation(perm); } if ( ! par::all_pheno ) shutdown(); } ///////////////////////////////// // Haplotype association analysis if (par::phase_snps && ( par::test_hap_CC || par::test_hap_GLM || par::test_hap_QTL || par::test_hap_TDT ) ) { // This is done separaytely, via the main // assoc. loop if (par::test_hap_GLM) P.calcAssociationWithPermutation(perm); else { //////////////////////////////////////////////// // Perform omnibus and haplotype-specific tests string f; if ( par::test_hap_CC ) f = par::output_file_name + ".assoc.hap"; else if ( par::test_hap_QTL ) f = par::output_file_name + ".qassoc.hap"; else if ( par::test_hap_TDT ) f = par::output_file_name + ".tdt.hap"; if (par::test_hap_CC) { P.printLOG("Writing haplotype association statistics to [ " + f + " ]\n"); P.haplo->HTEST.open(f.c_str(), ios::out); P.haplo->HTEST.precision(4); P.haplo->HTEST << setw(10) << "LOCUS" << " " << setw(12) << "HAPLOTYPE" << " " << setw(10) << "F_A" << " " << setw(10) << "F_U" << " " << setw(10) << "CHISQ" << " " << setw(4) << "DF" << " " << setw(10) << "P" << " " << "SNPS" << "\n"; } if ( par::test_hap_QTL ) { P.printLOG("Writing haplotype association statistics to [ " + f + " ]\n"); P.haplo->HTEST.open(f.c_str(), ios::out); P.haplo->HTEST.precision(4); P.haplo->HTEST << setw(10) << "LOCUS" << " " << setw(12) << "HAPLOTYPE" << " " << setw(8) << "NANAL" << " " << setw(10) << "BETA" << " " << setw(10) << "R2" << " " << setw(8) << "STAT" << " " << setw(10) << "P" << " " << "SNPS" << "\n"; } if (par::test_hap_TDT) { P.printLOG("Writing haplotype TDT statistics to [ " + f + " ]\n"); P.haplo->HTEST.open(f.c_str(), ios::out); P.haplo->HTEST.precision(4); P.haplo->HTEST << setw(10) << "LOCUS" << " " << setw(12) << "HAPLOTYPE" << " " << setw(10) << "T" << " " << setw(10) << "U" << " " << setw(10) << "CHISQ" << " " << setw(10) << "P" << " " << "SNPS" << "\n"; } P.haplo->phaseAllHaplotypes(true,perm); P.haplo->HTEST.close(); } if ( ! par::all_pheno ) shutdown(); } ////////////////////////////////////////////////////// // Haplotypic conditional tests (WHAP implementation, // now called CHAP, for conditional haplotype if (par::chap_test) { P.conditionalHaplotypeTest(true,perm); if ( ! par::all_pheno ) shutdown(); } ////////////////////////////// // QTL interaction test if (par::assoc_gxe) { P.perm_testGXE2(perm); if ( ! par::all_pheno ) shutdown(); } ///////////////////////////// // Rare allele test if ( par::elf_baseline ) { P.elfBaseline(); shutdown(); } if (par::rare_test) { P.permTestRareDistribution(perm); if ( ! par::all_pheno ) shutdown(); } ///////////////////////// // Hotelling's T^2 test if (par::hotel) { P.perm_testHotel(perm); if ( ! par::all_pheno ) shutdown(); } /////////////////////////////////// // Test difference in missing rates if (par::test_missing) { P.calcAssociationWithPermutation(perm); if ( ! par::all_pheno ) shutdown(); } ////////////////////////////////// // Genome-wide family-based (TDT) // and Parent-of-origin analysis if (par::TDT_test) { // Force a Mendel error check, if we have not // already if (! (par::MENDEL_report || par::MENDEL_test) ) P.checkMendel(); // Either basic TDT or Parent-Of-Origin analysis if (par::parent_of_origin) P.perm_testTDT_POO(perm); else if (par::sibTDT_test) P.perm_testTDT(perm); else P.perm_testTDT(perm); if ( ! par::all_pheno ) shutdown(); } // Read next phenotype: repeat, or shutdown if ( par::all_pheno ) { if ( par::loop_over ) { // Construct next phenotype from cluster file par::loop_counter++; if ( par::loop_counter == P.nk ) shutdown(); } else { // Read next phenotype from file par::mult_pheno++; if ( ! P.readPhenoFile() ) shutdown(); // and recode, if a binary affection status coding if (par::bt) affCoding(P); } } if ( ! par::all_pheno ) shutdown(); } // Next potential phenotype ////////////////////////////////////////////////// // // // PLINK segmental sharing analyses // // // ////////////////////////////////////////////////// // Stop now, unless a plink analysis is specified if (!par::plink) shutdown(); if (par::SNP_major) P.SNP2Ind(); ////////////////////////////////////////////// // Read pre-computed segment list and perform // segmental tests? if (par::read_segment_file) { ifstream SEG; SEG.open(par::read_segment_filename.c_str(),ios::in); P.printLOG("Reading IBD-segment information from [ " +par::read_segment_filename+" ]\n"); checkFileExists(par::read_segment_filename); if (par::segment_minimal) P.readSegmentFileMinimal(SEG); else P.readSegmentFile(SEG); SEG.close(); // IBS validation of segments (i.e. possibly in a larger // datafile? but one that must be a superset of all SNPs in // segment file) if (false) { P.validateSegments(); shutdown(); } // Find overlap in segments? if (par::segment_overlap) P.summariseHomoRuns(); // Per-individual summary/test? if ( par::segment_test_individual ) { P.segmentIndividualTest(perm); shutdown(); } // Perform pairwise summary/analysis of segments? P.summaryIBSsegments(perm); P.printLOG("Writing segment summary to [ " + par::output_file_name + ".segment.indiv ]\n\n"); P.indivSegmentSummary(); shutdown(); } ////////////////////////////// // Pair inclusion/exclusion // Number of informative pairs int c=0; // Read or calculate informative pairs? if (par::ibd_read) c = P.readInformative(); else c = P.calcInformative(); //////////////////////////////////////////////// // Test of genome-wide relatedness by covariate if ( par::genome_test ) { P.testGenomeIBDByCovariate(perm); shutdown(); } /////////////////////////////////// // Save pairs to be included? i.e. // after removing all pairs for // a) low IBD // b) not being an affected pair // c) being a concordant unaffected pair if (par::inc_write) P.writeInformative(); ///////////////////////////////////////////////////////////////// // Get and display information on chromosomal range to be tested // else if (par::singlepoint) // P.printLOG("Using singlepoint analysis mode\n"); else if (par::inter_grid>0) { stringstream s2; s2 << "Using multipoint analysis: step = " << par::inter_grid << " and fringe = " << par::fringe << " cM\n"; P.printLOG(s2.str()); } else { stringstream s2; s2 << "Using multipoint analysis: grid = " << par::grid << " and fringe = " << par::fringe << " cM\n"; P.printLOG(s2.str()); } vector chrs; if (par::run_chr==0) { vector r = getChromosomeRange(P); P.printLOG("\nScanning from autosomes from chromosome "+ chromosomeName( r[0] ) + " to "+ chromosomeName( r[1] ) + "\n\n"); for (int i=r[0];i<=r[1];i++) if ( ( !par::chr_haploid[i] ) && ( !par::chr_sex[i] ) ) chrs.push_back(i); } else chrs.push_back(par::run_chr); ofstream SEG; if (par::segment_output) { string f = par::output_file_name + ".segment"; SEG.open(f.c_str(),ios::out); P.printLOG("Writing IBD-segment information to [ "+f+" ]\n"); if (par::segment_minimal) P.printLOG("Minimal segment file format\n"); // Header row for non-minimal format if (! par::segment_minimal) { SEG << setw(par::pp_maxfid) << "FID1" << " " << setw(par::pp_maxiid) << "IID1" << " " << setw(par::pp_maxfid) << "FID2" << " " << setw(par::pp_maxiid) << "IID2" << " "; if (par::bt) SEG << setw(4) << "PHE" << " "; SEG << setw(4) << "CHR" << " " << setw(10) << "BP1" << " " << setw(10) << "BP2" << " " << setw(par::pp_maxsnp) << "SNP1" << " " << setw(par::pp_maxsnp) << "SNP2" << " " << setw(6) << "NSNP" << " " << setw(10) << "KB" << "\n"; } f = par::output_file_name + ".segment.summary"; P.printLOG("Writing IBD-segment summary to [ "+f+" ]\n\n"); P.printLOG("Minimum segment length is " +dbl2str((double)par::segment_length/(double)1000) +" kb and "+int2str(par::segment_snp)+" SNPs\n"); P.printLOG("Segment thresholds are "+dbl2str(par::segment_threshold_start) +" and "+dbl2str(par::segment_threshold_finish)+"\n"); P.printLOG("Maximum intra-segment inter-SNP distance is " +int2str(par::segment_inter_snp_distance) +"\n"); } ofstream MP; if (par::multi_output) { string f = par::output_file_name + ".multi"; MP.open(f.c_str(), ios::out); MP.setf(ios::fixed); MP.precision(5); P.printLOG("Writing multipoint IBD estimates to [ "+ f+" ]\n"); } ofstream GMULTI; if (par::gmulti_output) { string f = par::output_file_name + ".gmulti"; GMULTI.open(f.c_str(), ios::out); GMULTI.precision(4); P.printLOG("Writing genotype/multipoint IBD estimates to [ "+ f +" ]\n"); } ////////////////////////////// // Consider each chromosome for (int i=0;isol != p2->sol ) continue; } ///////////////////////////////// // 1. Calculate IBD(g) | IBS(g) Z IBDg = P.saved_IBDg[c2]; if (!par::silent) { cout << "IBD calculation: " << ++c2 << " of " << c << " \r"; cout.flush(); } ///////////////////////////////// // 2. Calculate IBD(l) - IBD(g) vector IBDl = P.calcLocusIBD(p1,p2,IBDg); ///////////////////////////////// // 3. Multipoint calculation P.pairid = itoa((int)p1->phenotype,10) +" "+itoa((int)p2->phenotype,10)+" "; P.pairid += itoa((int)c2,10) + " "; P.pairid += p1->fid+"_"+p1->iid+"_ "; P.pairid += p2->fid+"_"+p2->iid+"_"; vector_t p; ////////////////////////////////// // Perform either using // Singlepoint analysis // Multipoint analysis (default) // if (par::singlepoint) // p = P.calcSinglePoint(IBDl,IBDg); p = P.calcMultiPoint(IBDl,IBDg,MP); //////////////////////////////// // 3b. Verbose output: // genotypes for each pair if (par::gmulti_output) { for (int l=par::run_start; l<=par::run_end; l++) P.displayGMULTI(p1,p2,l,GMULTI); } /////////////////////////////// // 4. Scan for segments of IBD P.findSegments(i1,i2,p,SEG); ///////////////////////////// // 5. Add to list // Do not bother saving for now... // only save segments... if (false) P.pihat.push_back(p); // And (A,B) pair to list P.pair1.push_back(i1); P.pair2.push_back(i2); } } if (!par::silent) cout << "\n"; ///////////////////////////////// // Make list of unique individuals // copy first set of individuals P.in_anal = P.pair1; for (unsigned int ind=0; ind::iterator new_end= unique(P.in_anal.begin(), P.in_anal.end()); // delete all elements past new_end P.in_anal.erase(new_end, P.in_anal.end()); P.printLOG(int2str(P.in_anal.size())+ " unique, informative individuals in analysis\n"); if ( P.in_anal.size() == 0 ) { error("No individuals left in analysis: halting"); } ///////////////////////////////// // Verbose output: summarise IBD if (par::segment_output) { // P.summaryIBDsegments(perm); } else if (par::summary_ibd_output) P.summaryIBD(); ///////////////////////////////// // Next chromosome par::done_global_pihat = true; if (!par::silent) cout << "\n"; } // Now do IBD segment (as IBS...) if (par::segment_output) { P.summaryIBSsegments(perm); P.indivSegmentSummary(); } ////////////////////////////// // Find overlap in segments? if (par::segment_overlap) P.summariseHomoRuns(); ////////////////////////////////////// // Shut segment and multipoint files if (par::segment_output) SEG.close(); if (par::multi_output) MP.close(); if (par::gmulti_output) GMULTI.close(); //////////////////////////////// // Output genome-wide p-values if (par::permute) if (chrs.size()>=1 && (!par::ignore_phenotypes)) P.displayGenomePV(); //////////////////////////////// // We're definitely done now shutdown(); } //////////////////////////////// // Clean-up void Plink::cleanUp() { for (int i=0; i #include #include #include #include #include #include #include "plink.h" #include "stats.h" #include "helper.h" #include "options.h" using namespace std; void Plink::tagMode() { //////////////////////////// // Look-up table by SNP name map mlocus; for (int l = 0 ; l < nl_all ; l++ ) mlocus.insert(make_pair( locus[l]->name, l )); set toTag; bool testAll = par::gettag_file == "all"; if ( !testAll ) { checkFileExists(par::gettag_file); printLOG("Reading SNPs to tag from [ " + par::gettag_file + " ] \n"); ifstream IN2(par::gettag_file.c_str(),ios::in); int cnt = 0; while ( ! IN2.eof() ) { string snp; string code; if ( par::gettag_mode1 ) IN2 >> snp; else if ( par::gettag_mode2 ) { vector tokens = tokenizeLine(IN2); if ( tokens.size() == 0 ) continue; if ( tokens.size() != 2 ) error("Expected two columns per line\n"); snp = tokens[0]; code = tokens[1]; if ( code == "0" ) continue; } if ( snp == "" ) continue; ++cnt; // Can we find this SNP? map::iterator i = mlocus.find( snp ); if ( i == mlocus.end() ) continue; // Add to list toTag.insert(i->second); } printLOG("Read " + int2str(cnt) + " SNPs to tag, of which " + int2str(toTag.size()) + " are unique and present\n"); IN2.close(); } else { printLOG("Setting to tag all " + int2str(nl_all) + " SNPs in dataset\n"); for (int l=0;l::iterator i = toTag.begin(); set tagged; while ( i != toTag.end() ) { // SNP to tag int l = *i; tagged.insert(l); set thisTagged; int dist_left = locus[l]->bp; int dist_right = locus[l]->bp; // Move forwards and backwards, within range, and // add any with r^2 above threshold to tagged set int j = l - 1; int chr = locus[l]->chr; int pos = locus[l]->bp; while (1) { if ( j < 0 ) break; if ( locus[j]->chr != chr ) break; if ( pos - locus[j]->bp > par::gettag_kb ) break; if ( ! par::gettag_listall ) { if ( tagged.find(j) != tagged.end() ) { --j; continue; } } double rsq = correlation2SNP( l,j,true,false); if ( realnum(rsq) && rsq >= par::gettag_r2 ) { tagged.insert(j); if ( par::gettag_listall ) { thisTagged.insert(j); if ( locus[j]->bp < dist_left ) dist_left = locus[j]->bp; } } --j; } // Now move right j = l+1; while (1) { if ( j == nl_all ) break; if ( locus[j]->chr != chr ) break; if ( locus[j]->bp - pos > par::gettag_kb ) break; if ( ! par::gettag_listall ) { if ( tagged.find(j) != tagged.end() ) { ++j; continue; } } double rsq = correlation2SNP( l,j,true,false); if ( realnum(rsq) && rsq >= par::gettag_r2 ) { tagged.insert(j); if ( par::gettag_listall ) { thisTagged.insert(j); if ( locus[j]->bp > dist_right ) dist_right = locus[j]->bp; } } ++j; } if ( par::gettag_listall ) { O2 << setw( par::pp_maxsnp ) << locus[l]->name << " " << setw(4) << locus[l]->chr << " " << setw(10) << locus[l]->bp << " " << setw(4) << thisTagged.size() << " " << setw(10) << dist_left << " " << setw(10) << dist_right << " " << setw(8) << (dist_right - dist_left ) / 1000.0 << " "; set::iterator s = thisTagged.begin(); bool first = true; while ( s != thisTagged.end() ) { if ( first ) { first = false; } else O2 << "|"; O2 << locus[*s]->name; ++s; } if ( first ) O2 << "NONE"; O2 << "\n"; } // Consider next SNP to tag ++i; } if ( par::gettag_listall ) O2.close(); if ( ! testAll ) { printLOG("In total, added " + int2str( tagged.size() - toTag.size() ) + " tag SNPs\n"); ofstream O1; printLOG("Writing tag list to [ " + par::output_file_name + ".tags ]\n"); O1.open( (par::output_file_name+".tags").c_str(), ios::out); // Mode 1 : just write list if ( par::gettag_mode1 ) { set::iterator l = tagged.begin(); while ( l != tagged.end() ) { O1 << locus[*l]->name << "\n"; ++l; } } // Mode 2 : Write all SNPs, with 0/1 code if ( par::gettag_mode2 ) { for (int l = 0; l < nl_all ; l++ ) { O1 << locus[l]->name << "\t"; if ( tagged.find(l) != tagged.end() ) O1 << "1\n"; else O1 << "0\n"; } } O1.close(); } } plink-1.07-src/crandom.h0000644000265600020320000000222311264127626014271 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __CRANDOM_H__ #define __CRANDOM_H__ #include using namespace std; class CRandom { public: static const int IA; static const int IM; static const int IQ; static const int IR; static const int NTAB; static const int NDIV; static const double EPS; static const double AM; static const double RNMX; // Current seed static int idum; static int iy; static vector iv; static void srand(long unsigned iseed = 0); static double rand(); static int rand (int); }; #endif plink-1.07-src/whap.cpp0000644000265600020320000010533111264127625014143 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "whap.h" #include "helper.h" #include "plink.h" #include "options.h" #include "perm.h" #include "nlist.h" #include "phase.h" #include "model.h" #include "linear.h" #include "logistic.h" #include "stats.h" ////////////////////////////////////////////////////////////// // // Conditional Haplotype tests (CHAP models) // // A null and alternate model specified in terms of // haplotypes (potentially grouped) // covariates // conditioning SNPs // // Only focus on allelic, autosomal main effects right now // // // SNPs done with --condition {list} // Covariates with --covar / --covar-name / --covar-number // Haplotypes with --hap-snps // // Covariates are always present under both alternate and null // SNPs can be dropped from the alternate (tested) // Haplotypes can be dropped (tested) and grouped // A helper function void displayHaploGroups(ofstream &, ChapModel &, HaploPhase *); string ci(double coef, double se) { string c = " ("; if ( (!par::bt) || par::return_beta ) { c += dbl2str( coef - par::ci_zt * se , 3 ) + "; " ; c += dbl2str( coef + par::ci_zt * se , 3 ) + " )" ; } else { c += dbl2str( exp( coef - par::ci_zt * se ), 3) + "; " ; c += dbl2str( exp( coef + par::ci_zt * se ), 3) + " )" ; } return c; } vector_t Plink::conditionalHaplotypeTest(bool print_results, Perm & perm) { /////////////////////////////////////////////// // // // Some basic setup first // // // /////////////////////////////////////////////// Chap thisCModel(this, haplo); whap = & thisCModel; ChapModel alternateModel; ChapModel nullModel; // Use basic GLM function to fit linear and logistic // models: although, let it know that there will not // be a 'main' SNP par::assoc_glm_without_main_snp = true; // Return a single result vector_t results(1); printLOG("Writing conditional haplotype tests to [ " + par::output_file_name + ".chap ]\n"); ofstream CH; CH.open((par::output_file_name+".chap").c_str(),ios::out); CH.precision(3); CH << "+++ PLINK conditional haplotype test results +++ \n\n"; /////////////////////////////////////////////// // // // Phase haplotypes // // // /////////////////////////////////////////////// haplo->phaseAllHaplotypes(false,*pperm); // Record the number of common haplotypes int nch = 0; for (int h=0; h < haplo->nh; h++) if ( haplo->f[h] >= par::min_hf ) ++nch; CH << haplo->ns << " SNPs, and " << nch << " common haplotypes ( MHF >= " << par::min_hf << " ) " << "from " << haplo->nh << " possible\n\n"; CH << setw(4) << "CHR" << " " << setw(12) << "BP" << " " << setw(12) << "SNP" << " " << setw(4) << "A1" << " " << setw(4) << "A2" << " " << setw(10) << "F" << "\n"; for (int s=0; s< haplo->ns; s++) CH << setw(4) << locus[haplo->S[s]]->chr << " " << setw(12) << locus[haplo->S[s]]->bp << " " << setw(12) << locus[haplo->S[s]]->name << " " << setw(4) << locus[haplo->S[s]]->allele1 << " " << setw(4) << locus[haplo->S[s]]->allele2 << " " << setw(10) << locus[haplo->S[s]]->freq << "\n"; CH << "\n"; if ( nch == 0 ) { results[0] = 0; CH << "Exiting... no common haplotypes to test\n"; CH.close(); return results; } /////////////////////////////////////////////// // // // Build models // // // /////////////////////////////////////////////// whap->setModels(alternateModel, nullModel); whap->build(alternateModel); whap->build(nullModel); /////////////////////////////////////////////// // // // Display models // // // /////////////////////////////////////////////// CH << "Haplogrouping: each {set} allowed a unique effect\n"; CH << "Alternate model\n"; displayHaploGroups(CH, alternateModel,haplo); CH << "Null model\n"; displayHaploGroups(CH, nullModel,haplo); CH << "\n"; if ( ! whap->isNested() ) error("The null model is not nested in the alternate: please respecify"); /////////////////////////////////////////////// // // // Fit alternate model // // // /////////////////////////////////////////////// whap->current = & alternateModel; glmAssoc(false,perm); Model * alternate = model; /////////////////////////////////////////////// // // // Fit null model // // // /////////////////////////////////////////////// // Ensure we have the same individuals vector missingInAlternate = alternate->getMissing(); for (int i=0; imissing2 = missingInAlternate[i]; whap->current = & nullModel; glmAssoc(false,perm); Model * null = model; /////////////////////////////////////////////// // // // Fit group-specific models // // // /////////////////////////////////////////////// // If there is more than 1 group, these models test each group // against all others, in both the alternate and null vector_t alt_specific_pval; bool hasAltSpecifics = alternateModel.group.size() > 2 && par::chap_add_grp_specifics; if ( hasAltSpecifics ) { //////////////////////////////////////////// // Create a model with no haplotype effects ChapModel simpleNullModel = nullModel; simpleNullModel.group.clear(); set t; for (int h=0; h< haplo->nh; h++) if ( haplo->f[h] >= par::min_hf ) t.insert(h); simpleNullModel.group.push_back(t); whap->current = & simpleNullModel; glmAssoc(false,perm); Model * simplenull = model; //////////////////////////////////////////// // Run all haplotype-specific models for (int h=0; h< haplo->nh; h++) { if ( haplo->f[h] < par::min_hf ) continue; ChapModel simpleAlternateModel = nullModel; simpleAlternateModel.group.clear(); simpleAlternateModel.group.resize(1); for (int h2=0; h2< haplo->nh; h2++) { if ( haplo->f[h2] < par::min_hf ) continue; if ( h2 == h ) { set t; t.insert(h); simpleAlternateModel.group.push_back(t); } else simpleAlternateModel.group[0].insert(h); } //////////////////////////////////////////// // Perform test, and model comparison whap->current = & simpleAlternateModel; glmAssoc(false,perm); Model * simplealternate = model; ////////////////////////// // Store test statistic alt_specific_pval.push_back( modelComparisonPValue(simplealternate, simplenull) ); // Next haplo-group } } /////////////////////////////////////////////// // // // Fit sub-null model // // // /////////////////////////////////////////////// // If the null model contains >1 group, then // perform the alternate:null comparisons // separately for each sub-group, if the null // still contains fewer parameters than the // alternate vector_t subnull_pval; int subnullModels = 0; // Worth doing? if ( nullModel.group.size() > 1 && alternateModel.group.size() - nullModel.group.size() > 1 ) { for (int g = 0; g < nullModel.group.size(); g++) { // For this null-group, do the haplotypes in the // alternate belong to >1 group? If so, perform a // separate test set::iterator ih = nullModel.group[g].begin(); set aGroup; while ( ih != nullModel.group[g].end() ) { aGroup.insert( alternateModel.haploGroup.find(*ih)->second ); ++ih; } if ( aGroup.size() > 1 ) { ++subnullModels; // Re-jig a new model, that is basically like the // alternate, except for this one group. ChapModel subnullModel = alternateModel; // Edit group only (haploGroup will not be used, // so ignore that for now...) set::iterator ai = aGroup.begin(); // Get first group: arbitrarily choose this group to // be the merged-to group int ng = *ai; // All other groups, merge with this first one ++ai; // Iterate over each alternate-model haplogroup to be merged while ( ai != aGroup.end() ) { // Iterate over all haplotypes in this haplogroup set::iterator s = alternateModel.group[ *ai ].begin(); while ( s != alternateModel.group[ *ai ].end() ) { subnullModel.group[ ng ].insert( *s ); ++s; } subnullModel.group[ *ai ].clear(); ++ai; } // Now erase empty groups for (int g=0; gcurrent = & subnullModel; glmAssoc(false,perm); Model * subnull = model; // Store test statistic subnull_pval.push_back( modelComparisonPValue(alternate, subnull) ); } else { subnull_pval.push_back(-1); } } } /////////////////////////////////////////////// // // // Report model comparisons // // // /////////////////////////////////////////////// // Check that both models converged if ( ! ( alternate->isValid() && null->isValid() ) ) error("Could not fit conditional haplotype models:\n " "collinearity issues from a badly-specified model\n"); vector_t coeff1 = alternate->getCoefs(); vector_t se1 = alternate->getSE(); vector_t coeff0 = null->getCoefs(); vector_t se0 = null->getSE(); vector label1 = alternate->label; vector label0 = null->label; ////////////////////////////////// // Convert to odds ratios? vector_t odds1 = coeff1; vector_t odds0 = coeff0; if ( par::bt ) { for (int h=0; hns+5 ) << "HAPLO" << " " << setw(10) << "FREQ" << " "; CH << setw( estimate_size ) << clabel_alternate << " "; if ( hasAltSpecifics ) CH << setw(12) << "SPEC(A)" << " "; CH << setw( estimate_size ) << clabel_null << " "; if ( subnullModels > 1 ) CH << setw(12) << "SUBNULL P" << " "; CH << "\n"; CH << setw( haplo->ns+5 ) << "-------" << " " << setw(10) << "------" << " "; CH << setw( estimate_size ) << cunder_alternate << " "; if ( hasAltSpecifics ) CH << setw(12) << "---------" << " "; CH << setw( estimate_size ) << cunder_null << " "; if ( subnullModels > 1 ) CH << setw(12) << "-----------" << " "; CH << "\n"; for ( int g=0; g::iterator ih = alternateModel.group[g2].begin(); bool printed = false; while ( ih != alternateModel.group[g2].end() ) { // Only display if this haplotype is in the // null group if ( nullModel.group[g].find( *ih ) == nullModel.group[g].end() ) { ++ih; continue; } CH << setw( haplo->ns+5 ) << haplo->haplotypeName( *ih ) << " " << setw(10) << haplo->f[ *ih ] << " "; if ( ! printed ) { if ( g2==0 ) CH << setw( estimate_size ) << "(-ref-)" << " "; else { int p = alternateModel.haploGroup.find(*ih)->second; if ( realnum( odds1[p] ) ) { string r = dbl2str( odds1[p] , 4 ); if ( par::display_ci ) r += ci( coeff1[p] , se1[ p ] ); CH << setw( estimate_size ) << r << " "; } else { CH << setw( estimate_size ) << "NA" << " "; } } if ( hasAltSpecifics ) CH << setw(12) << alt_specific_pval[ g2 ] << " "; printed = true; } else { CH << setw( estimate_size ) << "| " << " "; if ( hasAltSpecifics ) CH << setw(12) << " " << " "; } /////////////////////////////// // Display corresponding null if ( ! printed0 ) { if ( g==0 ) CH << setw( estimate_size ) << "(-ref-)" << " "; else { int p = nullModel.haploGroup.find(*ih)->second; if ( realnum( odds0[p] ) ) { string r = dbl2str( odds0[p] , 4 ); if ( par::display_ci ) r += ci( coeff0[p] , se0[ p ] ); CH << setw( estimate_size ) << r << " "; } else CH << setw( estimate_size ) << "NA" << " "; } } else CH << setw( estimate_size ) << "| " << " "; //////////////////////////////// // Display corresponding subnull? if ( subnullModels > 1 ) { if ( ! printed0 ) { if ( subnull_pval[g] < 0 ) CH << setw(12) << "n/a" << " "; else CH << setw(12) << subnull_pval[g] << " "; } } printed0 = true; CH << "\n"; ++ih; } // Delimiter alternate groups, unless we // are also about to delimit a null group } if ( g < nullModel.group.size() - 1 ) CH << "\n"; } CH << setw( haplo->ns+5 ) << "-------" << " " << setw(10) << "------" << " "; CH << setw( estimate_size ) << cunder_alternate << " "; if ( hasAltSpecifics ) CH << setw(12) << "---------" << " "; CH << setw( estimate_size ) << cunder_null << " "; if ( subnullModels > 1 ) CH << setw(12) << "-----------" << " "; CH << "\n"; ///////////////////////////////////////////////// // Display other covariates, conditioning SNPs // 0=intercept; 1 -> (H-1) haplotype-group coefficients; // then conditioning SNPs; then other covariates int p1 = alternateModel.group.size(); int p0 = nullModel.group.size(); // Only an intercept, then need to add 1 if ( p1 == 0 ) p1++; if ( p0 == 0 ) p0++; ///////////////////////////////////////////////////////// // Conditioning SNPs: will always feature in alternate; // may or may not feature in null if ( conditioner.size() > 0 ) { CH << "\n"; CH << setw( haplo->ns+5) << "SNPS" << " " << setw(12) << " " << " " << setw( estimate_size ) << clabel_alternate << " " << setw( estimate_size ) << clabel_null << "\n"; CH << setw( haplo->ns+5) << "-----" << " " << setw(12) << " " << " " << setw( estimate_size ) << cunder_alternate << " " << setw( estimate_size ) << cunder_null << "\n"; for (int s=0; s < conditioner.size(); s++) { CH << setw( haplo->ns+5 ) << label1[p1] << " " << setw(12) << " " << " "; if ( realnum( coeff1[ p1 ] ) ) { string r = dbl2str( coeff1[ p1 ] , 4 ); if ( par::display_ci ) r += ci( coeff1[ p1 ] , se1[ p1 ] ); CH << setw( estimate_size ) << r << " "; } else CH << setw( estimate_size ) << "NA" << " "; p1++; if ( nullModel.masked_conditioning_snps[s] ) { if ( realnum( coeff0[ p0 ] ) ) { string r = dbl2str( coeff0[ p0 ] , 4 ); if ( par::display_ci ) r += ci( coeff0[ p0 ] , se0[ p0 ] ); CH << setw( estimate_size ) << r << " "; } else CH << setw( estimate_size ) << "NA" << "\n"; p0++; } else CH << setw( estimate_size ) << "(dropped)" << "\n"; } } /////////////////////////////////////////////////////// // Other covariates: these will always feature in alternate // and null if ( par::clist && par::clist_number > 0 ) { CH << "\n"; CH << setw( haplo->ns+5) << "COVAR" << " " << setw(12) << " " << " " << setw( estimate_size ) << clabel_alternate << " " << setw( estimate_size ) << clabel_null << "\n"; CH << setw( haplo->ns+5) << "-----" << " " << setw(12) << " " << " " << setw( estimate_size ) << cunder_alternate << " " << setw( estimate_size ) << cunder_null << "\n"; for (int s=0; sns+5) << label1[p1] << " " << setw(12) << " " << " "; if ( ! par::display_ci ) { CH << setw( estimate_size ) << coeff1[ p1 ] << " " << setw( estimate_size ) << coeff0[ p0 ] << "\n"; } else { string r = dbl2str( coeff1[ p1 ] , 4 ) + ci( coeff1[ p1 ] , se1[ p1 ] ); CH << setw( estimate_size ) << r << " "; r = dbl2str( coeff0[ p0 ] , 4 ) + ci( coeff0[ p0 ] , se0[ p0 ] ); CH << setw( estimate_size ) << r << " "; } // Advance to next coefficient, ++p1; ++p0; } } if ( alternate->isSexInModel() ) { // Have we already displayed a header for covariates? if ( ! ( par::clist && par::clist_number > 0 ) ) { CH << "\n"; CH << setw( haplo->ns+5) << "COVAR" << " " << setw(12) << " " << " " << setw( estimate_size ) << clabel_alternate << " " << setw( estimate_size ) << clabel_null << "\n"; CH << setw( haplo->ns+5) << "-----" << " " << setw(12) << " " << " " << setw(estimate_size ) << cunder_alternate << " " << setw(estimate_size ) << cunder_null << "\n"; } CH << setw( haplo->ns+5 ) << label1[p1] << " " << setw(12) << " " << " "; if ( ! par::display_ci ) { CH << setw(estimate_size) << coeff1[ p1 ] << " " << setw(estimate_size) << coeff0[ p0 ] << "\n"; } else { string r = dbl2str( coeff1[ p1 ] , 4 ) + ci( coeff1[ p1 ] , se1[ p1 ] ); CH << setw( estimate_size ) << r << " "; r = dbl2str( coeff0[ p0 ] , 4 ) + ci( coeff0[ p0 ] , se0[ p0 ] ); CH << setw( estimate_size ) << r << " "; } ++p1; ++p0; } if ( p1 != coeff1.size() || p0 != coeff0.size() ) error("Internal error in whap.cpp -- p1,p0 do not align"); ///////////////////////////////////////////////// // Display overall model comparison statistics CH << "\n" << "Model comparison test statistics:\n\n"; CH << setw(25) << " " << " " << setw(10) << "Alternate" << " " << setw(10) << "Null" << "\n"; if ( par::bt ) { LogisticModel * lalternate = (LogisticModel*)alternate; LogisticModel * lnull = (LogisticModel*)null; CH << setw(25) << "-2LL : " << " " << setw(10) << lalternate->getLnLk() << " " << setw(10) << lnull->getLnLk() << " " << "\n\n"; if ( lalternate->getNP() - lnull->getNP() == 0 ) CH << setw(25) << "Likelihood ratio test: " << " ( not a valid comparison: identical models, df = 0 )\n"; else { double lrt = lnull->getLnLk() - lalternate->getLnLk(); if ( lrt < 0 || !realnum(lrt) ) lrt = 0; int df = lalternate->getNP() - lnull->getNP(); double pval = chiprobP( lrt, df); CH << setw(25) << "Likelihood ratio test: " << "chi-square = " << lrt << "\n" << setw(25) << " " << "df = " << df << "\n" << setw(25) << " " << "p = " ; if ( pval < 0 || ! realnum(pval) ) CH << "NA"; else CH << pval; CH << "\n"; } } else { // Quantitative traits LinearModel * lalternate = (LinearModel*)alternate; LinearModel * lnull = (LinearModel*)null; CH << setw(25) << "R-squared : " << " " << setw(10) << lalternate->calculateRSquared() << " " << setw(10) << lnull->calculateRSquared() << " " << "\n"; CH << setw(25) << "Adjusted R-squared : " << " " << setw(10) << lalternate->calculateAdjustedRSquared() << " " << setw(10) << lnull->calculateAdjustedRSquared() << " " << "\n\n"; // CH << setw(25) << "Mallow's C : " << " " // << lalternate->calculateMallowC(lnull) // << "\n"; double F = lalternate->calculateFTest(lnull); if ( F < 0 || !realnum(F) ) F = 0; double pvalue = pF( F, alternate->getNP() - null->getNP(), alternate->Ysize() - alternate->getNP() - 1 ); string df = int2str(alternate->getNP() - null->getNP()) +", "+int2str(alternate->Ysize() - alternate->getNP() - 1); CH << setw(25) << "F-statistic comparison : " << "F = " << F << "\n" << setw(25) << " " << "df = " << df << "\n" << setw(25) << " " << "p = "; if ( pvalue < 0 || !realnum(pvalue) ) CH << "NA"; else CH << pvalue; CH << "\n"; } /////////////////////////////////////////////// // // // We're done // // // /////////////////////////////////////////////// CH.close(); delete alternate; delete null; return results; } void Chap::determineTestType() { // REDUNDANT } void Chap::build(ChapModel & model) { bool isNull = (&model) == null; // Either: // 1) Grouping for alternate and/or null // 2) Specific SNPs for alternate and/or null // 3) Sole-variant framing // 4) Independent effects // 5) Haplotype-specific model.group.clear(); bool useDefault = false; string modelDescription = isNull ? par::chap_model0 : par::chap_model1; if ( par::chap_specified_groups ) { // Make comma as hash group-delimiter code modelDescription = searchAndReplace(modelDescription,","," # "); // Expand haplotype equality statements modelDescription = searchAndReplace(modelDescription,"="," "); // Tokenize vector tok; string buf; stringstream ss(modelDescription); while (ss >> buf) tok.push_back(buf); // Check the same haplotype isn't specified more than once set hapin; for (int h=0; h toAdd; for ( int h=0; h < H->nh; h++ ) { // Is this haplotype already explicitly listed? string hname = H->haplotypeName( h ); bool listed = false; for (int i=0; i< tok.size(); i++) { if ( tok[i] == hname ) { listed = true; break; } } if ( ! listed ) toAdd.insert(h); } // Is there anything to add? if ( toAdd.size() > 0 ) { if ( groupAll ) tok.push_back("#"); set::iterator ih = toAdd.begin(); while ( ih != toAdd.end() ) { if ( ungroupAll ) tok.push_back("#"); tok.push_back( H->haplotypeName(*ih) ); ++ih; } } } ///////////////////////////////////////////// // Now work out the wild-card expanded list set t; for ( int i = 0 ; i < tok.size() ; i++ ) { if ( tok[i] == "#" ) { if ( t.size() > 0 ) model.group.push_back( t ); t.clear(); } else if ( tok[i] == "*" || tok[i] == "%" ) continue; else { // find haplotype code the long way... for (int h=0; h< H->nh; h++) { if ( H->f[h] < par::min_hf ) continue; if ( tok[i] == H->haplotypeName(h) ) t.insert(h); } } } if ( t.size() > 0 ) model.group.push_back(t); } else if ( par::chap_specified_snps ) { // Assume that modelDescription contains a list of SNPs map mapping; for (int l=0; lnl_all; l++) mapping.insert(make_pair( P->locus[l]->name,l)); NList nl(P->nl_all); vector snplist = nl.deparseStringList(modelDescription,&mapping); if (snplist.size() == 0 ) useDefault = true; setSNPList(snplist, model); } else if ( par::chap_sole_variant && isNull ) { // This could be a list of SNPs, or a list of haplotypes // If SNPs, in NList form (i.e. allowing for // If haplotypes, just in common delimited form // Under the alternate, we do not do anything here (i.e. thus the // condition above, which means the default alternate coding // will be used) // We may also have specified, with "--control-alleles", that only // a subset of the implied groupings are constrained under the null modelDescription = par::chap_entity; // We need to determine whether haplotypes or SNPs specified map mapping; for (int s=0; sns; s++) mapping.insert(make_pair( P->locus[H->S[s]]->name,s)); for (int h=0; hnh; h++) mapping.insert(make_pair( H->haplotypeName(h),H->ns + h)); if ( mapping.size() != H->ns + H->nh ) error("Problem, as some SNPs and haplotypes appear not to have unique names"); NList nl(H->ns + H->nh); vector lst = nl.deparseStringList(modelDescription,&mapping); if (lst.size() == 0 ) useDefault = true; bool isSNP = false; bool isHAP = false; for (int i=0; i< lst.size(); i++) if ( lst[i] >= H->ns ) isHAP = true; else if ( lst[i] < H->ns ) isSNP = true; if ( isSNP && isHAP ) error("Cannot specify both SNPs and hapltoypes for --chap-control"); if ( isHAP && par::chap_sole_variant_specific_alleles ) error("Can only specify --control-alleles when SNPs are listed for --control"); if ( isSNP ) { // Convert to locus 0..nl_all coding for (int i=0; iS[lst[i]]; setSNPList(lst, model); } else { // For any haplotype found, make so that it has it's // own group model.group.clear(); model.group.resize(1); // main null group for (int h=0; h< H->nh; h++) { if ( H->f[h] < par::min_hf ) continue; bool found = false; for (int i=0; ins == h ) { set t; t.insert(h); model.group.push_back(t); found = true; } if ( ! found ) model.group[0].insert(h); } } } else if ( par::chap_independent_effect && isNull ) { // Set SNPs for all *except* the one(s) specified, under the null modelDescription = par::chap_entity; map mapping; for (int s=0; sns; s++) mapping.insert(make_pair( P->locus[H->S[s]]->name,s)); // Use NList to return negative complement of SNPs listed NList nl(H->ns,false); vector snplist = nl.deparseStringList(modelDescription,&mapping); if (snplist.size() == 0 ) useDefault = true; // Convert to locus 0..nl_all coding for (int i=0; iS[snplist[i]]; setSNPList(snplist, model); } else if ( par::chap_haplotype_specific && ! isNull ) { // Set a single haplotype under the alternative modelDescription = par::chap_entity; map mapping; for (int h=0; hnh; h++) mapping.insert(make_pair( H->haplotypeName(h),h)); NList nl(H->nh); vector lst = nl.deparseStringList(modelDescription,&mapping); if (lst.size() == 0 ) useDefault = true; model.group.clear(); model.group.resize(1); // main null group for (int h=0; h< H->nh; h++) { if ( H->f[h] < par::min_hf ) continue; bool found = false; for (int i=0; i t; t.insert(h); model.group.push_back(t); found = true; } if ( ! found ) model.group[0].insert(h); } } } else { // If nothing else done by now... useDefault = true; } ///////////////////////////////////////////////// // Use defaults? Default null model is all // haplotypes in one group. if ( useDefault ) { model.group.clear(); if ( isNull ) { set t; for (int h=0; h< H->nh; h++) if ( H->f[h] >= par::min_hf ) t.insert(h); model.group.push_back(t); } else { for (int h=0; h< H->nh; h++) if ( H->f[h] >= par::min_hf ) { set t; t.insert(h); model.group.push_back(t); } } } //////////////////////////////////////////////////// // Also make haploGroup's also -- might not need these? model.haploGroup.clear(); for (int g=0; g< model.group.size(); g++) { set::iterator ih = model.group[g].begin(); while ( ih != model.group[g].end() ) { // cout << H->haplotypeName(*ih) << " " << g << "\n"; model.haploGroup.insert(make_pair( *ih, g)); ++ih; } } ///////////////////////////////////////////// // What about dropping conditioning SNPs? // By default, include all conditioning SNPs model.masked_conditioning_snps.clear(); model.masked_conditioning_snps.resize(P->conditioner.size(),true); // *Unless* this is the null, and we have requested that some be // dropped if ( isNull && par::chap_drop_snps ) { // List of conditioning SNPs if ( P->conditioner.size() == 0 ) error("No conditioning SNPs in the analysis:\n cannot test " + par::chap_drop_snps_list ); map mapping; for (int c=0; c < P->conditioner.size(); c++) mapping.insert(make_pair( P->locus[P->conditioner[c]]->name,c)); // List of SNPs to drop: NList nl( P->conditioner.size() ); vector lst = nl.deparseStringList(par::chap_drop_snps_list,&mapping); for (int l=0; lnh-1 ; h1++) for (int h2 = h1+1; h2 < H->nh; h2++) { if ( alternate->haplotypesInSameGroup(h1,h2) ) if ( ! null->haplotypesInSameGroup(h1,h2) ) return false; } return true; } bool ChapModel::haplotypesInSameGroup(int h1, int h2) { map::iterator i1 = haploGroup.find(h1); map::iterator i2 = haploGroup.find(h2); if ( i1 == haploGroup.end() || i2 == haploGroup.end() ) return true; if ( i1 == haploGroup.end() || i2 == haploGroup.end() ) error("Internal problem in ChapModel..."); return ( i1->second == i2->second ); } void Chap::setSNPList(vector & snplist, ChapModel & model) { boolvec_t snpmask(H->ns,false); for (int l=0;lns; s++) if ( H->S[s] == snplist[l] ) snpmask[s] = true; map subhaplotypes = H->makeSubHaplotypeSet(snpmask); model.group.clear(); int cnt=0; map added; for (int h = 0; h < H->nh; h++) { if ( H->f[h] >= par::min_hf ) { int g = subhaplotypes.find(h)->second; map::iterator gi = added.find(g); if ( gi == added.end() ) { added.insert(make_pair(g,cnt)); cnt++; set t; t.insert(h); model.group.push_back(t); } else model.group[ added.find(g)->second ].insert(h); } } } void displayHaploGroups(ofstream & CH, ChapModel & m, HaploPhase * haplo) { bool closeThenEnd = false; int cnt = 0; CH << " "; for (int h=0; h < m.group.size(); h++) { CH << "{ "; set::iterator ih = m.group[h].begin(); int cnt2 = m.group[h].size(); while ( ih != m.group[h].end() ) { if ( cnt>0 && cnt2 < m.group[h].size() ) CH << ", "; CH << haplo->haplotypeName( *ih ); ++cnt; --cnt2; if ( cnt * haplo->ns > 40 ) { if ( cnt2 == 0) closeThenEnd = true; else { CH << "\n "; cnt = 0; } } ++ih; } // Close group CH << " } "; if ( closeThenEnd ) { CH << "\n "; closeThenEnd = false; cnt=0; } } CH << "\n"; } plink-1.07-src/epi.cpp0000644000265600020320000004050711264127624013763 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "crandom.h" #include "linear.h" #include "logistic.h" #include "stats.h" extern ofstream LOG; using namespace std; //////////////////////////////////////// // Epistasis tests (no permutation) void Plink::calcEpistasis() { /////////////////////////////////////////// // SNP major mode or individual major mode? if (par::fast_epistasis) { if ( ! par::SNP_major ) Ind2SNP(); } else { if ( par::SNP_major ) SNP2Ind(); } ////////////////////////////////////////////// // Set up results files ofstream EPI; string f = par::output_file_name; if (par::qt) f += ".epi.qt"; else { if (par::epi_caseonly) f += ".epi.co"; else f += ".epi.cc"; } EPI.open(f.c_str(),ios::out); printLOG("Writing epistasis pairwise results to [ " + f + " ] \n"); EPI.precision(4); if ( !par::epi_quickscan ) { EPI << setw(4) << "CHR1" << " " << setw(par::pp_maxsnp) << "SNP1" << " " << setw(4) << "CHR2" << " " << setw(par::pp_maxsnp) << "SNP2" << " "; if (!par::fast_epistasis) { if (par::bt) EPI << setw(12) << "OR_INT" << " "; else EPI << setw(12) << "BETA_INT" << " "; } EPI << setw(12) << "STAT" << " " << setw(12) << "P" << " " << "\n"; } else EPI << setw(4) << "CHR1" << " " << setw(par::pp_maxsnp) << "SNP1" << " " << setw(4) << "CHR2" << " " << setw(par::pp_maxsnp) << "SNP2" << " " << setw(12) << "CHISQ" << " " << "\n"; //////////////////////////////////////////////////////////////////// // epi1 and epi2 thresholds were given in terms of 0.01 (two-sided) // calculate appropriate absolute Z scores printLOG("Threshold for displaying epistatic result (--epi1) : p <= "+dbl2str(par::epi_alpha1)+"\n"); printLOG("Threshold for counting epistatic result (--epi2) : p <= "+dbl2str(par::epi_alpha2)+"\n"); par::epi_alpha1 = fabs(ltqnorm(par::epi_alpha1 / 2)); par::epi_alpha2 = fabs(ltqnorm(par::epi_alpha2 / 2)); // Fast epistasis: caae-only or case/control // Regression based test: case/control or quantitative trait // Take a list of SNPs, or all SNPs (vector epi1) // Test these against either themselves, or all SNPs (vector epi2) // A B // ALL x ALL skip e1>e2 // SET1 x ALL // SET1 x SET1 skip e1>e2 // SET1 x SET2 bool skip_symm = false; // Only output epistatic tests that have p < par::epi_alpha1; // Do not even attempt to save any epistatic results -- go straight to STDOUT // Also present summary results for all epi1 SNPs // (i.e. average / proportion of significant epistatic tests // at a certain alpha level, par::epi_alpha2 vector sA(nl_all,false); vector sB(nl_all,false); // Are we using a test set? If so, construct now if (par::set_test) { if (snpset.size()>2) error("Can only specify one or two SETs when testing for epistasis"); if (snpset.size()==0) error("There are no valid sets specified"); for (int e=0;e::iterator e1 = sA.begin(); e1 != sA.end(); e1++) if (*e1) epc++; int epcc = 0; // Keep track of how many epistatic tests actually performed long int nepi = 0; vector summary_sig(nl_all,0); vector summary_good(nl_all,0); vector best_score(nl_all,0); vector best_partner(nl_all); ////////////////////////////////////////// // Begin iterating over pairs : SET x SET for (int e1=0;e1=e2 && skip_symm) continue; // Same SNP if (e1==e2) continue; // Skip X chromosome for now if (par::chr_sex[locus[e1]->chr] || par::chr_sex[locus[e2]->chr] || par::chr_haploid[locus[e1]->chr] || par::chr_haploid[locus[e2]->chr]) continue; // SNPs too close (case-only analysis) if (par::epi_caseonly) if ( locus[e1]->chr == locus[e2]->chr) if ( fabs((double)(locus[e1]->bp - locus[e2]->bp)) < par::epi_caseonly_kb_gap*1000 ) continue; ////////////////////////////////// // Perform test of epistasis here if (par::bt && par::fast_epistasis) { double z; // statistic from either method // Odds ratio test // make two 2x2 tables int a11, a12, a21, a22; int u11, u12, u21, u22; a11=a12=a21=a22=0; u11=u12=u21=u22=0; vector::iterator a1 = SNP[e1]->one.begin(); vector::iterator a2 = SNP[e1]->two.begin(); vector::iterator b1 = SNP[e2]->one.begin(); vector::iterator b2 = SNP[e2]->two.begin(); vector::iterator person = sample.begin(); while ( person != sample.end() ) { if( (*person)->missing ) { // Next person a1++; a2++; b1++; b2++; person++; continue; } if ((*person)->aff) // if affected { if ( ! *b1 ) { if ( ! *b2 ) // ??x00 { if ( ! *a1 ) { if ( ! *a2 ) a11+=4; // 00 x 00 else { a11+=2; a21+=2; } // 01 x 00 } else if ( *a2 ) a21+=4; // 11 x 00 } else // ??x01 { if ( ! *a1 ) { if ( ! *a2 ) { a11+=2; a12+=2; } // 00 x 01 else { a11++; a21++; a12++; a22++; } // 01x01 } else if ( *a2 ) { a21+=2; a22+=2; } // 11 x 01 } } else if ( *b2 ) // ?? x 11 { if ( ! *a1 ) { if ( ! *a2 ) a12+=4; // 00 x 01 else { a12+=2; a22+=2; } // 01 x 01 } else if ( *a2 ) a22+=4; // 11 x 01 } } // Unaffecteds? else if ( !par::epi_caseonly ) // unaffected { if ( ! *b1 ) { if ( ! *b2 ) // ??x00 { if ( ! *a1 ) { if ( ! *a2 ) u11+=4; // 00 x 00 else { u11+=2; u21+=2; } // 01 x 00 } else if ( *a2 ) u21+=4; // 11 x 00 } else // ??x01 { if ( ! *a1 ) { if ( ! *a2 ) { u11+=2; u12+=2; } // 00 x 01 else { u11++; u21++; u12++; u22++; } // 01x01 } else if ( *a2 ) { u21+=2; u22+=2; } // 11 x 01 } } else if ( *b2 ) // ?? x 11 { if ( ! *a1 ) { if ( ! *a2 ) u12+=4; // 00 x 01 else { u12+=2; u22+=2; } // 01 x 01 } else if ( *a2 ) u22+=4; // 11 x 01 } } // Next person a1++; a2++; b1++; b2++; person++; } // Calculate log(OR) and SEs double or_aff, v_aff, or_unf, v_unf; or_aff = log( (double)(a11*a22)/ (double)(a12*a21) ); v_aff = 1/(double)a11 + 1/(double)a12 + 1/(double)a21 + 1/(double)a22; // Case-only z-score (if requested) if (par::epi_caseonly) z = fabs( or_aff / sqrt(v_aff) ); else // Standard case-control analysis { or_unf = log( (double)(u11*u22)/ (double)(u12*u21) ); v_unf = 1/(double)u11 + 1/(double)u12 + 1/(double)u21 + 1/(double)u22; z = fabs( (or_aff - or_unf) / sqrt ( v_aff + v_unf ) ); } ////////////////////////////// // --nop option in effect // Just output z score, if valid & above threshold if (par::epi_quickscan) { // Is this worth recording? if ( realnum(z) ) { nepi++; if (z >= par::epi_alpha1) EPI << setw(4) << locus[e1]->chr << " " << setw(par::pp_maxsnp) << locus[e1]->name << " " << setw(4) << locus[e2]->chr << " " << setw(par::pp_maxsnp) << locus[e2]->name << " " << setw(12) << z*z << "\n"; EPI.flush(); continue; } } ///////////////////////////////// // More full parsing of results double zero = 0; // Check this is a proper result if ( par::epi_filter && realnum(z) ) { // One more test performed nepi++; // Count as a good result summary_good[e1]++; if (sA[e2]) summary_good[e2]++; // Do we want to record this as part of the summary for the first set? if (z >= par::epi_alpha2) { // first variable will always be in A set summary_sig[e1]++; // but the second may also be in A set if (sA[e2]) summary_sig[e2]++; } // Is this result the best scrore yet for marker in set A? if (z > best_score[e1]) { best_score[e1] = z; best_partner[e1] = e2; } // The second marker might also be in set A if (sA[e2]) { if (z > best_score[e2]) { best_score[e2] = z; best_partner[e2] = e1; } } // Is this worth recording? if (z >= par::epi_alpha1) { EPI << setw(4) << locus[e1]->chr << " " << setw(par::pp_maxsnp) << locus[e1]->name << " " << setw(4) << locus[e2]->chr << " " << setw(par::pp_maxsnp) << locus[e2]->name << " " << setw(12) << z*z << " " << setw(12) << normdist(-z) * 2 << " " << "\n"; EPI.flush(); } else continue; // skip to next pair (skip logistic test) } else if (!par::epi_filter) { // Record all results here, whether NA or otherwise EPI << setw(4) << locus[e1]->chr << " " << setw(par::pp_maxsnp) << locus[e1]->name << " " << setw(4) << locus[e2]->chr << " " << setw(par::pp_maxsnp) << locus[e2]->name << " " << setw(12) << z*z << " " << setw(12) << normdist(-z) * 2 << " " << "\n"; EPI.flush(); } else continue; // if bad statistic for this test, do not try logistic } // End of binary OR test /////////////////////////////////////////////// // Logistic or linear regression epistasis test if ( !par::fast_epistasis ) { Model * lm; if (par::bt) { LogisticModel * m = new LogisticModel(this); lm = m; } else { LinearModel * m = new LinearModel(this); lm = m; } // Set missing data lm->setMissing(); // Main effect of SNP 1 lm->addAdditiveSNP(e1); lm->label.push_back("ADD1"); // Main effect of SNP 2 lm->addAdditiveSNP(e2); lm->label.push_back("ADD2"); // Epistasis lm->addInteraction(1,2); lm->label.push_back("EPI"); // Build design matrix lm->buildDesignMatrix(); // Prune out any remaining missing individuals // No longer needed // lm->pruneY(); // Fit linear model lm->fitLM(); // Did model fit okay? lm->validParameters(); // Obtain estimates and statistic lm->testParameter = 3; // interaction vector_t b = lm->getCoefs(); double chisq = lm->getStatistic(); double pvalue = chiprobP(chisq,1); double z = sqrt(chisq); // Is this result worth displaying? if (lm->isValid()) { // One more valid test performed nepi++; // Count as a good result summary_good[e1]++; if (sA[e2]) summary_good[e2]++; // Do we want to record this as part of the summary for the first set? if ( z >= par::epi_alpha2) { // first variable will always be in A set summary_sig[e1]++; // but the second may also be in A set if (sA[e2]) summary_sig[e2]++; } // Is this result the best scrore yet for marker in set A? if (z > best_score[e1]) { best_score[e1] = z; best_partner[e1] = e2; } // The second marker might also be in set A if (sA[e2]) { if (z > best_score[e2]) { best_score[e2] = z; best_partner[e2] = e1; } } } // Is this result worth displaying? if ( z >= par::epi_alpha1 ) { EPI << setw(4) << locus[e1]->chr << " " << setw(par::pp_maxsnp) << locus[e1]->name << " " << setw(4) << locus[e2]->chr << " " << setw(par::pp_maxsnp) << locus[e2]->name << " "; if (lm->isValid()) { if ( par::bt) EPI << setw(12) << exp(b[3]) << " " << setw(12) << chisq << " " << setw(12) << pvalue << " " << "\n"; else EPI << setw(12) << b[3] << " " << setw(12) << chisq << " " << setw(12) << pvalue << " " << "\n"; } else EPI << setw(12) << "NA" << " " << setw(12) << "NA" << " " << setw(12) << "NA" << " " << "\n"; EPI.flush(); } // Clean up delete lm; } } // Next pair of SNPs } } if (!par::silent) cout << "\n"; EPI.close(); ////////////////////// // Summary of results // Skip this for now if (true) { f += ".summary"; EPI.open(f.c_str(),ios::out); EPI.clear(); printLOG("Performed a total of "+int2str(nepi)+" valid SNPxSNP tests\n"); printLOG("Writing epistasis summary results to [ " + f + " ] \n"); EPI.precision(4); EPI << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "N_SIG" << " " << setw(12) << "N_TOT" << " " << setw(12) << "PROP" << " " << setw(12) << "BEST_CHISQ" << " " << setw(4) << "BEST_CHR" << " " << setw(par::pp_maxsnp) << "BEST_SNP" << " " << "\n"; int c=0; for (int e1=0;e1chr << " " << setw(par::pp_maxsnp) << locus[e1]->name << " " << setw(12) << summary_sig[e1] << " " << setw(12) << summary_good[e1] << " " << setw(12) << (double)summary_sig[e1] / (double)summary_good[e1] << " " << setw(12) << best_score[e1] * best_score[e1] << " " << setw(4) << locus[best_partner[e1]]->chr << " " << setw(par::pp_maxsnp) << locus[best_partner[e1]]->name << " " << "\n"; } } EPI.close(); } } plink-1.07-src/genogroup.h0000644000265600020320000000256111264127626014660 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __GENOGROUP_H_ #define __GENOGROUP_H__ class MultiLocusGenotype { public: vector g; int count; int reference; vector skip; bool operator<(const MultiLocusGenotype & b) const { for (int i=0; i class less { public: bool operator()(MultiLocusGenotype const* p1, MultiLocusGenotype const* p2) { if (!p1) return true; if (!p2) return false; if (p1->g < p2->g) return true; return false; } }; }; #endif plink-1.07-src/mds.cpp0000644000265600020320000001747711264127624014003 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "stats.h" #ifdef WITH_LAPACK #include "lapackf.h" #endif void Plink::generateMDS() { // Take this solution, (i) // 1) Average clusters to generate points // 2) Perform multidimensional scaling // 3) Dump information into Haploview-friendly file for visualisation string f = par::output_file_name + ".mds"; printLOG("Writing MDS solution to [ " + f + " ] \n"); if (par::mds_by_individual) printLOG("MDS plot of individuals (not clusters)\n"); else printLOG("MDS plot of clusters (not individuals)\n"); vector< vector > cl; // Re-Populate the cl cluster information list, if need be if ( ! par::mds_by_individual ) { set clnum; for (int i=0; isol >= 0 ) clnum.insert( sample[i]->sol ); } cl.resize( clnum.size() ); for (int i=0; isol >= 0 ) cl[ sample[i]->sol ].push_back(i); } } int nc; if (par::mds_by_individual) nc = n; else nc = cl.size(); // Now we have built the between-cluster distance matrix (which will // typically be smaller than the between-individual matrix, we // should be able to apply visualisation (note: for samples of under // 5000 individuals, should be okay to apply standard per-individual // clustering // B = - 1/2 Z D^2 Z // // where Z = I - 1/n U // // I identity matrix (n x n) // U is unit matix (n x n) // A double-centered matrix B // b_ij = -1/2 [ d^2_ij - d^2_.j - d^2_i. + d^2_.. ] #ifdef WITH_LAPACK // Full, symm matrix (1D format) vector_t D(nc*nc,0); for (int c1 = 0 ; c1 < nc ; c1++) for (int c2 = c1 ; c2 < nc ; c2++) { if (c1==c2) D[ c1 + c2*nc ]=0; else { if (par::mds_by_individual) { if (c1>c2) D[c1 + c2*nc] = D[ c2 + c1*nc ] = (1-mdist[c1][c2]) * (1-mdist[c1][c2]); else D[c1 + c2*nc] = D[ c2 + c1*nc ] = (1-mdist[c2][c1]) * (1-mdist[c2][c1]); } else { // Average over all pairs between cluster double avg = 0; for (int i1=0; i1 cl[c2][i2] ) avg += 1-mdist[cl[c1][i1]][cl[c2][i2]]; else avg += 1-mdist[cl[c2][i2]][cl[c1][i1]]; } avg /= cl[c1].size() * cl[c2].size(); // Symmetric matrix of squared distances D[c1 + c2*nc] = D[c2 + c1*nc] = avg * avg; } } } double mean = 0; vector_t M(nc,0); for (int c1 = 0 ; c1 < nc ; c1++) { for (int c2 = 0 ; c2 < nc ; c2++) { M[c1] += D[c1 + c2*nc]; } M[c1] /= (double)nc; mean += M[c1]; } mean /= (double)nc; // For each element for D, double center for (int c1 = 0 ; c1 < nc ; c1++) for (int c2 = c1 ; c2 < nc ; c2++) D[c1 + c2*nc] = D[c2 + c1*nc] = - 0.5 * ( D[c1 + c2*nc] - M[c1] - M[c2] + mean ); // Calculate only required eigen-vectors vector_t eigenvalue(nc); matrix_t eigenvector; sizeMatrix(eigenvector,nc,nc); //svd_lapack(n,D,eigenvalue,eigenvector); eigen_lapack(n,D,eigenvalue,eigenvector); // cout << "EVAL = \n"; // display(eigenvalue); // cout << "EVEC = \n"; // display(eigenvector); #else // Not using LAPACK matrix_t D; sizeMatrix(D,nc,nc); for (int c1 = 0 ; c1 < nc ; c1++) for (int c2 = c1 ; c2 < nc ; c2++) { if (c1==c2) D[c1][c2]=0; else { if (par::mds_by_individual) { if (c1>c2) D[c1][c2] = D[c2][c1] = (1-mdist[c1][c2]) * (1-mdist[c1][c2]); else D[c1][c2] = D[c2][c1] = (1-mdist[c2][c1]) * (1-mdist[c2][c1]); } else { // Average over all pairs between cluster double avg = 0; for (int i1=0; i1 cl[c2][i2] ) avg += 1-mdist[cl[c1][i1]][cl[c2][i2]]; else avg += 1-mdist[cl[c2][i2]][cl[c1][i1]]; } avg /= cl[c1].size() * cl[c2].size(); // Symmetric matrix of squared distances D[c1][c2] = D[c2][c1] = avg * avg; } } } double mean = 0; vector_t M(nc,0); for (int c1 = 0 ; c1 < nc ; c1++) { for (int c2 = 0 ; c2 < nc ; c2++) { M[c1] += D[c1][c2]; } M[c1] /= (double)nc; mean += M[c1]; } mean /= (double)nc; // For each element for D, double center for (int c1 = 0 ; c1 < nc ; c1++) for (int c2 = c1 ; c2 < nc ; c2++) D[c1][c2] = D[c2][c1] = - 0.5 * ( D[c1][c2] - M[c1] - M[c2] + mean ); vector_t eigenvalue(nc); matrix_t eigenvector; sizeMatrix(eigenvector,nc,nc); // cout << "*---------\n"; // for (int i=0; i emap; for (int i=0; i::reverse_iterator e = emap.rbegin(); int inc = par::cluster_mds_dim; vector elist; while ( e != emap.rend() && inc > 0 ) { elist.push_back(e->second); inc--; e++; } if (par::cluster_mds_dim < 1) par::cluster_mds_dim = 1; if (par::cluster_mds_dim > nc) par::cluster_mds_dim = nc; if ( elist.size() != par::cluster_mds_dim ) { error("Internal problem extracting MDS solution\n"); elist.resize(par::cluster_mds_dim); } // Sqrt(D) for (int i=0; i= 0 ? sqrt(eigenvalue[i]) : 0 ; // Make solution // EVEC * sqrt(EVAL) but filter on rows that are in solution // with EVAL as diagonal matrix matrix_t mds; sizeMatrix(mds,nc,par::cluster_mds_dim); for (int c1=0; c1fid << " " << setw(par::pp_maxiid) << sample[i]->iid << " " << setw(6) << sample[i]->sol << " "; for (int c=0; csol >= 0 ) MDS << setw(12) << mds[ sample[i]->sol ][c] << " "; else MDS << setw(12) << "NA" << " "; } } MDS << "\n"; } MDS.close(); } plink-1.07-src/test.map0000645000265600020320000000002710652403213014141 0ustar tilleaadmin1 snp1 0 1 1 snp2 0 2 plink-1.07-src/test.ped0000645000265600020320000000021210652403213014130 0ustar tilleaadmin1 1 0 0 1 1 A A G T 2 1 0 0 1 1 A C T G 3 1 0 0 1 1 C C G G 4 1 0 0 1 2 A C T T 5 1 0 0 1 2 C C G T 6 1 0 0 1 2 C C T T plink-1.07-src/zfstream.h0000644000265600020320000002772011264127626014512 0ustar tilleaadmin/* * A C++ I/O streams interface to the zlib gz* functions * * by Ludwig Schwardt * original version by Kevin Ruland * * This version is standard-compliant and compatible with gcc 3.x. */ #ifndef ZFSTREAM_H #define ZFSTREAM_H #include // not iostream, since we don't need cin/cout #include #include "zlib.h" /*****************************************************************************/ /** * @brief Gzipped file stream buffer class. * * This class implements basic_filebuf for gzipped files. It doesn't yet support * seeking (allowed by zlib but slow/limited), putback and read/write access * (tricky). Otherwise, it attempts to be a drop-in replacement for the standard * file streambuf. */ class gzfilebuf : public std::streambuf { public: // Default constructor. gzfilebuf(); // Destructor. virtual ~gzfilebuf(); /** * @brief Set compression level and strategy on the fly. * @param comp_level Compression level (see zlib.h for allowed values) * @param comp_strategy Compression strategy (see zlib.h for allowed values) * @return Z_OK on success, Z_STREAM_ERROR otherwise. * * Unfortunately, these parameters cannot be modified separately, as the * previous zfstream version assumed. Since the strategy is seldom changed, * it can default and setcompression(level) then becomes like the old * setcompressionlevel(level). */ int setcompression(int comp_level, int comp_strategy = Z_DEFAULT_STRATEGY); /** * @brief Check if file is open. * @return True if file is open. */ bool is_open() const { return (file != NULL); } /** * @brief Open gzipped file. * @param name File name. * @param mode Open mode flags. * @return @c this on success, NULL on failure. */ gzfilebuf* open(const char* name, std::ios_base::openmode mode); /** * @brief Attach to already open gzipped file. * @param fd File descriptor. * @param mode Open mode flags. * @return @c this on success, NULL on failure. */ gzfilebuf* attach(int fd, std::ios_base::openmode mode); /** * @brief Close gzipped file. * @return @c this on success, NULL on failure. */ gzfilebuf* close(); protected: /** * @brief Convert ios open mode int to mode string used by zlib. * @return True if valid mode flag combination. */ bool open_mode(std::ios_base::openmode mode, char* c_mode) const; /** * @brief Number of characters available in stream buffer. * @return Number of characters. * * This indicates number of characters in get area of stream buffer. * These characters can be read without accessing the gzipped file. */ virtual std::streamsize showmanyc(); /** * @brief Fill get area from gzipped file. * @return First character in get area on success, EOF on error. * * This actually reads characters from gzipped file to stream * buffer. Always buffered. */ virtual int_type underflow(); /** * @brief Write put area to gzipped file. * @param c Extra character to add to buffer contents. * @return Non-EOF on success, EOF on error. * * This actually writes characters in stream buffer to * gzipped file. With unbuffered output this is done one * character at a time. */ virtual int_type overflow(int_type c = traits_type::eof()); /** * @brief Installs external stream buffer. * @param p Pointer to char buffer. * @param n Size of external buffer. * @return @c this on success, NULL on failure. * * Call setbuf(0,0) to enable unbuffered output. */ virtual std::streambuf* setbuf(char_type* p, std::streamsize n); /** * @brief Flush stream buffer to file. * @return 0 on success, -1 on error. * * This calls underflow(EOF) to do the job. */ virtual int sync(); // // Some future enhancements // // virtual int_type uflow(); // virtual int_type pbackfail(int_type c = traits_type::eof()); // virtual pos_type // seekoff(off_type off, // std::ios_base::seekdir way, // std::ios_base::openmode mode = std::ios_base::in|std::ios_base::out); // virtual pos_type // seekpos(pos_type sp, // std::ios_base::openmode mode = std::ios_base::in|std::ios_base::out); private: /** * @brief Allocate internal buffer. * * This function is safe to call multiple times. It will ensure * that a proper internal buffer exists if it is required. If the * buffer already exists or is external, the buffer pointers will be * reset to their original state. */ void enable_buffer(); /** * @brief Destroy internal buffer. * * This function is safe to call multiple times. It will ensure * that the internal buffer is deallocated if it exists. In any * case, it will also reset the buffer pointers. */ void disable_buffer(); /** * Underlying file pointer. */ gzFile file; /** * Mode in which file was opened. */ std::ios_base::openmode io_mode; /** * @brief True if this object owns file descriptor. * * This makes the class responsible for closing the file * upon destruction. */ bool own_fd; /** * @brief Stream buffer. * * For simplicity this remains allocated on the free store for the * entire life span of the gzfilebuf object, unless replaced by setbuf. */ char_type* buffer; /** * @brief Stream buffer size. * * Defaults to system default buffer size (typically 8192 bytes). * Modified by setbuf. */ std::streamsize buffer_size; /** * @brief True if this object owns stream buffer. * * This makes the class responsible for deleting the buffer * upon destruction. */ bool own_buffer; }; /*****************************************************************************/ /** * @brief Gzipped file input stream class. * * This class implements ifstream for gzipped files. Seeking and putback * is not supported yet. */ class gzifstream : public std::istream { public: // Default constructor gzifstream(); /** * @brief Construct stream on gzipped file to be opened. * @param name File name. * @param mode Open mode flags (forced to contain ios::in). */ explicit gzifstream(const char* name, std::ios_base::openmode mode = std::ios_base::in); /** * @brief Construct stream on already open gzipped file. * @param fd File descriptor. * @param mode Open mode flags (forced to contain ios::in). */ explicit gzifstream(int fd, std::ios_base::openmode mode = std::ios_base::in); /** * Obtain underlying stream buffer. */ gzfilebuf* rdbuf() const { return const_cast(&sb); } /** * @brief Check if file is open. * @return True if file is open. */ bool is_open() { return sb.is_open(); } /** * @brief Open gzipped file. * @param name File name. * @param mode Open mode flags (forced to contain ios::in). * * Stream will be in state good() if file opens successfully; * otherwise in state fail(). This differs from the behavior of * ifstream, which never sets the state to good() and therefore * won't allow you to reuse the stream for a second file unless * you manually clear() the state. The choice is a matter of * convenience. */ void open(const char* name, std::ios_base::openmode mode = std::ios_base::in); /** * @brief Attach to already open gzipped file. * @param fd File descriptor. * @param mode Open mode flags (forced to contain ios::in). * * Stream will be in state good() if attach succeeded; otherwise * in state fail(). */ void attach(int fd, std::ios_base::openmode mode = std::ios_base::in); /** * @brief Close gzipped file. * * Stream will be in state fail() if close failed. */ void close(); private: /** * Underlying stream buffer. */ gzfilebuf sb; }; /*****************************************************************************/ /** * @brief Gzipped file output stream class. * * This class implements ofstream for gzipped files. Seeking and putback * is not supported yet. */ class gzofstream : public std::ostream { public: // Default constructor gzofstream(); /** * @brief Construct stream on gzipped file to be opened. * @param name File name. * @param mode Open mode flags (forced to contain ios::out). */ explicit gzofstream(const char* name, std::ios_base::openmode mode = std::ios_base::out); /** * @brief Construct stream on already open gzipped file. * @param fd File descriptor. * @param mode Open mode flags (forced to contain ios::out). */ explicit gzofstream(int fd, std::ios_base::openmode mode = std::ios_base::out); /** * Obtain underlying stream buffer. */ gzfilebuf* rdbuf() const { return const_cast(&sb); } /** * @brief Check if file is open. * @return True if file is open. */ bool is_open() { return sb.is_open(); } /** * @brief Open gzipped file. * @param name File name. * @param mode Open mode flags (forced to contain ios::out). * * Stream will be in state good() if file opens successfully; * otherwise in state fail(). This differs from the behavior of * ofstream, which never sets the state to good() and therefore * won't allow you to reuse the stream for a second file unless * you manually clear() the state. The choice is a matter of * convenience. */ void open(const char* name, std::ios_base::openmode mode = std::ios_base::out); /** * @brief Attach to already open gzipped file. * @param fd File descriptor. * @param mode Open mode flags (forced to contain ios::out). * * Stream will be in state good() if attach succeeded; otherwise * in state fail(). */ void attach(int fd, std::ios_base::openmode mode = std::ios_base::out); /** * @brief Close gzipped file. * * Stream will be in state fail() if close failed. */ void close(); private: /** * Underlying stream buffer. */ gzfilebuf sb; }; /*****************************************************************************/ /** * @brief Gzipped file output stream manipulator class. * * This class defines a two-argument manipulator for gzofstream. It is used * as base for the setcompression(int,int) manipulator. */ template class gzomanip2 { public: // Allows insertor to peek at internals template friend gzofstream& operator<<(gzofstream&, const gzomanip2&); // Constructor gzomanip2(gzofstream& (*f)(gzofstream&, T1, T2), T1 v1, T2 v2); private: // Underlying manipulator function gzofstream& (*func)(gzofstream&, T1, T2); // Arguments for manipulator function T1 val1; T2 val2; }; /*****************************************************************************/ // Manipulator function thunks through to stream buffer inline gzofstream& setcompression(gzofstream &gzs, int l, int s = Z_DEFAULT_STRATEGY) { (gzs.rdbuf())->setcompression(l, s); return gzs; } // Manipulator constructor stores arguments template inline gzomanip2::gzomanip2(gzofstream &(*f)(gzofstream &, T1, T2), T1 v1, T2 v2) : func(f), val1(v1), val2(v2) { } // Insertor applies underlying manipulator function to stream template inline gzofstream& operator<<(gzofstream& s, const gzomanip2& m) { return (*m.func)(s, m.val1, m.val2); } // Insert this onto stream to simplify setting of compression level inline gzomanip2 setcompression(int l, int s = Z_DEFAULT_STRATEGY) { return gzomanip2(&setcompression, l, s); } #endif // ZFSTREAM_H plink-1.07-src/genedrop.cpp0000644000265600020320000002451611264127625015014 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include "plink.h" #include "perm.h" #include "options.h" #include "helper.h" #include "crandom.h" void Perm::preGeneDrop() { // Note -- minor issue, this routine ignores // issue of linkage between sibs // Idea: to use standard case/control or QT test, but permute only // transmissions from founders, and to all offspring: to give a // within-family test. // Set up parent-offspring structure, but not related to // nuclear families // par::perm_genedrop // If true, perform gene-dropping permutation instead of label-swapping // par::perm_genedrop_founders // If true, non-founder parents always drop one of their gene-dropped alleles // If false, non-founder parents always drop one of their true alleles // par::perm_genedrop_parents // If true, we also perform a label-swapping permutation within all parents // par::perm_genedrop_sibships // If true, we also perform a label-swapping permutation within all full // sibships without parents map fnd; // Link up parents and offspring P.linkRelateds(idmap, fnd); P.printLOG("Allocated family structure for gene-dropping\n"); // Set initial permutation structure -- no label-swapping for (int i=0; isol = -1; // Label-swapping cluster count int cc=0; int cc_par=0; map parent; map parent_pat; map parent_mat; // Set up clusters for within-parent label-swapping permutation if (par::perm_genedrop_parents) { // Parents must be pairs for simple nuclear families // i.e. watch out for half-sib relations for (int i=0; ifid+"_"+person->iid) == fnd.end() ) { string spat = person->pp->fid+"_"+person->pp->iid; string smat = person->pm->fid+"_"+person->pm->iid; string pair = spat+" x "+smat; // If this parent pair has not previously featured, // AND if neither parent has previously featured in // any other pairing, then make this parental set // a pair if ( parent.find(pair) == parent.end() && parent_pat.find(spat) == parent_pat.end() && parent_mat.find(smat) == parent_mat.end() ) { person->pp->sol = cc; person->pm->sol = cc; cc++; parent.insert(make_pair(pair,cc)); parent_pat.insert(make_pair(spat,cc)); parent_mat.insert(make_pair(smat,cc)); } } } P.printLOG("Allocated "+int2str(cc)+" clusters for within-parent permutation\n"); cc_par = cc; } // Set up clusters for within-sibship permutation if (par::perm_genedrop_sibships) { // i.e for individuals for whom pat and mat != 0 but the parent is // no longer in the dataset (i.e. removed for low genotyping, as it // was a dummy parent). map sibs; for (int i=0; ifounder && fnd.find(person->fid+"_"+person->iid) != fnd.end() && parent_pat.find(person->fid+"_"+person->iid) == parent_pat.end() && parent_mat.find(person->fid+"_"+person->iid) == parent_mat.end() ) { string pair = person->fid+"_"+person->pat+"_"+person->mat; // If we haven't seen this sibship before, add a new cluster code map::iterator sit = sibs.find(pair); if (sit == sibs.end()) { person->sol = cc; sibs.insert(make_pair(pair,cc)); cc++; } else { // ...otherwise, assign to existing one person->sol = sit->second; } } } P.printLOG("Allocated "+int2str(cc-cc_par)+" clusters for within-sibship permutation\n"); } // Label-swapping permutation of all unrelated individuals? // everybody else, who is a family size 1 // this means if (par::perm_genedrop_unrel) { map unrel; for (int i=0; ifid; if ( unrel.find(f) == unrel.end() ) { unrel.insert(make_pair(f,1)); } else { (unrel.find(f)->second)++; } } for (int i=0; ifid << "\t" << unrel.find(P.sample[i]->fid)->second << "\n"; // If no parents, assign unique cluster // if (fnd.find(person->fid+"_"+person->iid) != fnd.end()) // person->sol = cc; P.printLOG("Allocated cluster for between-founder permutation\n"); } } void Perm::geneDrop() { // Transmissions vector pat(P.n,true); vector mat(P.n,false); vector done(P.n,false); // Consider each individual for (int i=0; ifounder ) { pat[i] = mat[i] = false; if (CRandom::rand() > 0.5) pat[i] = true; if (CRandom::rand() > 0.5) mat[i] = true; } } // Now we have constructed the gene-dropping matrix, // we proceed to consider each SNP at a time for (int l=0; lfounder) dropAlleles(P,P.sample[i],i,l,pat,mat,done,idmap); } } void Perm::dropAlleles(Plink & P, Individual * person, int i, int l, vector & pat, vector & mat, vector & done, map & idmap) { // If founder, leave genotype as is; also, if either parent has // missing genotype data, then do not permute vector::iterator s1; vector::iterator s2; bool pat1, pat2; bool mat1, mat2; if (par::SNP_major) { s1 = P.SNP[l]->one.begin()+i; s2 = P.SNP[l]->two.begin()+i; pat1 = P.SNP[l]->one[person->ip]; pat2 = P.SNP[l]->two[person->ip]; mat1 = P.SNP[l]->one[person->im]; mat2 = P.SNP[l]->two[person->im]; } else { s1 = person->one.begin()+l; s2 = person->two.begin()+l; pat1 = P.sample[person->ip]->one[l]; pat2 = P.sample[person->ip]->two[l]; mat1 = P.sample[person->im]->one[l]; mat2 = P.sample[person->im]->two[l]; } if ( ! ( person->founder || // founder ( pat1 && !pat2 ) || // pat missing ( mat1 && !mat2 ) || // mat missing ( (*s1) && ! *s2 ) ) ) // self missing { // For pat/mat : // false = paternal/slot1, // true = maternal/slot2 // i.e. if parent is heterozygous, then pat T/F says which // allele to take (F/T), otherwise just take the homozygous // allele bool d1 = false; bool d2 = false; // Is father heterozygous? if ( pat1 != pat2 ) d1 = pat[i]; else d1 = pat1; // Is mother heterozygous? if ( mat1 != mat2 ) d2 = mat[i]; else d2 = mat1; // Set new genotype: FF, FT or TT? // (Missing will be left as is) if ( (!d1) && (!d2) ) { *s1 = false; *s2 = false; } else if ( d1 != d2 ) { *s1 = false; *s2 = true; } else if ( d1 && d2 ) { *s1 = true; *s2 = true; } done[i]=true; } // Now also update any kids of this person that still need doing for (int k=0; kkids.size(); k++) if (!done[idmap.find(person->kids[k])->second]) dropAlleles(P,person->kids[k],person->ikids[k],l,pat,mat,done,idmap); return; } void Plink::linkRelateds(map & idmap, map & fnd) { map imap; map imap2; map::iterator iit; map::iterator iit2; // Populate map, clear any existing family-related information for (int i=0; ifid+"_"+ sample[i]->iid, sample[i])); imap2.insert(make_pair(sample[i],i)); idmap.insert(make_pair(sample[i],i)); sample[i]->kids.clear(); sample[i]->ikids.clear(); sample[i]->family = NULL; } // Link up parents and offspring for (int i=0; ifounder) { person->pp = person->pm = NULL; person->ip = person->im = -1; fnd.insert(make_pair(sample[i]->fid+"_"+sample[i]->iid,sample[i])); } else { // Father (if does not exist, treat as founder) iit = imap.find(person->fid+"_"+person->pat); if (iit == imap.end()) { person->pp = NULL; fnd.insert(make_pair(sample[i]->fid+"_"+sample[i]->iid,sample[i])); } else { person->pp = iit->second; iit2 = imap2.find(iit->second); person->ip = iit2->second; } // Mother (if does not exist, treat as founder) iit = imap.find(person->fid+"_"+person->mat); if (iit == imap.end()) { person->pm = NULL; fnd.insert(make_pair(sample[i]->fid+"_"+sample[i]->iid,sample[i])); } else { person->pm = iit->second; iit2 = imap2.find(iit->second); person->im = iit2->second; } // Otherwise, add this person as a child of mother and father if ( ! (person->pp == NULL || person->pm == NULL ) ) { person->pp->kids.push_back(person); person->pm->kids.push_back(person); person->pp->ikids.push_back(i); person->pm->ikids.push_back(i); } } } } plink-1.07-src/tdt.cpp0000644000265600020320000005542611264127624014007 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "crandom.h" #include "sets.h" #include "perm.h" #include "fisher.h" #include "helper.h" #include "stats.h" void Plink::perm_testTDT(Perm & perm) { //////////////////////////////////////// // This function is the entry point for // both TDT and DFAM tests (this is the // wrapper around the test functions for // permutation, set-based tests, etc). ////////////////////////////////// // Individual-major mode analysis if (par::SNP_major) SNP2Ind(); if ( ! par::bt ) error("This analysis requires a binary disease phenotype"); if ( par::set_r2 ) { printLOG("Performing LD-based set test, with parameters:\n"); printLOG(" r-squared (--set-r2) = " + dbl2str( par::set_r2_val ) + "\n" ); printLOG(" p-value (--set-p) = " + dbl2str( chiprobP(par::set_chisq_threshold,1) ) + "\n" ); printLOG(" max # SNPs (--set-max) = " + int2str( par::set_max ) + "\n" ); pS->makeLDSets(); } if ( par::sibTDT_test ) { //////////////////////////////////////////////////////////////// // Parse family and cluster sets, to ensure we do not count // anybody twice -- i.e. remove anybody who is in a family from // CMH-like analysis // Make a temporary set of Individuals* set plist; for (int i=0; iperson.size(); i++) { // Exclude non-singletons from list of people // Individual might have been filtered out, in which case // we should revise klist[] in any case; if ( plist.find( klist[k]->person[i] ) == plist.end() ) { klist[k]->person.erase(klist[k]->person.begin() + i); i--; } else { Individual * person = klist[k]->person[i]; Family * fam = klist[k]->person[i]->family; if ( fam->parents || fam->sibship ) { klist[k]->person[i]->sol = -1; klist[k]->person.erase(klist[k]->person.begin() + i); i--; } } } } } /////////////////////////////////////////// // Calculate original results for true data vector dummy(family.size(),false); /////////////////////////////////////////// // Create cluster for permutation // a) Permute within family // b) Use any existing --within cluster scheme for unrelateds. // The preGeneDrop() function will blank sol for all individuals in // families; if a cluster has been loaded in, certain clusters will // be set to zero possibly. This is fine -- all we need to do is now // go through and add new clusters for each family. Start adding // from nk onwards (we just keep any zero-sized clusters in the // analysis, they will not harm anything). We only need to put // siblings in clusters (i.e. parents do not come into this) if ( par::sibTDT_test ) for (int f=0; fsingleton ) continue; if ( fam->kid.size() < 2 ) continue; klist.push_back( new Cluster ); for (int c=0; c < fam->kid.size(); c++) { fam->kid[c]->sol = nk; klist[nk]->person.push_back( fam->kid[c] ); } nk++; } ///////////////////////////////// // Determine the number of tests int ntests = nl_all; if ( par::set_r2 || par::set_score ) ntests = pS->snpset.size(); ///////////////////////////////// // Empirical p-values perm.setTests(ntests); perm.setPermClusters(*this); string testname = ".tdt"; if (par::sibTDT_test) testname = ".dfam"; vector original; if (par::sibTDT_test) original = testSibTDT(true, false, perm, dummy, dummy); else original = testTDT(true, false, perm, dummy, dummy); //////////////////////////// // Display corrected p-values? if (par::multtest) { vector obp(0); for (int l=0; l setsigsize; if (par::set_test) { if ( par::set_r2 ) { original = pS->fitLDSetTest(original,true); // ...and save # of significant SNPs setsigsize.clear(); for (int i=0; iprofileSNPs.size(); i++) setsigsize.push_back( pS->s_min[i] ); } pS->cumulativeSetSum_WITHLABELS(*this,original); } ////////////////////// // Begin permutations bool finished = false; while(!finished) { /////////////////////////////////// // Set up permutation list for TDT // Permutations are constant across family and markers // flipA/B[permutation][family] vector fA(family.size(),false); vector fP(family.size(),false); for (int f=0; f pr; if (par::sibTDT_test) pr = testSibTDT(false, true, perm, fA, fP); else pr = testTDT(false, true, perm, fA, fP); ////////////////////// // Make sets? if (par::set_test) { if ( par::set_r2 ) pr = pS->fitLDSetTest(pr,false); else pS->cumulativeSetSum_WITHOUTLABELS(pr,perm.current_reps()+1); } //////////////////////////////// // Standard permutation counting finished = perm.update(pr,original); } // next permutation if (!par::silent) cout << "\n\n"; /////////////////////////////////////////// // Calculate SET-based empirical p-values if (par::set_test && ! (par::set_r2 || par::set_score) ) { printLOG("Calculating empirical SET-based p-values\n"); pS->empiricalSetPValues(); } //////////////////// // Display results ofstream TDT; string f; if ( par::set_r2 ) { if (par::adaptive_perm) f = par::output_file_name + testname + ".set.perm"; else f = par::output_file_name + testname + ".set.mperm"; TDT.open(f.c_str(),ios::out); TDT.precision(4); printLOG("Writing set-based results to [ " + f + " ] \n"); TDT << setw(12) << "SET" << " " << setw(6) << "NSNP" << " " << setw(6) << "NSIG" << " " << setw(6) << "ISIG" << " " << setw(12)<< "STAT" << " " << setw(12) << "EMP1" << " " << "SNPS" << "\n"; vector pv(0); for (int l=0; l par::pfvalue ) continue; TDT << setw(12) << setname[l] << " " << setw(6) << pS->snpset[l].size() << " " << setw(6) << pS->numSig[l] << " " << setw(6) << pS->selectedSNPs[l].size() << " "; TDT << setw(12) << original[l] << " " << setw(12) << perm.pvalue(l) << " "; if ( pS->selectedSNPs[l].size() == 0 ) TDT << "NA"; else for (int j=0; jselectedSNPs[l].size(); j++) { TDT << locus[ snpset[l][pS->selectedSNPs[l][j]] ]->name; if ( j < pS->selectedSNPs[l].size() - 1 ) TDT << "|"; } TDT << "\n"; } } else { // Standard empirical p-value reports string f; if (par::adaptive_perm) f = par::output_file_name + testname + ".perm"; else f = par::output_file_name + testname + ".mperm"; TDT.open(f.c_str(),ios::out); printLOG("Writing TDT permutation results to [ " + f + " ] \n"); TDT.precision(4); TDT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " "; if (par::perm_TDT_basic) TDT << setw(12) << "CHISQ_TDT" << " "; else if (par::perm_TDT_parent) TDT << setw(12) << "CHISQ_PAR" << " "; else TDT << setw(12) << "CHISQ_COM" << " "; TDT << setw(12) << "EMP1" << " "; if (par::adaptive_perm) TDT << setw(12) << "NP" << " " << "\n"; else TDT << setw(12) << "EMP2" << " " << "\n"; for (int l=0; l par::pfvalue ) continue; TDT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " "; if (original[l] < -0.5) TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " " << setw(12) << "NA"; else { TDT << setw(12) << original[l] << " " << setw(12) << perm.pvalue(l) << " "; if (par::adaptive_perm) TDT << setw(12) << perm.reps_done(l); else TDT << setw(12) << perm.max_pvalue(l); } TDT << "\n"; } } TDT.close(); //////////////////////////// // Display SET-based results if (par::set_test && ! par::set_r2 ) { f = par::output_file_name + testname + ".set"; TDT.open(f.c_str(),ios::out); printLOG("Writing set-based TDT results to [ " +f+ " ] \n"); TDT.clear(); // Header row TDT << setw(12) << "SET" << " " << setw(6) << "S" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "T" << " " << setw(12) << "P_0" << " " << setw(12) << "P_1" << " " << setw(12) << "P_2" << " " << "\n"; for (int i=0;ipv_set.size();i++) { TDT << "\n"; for (int j=0;jpv_set[i].size();j++) { TDT << setw(12) << setname[i] << " " << setw(6) << string("S"+int2str(j+1+pS->s_min[i])) << " " << setw(par::pp_maxsnp) << pS->setsort[i][j] << " " << setw(12) << pS->stat_set[i][j][0] << " " << setw(12) << pS->pv_set[i][j][0] << " " << setw(12) << pS->pv_maxG_set[i][j]/(par::replicates+1) << " " << setw(12) << pS->pv_maxE_set[i][j]/(par::replicates+1) << " " << "\n"; } } TDT.close(); } } vector Plink::testTDT(bool print_results, bool permute, Perm & perm, vector & flipA, vector & flipP) { // TDT and X chromosome: males are coded as homozygous i.e. father // should always be uninformative; // male child will always receive his X from father // females as usual /////////////////////////// // Vector to store results vector res(nl_all); double zt; ofstream TDT, MT; if (print_results) { string f = par::output_file_name + ".tdt"; TDT.open(f.c_str(),ios::out); printLOG("Writing TDT results (asymptotic) to [ " + f + " ] \n"); TDT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "BP" << " " << setw(3) << "A1" << " " << setw(3) << "A2" << " " << setw(6) << "T" << " " << setw(6) << "U" << " " << setw(12) << "OR" << " "; if (par::display_ci) TDT << setw(12) << string("L"+dbl2str(par::ci_level*100)) << " " << setw(12) << string("U"+dbl2str(par::ci_level*100)) << " "; TDT << setw(12) << "CHISQ" << " " << setw(12) << "P" << " "; if (par::discordant_parents) TDT << setw(12) << "A:U_PAR" << " " << setw(12) << "CHISQ_PAR" << " " << setw(12) << "P_PAR" << " " << setw(12) << "CHISQ_COM" << " " << setw(12) << "P_COM" << " "; TDT << "\n"; if ( par::mating_tests ) { MT.open( (par::output_file_name + ".mt").c_str(), ios::out); MT.precision(3); } if (par::display_ci) zt = ltqnorm( 1 - (1 - par::ci_level) / 2 ) ; } /////////////////////////////////// // Perform analysis for each locus for (int l=0; lTDT ) continue; int trA = 0; // transmitted allele from first het parent int unA = 0; // untransmitted allele from first het parent int trB = 0; // transmitted allele from second het parent int unB = 0; // untransmitted allele from second het parent Individual * pat = family[f]->pat; Individual * mat = family[f]->mat; vector kid = family[f]->kid; bool pat1 = pat->one[l]; bool pat2 = pat->two[l]; bool mat1 = mat->one[l]; bool mat2 = mat->two[l]; // We need two genotyped parents, with // at least one het if ( pat1 == pat2 && mat1 == mat2 ) continue; if ( ( pat1 && !pat2 ) || ( mat1 && !mat2 ) ) continue; // Consider all offspring in nuclear family for (int c=0; caff ) continue; bool kid1 = kid[c]->one[l]; bool kid2 = kid[c]->two[l]; // Skip if offspring has missing genotype if ( kid1 && !kid2 ) continue; // We've now established: no missing genotypes // and at least one heterozygous parent // Kid is 00 if ( (!kid1) && (!kid2) ) { if ( ( (!pat1) && pat2 ) && ( (!mat1) && mat2 ) ) { trA=1; unA=2; trB=1; unB=2; } else { trA=1; unA=2; } } else if ( (!kid1) && kid2 ) // Kid is 01 { // het dad if (pat1 != pat2 ) { // het mum if ( mat1 != mat2 ) { trA=1; trB=2; unA=2; unB=1; } else if ( !mat1 ) { trA=2; unA=1; } else { trA=1; unA=2; } } else if ( !pat1 ) { trA=2; unA=1; } else { trA=1; unA=2; } } else // kid is 1/1 { if ( ( (!pat1) && pat2 ) && ( (!mat1) && mat2 ) ) { trA=2; unA=1; trB=2; unB=1; } else { trA=2; unA=1; } } // We have now populated trA (first transmission) // and possibly trB also //////////////////////////////////////// // Permutation? 50:50 flip (precomputed) if (permute) { if (flipA[f]) { int t=trA; trA=unA; unA=t; t=trB; trB=unB; unB=t; } } // Increment transmission counts if (trA==1) t1++; if (trB==1) t1++; if (trA==2) t2++; if (trB==2) t2++; if ( par::verbose) { cout << "TDT\t" << locus[l]->name << " " << pat->fid << " : " << trA << " " << trB << "\n"; } } // next offspring in family } // next nuclear family ///////////////////////////////////////////// // Consider parental discordance information double p1 = 0; double p2 = 0; double d1 = 0; double d2 = 0; if (par::discordant_parents) { // Count over families for (int f=0; fdiscordant_parents ) continue; Individual * pat = family[f]->pat; Individual * mat = family[f]->mat; bool pat1 = pat->one[l]; bool pat2 = pat->two[l]; bool mat1 = mat->one[l]; bool mat2 = mat->two[l]; // ...and that both are genotyped if ( ( pat1 && !pat2 ) || ( mat1 && !mat2 ) ) continue; //////////////////////////////////////// // Permutation? 50:50 flip (precomputed) if (permute) { if (flipP[f]) { if (pat->aff) { pat->aff = false; mat->aff = true; } else { pat->aff = true; mat->aff = false; } } } // Get number of 'F' alleles that the affected parent has // above the unaffected; this count is p1/d1 // excess T alleles in affected -> p1/d1 // excess F alleles in unaffected -> p2/d2 if ( pat1 == mat1 && pat2 == mat2 ) continue; // d = 0; else if ( pat->aff ) // affected pat { if ( (!pat1) && (!pat2) ) // F/F { // mat will either be T/T or F/T if ( mat1 ) d1++; // two extra T else p1++; // one extra T } else if ( (!pat1 ) && pat2 ) // pat F/T { // mat either T/T or F/F if ( mat1 ) p1++; // one extra T else p2++; // one less T } else // pat must be T/T { // mat will either be F/F or F/T if ( ! mat2 ) d2++; // two less T else p2++; // one less T } } else // affected mat / score other direction { if ( (!pat1) && (!pat2) ) // F/F { // mat will either be T/T or F/T if ( mat1 ) d2++; else p2++; } else if ( (!pat1 ) && pat2 ) // pat F/T { // mat either T/T or F/F if ( mat1 ) p2++; else p1++; } else // pat must be T/T { // mat will either be F/F or F/T if ( ! mat2 ) d1++; else p1++; } } } } /////////////////////////////////// // General family test if ( par::mating_tests ) { table_t parenMT; sizeTable(parenMT,3,3); // Count over families for (int f=0; fpat; Individual * mat = family[f]->mat; if ( pat == NULL || mat == NULL ) continue; bool pat1 = pat->one[l]; bool pat2 = pat->two[l]; bool mat1 = mat->one[l]; bool mat2 = mat->two[l]; // ...and that both are genotyped if ( ( pat1 && !pat2 ) || ( mat1 && !mat2 ) ) continue; int i=0, j=0; if ( pat1 ) ++i; if ( pat2 ) ++i; if ( mat1 ) ++j; if ( mat2 ) ++j; ++parenMT[i][j]; } double mean1 = 0, mean2 = 0; int total = 0; for(int i=0; i<=2; i++) for (int j=0; j<=2; j++) { mean1 += parenMT[i][j] * i; mean2 += parenMT[i][j] * j; total += parenMT[i][j]; } mean1 /= (double)total; mean2 /= (double)total; double var1 = 0, var2 = 0, covar = 0; for(int i=0; i<=2; i++) for (int j=0; j<=2; j++) { var1 += ( i - mean1 )*(i-mean1)*parenMT[i][j]; var2 += ( j - mean2 )*(j-mean2)*parenMT[i][j]; covar += ( i - mean1 )*(j-mean2)*parenMT[i][j]; } var1 /= (double)total - 1.0; var2 /= (double)total - 1.0; covar /= (double)total - 1.0; double r = covar / sqrt( var1 * var2 ); // double t = fisher(parenMT); double t = chiTable(parenMT); MT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << locus[l]->bp << " " << setw(8) << locus[l]->freq << " " << setw(12) << t << " "; t = symTable(parenMT); MT << setw(12) << t << " "; MT << setw(12) << r << " "; for(int i=0; i<=2; i++) for (int j=0; j<=2; j++) MT << setw(5) << parenMT[i][j] << " "; MT << "\n"; } ///////////////////////////// // Finished counting: now compute // the statistics double tdt_chisq, par_chisq, com_chisq; tdt_chisq = par_chisq = com_chisq = -1; // Basic TDT test if (t1+t2 > 0) tdt_chisq = ((t1-t2)*(t1-t2))/(t1+t2); if (par::discordant_parents) { // parenTDT if ( p1+p2+d1+d2 > 0 ) par_chisq = (((p1+2*d1)-(p2+2*d2))*((p1+2*d1)-(p2+2*d2))) /(p1+p2+4*(d1+d2)); // Combined test if ( t1+p1+4*d1+t2+p2+4*d2 > 0 ) com_chisq = ( ( (t1+p1+2*d1) - (t2+p2+2*d2) ) * ( (t1+p1+2*d1) - (t2+p2+2*d2) ) ) / ( t1+p1+4*d1+t2+p2+4*d2 ) ; } // Display asymptotic results if (print_results) { double pvalue = chiprobP(tdt_chisq,1); // Skip?, if filtering p-values if ( par::pfilter && pvalue > par::pfvalue ) continue; TDT.precision(4); TDT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << locus[l]->bp << " " << setw(3) << locus[l]->allele1 << " " << setw(3) << locus[l]->allele2 << " " << setw(6) << t1 << " " << setw(6) << t2 << " "; // Odds ratio for T:U double OR = t1 / t2; if ( ! realnum(OR) ) { TDT << setw(12) << "NA" << " "; if (par::display_ci) TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " "; } else { TDT << setw(12) << OR << " "; if (par::display_ci) { double OR_lower = exp( log(OR) - zt * sqrt(1/t1+1/t2)) ; double OR_upper = exp( log(OR) + zt * sqrt(1/t1+1/t2)) ; TDT << setw(12) << OR_lower << " " << setw(12) << OR_upper << " "; } } if (tdt_chisq>=0) TDT << setw(12) << tdt_chisq << " " << setw(12) << chiprobP(tdt_chisq,1) << " "; else TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " "; if (par::discordant_parents) { TDT << setw(12) << dbl2str(p1+2*d1)+":"+dbl2str(p2+2*d2) << " "; if (par_chisq>=0) TDT << setw(12) << par_chisq << " " << setw(12) << chiprobP(par_chisq,1) << " "; else TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " "; if (com_chisq>=0) TDT << setw(12) << com_chisq << " " << setw(12) << chiprobP(com_chisq,1) << " "; else TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " "; } TDT << "\n"; } /////////////////////////////////////////// // Choose which statistic for permutation if (par::perm_TDT_basic) res[l] = tdt_chisq; else if (par::perm_TDT_parent) res[l] = par_chisq; else res[l] = com_chisq; } // next locus ////////////////////////////// // Close output file, if open if (print_results) { TDT.close(); if ( par::mating_tests ) MT.close(); } /////////////////////////////////////////// // Return chosen statistic for permutation return res; } plink-1.07-src/bmerge.cpp0000644000265600020320000005364611264127625014460 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" void Plink::mergeBinaryData() { // Function to merge a text file with an exisiting data set // either SNP-major or individual-major modes if (!par::merge_list) { printLOG( "Using merge mode " + int2str( par::merge_mode ) + " : "); if (par::merge_mode==1) printLOG("consensus call (default)\n"); else if (par::merge_mode==2) printLOG("overwrite if missing in original\n"); else if (par::merge_mode==3) printLOG("overwrite unless missing in new\n"); else if (par::merge_mode==4) printLOG("overwrite none\n"); else if (par::merge_mode==5) printLOG("overwrite all\n"); else if (par::merge_mode==6) printLOG("diff mode: all differences\n"); else if (par::merge_mode==7) printLOG("diff mode: non-missing differences\n"); diff_overlap = 0; diff_nonmissing_overlap = 0; diff_concordant_overlap = 0; } // We've already loaded in the first file // Do not overwrite any existing phenotype information checkFileExists(par::merge_bedfile); checkFileExists(par::merge_bimfile); checkFileExists(par::merge_famfile); // Make hash of original SNP names map mlocus; for (int l=0;lname,l)); map::iterator ilocus; // A temporary hash for the names of any markers that // do not match in terms of strand set misstrand; map misstrand_dummy; bool fatal_error = false; /////////////////////////////////////// // .bim vector ordered(0); map exists; vector locus2(0); set flip_alleles; ifstream MAP(par::merge_bimfile.c_str(), ios::in); MAP.clear(); int exist_cnt=0; int c=0; while(!MAP.eof()) { Locus * loc = new Locus; long int inc; MAP >> loc->chr // will automatically by numeric >> loc->name >> loc->pos // will automatically be in M units >> loc->bp >> loc->allele1 >> loc->allele2; inc = loc->bp; // Use the frequency slot temporarily to // store order information loc->freq = c++; // Check that cM/M specification looks correct, if // we want to perform a plink-based analysis if (par::plink && (!par::cm_map) && (loc->pos > 50) ) error("Looks like you need to specify --cm ??"); // Convert cM to M map distances if (par::cm_map) loc->pos /= 100; // Including all loci in merge-mode if (loc->name!="") { ilocus = mlocus.find(loc->name); /////////////////////////////////////////////////// // Check whether or not this Locus already exists? if (ilocus != mlocus.end()) { Locus * loc2 = locus[ilocus->second]; // Check same chromosome and positions, etc if ( loc2->chr != loc->chr ) { cerr << "Warning: different chromosome for " << loc->name << "\n"; loc->chr = loc2->chr; } // Check same chromosome and positions, etc if ( loc2->bp != loc->bp ) { cerr << "Warning: different physical position for " << loc->name << "\n"; loc->bp = loc2->bp; } if ( loc2->pos != loc->pos ) { cerr << "Warning: different genetic position for " << loc->name << "\n"; loc->pos = loc2->pos; } exists[loc->name] = true; exist_cnt++; locus2.push_back(loc2); // Keep the new file order (would have been in freq) int t = (int)loc->freq; if ( loc2->allele1 == "" ) loc2->allele1 = par::missing_genotype; if ( loc2->allele2 == "" ) loc2->allele2 = par::missing_genotype; ///////////////////////////////////////// // Add allele names to list, if needed // and check if allele codes need flipping? // e.g. if A/C SNP in memory but C/A in file? // or had one allele missing // and check that alleles match up correctly // 0 A A 0 -> 0 A + flip // 0 0 0 0 -> 0 0 // 0 0 0 A -> {0 A} // 0 0 A B -> {A B} // 0 A 0 A -> 0 A // 0 A 0 B -> {B A} + flip // 0 A A B -> {B A} + flip // 0 A B A -> {B A} // 0 A 0 0 -> 0 A // A B A B -> A B // A B B A -> A B + flip // A B 0 A -> A B + flip // A B 0 0 -> A B // 1) Are there any empty slots in the existing allele? // If so, fill them in with any new alleles // 2) Do we have a strand problem? // 3) Or do we need a flip? // New codes string one = loc->allele1; string two = loc->allele2; // cout << "LOCUS " << loc->name << " OLD, NEW = [" // << loc2->allele1 << "] [" // << loc2->allele2 << "] [" // << one << "] [" // << two << "]\n"; set alleleCount; if ( one != par::missing_genotype ) alleleCount.insert( one ); if ( two != par::missing_genotype ) alleleCount.insert( two ); if ( loc2->allele1 != par::missing_genotype ) alleleCount.insert( loc2->allele1 ); if ( loc2->allele2 != par::missing_genotype ) alleleCount.insert( loc2->allele2 ); // More than 2 obseved alleles? if ( alleleCount.size() > 2 ) { misstrand.insert(loc2->name); fatal_error = true; } else { ////////////////////////////// // 1) Fill in empty slots // Fill slot 2 first (i.e. 0 A code for monomorphic, not A 0) // If first new allele is not missing... if (one!=par::missing_genotype) { // ...and not already listed if (one!=loc2->allele1 && one!=loc2->allele2) { // ...then add to first empty slot if(loc2->allele2=="" || loc2->allele2==par::missing_genotype) loc2->allele2=one; else if(loc2->allele1=="" || loc2->allele1==par::missing_genotype) loc2->allele1=one; } } if (two!=one && two!=par::missing_genotype) // ...and not already listed if (two!=loc2->allele1 && two!=loc2->allele2) { // ...then add to first empty slot if(loc2->allele2=="" || loc2->allele2==par::missing_genotype) loc2->allele2=two; else if(loc2->allele1=="" || loc2->allele1==par::missing_genotype ) loc2->allele1=two; } ////////////////////////////// // 2) Need a flip? if ( ( one == loc2->allele2 && one != par::missing_genotype ) || ( two == loc2->allele1 && two != par::missing_genotype ) || ( one != loc2->allele1 && one != par::missing_genotype && loc2->allele1 != par::missing_genotype ) || ( two != loc2->allele2 && two != par::missing_genotype && loc2->allele2 != par::missing_genotype ) ) { if ( one == loc2->allele2 || two == loc2->allele1 ) { flip_alleles.insert(loc2->name); } else { ////////////////////////////// // 3) Strand, wrong coding? misstrand.insert(loc2->name); fatal_error = true; } } } // Clean up what we do not need delete loc; // Replace with old locus (but swap back in new file position) loc = loc2; loc->freq = t; } else { // Locus does not exist -- add to locus list exists[loc->name] = false; locus2.push_back(loc); } ordered.push_back(*loc); } } MAP.clear(); MAP.close(); /////////////////////////////////////////////////////// // Did we encounter any fatal errors from flipped SNPs? if (fatal_error) { ofstream MSNP; string f = par::output_file_name+".missnp"; MSNP.open(f.c_str(), ios::out); set::iterator ilocus; for ( ilocus = misstrand.begin() ; ilocus != misstrand.end() ; ilocus++) { MSNP << *ilocus << "\n"; } MSNP.close(); printLOG("\nFound " + int2str(misstrand.size()) + " SNPs that do not match in terms of allele codes\n"); printLOG("Might include strand flips, although flipped A/T and C/G SNPs will be undetected)\n"); printLOG("Writing problem SNPs to [ " + f + " ]\n"); error("Stopping due to mis-matching SNPs -- check +/- strand?"); } if (!par::merge_list) { printLOG("\n" +int2str(locus2.size()) + " markers to be merged from [ " +par::merge_bimfile + " ]\n"); printLOG("Of these, "+int2str( locus2.size()-exist_cnt ) + " are new, " + int2str( exist_cnt ) + " already exist in current data\n"); } /////////////////////////////////////////////// // Build ordered table, so that genotypes can be inserted // in correct order; then swap locus file over // Sorting a vector of pointers, so we need this special fix stable_sort(locus2.begin(),locus2.end(),less()); // Sorting a normal vector stable_sort(ordered.begin(),ordered.end()); c=0; for (int i=0; ifreq // p2 p3 p1 p5 p4 : genetic position // 0 1 2 3 4 : file order // sort by cM // p1 p2 p3 p4 p5 : genetic // 2 0 1 4 3 : file order // 0 1 2 : add genetic order: nonmissing... // // sort by file order again // p2 p3 p1 p5 p4 : genetic // 0 1 2 3 4 : file // 1 0 2 : position to put in locus[l] // Add new locus2() to end of locus() for (int l=0; lname)->second ) { Locus * loc = new Locus; loc = locus2[l]; locus.push_back(loc); } } /////////////////////////////////////////////// // .fam // Make new hash of Locus names mlocus.clear(); for (int l=0;lname,l)); } if (mlocus.size() != locus.size() ) { cerr << "Problem encountered merging files, with the following markers:\n"; mlocus.clear(); for (int l=0;lname) != mlocus.end()) cerr << locus[l]->name << "\n"; mlocus.insert(make_pair(locus[l]->name,l)); } cerr << "[ dump info: sizes = " << mlocus.size() << " and " << locus.size() << " ]\n"; error("Cannot merge files. Check your MAP files."); } // Make hash of existing individuals map msample; for (int i=0;ifid+"_"+sample[i]->iid,i)); map::iterator isample; // Resize all existing individuals // and set new elements to missing genotype (TF) if (par::SNP_major) { // Add space for new SNPs for (int i=0; ione.resize(n,true); newlocus->two.resize(n,false); SNP.push_back(newlocus); } } else { // If using individual-major mode for (int i=0; ione.resize(locus.size(),true); sample[i]->two.resize(locus.size(),false); } } // An output file for diff mode ofstream MERD; if (par::merge_mode >=6) { string f = par::output_file_name+".diff"; MERD.open(f.c_str(), ios::out); MERD << setw(20) << "SNP" << " " << setw(20) << "FID" << " " << setw(20) << "IID" << " " << setw(8) << "NEW" << " " << setw(8) << "OLD" << " " << "\n"; } int new_person = 0; int old_person = 0; /////////////////////////////////////// // Read in FAM/BED file to new merge file // Initially, assume a binary trait par::qt = false; par::bt = true; ifstream FAM; FAM.open(par::merge_famfile.c_str()); FAM.clear(); vector existing_person_list; int original_sample_size = sample.size(); vector sample2; c=0; while(!FAM.eof()) { Individual * person = new Individual; // No comments allowed in BED/BIM/FAM files // First 6 obligatory fields string phenotype; FAM >> person->fid >> person->iid >> person->pat >> person->mat >> person->sexcode >> phenotype; // Skip last empty line that gets read if (person->fid=="") break; // Check for reserved family ID code if ( person->fid=="FID" ) error("FID is a reserved ID... please select a different family ID"); // Are we using 0/1 coding? if (par::coding01) { if ( phenotype == "1" ) phenotype = "2"; else if ( phenotype == "0" ) phenotype = "1"; else phenotype = "0"; } if (person->sexcode=="1") person->sex = true; // male else if (person->sexcode=="2") person->sex = false; // female (default) else if (!par::ignore_missing_sex) { person->missing = true; } // Have we already created this person? bool already_in = false; isample = msample.find(person->fid+"_"+person->iid); int indn = isample->second; if ( isample != msample.end() ) { already_in = true; delete person; person = sample[isample->second]; old_person++; } else new_person++; // Only look at phenotype if not already created if (!already_in) { ////////////////// // A non-founder? person->founder = (person->pat == "0" && person->mat == "0") ? true : false; ///////////////////////////////////////////////////// // Set missing status; test for quantitative traits? if (phenotype == par::missing_phenotype) person->missing = true; else { if ( ! from_string( person->phenotype, phenotype, std::dec ) ) person->missing = true; else if (phenotype != "0" && phenotype != "1" && phenotype != "2" ) { par::qt = true; par::bt = false; } } } /////////////////////////////////////// // Add necessary space for a new person // Missing genotypes by default if (!already_in) { if (par::SNP_major) { // Add a new missing person to each SNP vector::iterator s = SNP.begin(); while ( s != SNP.end() ) { (*s)->one.push_back(true); (*s)->two.push_back(false); s++; } // And set the individual number indn = n + new_person - 1; } else { // Add all new SNPs to this person person->one.resize(locus.size(),true); person->two.resize(locus.size(),false); } } // Record whether this individual is new or not existing_person_list.push_back(already_in); sample2.push_back(person); // Add individual to list, if need be if (!existing_person_list[c]) { sample.push_back(person); msample.insert(make_pair( person->fid+"_"+person->iid, sample.size()-1 ) ); } // Increase person counter c++; } if (!par::merge_list) { printLOG(int2str( existing_person_list.size() ) + " individuals merged from [ " + par::merge_famfile + " ] \n"); printLOG("Of these, " + int2str(new_person) + " were new, " + int2str( old_person ) + " were already in current data\n\n"); } //////////////////////////////////// // Read genotype information, merge ifstream BIT; bool bfile_SNP_major = openBinaryFile( par::merge_bedfile, BIT ); if (bfile_SNP_major != par::SNP_major) error("BED files must both be SNP-major or both individual-major for merging\n"); if ( (!par::SNP_major) || (!bfile_SNP_major) ) error("Cannot --bmerge individual-major BED files -- convert to SNP-major"); /////////////////////////// // SNP-major mode if (bfile_SNP_major) { // Person look-up table vector::iterator person = sample2.begin(); vector vindn; while ( person != sample2.end() ) { map::iterator isample = msample.find((*person)->fid+"_"+(*person)->iid); int indn; if ( isample != msample.end() ) indn = isample->second; else error("Internal error in --bmerge... should not happen...\n"); vindn.push_back(indn); ++person; } CSNP * snp; // Outer loop for SNPs int s=0; while (s::iterator ilocus = mlocus.find(locus2[ s ]->name); // int l2 = ilocus->second; int k0 = (int)ordered[s].freq; ilocus = mlocus.find(locus2[k0]->name); int k = ilocus->second; bool flipmode = flip_alleles.find( locus2[k0]->name ) != flip_alleles.end(); string snp1 = locus2[ k0 ]->allele1; string snp2 = locus2[ k0 ]->allele2; bool existence = exists.find(locus2[k0]->name)->second; ////////////////////////////////////// // Inner loop for individuals vector::iterator indn = vindn.begin(); while ( indn != vindn.end() ) { char ch[1]; BIT.read(ch,1); bitset<8> b; b = ch[0]; int c=0; if (!BIT) error("Problem with the BED file...has the FAM file been changed?\n"); while (c<7 && indn != vindn.end() ) { bool s1 = b[c++]; bool s2 = b[c++]; if ( flipmode && s1 == s2 ) { s1 = !s1; s2 = !s2; } string one = snp1; string two = snp2; if (s1 && s2) one=snp2; else if ( (!s1) && (!s2) ) two=snp1; else if ( s1 && !s2 ) one=two=par::missing_genotype; bool already_in = false; if ( *indn < original_sample_size ) already_in = true; else already_in = existing_person_list[*indn - original_sample_size]; bool e = reconcileMerge( *indn, k , one, two, already_in, existence, MERD, misstrand_dummy); if (e) fatal_error=true; // next person indn++; } } // next SNP s++; } // Set file mode par::SNP_major = true; } //////////////////////////////////// // Individual-major mode // else // { // // Outer loop for individuals // vector::iterator person = sample.begin(); // while ( person != sample.end() ) // { // // Inner loop for SNPs // int s=0; // while (s b; // b = ch[0]; // int c=0; // while (c<7 && sone[ s ] = b[c++]; // (*person)->two[ s ] = b[c++]; // s++; // } // } // person++; // } // // Set file mode // par::SNP_major = false; // } ////////////// // Close files FAM.clear(); FAM.close(); BIT.clear(); BIT.close(); // If a binary trait, now make 0 missing also // i.e. if we never saw other than missing, 0, 1 or 2 if (par::bt) for (int i=0; iphenotype == 0 ) sample[i]->missing = true; if (par::merge_mode >=6) { printLOG("Results from diff ( merge mode " + int2str(par::merge_mode) + " ) written to [ " + par::output_file_name + ".diff ]\n"); MERD.close(); printLOG("Of " + int2str(diff_overlap) + " overlapping SNPs, " + int2str( diff_nonmissing_overlap ) + " were both genotyped\nand " + int2str( diff_concordant_overlap ) + " were concordant\n"); printLOG("Concordance rate is " + dbl2str( (double)diff_concordant_overlap / (double)diff_nonmissing_overlap ) + "\n"); shutdown(); } // Phenotype statistics if (!par::merge_list) { int nm=0; for (int i=0;imissing) nm++; printLOG(int2str( nm ) + " individuals with nonmissing phenotypes\n"); if (par::bt) { printLOG("Assuming a disease phenotype (1=unaff, 2=aff, 0=miss)\n"); if (par::missing_phenotype!="0") printLOG("Missing phenotype value is also " + par::missing_phenotype + "\n"); int ncase = 0; int ncontrol = 0; for (int i=0; iphenotype == 1 ) ncontrol++; else if ( sample[i]->phenotype == 2 ) ncase++; printLOG(int2str(ncase)+" cases and "+int2str(ncontrol)+" controls\n"); } else { printLOG("Assuming a quantitative trait\n"); printLOG("Missing phenotype value is " + par::missing_phenotype + "\n"); } } } plink-1.07-src/idhelp.h0000644000265600020320000001165211264127626014121 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __IDHELP_H__ #define __IDHELP_H__ #include #include #include "plink.h" using namespace std; class IDFile; class IDField { public: string name; string jointName; int width; bool null; bool attribute; bool equiv; bool joint; // Equivalence IDs map eqid; set aliasList; set masterList; IDField() { attribute = false; null = false; equiv = false; joint = false; width = 4; } bool operator< (const IDField & b) const { if ( name < b.name ) return true; return false; } bool operator== (const IDField & b) const { return name == b.name; } }; class IDFile { public: string filename; int uniqFieldCount; bool hasHeader; set missingValues; string delimit; string alias_delimit; vector fields; vector< vector > joint; vector< vector > equiv; map injections; IDFile() { hasHeader = false; missingValues.insert("."); uniqFieldCount = 0; delimit = " "; alias_delimit = ","; } }; class IDValue { public: IDValue() { value = ""; field = NULL; jointValue = ""; } void updateAlias() { map::iterator f = field->eqid.find( value ); if ( f != field->eqid.end() ) value = f->second; } IDField * field; string jointValue; // This is main default value string value; bool operator< (const IDValue & b) const { if ( field->name < b.field->name ) return true; else if ( field->name > b.field->name ) return false; if ( field->joint && b.field->joint ) return jointValue < b.jointValue; else return value < b.value; } bool operator!= (const IDValue & b) const { return ! ( *this == b ); } bool operator== (const IDValue & b) const { // These must refer to the same field to be // comparable if ( field->name != b.field->name ) return false; if ( field->joint && b.field->joint ) { return jointValue == b.jointValue; } else { return value == b.value; } } bool singleMatch(const IDValue & b) const { // Do not take joint field values into // account here if ( field->name != b.field->name ) return false; return value == b.value; } }; class IDGroup { public: vector values; IDFile * file; bool resolved; IDGroup() { resolved = false; } void display() { cout << "File = " << file->filename << ", resolved = " << resolved << "\n"; for (int k=0; kfield->name << " = " << values[k]->value << " j= " << values[k]->jointValue << " joint=" << values[k]->field->joint << " attrib=" << values[k]->field->attribute << " null=" << values[k]->field->null << "\n"; cout << "\n"; } }; class IDHelper { public: // Files containing IDs vector files; // The actual IDs we are matching on set fields; map fieldMap; set::iterator iField; // The basic data we read in, then try to resolve into // a smaller set vector idgroup; // The lookup table map > idmap; map dict; set< set > uniqueFields; map > jointMap; vector< set > jointField; vector< vector > jointOrder; set attribFields; // Functions // Main wrapper void idHelp(); void idReplace(); void idMatch(); void idDump(); void idListAlias(); // Helper functions // Populate joint values for a given set void setJointValues( set & ); void setJointValues( IDGroup * ); // Map string ID1+ID2=V1+V2,ID3=V3 map > parseQuery(string ); // Find unique matching observation IDGroup * findUniqueIndividual( set & ); // Does this person match this template? bool matchIndividual(IDGroup * group, map > & ); // Find all matching observations set findAllIndividuals( map > & ); // Set an alias bool setAlias(IDField*, string, int, map&); }; #endif plink-1.07-src/binput.cpp0000644000265600020320000002627711264127624014517 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" extern ofstream LOG; void Plink::readBinData() { if ( par::do_not_load_snps ) { printLOG("Skipping SNP and genotype information...\n"); checkFileExists(par::famfile); readFamFile(par::famfile); return; } // We cannot assume the file will be in order, as it might have been // previusly created by a --merge/--bmerge command // Check files exist checkFileExists(par::famfile); checkFileExists(par::bitfilename_map); checkFileExists(par::bitfilename); printLOG("Reading map (extended format) from [ " + par::bitfilename_map + " ] \n"); vector ordered; ifstream MAP(par::bitfilename_map.c_str(), ios::in); MAP.clear(); int c=0; while(!MAP.eof()) { Locus * loc = new Locus; MAP >> loc->chr // will automatically by numeric >> loc->name >> loc->pos >> loc->bp >> loc->allele1 >> loc->allele2; if ( MAP.eof() ) { delete loc; continue; } else if ( MAP.fail() ) { delete loc; error("Problem reading BIM file, line " + int2str(c+1) + "\n"); } // Use the frequency slot temporarily to // store order information loc->freq = c++; // Check that cM/M specification looks correct, if // we want to perform a plink-based analysis if (par::plink && (!par::cm_map) && (loc->pos > 50) ) error("Looks like you need to specify --cm ??"); // Convert cM to M map distances if (par::cm_map) loc->pos /= 100; // Always included, but not always in correct order if (loc->name!="") { locus.push_back(loc); ordered.push_back(*loc); } else delete loc; } printLOG( int2str(locus.size()) + " markers to be included from [ " + par::bitfilename_map + " ]\n"); MAP.clear(); MAP.close(); if ( locus.size() == 0 ) shutdown(); /////////////////////////////////////////////// // Build ordered table, so that genotypes can // be inserted in correct order; then swap locus // file over // Sort vector of pointers to Locus stable_sort(locus.begin(),locus.end(),less()); // Sort normal vector Locus stable_sort(ordered.begin(),ordered.end()); c=0; for (int i=0; i include(0); int nl_actual = locus.size(); if ( (!par::plink) && (!par::run_chr==0) ) { // Get range setMarkerRange(); // And set to not import all markers outside range // 0..nl_all scale: par::run_start..par::run_end nl_actual = 0; for (int j=0; j par::run_end ) include.push_back(-1); else { include.push_back(fp); nl_actual++; } } // 0 1 2 3 4 5 6 7 8 9 // We now have -1 -1 -1 3 4 5 6 -1 -1 -1 // but we want -1 -1 -1 0 1 2 3 -1 -1 -1 for (int j=0; j -1 ) include[j] -= par::run_start ; } } else { // If we do want to look at all the data for (int j=0; j memblock; // if ( par::fast_binary ) // { // ifstream::pos_type fbegin = BIT.tellg(); // BIT.seekg(0, ios::end); // ifstream::pos_type fend = BIT.tellg(); // ifstream::pos_type size = fend-fbegin+1; // memblock.resize(size); // BIT.seekg(fbegin); // BIT.read(&memblock[0], size); // BIT.close(); // } ////////////////////////////// // Allocate space for SNPs if (bfile_SNP_major) { for (int i=0; ione.resize( sample.size() ); newlocus->two.resize( sample.size() ); SNP.push_back(newlocus); } } else { vector::iterator person = sample.begin(); while ( person != sample.end() ) { (*person)->one.resize(nl_actual); (*person)->two.resize(nl_actual); person++; } } /////////////////////////// // SNP-major mode if (bfile_SNP_major) { CSNP * snp; // Outer loop for SNPs int s=0; while (s -1 ) snp = SNP[ include[s] ]; else snp = NULL; // Inner loop for individuals // vector::iterator person = sample.begin(); int indx = 0; int ss = sample.size(); while ( indx < ss ) { bitset<8> b; // if ( par::fast_binary ) // { // b = memblock[indx++]; // } // else // { char ch[1]; BIT.read(ch,1); if (!BIT) error("Problem with the BED file...has the FAM/BIM file been changed?\n"); b = ch[0]; // } int c=0; while (c<7 && indx < ss ) { if (snp) { // snp->one.push_back( b[c++] ); // snp->two.push_back( b[c++] ); snp->one[indx] = b[c++]; snp->two[indx] = b[c++]; } else { c+=2; } // ++person; ++indx; } } // next SNP s++; } // Set file mode par::SNP_major = true; } //////////////////////////////////// // Individual-major mode else { // Outer loop for individuals vector::iterator person = sample.begin(); while ( person != sample.end() ) { // Inner loop for SNPs int s=0; while (s b; b = ch[0]; int c=0; while (c<7 && s -1 ) { (*person)->one[ include[s] ] = b[c++]; (*person)->two[ include[s] ] = b[c++]; } else { c+=2; } s++; } } person++; } // Set file mode par::SNP_major = false; } // Check that we got what we expected char ch[1]; BIT.read(ch,1); if (BIT) error("Problem with the BED file... has the FAM/BIM file been changed?\n"); BIT.clear(); BIT.close(); // Free any buffer memory used // if ( par::fast_binary ) // memblock.clear(); //////////////////////////////////////// // If need be, now prune the MAP file // i.e. if --chr or --from/--to were used if ( (!par::plink) && (!par::run_chr==0) ) { vector l0(0); for(int l=0; l < locus.size(); l++) { if ( !( l < par::run_start || l > par::run_end ) ) l0.push_back(locus[l]); else delete locus[l]; } locus.clear(); locus = l0; } } bool Plink::openBinaryFile(string s, ifstream & BIT) { BIT.open(s.c_str(), ios::in | ios::binary); // 1) Check for magic number // 2) else check for 0.99 SNP/Ind coding // 3) else print warning that file is too old char ch[1]; BIT.read(ch,1); bitset<8> b; b = ch[0]; bool bfile_SNP_major = false; bool v1_bfile = true; // If v1.00 file format // Magic numbers for .bed file: 00110110 11011000 = v1.00 bed file if ( ( b[2] && b[3] && b[5] && b[6] ) && ! ( b[0] || b[1] || b[4] || b[7] ) ) { // Next number BIT.read(ch,1); b = ch[0]; if ( ( b[0] && b[1] && b[3] && b[4] ) && ! ( b[2] || b[5] || b[6] || b[7] ) ) { // Read SNP/Ind major coding BIT.read(ch,1); b = ch[0]; if ( b[0] ) bfile_SNP_major = true; else bfile_SNP_major = false; if (bfile_SNP_major) printLOG("Detected that binary PED file is v1.00 SNP-major mode\n"); else printLOG("Detected that binary PED file is v1.00 individual-major mode\n"); } else v1_bfile = false; } else v1_bfile = false; // Reset file if < v1 if ( ! v1_bfile ) { printLOG("Warning, old BED file BED *** \n\n"); bfile_SNP_major = false; BIT.close(); BIT.clear(); BIT.open(s.c_str(), ios::in | ios::binary); } else if ( ! v1_bfile ) { if ( b[0] ) bfile_SNP_major = true; else bfile_SNP_major = false; printLOG("Binary PED file is v0.99\n"); if (bfile_SNP_major) printLOG("Detected that binary PED file is in SNP-major mode\n"); else printLOG("Detected that binary PED file is in individual-major mode\n"); } return bfile_SNP_major; } plink-1.07-src/pdriver.cpp0000644000265600020320000002273611264127625014666 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include "plink.h" #include "options.h" #include "helper.h" #include "phase.h" void Plink::proxyWrapper() { if (!par::SNP_major) Ind2SNP(); if ( par::proxy_glm ) par::assoc_glm_without_main_snp = true; ///////////////////////////////////////////////////// // Use 'pos' slot in Locus to store genotyping // rate information, as we need access to this often for (int l=0 ; lpos = genotypingRate(*this,l); string f = par::output_file_name; string f2 = par::output_file_name; if ( par::proxy_impute ) { f += ".proxy.impute"; f2 += ".proxy.impute.dosage"; } else if ( par::proxy_error ) { f += ".proxy.genocheck"; } else if ( par::proxy_all && ! par::proxy_full_report ) { if ( par::proxy_CC ) { if ( par::qt ) f += ".qassoc.proxy"; else f += ".assoc.proxy"; } else f += ".tdt.proxy"; } else f += ".proxy.report"; haplo->HTEST.open(f.c_str(),ios::out); haplo->HTEST.precision(3); if ( par::proxy_record_dosage ) { OUTFILE.open(f2.c_str(),ios::out); OUTFILE.precision(3); } printLOG("\n"); /* printLOG("Criteria for selecting proxy SNPs:\n"); printLOG(" Selecting at most " + int2str( par::proxy_snp_filter ) + " proxy SNPs (--proxy-maxsnp)\n"); printLOG(" Searching up to " + int2str( par::proxy_window ) + " SNPs around reference (--proxy-window)\n"); printLOG(" Searching within " + dbl2str(par::proxy_kb) +" kb around reference (--proxy-kb)\n"); printLOG(" Proxy genotype missingness threshold is "+dbl2str(par::proxy_geno)+" (--proxy-geno)\n"); printLOG(" Proxy MAF threshold is " + dbl2str(par::proxy_maf) +" (--proxy-maf)\n"); if ( par::proxy_r2_filter ) { printLOG(" Proxy r-sq filters of " + dbl2str( par::proxy_r2_filter_A ) + ", " + dbl2str( par::proxy_r2_filter_B ) + ", " + dbl2str( par::proxy_r2_filter_C ) + " (--proxy-r2)\n"); } else { printLOG(" No proxy r-sq filter selected (--proxy-no-r2-filter)\n"); } */ printLOG("Criteria for selecting proxy SNPs using frequency based metrics:\n"); printLOG("For SNPs with MAF above, then below " + dbl2str(par::proxy_planB_threshold) + ", \n"); printLOG(" Selecting at most " + int2str( par::proxy_snp_filter_planA ) + "," + int2str( par::proxy_snp_filter_planB ) + " proxy SNPs (--proxy-maxsnp)\n"); printLOG(" Searching up to " + int2str( par::proxy_window_planA ) + "," + int2str( par::proxy_window_planB ) + "," + " SNPs around reference (--proxy-window)\n"); printLOG(" Searching within " + dbl2str(par::proxy_kb_planA) +"," + dbl2str(par::proxy_kb_planB) +"," +" kb around reference (--proxy-kb)\n"); printLOG(" Proxy genotype missingness threshold is "+dbl2str(par::proxy_geno)+" (--proxy-geno)\n"); printLOG(" Proxy MAF threshold is " + dbl2str(par::proxy_maf) +" (--proxy-maf)\n"); printLOG(" Proxy r-sq filters of " + dbl2str( par::proxy_r2_filter_A_planA ) + ", " + dbl2str( par::proxy_r2_filter_B_planA ) + ", " + dbl2str( par::proxy_r2_filter_C_planA ) + " and " + dbl2str( par::proxy_r2_filter_A_planB ) + ", " + dbl2str( par::proxy_r2_filter_B_planB ) + ", " + dbl2str( par::proxy_r2_filter_C_planB ) + " (--proxy-r2)\n"); if ( par::proxy_impute ) { int reference_panel = 0; for ( int i = 0 ; i < n ; i++ ) if ( sample[i]->missing ) reference_panel++; if ( reference_panel == 0 ) error("No reference panel for imputation (i.e. individuals with missing phenotypes)\n"); printLOG("Imputation reference panel of " + int2str(reference_panel) + " individuals\n"); } if ( ( ! par::proxy_impute ) && ( ( !par::proxy_all ) || par::proxy_full_report ) ) { printLOG("Criteria for selecting proxy subhaplotypes:\n"); printLOG(" Haplotype frequency threshold " +dbl2str(par::proxy_mhf)+" (--proxy-mhf)\n"); printLOG(" r-squared threshold to reference " + dbl2str(par::proxy_r2) +" (--proxy-sub-r2)\n"); printLOG(" Maximum of " + int2str(par::proxy_maxhap) + " SNPs per subhaplotype (--proxy-sub-maxsnp)\n"); } printLOG("\n"); printLOG("Writing haplotype-based proxy tests to [ " + f + " ] \n"); if ( par::proxy_record_dosage ) printLOG("Writing dosage imputation information to [ " + f2 + " ]\n"); // Either condsider all markers, with reduced output (i.e. // just the haplotype based test for that SNP, or a single // marker with extended outout if ( par::proxy_all ) { set plocus; if ( par::proxy_all_list ) { // read list of SNPs to use as reference SNP // for proxy map mlocus; for (int l=0; lname,l)); ifstream PLIST; checkFileExists(par::proxy_all_list_file); PLIST.open(par::proxy_all_list_file.c_str(), ios::in); while ( ! PLIST.eof() ) { string marker ; PLIST >> marker; map::iterator mi = mlocus.find( marker ); if ( mi != mlocus.end() ) plocus.insert( mi->second ); } PLIST.close(); } // And we might want a full report for each? // Turn off 'all' mode in that case to get // verbose output if ( par::proxy_full_report ) par::proxy_all = false; else { if ( par::proxy_impute ) { haplo->HTEST << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(4) << "NPRX" << " " << setw(8) << "INFO" << " " << setw(8) << "TOTAL_N" << " " << setw(8) << "OBSERVD" << " " << setw(8) << "IMPUTED" << " " << setw(8) << "OVERLAP" << " " << setw(8) << "CONCORD" << " "; if ( par::proxy_impute_genotypic_concordance ) haplo->HTEST << setw(8) << "F_AA" << " " << setw(8) << "I_AA" << " " << setw(8) << "C_AA" << " " << setw(8) << "F_AB" << " " << setw(8) << "I_AB" << " " << setw(8) << "C_AB" << " " << setw(8) << "F_BB" << " " << setw(8) << "I_BB" << " " << setw(8) << "C_BB" << " "; if ( par::proxy_list_proxies ) haplo->HTEST << "PROXIES"; haplo->HTEST << "\n"; } else { //haplo->HTEST.setf(ios::scientific); haplo->HTEST << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "BP" << " " << setw(4) << "A1" << " " << setw(4) << "A2" << " " << setw(10) << "GENO" << " " << setw(4) << "NPRX" << " " << setw(8) << "INFO" << " "; if ( par::proxy_CC ) { if ( par::qt || par::proxy_glm ) { haplo->HTEST << setw(8) << "F" << " "; if ( par::qt ) haplo->HTEST << setw(8) << "BETA" << " "; else haplo->HTEST << setw(8) << "ODDS" << " "; } else haplo->HTEST << setw(8) << "F_A" << " " << setw(8) << "F_U" << " " << setw(8) << "OR" << " "; } else if ( par::proxy_TDT ) haplo->HTEST << setw(8) << "T" << " " << setw(8) << "U" << " " << setw(8) << "OR" << " "; haplo->HTEST << setw(10) << "P" << " "; if ( par::proxy_list_proxies ) haplo->HTEST << "PROXIES"; haplo->HTEST << "\n"; } } //////////////////////////////////////// // Set-up cache for LD values proxyLD.clear(); //////////////////////////////////////// // Iterate over all SNPs specified int similar = 0; for ( int l = 0 ; l < nl_all ; l++ ) { // cout << "testing " << locus[l]->name << "\n"; ///////////////////////// // Skip if not on list if ( par::proxy_all_list ) if ( plocus.find(l) == plocus.end() ) continue; ///////////////////////////////////////// // Have we already seen an identical SNP? // (SKIP THIS PART FOR NOW -- DOESN'T // PRACTICALLY ADD MUCH AT ALL) // if ( l != 0 && identicalSNPs( this, l-1, l ) ) // { // ++similar; // continue; // } if ( ! par::silent ) { cout << l+1 << " of " << nl_all << " performed \r"; cout.flush(); } /////////////////////////////////// // Perform actual proxy-based test performProxyTests(l); // cout << "done!\n"; /////////////////////////////////// // Clear the LD cache occassionally if ( proxyLD.size() > 50000 ) proxyLD.clear(); } if ( ! par::silent ) cout << "\n"; } else { int l = getMarkerNumber(*this,par::proxy_assoc_snp); if ( l < 0 ) error("Cannot find proxy SNP [ " + par::proxy_assoc_snp + " ]\n"); performProxyTests(l); } haplo->HTEST.close(); if ( par::proxy_record_dosage ) OUTFILE.close(); } plink-1.07-src/impute.cpp0000644000265600020320000002656211264127625014517 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "genogroup.h" #include "phase.h" #include "haplowindow.h" extern ofstream LOG; using namespace std; class probabilisticGenotype{ public: probabilisticGenotype() { AA = AB = BA = BB = 0; phased = genotype = false; genotype = phased_genotype = -1; calculated = false; } bool calculated; double AA, AB, BA, BB; bool phased; bool genotyped; int genotype; // 0,1,2 = AA, AB, BB int phased_genotype; // 0,1,2,3 = AA, AB, BA, BB }; void HaploPhase::updateForImputation() { // Goal: given results now in HaploPhase, can we eliminate // any HaploWindow phases for any individuals // For each window involved, look at each phase of each // individual: was this supported by a phase in HaploPhase? // Also, start trying to order hap1 and hap2 across windows // to be consistent ////////////////////////////////////////////////////// // Reconcile HaploPhase (waplotype) results back into // subhaplotypes of the HaploWindows int num_phase_0 = 0; int num_phase_1 = 0; for (int i=0; ifounder ) continue; // If there were no available / possible haplotypes at this position, // then just leave the windows as are. if ( hap1[i].size() == 0 ) continue; for (int w=startWindow; w<=finishWindow; w++) { HaploWindow * currentWindow = windows[w]; int wc = w - startWindow; // Track original number of phases for this window num_phase_0 += currentWindow->hap1[i].size(); currentWindow->hap1[i].clear(); currentWindow->hap2[i].clear(); currentWindow->pp[i].clear(); map added; for (int z=0; z par::haplo_plem_meta_prune_phase ) { // Add this to window, after checking to see // if it already exists int2 subhaplotype; subhaplotype.p1 = hapi[ hap1[i][z] ][wc]; subhaplotype.p2 = hapi[ hap2[i][z] ][wc]; map::iterator ia = added.find( subhaplotype ); if ( ia == added.end() ) { int2 t; t.p1 = hapi[ hap1[i][z] ][wc]; t.p2 = hapi[ hap2[i][z] ][wc]; currentWindow->hap1[i].push_back( t.p1 ); currentWindow->hap2[i].push_back( t.p2 ); added.insert( make_pair( t, pp[i][z] )); } else { ia->second += pp[i][z]; } } } if ( currentWindow->hap1[i].size() > 1 ) { currentWindow->pp[i].resize( currentWindow->hap1[i].size()); double psum = 0; for (int z=0; z< currentWindow->hap1[i].size(); z++) { int2 subhaplotype; subhaplotype.p1 = currentWindow->hap1[i][z]; subhaplotype.p2 = currentWindow->hap2[i][z]; map::iterator ia = added.find( subhaplotype ); if ( ia != added.end() ) // CAN REMOVE THIS CHECK { currentWindow->pp[i][z] = ia->second; psum += ia->second; } } currentWindow->ambig[i] = true; for (int z=0; z< currentWindow->pp[i].size(); z++) currentWindow->pp[i][z] /= psum; } else { currentWindow->pp[i].clear(); currentWindow->ambig[i] = false; } // Track updated number of phases for this window num_phase_1 += currentWindow->hap1[i].size(); } } if (par::haplo_plem_verbose) { double reduction = (double) num_phase_1 / (double) num_phase_0 ; P.printLOG(dbl2str(reduction) + " pruning, from " + int2str(num_phase_0) + " to " + int2str( num_phase_1 ) + " phases \n"); } } void HaploPhase::mainImputation() { // P.printLOG("Entering final genotype imputation stage\n"); ////////////////////////////////////////////////////////////////////////// // Calculate information weights based on empirical variance for each SNP vector_t info(ns); vector_t infoCount(ns); for( int w = 0; w < nw; w++) { HaploWindow * currentWindow = windows[w]; for (int s = 0; s < currentWindow->ns; s++) { int gs = currentWindow->start + s; calculateEmpiricalVariance(gs); info[gs] += ratio; infoCount[gs]++; } } // Normalise information score for (int s=0; sfounder ) continue; ////////////////////////////////////////////////////////// // Store all imputed/phased genotypes for this individual vector g(ns); /////////////////////////////////// // Consider each window for( int w = 0; w < nw; w++) { HaploWindow * currentWindow = windows[w]; //////////////////////////////// // Consider each possible phase for ( int z = 0 ; z < currentWindow->hap1[i].size(); z++) { double posterior = currentWindow->ambig[i] ? currentWindow->pp[i][z] : 1 ; // Consider each position for (int s = 0; s < currentWindow->ns; s++) { int gs = currentWindow->start + s; // Do not attempt to impute low-confidence SNPs if ( info[ gs ] < par::proxy_info_threshold ) continue; // Otherwise calculate dosage if ( currentWindow->hap[currentWindow->hap1[i][z]][s] ) { if ( currentWindow->hap[currentWindow->hap2[i][z]][s] ) g[gs].AA += posterior; else g[gs].AB += posterior; } else { if ( currentWindow->hap[currentWindow->hap2[i][z]][s] ) g[gs].BA += posterior; else g[gs].BB += posterior; } // Next SNP } } // Next phase } // Next window // Normalise dosage for (int s=0; s < g.size(); s++) { double psum = g[s].AA + g[s].AB + g[s].BA + g[s].BB; if ( psum > 0 ) { g[s].AA /= psum; g[s].AB /= psum; g[s].BA /= psum; g[s].BB /= psum; } // Impute into missing genotype data spaces; or give verbose // output to a file if ( g[s].AA > par::proxy_impute_threshold ) { g[s].genotype = 0; g[s].phased_genotype = 0; g[s].genotyped = g[s].phased = true; } else if ( g[s].BB > par::proxy_impute_threshold ) { g[s].genotype = 2; g[s].phased_genotype = 3; g[s].genotyped = g[s].phased = true; } else if ( g[s].AB > par::proxy_impute_threshold ) { g[s].genotype = 1; g[s].phased_genotype = 1; g[s].genotyped = g[s].phased = true; } else if ( g[s].BA > par::proxy_impute_threshold ) { g[s].genotype = 1; g[s].phased_genotype = 2; g[s].genotyped = g[s].phased = true; } else if ( g[s].AB + g[s].BA > par::proxy_impute_threshold ) { g[s].genotype = 1; g[s].genotyped = true; g[s].phased = false; } else { g[s].genotyped = false; g[s].phased = false; } //////////////////////////////////// // Impute any missing genotype data bool s1 = par::SNP_major ? P.SNP[S[s]]->one[i] : P.sample[i]->one[S[s]] ; bool s2 = par::SNP_major ? P.SNP[S[s]]->two[i] : P.sample[i]->two[S[s]] ; string original_genotype = genotype(P,i,S[s]); if ( s1 && ! s2 ) // Original data are missing { if ( g[s].genotyped ) { if ( par::SNP_major ) { if ( g[s].genotype == 0 ) { P.SNP[S[s]]->one[i] = false; P.SNP[S[s]]->two[i] = false; } else if ( g[s].genotype == 1 ) { P.SNP[S[s]]->one[i] = false; P.SNP[S[s]]->two[i] = true; } else { P.SNP[S[s]]->one[i] = true; P.SNP[S[s]]->two[i] = true; } } else { if ( g[s].genotype == 0 ) { P.sample[i]->one[S[s]] = false; P.sample[i]->two[S[s]] = false; } else if ( g[s].genotype == 1 ) { P.sample[i]->one[S[s]] = false; P.sample[i]->two[S[s]] = true; } else { P.sample[i]->one[S[s]] = true; P.sample[i]->two[S[s]] = true; } } } } ////////////////////// // Verbose output mode if ( par::impute_verbose ) { int l = S[s]; HIMPUTE << P.sample[i]->fid << "\t" << P.sample[i]->iid << "\t" << P.locus[l]->name << "\t"; string g1 = P.locus[l]->allele1; string g2 = P.locus[l]->allele2; // Assumption: par::proxy_impute_threshold must be at // least 50% HIMPUTE << setw(8) << g[s].AA << " " << setw(8) << g[s].AB << " " << setw(8) << g[s].BA << " " << setw(8) << g[s].BB << " " << setw(8) << info[s] << " " << setw(10) << g[s].AA + 0.5 * ( g[s].AB + g[s].BA ) << " "; if ( g[s].AA > par::proxy_impute_threshold ) { HIMPUTE << g1 << " " << g1 << " " << g1 << " " << g1 << "\t"; } else if ( g[s].BB > par::proxy_impute_threshold ) { HIMPUTE << g2 << " " << g2 << " " << g2 << " " << g2 << "\t"; } else if ( g[s].AB > par::proxy_impute_threshold ) { HIMPUTE << g1 << " " << g2 << " "; if ( g1 < g2 ) HIMPUTE << g1 << " " << g2 << "\t"; else HIMPUTE << g2 << " " << g1 << "\t"; } else if ( g[s].BA > par::proxy_impute_threshold ) { HIMPUTE << g2 << " " << g1 << " "; if ( g1 < g2 ) HIMPUTE << g1 << " " << g2 << "\t"; else HIMPUTE << g2 << " " << g1 << "\t"; } else if ( g[s].AB + g[s].BA > par::proxy_impute_threshold ) { HIMPUTE << par::missing_genotype << " " << par::missing_genotype << " "; if ( g1 < g2 ) HIMPUTE << g1 << " " << g2 << "\t"; else HIMPUTE << g2 << " " << g1 << "\t"; } else { HIMPUTE << par::missing_genotype << " " << par::missing_genotype << " " << par::missing_genotype << " " << par::missing_genotype << "\t"; } HIMPUTE << original_genotype << "\n"; // End of verbose output mode } // Next SNP } // Next individual } } plink-1.07-src/greport.cpp0000644000265600020320000001555611264127625014677 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "options.h" #include "plink.h" #include "helper.h" extern Plink * PP; void Plink::displayGeneReport() { // Simply read in any generic results file and list of SNPs by // ranges (which may be subsetted). // if ( false ) // readMapFile(par::mapfile,include,include_pos,nl_actual); ofstream GREP; GREP.open( (par::output_file_name + ".range.report").c_str() , ios::out); map > ranges; // Read list of ranges ranges = readRange( par::greport_gene_list ); // Filter ranges if ( par::greport_subset ) ranges = filterRanges( ranges, par::greport_subset_file ); // Open a single results file ifstream RESIN; RESIN.open( par::greport_results.c_str() , ios::in ); // Read first (header) row char cline[par::MAX_LINE_LENGTH]; RESIN.getline(cline,par::MAX_LINE_LENGTH,'\n'); string sline = cline; if (sline=="") error("Problem reading [ " + par::greport_results + " ]\n"); string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); int chr_column = -1; int bp_column = -1; int pval_column = -1; int snp_column = -1; for (int i=0; i extractSNP; if ( par::extract_set ) { if ( snp_column == -1 ) error("Did not find a SNP field, so cannot use --extract"); checkFileExists( par::extract_file ); PP->printLOG("Only extracting SNPs listed in [ " + par::extract_file + " ]\n"); ifstream IN(par::extract_file.c_str(), ios::in); while ( ! IN.eof() ) { string snpname; IN >> snpname; if ( snpname=="" ) continue; extractSNP.insert(snpname); } IN.close(); PP->printLOG("Read " + int2str( extractSNP.size() ) + " SNPs to extract\n"); } if ( chr_column < 0 || bp_column < 0 ) error("Could not find CHR and BP fields in results file"); map > annotatedResults; string headerline = sline; int cnt = 0; while ( ! RESIN.eof() ) { // if ( ! par::silent ) // cout << "Processing results line " << ++cnt << " \r"; // vector tokens = tokenizeLine( RESIN ); char cline[par::MAX_LINE_LENGTH]; RESIN.getline(cline,par::MAX_LINE_LENGTH,'\n'); string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); if ( tokens.size() <= chr_column || tokens.size() <= bp_column ) continue; // Using a p-value-filtering field? double pvalue = 0; if ( pval_column != -1 ) { if ( tokens.size() <= pval_column ) continue; if ( ! from_string( pvalue, tokens[pval_column] , std::dec)) continue; if ( par::pfilter && pvalue > par::pfvalue ) continue; } if ( par::extract_set ) { if ( tokens.size() <= snp_column ) continue; if ( extractSNP.find( tokens[snp_column] ) == extractSNP.end() ) continue; } int thisChr = -1; int thisBP = -1; if ( ! from_string( thisChr, tokens[chr_column] , std::dec)) continue; if ( ! from_string( thisBP, tokens[bp_column] , std::dec)) continue; // Do we need to store this? i.e. what ranges is it actually in? // This information is in snp2range Range r1(thisChr,thisBP,thisBP,"dummy"); set implicated = rangeIntersect(r1,ranges); set::iterator ri = implicated.begin(); while ( ri != implicated.end() ) { string distance = dbl2str(( thisBP - ((*ri)->start + par::make_set_border)) /1000.00 , 4 ) + "kb" ; if ( annotatedResults.find( *ri ) == annotatedResults.end() ) { vector t(2); t[0] = distance; t[1] = sline; annotatedResults.insert(make_pair( (Range *)(*ri) , t ) ); } else { vector & v = annotatedResults.find( *ri )->second; v.push_back(distance); v.push_back(sline); } ++ri; } // Read next line of results } // Iterate through these -- they will be in genomic order, hopefully map >::iterator ri = ranges.begin(); while ( ri != ranges.end() ) { set::iterator si = ri->second.begin(); while ( si != ri->second.end() ) { bool displayed = false; map >::iterator ari; ari = annotatedResults.find( (Range *)&(*si) ); if ( ari != annotatedResults.end() ) { for (int l=0; l< ari->second.size(); l+=2) { if ( ! displayed ) { GREP << ri->first << " -- chr" << chromosomeName( si->chr ) << ":" << si->start << ".." << si->stop << " ( " << (si->stop - si->start ) / 1000.00 << "kb ) "; if ( par::make_set_border > 0 ) GREP << " including " << par::make_set_border/1000.00 << "kb border "; GREP << "\n\n" << setw(12) << "DIST" << " " << headerline << "\n"; displayed = true; } GREP << setw(12) << ari->second[l] << " " << ari->second[l+1] << "\n"; } } if ( ! displayed ) { if ( par::greport_display_empty ) { GREP << ri->first << " -- chr" << chromosomeName( si->chr ) << ":" << si->start << ".." << si->stop << " ( " << (si->stop - si->start ) / 1000.00 << "kb ) "; if ( par::make_set_border > 0 ) GREP << " including " << par::make_set_border/1000.00 << "kb border "; GREP << " { nothing to report }\n\n"; } } else GREP << "\n\n"; ++si; } ++ri; } RESIN.close(); GREP.close(); if ( ! par::silent ) cout << "\n"; printLOG("Writing per-range report to [ " + par::output_file_name + ".range.report ]\n"); shutdown(); } plink-1.07-src/assoc.cpp0000644000265600020320000016613311264127624014322 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include "plink.h" #include "fisher.h" #include "stats.h" #include "helper.h" #include "options.h" #include "crandom.h" #include "sets.h" #include "perm.h" #include "phase.h" using namespace std; extern ofstream LOG; //////////////////////////////////////// // Standard 2x2 allelic association tests // Genotypic, and quantitative trait association // Permutation (within-cluster) void Plink::calcAssociationWithPermutation(Perm & perm) { // SNP-major mode analyses? if (par::assoc_glm) { if (par::SNP_major) SNP2Ind(); } else if (!par::SNP_major) Ind2SNP(); ////////////////////////////// // Profile-based set test? // if ( par::set_score ) // pS->profileTestInitialise(); ////////////////////////////// // LD-clump within each set? if ( par::set_test && par::set_r2 ) { printLOG("Performing LD-based set test, with parameters:\n"); printLOG(" r-squared (--set-r2) = " + dbl2str( par::set_r2_val ) + "\n" ); printLOG(" p-value (--set-p) = " + dbl2str( chiprobP(par::set_chisq_threshold,1) ) + "\n" ); printLOG(" max # SNPs (--set-max) = " + int2str( par::set_max ) + "\n" ); pS->makeLDSets(); } ////////////////////////////// // Step-wise set tests if ( par::set_step ) { vector_t r = pS->fitStepwiseModel(); shutdown(); } // Basic association testing results vector original; ////////////////////// // Empirical p-values //////////////////// // Number of tests int ntests = nl_all; if ( par::set_test && ( par::set_r2 || par::set_score ) ) ntests = pS->snpset.size(); perm.setTests(ntests); // Case/control missingness test statistics vector missing; // Observed marginals int aff; int unf; // Odds ratio vector odds(nl_all); ////////////////////////////////////////// // Working vectors for assoc_test_alt_perm vector a1; vector a2; vector a0; // Expected values for the 2x2 test vector exp_afffreq1; vector exp_afffreq2; vector exp_unffreq1; vector exp_unffreq2; if (par::assoc_test_alt_perm) { a1.resize(nl_all); a2.resize(nl_all); a0.resize(nl_all); exp_afffreq1.resize(nl_all); exp_afffreq2.resize(nl_all); exp_unffreq1.resize(nl_all); exp_unffreq2.resize(nl_all); } //////////////////////////////// // Set up permutation structure // (we need to perform this step // whether or not we also // subsequently permute) perm.setPermClusters(*this); perm.originalOrder(); //////////////////////////////////////////////////////// // Perform a test of missingness by case/control status if (par::test_missing && par::bt) original = testMiss(perm,true); ////////////////////////////////////////// // Calculate original association results else if (par::bt) { if (par::full_model_assoc) { //////////////////////////////////// // Full model case/control test original = fullModelAssoc(true,perm); } else if (par::assoc_glm) { /////////////////////////// // Logistic regression test original = glmAssoc(true,perm); } else if (par::CMH_test_1) { ////////////////////////////////////////// // 2 x 2 x K Cochran-Mantel-Haenszel test original = calcMantelHaenszel_2x2xK(perm, true); } else if ( par::test_hap_GLM ) { /////////////////////////////////// // Haplotypic GLM tests (logistic) string f = par::output_file_name + ".assoc.hap.logistic"; printLOG("Writing haplotype results to [ " + f + " ]\n"); haplo->HTEST.open(f.c_str(), ios::out); haplo->HTEST.precision(3); haplo->HTEST << setw(4) << "NSNP" << " " << setw(4) << "NHAP" << " " << setw(4) << "CHR" << " " << setw(12) << "BP1" << " " << setw(12) << "BP2" << " " << setw(par::pp_maxsnp) << "SNP1" << " " << setw(par::pp_maxsnp) << "SNP2" << " "; if ( ! par::test_hap_GLM_omnibus ) { haplo->HTEST << setw(12) << "HAPLOTYPE" << " " << setw(8) << "F" << " " << setw(8) << "OR" << " "; } haplo->HTEST << setw(8) << "STAT" << " " << setw(8) << "P" << "\n"; original = haplo->phaseAllHaplotypes(true,perm); haplo->HTEST.close(); } else { ////////////////////////////////////// // Standard allelic case/control test original = testAssoc(aff,unf, a1,a2,a0, odds, exp_afffreq1, exp_afffreq2, exp_unffreq1, exp_unffreq2, perm, true); } } else if (par::qt) { if (par::assoc_glm) { /////////////////////////// // Linear regression test original = glmAssoc(true,perm); } else if ( par::test_hap_GLM ) { /////////////////////////// // Haplotypic GLM tests (linear) string f = par::output_file_name + ".assoc.hap.linear"; printLOG("Writing haplotype results to [ " + f + " ]\n"); haplo->HTEST.open(f.c_str(), ios::out); haplo->HTEST.precision(3); haplo->HTEST << setw(4) << "NSNP" << " " << setw(4) << "NHAP" << " " << setw(4) << "CHR" << " " << setw(12) << "BP1" << " " << setw(12) << "BP2" << " " << setw(par::pp_maxsnp) << "SNP1" << " " << setw(par::pp_maxsnp) << "SNP2" << " "; if ( ! par::test_hap_GLM_omnibus ) { haplo->HTEST << setw(12) << "HAPLOTYPE" << " " << setw(8) << "F" << " " << setw(8) << "BETA" << " "; } haplo->HTEST << setw(8) << "STAT" << " " << setw(8) << "P" << "\n"; original = haplo->phaseAllHaplotypes(true,perm); haplo->HTEST.close(); } else { //////////////////////////////////// // Quantitative trait regression original = testQAssoc(true,perm); } } // If we didn't know how many values to expect back, // resize now (i.e. from haplotype tests) if ( par::test_hap_GLM ) { ntests = original.size(); perm.setTests(ntests); } //////////////////////////// // Display corrected p-values? string f0 = ".assoc"; if (par::test_missing && par::bt) f0 = ".missing"; else if (par::qt && !par::assoc_glm) f0 = ".qassoc"; else if (par::bt && par::assoc_glm) f0 += ".logistic"; else if (par::qt && par::assoc_glm) f0 += ".linear"; else if (par::CMH_test_1) f0 = ".cmh"; else if (par::test_hap_GLM) f0 += par::bt ? ".hap.logistic" : ".hap.linear"; else if (par::full_model_assoc) { if (par::model_perm_best && par::permute) { f0 = ".model.best"; printLOG("Using BEST of ALLELIC, DOM and REC for --model permutation\n"); } else if (par::model_perm_gen && par::permute) {f0 = ".model.gen"; printLOG("Using GENO for --model permutation\n"); } else if (par::model_perm_dom) {f0 = ".model.dom"; printLOG("Using DOM for --model permutation/adjusted p-values\n"); } else if (par::model_perm_rec) {f0 = ".model.rec"; printLOG("Using REC for --model permutation/adjusted p-values\n"); } else if (par::model_perm_trend) {f0 = ".model.trend"; printLOG("Using CA-trend test for --model permutation/adjusted p-values\n"); } } if (par::fisher_test ) f0 += ".fisher"; // Profile-based set-test? if ( par::set_test && par::set_r2 ) f0 += ".set"; else if ( par::set_test && par::set_score ) f0 += ".set.score"; // Assumes we have 1 df chi-square statistics if (par::multtest) { vector obp(0); if ( par::fisher_test ) { for (int l=0; l 0 && realnum(original[l]) ? inverse_chiprob( 1-original[l] , 1) : -9; obp.push_back( chi ) ; } } else { for (int l=0; l setsigsize; if (par::set_test) { if ( par::set_r2 || par::set_score ) { // Score... // original = pS->profileTestScore(); original = pS->fitLDSetTest(original,true); // ...and save # of significant SNPs setsigsize.clear(); for (int i=0; iprofileSNPs.size(); i++) setsigsize.push_back( pS->s_min[i] ); } else pS->cumulativeSetSum_WITHLABELS(*this,original); } ///////////////////////////// // Ordered/rank permutation? if (par::mperm_rank) perm.setOriginalRanking(original); ////////////////////// // Verbose dumping? if (par::mperm_save_all) printLOG("Dumping all permutation statistics to [ " + par::output_file_name+".mperm.dump.all ]\n"); else if (par::mperm_save_best) printLOG("Dumping best permutation statistics to [ " + par::output_file_name+".mperm.dump.best ]\n"); ////////////////////// // Begin permutations bool finished = par::replicates == 0 ? true : false; while(!finished) { // Store permuted results vector pr(ntests); if (par::perm_genedrop) { if (par::perm_genedrop_and_swap) perm.permuteInCluster(); perm.geneDrop(); } else perm.permuteInCluster(); if (par::test_missing) pr = testMiss(perm,false); else if ((!par::assoc_test_alt_perm) || par::qt || par::full_model_assoc || par::CMH_test_1 || par::assoc_glm) { if (par::qt) { if (par::assoc_glm) pr = glmAssoc(false,perm); else if ( par::test_hap_GLM ) pr = haplo->phaseAllHaplotypes(false,perm); else pr = testQAssoc(false,perm); } else if (par::full_model_assoc) pr = fullModelAssoc(false,perm); else if (par::assoc_glm) pr = glmAssoc(false,perm); else if (par::CMH_test_1) pr = calcMantelHaenszel_2x2xK(perm, false); else if ( par::test_hap_GLM ) pr = haplo->phaseAllHaplotypes(false,perm); else pr = testAssoc(aff,unf, a1,a2,a0, odds, exp_afffreq1, exp_afffreq2, exp_unffreq1, exp_unffreq2, perm, false); } else { ///////////////////////// // For binary traits only // ------------- // | A | B | E | aff // ------------- // | C | D | F | unf // ------------- // a1 a2 a0 // a1 most likely to be common, followed by a2, then a0 // save aff+unf (do not alter by locus) // and a1,a2,a0 marginals (which do alter by locus) // then we only need count A and B in each subsequent replicate: // int A, B, C, D, M; /////////////////////////////// // Iterate over SNPs vector::iterator s = SNP.begin(); int l=0; while ( s != SNP.end() ) { // In adaptive mode, possibly skip this test if (par::adaptive_perm && (!perm.snp_test[l])) { s++; l++; continue; } ///////////////// // clear counts D=M=0; ///////////////// // Autosomal or haploid? bool X=false, haploid=false; if (par::chr_sex[locus[l]->chr]) X=true; else if (par::chr_haploid[locus[l]->chr]) haploid=true; ///////////////////////////// // Iterate over individuals vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator gperson = sample.begin(); while ( gperson != sample.end() ) { // Phenotype for this person (i.e. might be permuted) Individual * pperson = (*gperson)->pperson; // SNP alleles bool s1 = *i1; bool s2 = *i2; if ( ! pperson->missing ) { if (! pperson->aff ) // unaffected { if ( haploid || ( X && (*gperson)->sex ) ) { if ( s2 ) D++; // (hemi, one count) else if ( s1 ) M++; // (missing, one count) } else { if ( s2 ) { if (!s1) D++; // (het, one A count) else D+=2; // (hom, two B count) } else if ( s1 ) M+=2; // (missing, two B count) } } } // Next individual gperson++; i1++; i2++; } // reconstruct rest of 2x2 table C = unf - D - M; A = a1[l] - C; B = a2[l] - D; pr[l] = ( (A - exp_afffreq1[l]) * ( A - exp_afffreq1[l] ) ) / exp_afffreq1[l] + ( (C - exp_unffreq1[l]) * ( C - exp_unffreq1[l] ) ) / exp_unffreq1[l] + ( (B - exp_afffreq2[l]) * ( B - exp_afffreq2[l] ) ) / exp_afffreq2[l] + ( (D - exp_unffreq2[l]) * ( D - exp_unffreq2[l] ) ) / exp_unffreq2[l]; // Next SNP s++; l++; } } ////////////////////// // Make sets? if (par::set_test) { if ( par::set_r2 ) pr = pS->fitLDSetTest(pr,false); else if ( par::set_score ) pr = pS->profileTestScore(); else pS->cumulativeSetSum_WITHOUTLABELS(pr,perm.current_reps()+1); } //////////////////////////////// // Standard permutation counting finished = perm.update(pr,original); } // next permutation if (!par::silent) cout << "\n\n"; ///////////////////////////////////////////////////// // // // Calculate SET-based empirical p-values // // // ///////////////////////////////////////////////////// if (par::set_test && ! (par::set_r2 || par::set_score) ) { printLOG("Calculating empirical SET-based p-values\n"); pS->empiricalSetPValues(); } ///////////////////////////////////////////////////// // // // Display basic permutation results // // // ///////////////////////////////////////////////////// ofstream ASC; string f; if (par::adaptive_perm) f = par::output_file_name + f0 + ".perm"; else f = par::output_file_name + f0 + ".mperm"; ASC.open(f.c_str(),ios::out); ASC.precision(4); printLOG("Writing permutation association results to [ " + f + " ] \n"); if ( par::test_hap_GLM ) ASC << setw(10) << "TEST" << " "; else if ( ! ( par::set_score || par::set_r2 ) ) { ASC << setw(4) << "CHR" << " " << setw(par::pp_maxsnp)<< "SNP" << " "; } else { ASC << setw(12) << "SET" << " " << setw(6) << "NSNP" << " " << setw(6) << "NSIG" << " " << setw(6) << "ISIG" << " "; } ASC << setw(12) << "EMP1" << " "; if ( !par::set_r2 ) { if (par::adaptive_perm) ASC << setw(12)<< "NP" << " "; else if ( par::mperm_rank ) ASC << setw(12)<< "EMP3" << " " << setw(12)<< "RANK" << " "; else ASC << setw(12)<< "EMP2" << " "; } else ASC << "SNPS"; ASC << "\n"; vector pv(0); for (int l=0; l par::pfvalue ) continue; if ( par::test_hap_GLM ) { ASC << setw(10) << ("T"+int2str(l)) << " "; } else if ( ! (par::set_score || par::set_r2 ) ) { ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " "; } else { ASC << setw(12) << setname[l] << " " << setw(6) << pS->snpset[l].size() << " " << setw(6) << pS->numSig[l] << " " << setw(6) << pS->selectedSNPs[l].size() << " "; } // All tests are 1 df double p = chiprobP(original[l],1); // ... except 2df genotypic test if ( par::model_perm_gen ) p = chiprobP(original[l],2); if (par::multtest) pv.push_back(p); ASC << setw(12) << perm.pvalue(l) << " "; if ( ! par::set_r2 ) { if (par::adaptive_perm) ASC << setw(12) << perm.reps_done(l) << " "; else if ( par::mperm_rank ) ASC << setw(12) << perm.max_pvalue(l) << " " << setw(12) << perm.rank(l) << " "; else ASC << setw(12) << perm.max_pvalue(l) << " "; } else { if ( pS->selectedSNPs[l].size() == 0 ) ASC << "NA"; else for (int j=0; jselectedSNPs[l].size(); j++) { ASC << locus[ snpset[l][pS->selectedSNPs[l][j]] ]->name; if ( j < pS->selectedSNPs[l].size() - 1 ) ASC << "|"; } } ASC << "\n"; } ASC.close(); //////////////////////////////////////////////////////// // // // Display SET-based results (sum statistics) // // // //////////////////////////////////////////////////////// if (par::set_test && ! (par::set_r2 || par::set_score) ) { f = par::output_file_name + f0 + ".set"; ASC.open(f.c_str(),ios::out); printLOG("Writing set-based association results to [ " + f + " ] \n"); ASC.clear(); ASC << setw(12) << "SET" << " " << setw(6) << "NSNP" << " " << setw(6) << "S" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "T" << " " << setw(12) << "P_0" << " " << setw(12) << "P_1" << " " << setw(12) << "P_2" << " " << "\n"; for (int i=0;ipv_set.size();i++) { if ( pS->pv_set[i].size()>0 ) ASC << "\n"; for (int j=0;jpv_set[i].size();j++) { ASC << setw(12) << setname[i] << " " << setw(6) << pS->snpset[i].size() << " " << setw(6) << string("S"+int2str(j+1+pS->s_min[i])) << " " << setw(par::pp_maxsnp) << pS->setsort[i][j] << " " << setw(12) << pS->stat_set[i][j][0] << " " << setw(12) << pS->pv_set[i][j][0] << " " << setw(12) << pS->pv_maxG_set[i][j]/(par::replicates+1) << " " << setw(12) << pS->pv_maxE_set[i][j]/(par::replicates+1) << "\n"; } } ASC.close(); } // Call destructor to close PDUMP file perm.closeDUMP(); } ///////////////////////////////////////////////// // Basic C/C association test vector Plink::testAssoc(int & aff, int & unf, vector & a1, vector & a2, vector & a0, vector & odds, vector & exp_afffreq1, vector & exp_afffreq2, vector & exp_unffreq1, vector & exp_unffreq2, Perm & perm, bool display) { ofstream ASC; if (display) { //////////////////// // Display results string f = par::output_file_name + ".assoc"; if ( par::fisher_test ) f += ".fisher"; ASC.open(f.c_str(),ios::out); ASC.precision(4); printLOG("Writing main association results to [ " + f + " ] \n"); ASC << setw(4) << "CHR" << " " << setw(par::pp_maxsnp)<< "SNP" << " " << setw(10) << "BP" << " " << setw(4) << "A1" << " "; if ( par::assoc_counts ) ASC << setw(8) << "C_A" << " " << setw(8) << "C_U" << " "; else ASC << setw(8) << "F_A" << " " << setw(8) << "F_U" << " "; ASC << setw(4) << "A2" << " "; if ( ! par::fisher_test ) ASC << setw(12)<< "CHISQ" << " "; ASC << setw(12)<< "P" << " " << setw(12)<< "OR" << " " ; if (par::display_ci) ASC << setw(12)<< "SE" << " " << setw(12) << string("L"+dbl2str(par::ci_level*100)) << " " << setw(12) << string("U"+dbl2str(par::ci_level*100)) << " "; ASC << "\n"; } vector original(nl_all); /////////////////////////////// // Iterate over SNPs vector::iterator s = SNP.begin(); int l=0; while ( s != SNP.end() ) { // In adaptive mode, possibly skip this test if (par::adaptive_perm && (!perm.snp_test[l])) { s++; l++; continue; } int A1 = 0, A2 = 0, A0 = 0; int U1 = 0, U2 = 0, U0 = 0; bool X=false, haploid=false; if (par::chr_sex[locus[l]->chr]) X=true; else if (par::chr_haploid[locus[l]->chr]) haploid=true; ///////////////////////////// // Iterate over individuals vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator gperson = sample.begin(); while ( gperson != sample.end() ) { // Phenotype for this person (i.e. might be permuted) Individual * pperson = (*gperson)->pperson; // Is this individual missing? if ( pperson->missing ) { // Next person gperson++; i1++; i2++; continue; } // SNP alleles bool s1 = *i1; bool s2 = *i2; // Type of marker if ( haploid || ( X && (*gperson)->sex ) ) { ///////////////////////////////////// // Haploid marker (or male X) if (pperson->aff) // if affected { if (!s1) { if (!s2) A1++; } else { if ( s2 ) A2++; else A0++; } } else // unaffected if not missing { if (!s1) { if (!s2) U1++; } else { if ( s2) U2++; else U0++; } } } else { ///////////////////////////////////// // Autosomal marker if (pperson->aff) // if affected { if (!s1) { if (!s2) A1+=2; else { A1++; A2++; } } else { if ( s2 ) A2+=2; else A0+=2; } } else // unaffected if not missing { if (!s1) { if (!s2) U1+=2; else { U1++; U2++; } } else { if ( s2 ) U2+=2; else U0+=2; } } } // Next person gperson++; i1++; i2++; } // Calculate standard association statistic // Total number of alleles double tot = A1+A2+U1+U2; double pvalue; // Total number of non-missing affecteds/unaffecteds if ( par::fisher_test ) { table_t t; sizeTable(t,2,2); t[0][0] = A1; t[0][1] = U1; t[1][0] = A2; t[1][1] = U2; pvalue = fisher(t); original[l] = 1 - pvalue; } else if (!par::assoc_test_alt_perm) { double Taff = A1+A2; double Tunf = U1+U2; double Ta1 = A1+U1; double Ta2 = A2+U2; double Texp_afffreq1 = (Taff*Ta1)/tot; double Texp_unffreq1 = (Tunf*Ta1)/tot; double Texp_afffreq2 = (Taff*Ta2)/tot; double Texp_unffreq2 = (Tunf*Ta2)/tot; original[l] = ((A1 - Texp_afffreq1) * (A1 - Texp_afffreq1)) / Texp_afffreq1 + ( (U1 - Texp_unffreq1) * ( U1 - Texp_unffreq1 ) ) / Texp_unffreq1 + ( (A2 - Texp_afffreq2) * ( A2 - Texp_afffreq2 ) ) / Texp_afffreq2 + ( (U2 - Texp_unffreq2) * ( U2 - Texp_unffreq2 ) ) / Texp_unffreq2 ; } else // if (par::assoc_test_alt_perm) { // Total number of non-missing affecteds/unaffecteds aff = A1+A2; unf = U1+U2; a1[l] = A1+U1; a2[l] = A2+U2; a0[l] = A0+U0; exp_afffreq1[l] = (aff*a1[l])/tot; exp_unffreq1[l] = (unf*a1[l])/tot; exp_afffreq2[l] = (aff*a2[l])/tot; exp_unffreq2[l] = (unf*a2[l])/tot; // Include missing alleles for final marginal values aff += A0; unf += U0; original[l] = ((A1 - exp_afffreq1[l]) * (A1 - exp_afffreq1[l])) / exp_afffreq1[l] + ( (U1 - exp_unffreq1[l]) * ( U1 - exp_unffreq1[l] ) ) / exp_unffreq1[l] + ( (A2 - exp_afffreq2[l]) * ( A2 - exp_afffreq2[l] ) ) / exp_afffreq2[l] + ( (U2 - exp_unffreq2[l]) * ( U2 - exp_unffreq2[l] ) ) / exp_unffreq2[l] ; } //////////////////////////////////////////////////// // Do we need to calculate odds ratio and p-value? // (Either for display (of original data) or for // other reasons (profile set-based test) if ( display || par::set_score ) { // Note: in set-score mode, use an adjusted form // of the odds-ratio, to ensure it is always valud if ( par::set_score ) odds[l] = (double)( (A1+0.5)*(U2+0.5) ) / (double)( (U1+0.5)*(A2+0.5) ) ; else { // with v. large sample N, better to use: ad/bc = a/b * d/c //odds[l] = (double)( A1*U2 ) / (double)( U1*A2 ) ; odds[l] = ( (double)A1 / (double)A2 ) * ( (double)U2 / (double)U1 ) ; } if ( ! par::fisher_test ) pvalue = chiprobP(original[l],1); } if ( par::set_score ) { if ( pvalue <= par::set_score_p && pvalue >= 0 ) pS->profileTestSNPInformation( l, log(odds[l]) * -log10( pvalue ) ); } if (display) { // Skip?, if filtering p-values if ( par::pfilter && ( pvalue > par::pfvalue || pvalue < 0 ) ) goto skip_p1; // Now display results for this SNP ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(10) << locus[l]->bp << " " << setw(4) << locus[l]->allele1 << " "; if ( A1+A2 != 0 ) { if ( par::assoc_counts ) ASC << setw(8) << A1 << " "; else ASC << setw(8) << (double)A1/(double)(A1+A2) << " "; } else ASC << setw(8) << "NA" << " "; if ( U1+U2 != 0 ) { if ( par::assoc_counts ) ASC << setw(8) << U1 << " "; else ASC << setw(8) << (double)U1/(double)(U1+U2) << " "; } else ASC << setw(8) << "NA" << " "; ASC << setw(4) << locus[l]->allele2 << " " ; if ( par::fisher_test ) { if ( pvalue > -1 ) ASC << setw(12) << pvalue << " "; else ASC << setw(12) << "NA" << " "; } else { if ( pvalue > -1 ) ASC << setw(12) << original[l] << " " << setw(12) << pvalue << " "; else ASC << setw(12) << "NA" << " " << setw(12) << "NA" << " "; } double zero=0; if (odds[l] != odds[l] || odds[l] == 1/zero || odds[l] == -1/zero ) { ASC << setw(12) << "NA" << " "; if (par::display_ci) ASC << setw(12) << "NA" << " " << setw(12) << "NA" << " " << setw(12) << "NA" << " "; } else { ASC << setw(12) << odds[l] << " " ; if (par::display_ci) { double lOR = log(odds[l]); double SE = sqrt(1/(double)A1 + 1/(double)A2 + 1/(double)U1 + 1/(double)U2); double OR_lower = exp( lOR - par::ci_zt * SE ); double OR_upper = exp( lOR + par::ci_zt * SE ); ASC << setw(12) << SE << " " << setw(12) << OR_lower << " " << setw(12) << OR_upper << " "; } } ASC << "\n"; } skip_p1: // Next SNP s++; l++; } if (display) ASC.close(); return original; } ///////////////////////////////////////////// // Full model association tests vector Plink::fullModelAssoc(bool print_results, Perm & perm) { if (print_results) printLOG("Full-model association tests, minimum genotype count: --cell " + int2str(par::min_geno_cell) + "\n"); vector results(nl_all); ofstream ASC; if (print_results) { string f = par::output_file_name + ".model"; ASC.open(f.c_str(),ios::out); printLOG("Writing full model association results to [ " + f + " ] \n"); ASC << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(4) << "A1" << " " << setw(4) << "A2" << " " << setw(8) << "TEST" << " " << setw(14) << "AFF" << " " << setw(14) << "UNAFF" << " "; if ( ! par::fisher_test ) ASC << setw(12) << "CHISQ" << " " << setw(4) << "DF" << " "; ASC << setw(12) << "P" << "\n"; ASC.precision(4); } /////////////////////////////// // Iterate over SNPs vector::iterator s = SNP.begin(); int l=0; while ( s != SNP.end() ) { // In adaptive mode, possibly skip this test if (par::adaptive_perm && (!perm.snp_test[l])) { s++; l++; continue; } int A11=0, A12=0, A22=0; int U11=0, U12=0, U22=0; //////////////////////// // Autosomal or haploid? bool X=false, haploid=false; if (par::chr_sex[locus[l]->chr]) X=true; else if (par::chr_haploid[locus[l]->chr]) haploid=true; // Skip haploid markers if (haploid) { s++; l++; continue; } ///////////////////////////// // Iterate over individuals vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator gperson = sample.begin(); while ( gperson != sample.end() ) { // Phenotype for this person (i.e. might be permuted) Individual * pperson = (*gperson)->pperson; // SNP alleles bool s1 = *i1; bool s2 = *i2; if ( ! pperson->missing ) { // Only consider diploid chromosomes if ( ! ( X && (*gperson)->sex ) ) { if ( pperson->aff ) // cases { if ( ! s1 ) { if ( ! s2 ) // Homozyg 00 A11++; else // Hetero 01 A12++; } else if ( s2 ) // Homozyg 11 A22++; } else { if ( !s1 ) { if ( !s2 ) // Homozyg 00 U11++; else // Hetero 01 U12++; } else if ( s2 ) // Homozyg 11 U22++; } } } // Next individual gperson++; i1++; i2++; } /////////////////////////////////// // Calculate association statistics double obs_A = A11 + A12 + A22; double obs_U = U11 + U12 + U22; double obs_T = obs_A + obs_U; double obs_1 = 2*(A11+U11) + A12 + U12; double obs_2 = 2*(A22+U22) + A12 + U12; double obs_11 = A11+U11; double obs_12 = A12+U12; double obs_22 = A22+U22; bool invalid = false; if (A11 < par::min_geno_cell || A12 < par::min_geno_cell || A22 < par::min_geno_cell) invalid = true; else if (U11 < par::min_geno_cell || U12 < par::min_geno_cell || U22 < par::min_geno_cell) invalid = true; if ( par::trend_only ) invalid = true; /////////////////////// // Cochram-Armitage Trend test double CA = ( ( obs_U / obs_T * A12 ) - ( obs_A / obs_T * U12 ) ) + 2*( ( obs_U / obs_T * A22 ) - ( obs_A / obs_T * U22 ) ) ; double varCA = obs_A * obs_U * ( ( obs_T * ( obs_12 + 4*obs_22 ) - ( obs_12+2*obs_22 ) * ( obs_12+2*obs_22 ) ) / (obs_T * obs_T * obs_T )) ; double CA_chisq = (CA*CA) / varCA; double CA_p = chiprobP(CA_chisq,1); double mult_p, mult_chisq; /////////////////////// // Multiplicative model double obs_A1 = 2*A11 + A12; double obs_A2 = 2*A22 + A12; double obs_U1 = 2*U11 + U12; double obs_U2 = 2*U22 + U12; if ( par::fisher_test ) { table_t t; sizeTable(t,2,2); t[0][0] = (int)obs_A1; t[1][0] = (int)obs_A2; t[0][1] = (int)obs_U1; t[1][1] = (int)obs_U2; mult_p = fisher(t); } else { double exp_A1 = (obs_A * obs_1 ) / obs_T; // note 2's cancelled for obs_A and obs_T double exp_A2 = (obs_A * obs_2 ) / obs_T; // which are counts of individuals, not double exp_U1 = (obs_U * obs_1 ) / obs_T; // alleles double exp_U2 = (obs_U * obs_2 ) / obs_T; mult_chisq = ( ( obs_A1 - exp_A1 ) * ( obs_A1 - exp_A1 ) ) / exp_A1 + ( ( obs_A2 - exp_A2 ) * ( obs_A2 - exp_A2 ) ) / exp_A2 + ( ( obs_U1 - exp_U1 ) * ( obs_U1 - exp_U1 ) ) / exp_U1 + ( ( obs_U2 - exp_U2 ) * ( obs_U2 - exp_U2 ) ) / exp_U2; /////////////////////// // Multiplicative model mult_p = chiprobP(mult_chisq,1); } double gen_p, dom_p, rec_p; gen_p = dom_p = rec_p = -9; double dom_chisq, rec_chisq, gen_chisq; if (!invalid) { ////////////////////////////////////////////////////////////// // Standard chi-square test, or Fisher's exact if ( par::fisher_test ) { //////////// // General table_t t; sizeTable(t,3,2); t[0][0] = A11; t[1][0] = A12; t[2][0] = A22; t[0][1] = U11; t[1][1] = U12; t[2][1] = U22; gen_p = fisher(t); //////////// // Dominant sizeTable(t,2,2); t[0][0] = A11+A12; t[1][0] = A22; t[0][1] = U11+U12; t[1][1] = U22; dom_p = fisher(t); ///////////// // Recessive sizeTable(t,2,2); t[0][0] = A11; t[1][0] = A12+A22; t[0][1] = U11; t[1][1] = U12+U22; rec_p = fisher(t); } else { /////////////////////// // General model double exp_A11 = (obs_A * obs_11 ) / obs_T; double exp_A12 = (obs_A * obs_12 ) / obs_T; double exp_A22 = (obs_A * obs_22 ) / obs_T; double exp_U11 = (obs_U * obs_11 ) / obs_T; double exp_U12 = (obs_U * obs_12 ) / obs_T; double exp_U22 = (obs_U * obs_22 ) / obs_T; gen_chisq = ( ( A11 - exp_A11 ) * ( A11 - exp_A11 ) ) / exp_A11 + ( ( A12 - exp_A12 ) * ( A12 - exp_A12 ) ) / exp_A12 + ( ( A22 - exp_A22 ) * ( A22 - exp_A22 ) ) / exp_A22 + ( ( U11 - exp_U11 ) * ( U11 - exp_U11 ) ) / exp_U11 + ( ( U12 - exp_U12 ) * ( U12 - exp_U12 ) ) / exp_U12 + ( ( U22 - exp_U22 ) * ( U22 - exp_U22 ) ) / exp_U22; /////////////////////// // Dominant (minor allele) (1) model dom_chisq = ( ( (A11+A12) - (exp_A11+exp_A12) ) * ( (A11+A12) - (exp_A11+exp_A12) ) ) / (exp_A11+exp_A12) + ( ( A22 - exp_A22 ) * ( A22 - exp_A22 ) ) / exp_A22 + ( ( (U11+U12) - (exp_U11+exp_U12) ) * ( (U11+U12) - (exp_U11+exp_U12) ) ) / (exp_U11+exp_U12) + ( ( U22 - exp_U22 ) * ( U22 - exp_U22 ) ) / exp_U22; ////////////////////////////////////// // Recessive (minor allele) (1) model rec_chisq = ( ( (A22+A12) - (exp_A22+exp_A12) ) * ( (A22+A12) - (exp_A22+exp_A12) ) ) / (exp_A22+exp_A12) + ( ( A11 - exp_A11 ) * ( A11 - exp_A11 ) ) / exp_A11 + ( ( (U22+U12) - (exp_U22+exp_U12) ) * ( (U22+U12) - (exp_U22+exp_U12) ) ) / (exp_U22+exp_U12) + ( ( U11 - exp_U11 ) * ( U11 - exp_U11 ) ) / exp_U11; ////////////////////////////////// // p-values and model comparisons gen_p = chiprobP(gen_chisq,2); dom_p = chiprobP(dom_chisq,1); rec_p = chiprobP(rec_chisq,1); } } //////////////////////////////////////////// // Save best p-value for permutation test ////////////////////////// // Save the desired result int best = 0 ; if (par::model_perm_best) { double best_p = mult_p; if (!invalid) { // Skip general model (i.e. just compare ALLELIC, DOM, REC //if (gen_p < best_p && gen_p >= 0 ) { best = 2; best_p = gen_p; } // general if (dom_p < best_p && dom_p >= 0 ) { best_p = dom_p; } // dom if (rec_p < best_p && rec_p >= 0 ) { best_p = rec_p; } // rec } results[l] = 1-best_p; } else if ( par::model_perm_gen ) results[l] = gen_p >= 0 ? 1-gen_p : -9 ; else if ( par::model_perm_dom ) results[l] = dom_p >= 0 ? 1-dom_p : -9 ; else if ( par::model_perm_rec ) results[l] = rec_p >= 0 ? 1-rec_p : -9; else if ( par::model_perm_trend ) results[l] = CA_p >= 0 ? CA_chisq : -9; if (print_results) { ///////////// // Genotypic if ( ! par::trend_only ) { ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << "GENO" << " " << setw(14) << int2str(A11)+"/"+int2str(A12)+"/"+int2str(A22) << " " << setw(14) << int2str(U11)+"/"+int2str(U12)+"/"+int2str(U22) << " " ; if (gen_p < -1) { if ( ! par::fisher_test ) ASC << setw(12) << "NA" << " " << setw(4) << "NA" << " "; ASC << setw(12) << "NA" << "\n" ; } else { if ( ! par::fisher_test ) ASC << setw(12) << gen_chisq << " " << setw(4) << "2" << " "; ASC << setw(12) << gen_p << "\n"; } } ///////////////// // CA trend test ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << "TREND" << " " << setw(14) << int2str(A11*2+A12)+"/"+int2str(A12+A22*2) << " " << setw(14) << int2str(U11*2+U12)+"/"+int2str(U12+U22*2) << " "; if (CA_p < -1) { if ( ! par::fisher_test ) ASC << setw(12) << "NA" << " " << setw(4) << "NA" << " "; ASC << setw(12) << "NA" << "\n" ; } else { if ( ! par::fisher_test ) ASC << setw(12) << CA_chisq << " " << setw(4) << "1" << " "; ASC << setw(12) << CA_p << "\n" ; } if ( ! par::trend_only ) { ///////////// // Allelic ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << "ALLELIC" << " " << setw(14) << int2str(A11*2+A12)+"/"+int2str(A12+A22*2) << " " << setw(14) << int2str(U11*2+U12)+"/"+int2str(U12+U22*2) << " "; if (mult_p < -1) { if ( ! par::fisher_test ) ASC << setw(12) << "NA" << " " << setw(4) << "NA" << " "; ASC << setw(12) << "NA" << "\n" ; } else { if ( ! par::fisher_test ) ASC << setw(12) << mult_chisq << " " << setw(4) << "1" << " "; ASC << setw(12) << mult_p << "\n" ; } ///////////// // Dominant ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << "DOM" << " " << setw(14) << int2str(A11+A12)+"/"+int2str(A22) << " " << setw(14) << int2str(U11+U12)+"/"+int2str(U22) << " "; if (dom_p < -1) { if ( ! par::fisher_test ) ASC << setw(12) << "NA" << " " << setw(4) << "NA" << " "; ASC << setw(12) << "NA" << "\n" ; } else { if ( ! par::fisher_test ) ASC << setw(12) << dom_chisq << " " << setw(4) << "1" << " "; ASC << setw(12) << dom_p << "\n" ; } ///////////// // Recessive ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << "REC" << " " << setw(14) << int2str(A11)+"/"+int2str(A12+A22) << " " << setw(14) << int2str(U11)+"/"+int2str(U12+U22) << " "; if (rec_p < -1) { if ( ! par::fisher_test ) ASC << setw(12) << "NA" << " " << setw(4) << "NA" << " "; ASC << setw(12) << "NA" << "\n" ; } else { if ( ! par::fisher_test ) ASC << setw(12) << rec_chisq << " " << setw(4) << "1" << " "; ASC << setw(12) << rec_p << "\n" ; } } } // Next SNP s++; l++; } if (print_results) { ASC.close(); } return results; } ///////////////////////////////////////////// // Simple quantitative trait association test // note: does not explicitly treat X/haploid // markers differently: see --linear vector Plink::testQAssoc(bool print_results , Perm & perm ) { vector results(nl_all); if ( print_results && par::multtest ) tcnt.resize(nl_all); ofstream ASC, QT_MEANS; if (print_results) { string f = par::output_file_name + ".qassoc"; printLOG("Writing QT association results to [ " + f + " ] \n"); ASC.open(f.c_str(),ios::out); ASC << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "BP" << " " << setw(8) << "NMISS" << " " << setw(10) << "BETA" << " " << setw(10) << "SE" << " " << setw(10) << "R2" << " " << setw(8) << "T" << " " << setw(12) << "P" << " " << "\n"; ASC.precision(4); if ( par::qt_means ) { string f = par::output_file_name + ".qassoc.means"; printLOG("Writing QT genotypic means to [ " + f + " ] \n"); QT_MEANS.open(f.c_str(),ios::out); QT_MEANS.precision(4); QT_MEANS << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(6) << "VALUE" << " " << setw(8) << "G11" << " " << setw(8) << "G12" << " " << setw(8) << "G22" << "\n"; } } //////////////////////////// // Iterate over each locus vector::iterator s = SNP.begin(); int l=0; while ( s != SNP.end() ) { // Skip possibly if (par::adaptive_perm && !perm.snp_test[l]) { l++; s++; continue; } double g_mean=0, g_var=0; double qt_mean=0, qt_var=0; double qt_g_covar=0; // number of individuals in analysis int nanal = 0; ///////////////////////////// // Iterate over individuals vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator gperson = sample.begin(); while ( gperson != sample.end() ) { // Phenotype for this person (i.e. might be permuted) Individual * pperson = (*gperson)->pperson; // SNP alleles bool s1 = *i1; bool s2 = *i2; if (!pperson->missing) { if ( ! ( s1 && (!s2) ) ) // 10 = missing { qt_mean += pperson->phenotype; if (!s1) { if (!s2) // 00 = hom(11) g_mean+=2; else // 01 = het(12) g_mean++; } nanal++; } } // Advance to the next person (for phenotype information // and the two SNP alleles also) gperson++; i1++; i2++; } qt_mean /= (double)nanal; g_mean /= (double)nanal; ////////////////////////////////// // Iterate over individuals again i1 = (*s)->one.begin(); i2 = (*s)->two.begin(); gperson = sample.begin(); while ( gperson != sample.end() ) { // Phenotype for this person (i.e. might be permuted) Individual * pperson = (*gperson)->pperson; // SNP alleles bool s1 = *i1; bool s2 = *i2; if (!pperson->missing) { if ( ! ( (s1) && (!s2) ) ) { qt_var += (pperson->phenotype-qt_mean) * ( pperson->phenotype-qt_mean ) ; double g = 0; if (!s1) { if (!s2) g=2; else g=1; } g_var += (g-g_mean) * ( g-g_mean ) ; qt_g_covar += ( pperson->phenotype - qt_mean ) * ( g - g_mean ) ; } } // Advance to the next person gperson++; i1++; i2++; } // Summary statistics qt_var /= (double)nanal - 1; g_var /= (double)nanal - 1; qt_g_covar /= (double)nanal - 1; // Test statistics double beta = qt_g_covar / g_var; double vbeta = ( qt_var/g_var - (qt_g_covar*qt_g_covar)/(g_var*g_var) ) / (nanal-2); double t = beta / sqrt(vbeta); double t_p; // Display results? if (print_results) { // double wald = (beta*beta) / ( vbeta ) ; // double wald_p = chiprobP(wald,1); t_p = pT(t,nanal-2); double r2 = (qt_g_covar * qt_g_covar ) / ( qt_var * g_var ) ; // double lrt = -nanal * log(1-r2); // Skip?, if filtering p-values if ( par::pfilter && ( t_p > par::pfvalue || t_p < 0 ) ) goto skip_p2; ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(10) << locus[l]->bp << " " << setw(8) << nanal << " "; if ( ! realnum(beta) ) { ASC << setw(10) << "NA" << " " << setw(10) << "NA" << " " << setw(10) << "NA" << " " ; } else { ASC << setw(10) << beta << " " << setw(10) << sqrt(vbeta) << " " << setw(10) << r2 << " " ; } if (t_p >= 0) ASC << setw(8) << t << " " << setw(12) << t_p << " " ; else ASC << setw(8) << "NA" << " " << setw(12) << "NA" << " " ; ASC << "\n"; if ( par::qt_means ) displayQTMeans(QT_MEANS, l); } skip_p2: // Store chi-sq results[l] = t*t; // Store original p-value (for --adjust) if ( print_results && par::multtest ) { tcnt[l] = nanal-2; } // Next SNP s++; l++; } if (print_results) { ASC.close(); if ( par::qt_means ) QT_MEANS.close(); } return results; } ///////////////////////////////////////////// // For a given SNP, calculate genotypic mean, // frequency and variance, and display void Plink::displayQTMeans(ofstream & QT_MEANS, int l) { vector::iterator s = SNP.begin()+ l; double g11=0, g12=0, g22=0; double x11=0, x12=0, x22=0; double xx11=0, xx12=0, xx22=0; ///////////////////////////// // Iterate over individuals vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator person = sample.begin(); while ( person != sample.end() ) { // SNP alleles bool s1 = *i1; bool s2 = *i2; if ( ! (*person)->missing ) { if ( ! s1 ) { if ( ! s2 ) { g11++; x11 += (*person)->phenotype; xx11 += (*person)->phenotype * (*person)->phenotype; } else { g12++; x12 += (*person)->phenotype; xx12 += (*person)->phenotype * (*person)->phenotype; } } else { if ( s2 ) { g22++; x22 += (*person)->phenotype; xx22 += (*person)->phenotype * (*person)->phenotype; } } } person++; i1++; i2++; } double nanal = g11 + g12 + g22; x11 /= g11; x12 /= g12; x22 /= g22; xx11 /= g11; xx12 /= g12; xx22 /= g22; double sd11 = g11>1 ? sqrt(xx11 - x11 * x11) * sqrt(g11/(g11-1)) : 0; double sd12 = g12>1 ? sqrt(xx12 - x12 * x12) * sqrt(g12/(g12-1)) : 0; double sd22 = g22>1 ? sqrt(xx22 - x22 * x22) * sqrt(g22/(g22-1)) : 0; string a1 = locus[l]->allele1; string a2 = locus[l]->allele2; if ( a1 == "" ) a1 = "*"; if ( a2 == "" ) a2 = "*"; QT_MEANS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(6) << "GENO" << " " << setw(8) << a1+"/"+a1 << " " << setw(8) << a1+"/"+a2 << " " << setw(8) << a2+"/"+a2 << "\n"; QT_MEANS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(6) << "COUNTS" << " " << setw(8) << g11 << " " << setw(8) << g12 << " " << setw(8) << g22 << "\n"; g11 /= nanal; g12 /= nanal; g22 /= nanal; QT_MEANS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(6) << "FREQ" << " " << setw(8) << g11 << " " << setw(8) << g12 << " " << setw(8) << g22 << "\n"; QT_MEANS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(6) << "MEAN" << " "; if ( g11>0 ) QT_MEANS << setw(8) << x11 << " "; else QT_MEANS << setw(8) << "NA" << " "; if ( g12>0 ) QT_MEANS << setw(8) << x12 << " "; else QT_MEANS << setw(8) << "NA" << " "; if ( g22>0) QT_MEANS << setw(8) << x22 << "\n"; else QT_MEANS << setw(8) << "NA" << "\n"; QT_MEANS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(6) << "SD" << " "; if ( g11>0 ) QT_MEANS << setw(8) << sd11 << " "; else QT_MEANS << setw(8) << "NA" << " "; if ( g12>0 ) QT_MEANS << setw(8) << sd12 << " "; else QT_MEANS << setw(8) << "NA" << " "; if ( g22>0) QT_MEANS << setw(8) << sd22 << "\n"; else QT_MEANS << setw(8) << "NA" << "\n"; } ////////////////////////////////////////////////////////////// // Test difference in missingness between cases and controls vector Plink::testMiss(Perm & perm, bool display) { // Requires SNP-major mode if (!par::SNP_major) Ind2SNP(); ofstream MIS; if (display) { string f = par::output_file_name + ".missing"; MIS.open(f.c_str(),ios::out); MIS.precision(4); printLOG("Writing case/control missingness test to [ " + f + " ] \n"); MIS << setw(4) << "CHR" << " " << setw(par::pp_maxsnp)<< "SNP" << " " << setw(12) << "F_MISS_A" << " " << setw(12) << "F_MISS_U" << " " << setw(12)<< "P" << " " << "\n"; } vector missing(0); vector::iterator s = SNP.begin(); int l=0; while ( s != SNP.end() ) { // Skip possibly if (par::adaptive_perm && !perm.snp_test[l]) { l++; s++; continue; } int affmiss = 0; int affgeno = 0; int unfmiss = 0; int unfgeno = 0; /////////////////////////////// // Iterate over each individual vector::iterator gperson = sample.begin(); vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); while ( gperson != sample.end() ) { Individual * pperson = (*gperson)->pperson; // If we haven't excluded this individual // on the basis of a cluster solution if ( ! pperson->missing ) { // "1" allele count if ( pperson->aff) // affected { if ( *i1 && ! *i2 ) affmiss++; else affgeno++; } else { if ( *i1 && ! *i2 ) unfmiss++; else unfgeno++; } } // Next individual gperson++; i1++; i2++; } // Calculate association statistic table_t t; sizeTable(t,2,2); t[0][0] = affmiss; t[0][1] = affgeno; t[1][0] = unfmiss; t[1][1] = unfgeno; double pvalue = fisher(t); // Record 1-p as empirical statistic if ( pvalue > -1 ) missing.push_back(1-pvalue); else missing.push_back(1); if (display) { // Skip?, if filtering p-values if ( par::pfilter && pvalue > par::pfvalue ) { // Next SNP s++; l++; continue; } // Total number of affecteds/unaffecteds int aff = affmiss + affgeno; int unf = unfmiss + unfgeno; MIS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << affmiss / (double)aff << " " << setw(12) << unfmiss / (double)unf << " "; if ( pvalue > -1 ) MIS << setw(12) << pvalue << "\n"; else MIS << setw(12) << "NA" << "\n"; } // Next SNP s++; l++; } if (display) { MIS.close(); } return missing; } void Plink::calcLDStatistics() { /////////////////////////////////////////// // Calculate simple correlation (r or r^2) // based on 3x3 genotype counts // If --matrix option also specified, output as // one matrix, otherwise use ofstream LD; string f = par::output_file_name + ".ld"; LD.open(f.c_str(),ios::out); printLOG("Writing LD statistics to [ " + f + " ] \n"); if (!par::matrix) { LD << setw(6) << "CHR_A" << " " << setw(12) << "BP_A" << " " << setw(par::pp_maxsnp) << "SNP_A" << " " << setw(6) << "CHR_B" << " " << setw(12) << "BP_B" << " " << setw(par::pp_maxsnp) << "SNP_B" << " " ; if (par::disp_r1) LD << setw(12) << "R" << " " ; else LD << setw(12) << "R2" << " " ; LD << "\n"; } set ldAnchorSet; if ( par::ld_anchor_list ) { checkFileExists( par::ld_SNP1_file ); ifstream IN( par::ld_SNP1_file.c_str() , ios::in ); map mlocus; for (int l=0; lname , l ) ); while ( !IN.eof() ) { string snp; IN >> snp; if (snp == "") continue; map::iterator i = mlocus.find( snp ); if ( i != mlocus.end() ) ldAnchorSet.insert( i->second ); } IN.close(); } int ld_anchor_number = -1; if ( par::ld_anchor && ! par::ld_anchor_list ) { par::matrix = false; ld_anchor_number = getMarkerNumber(*this, par::ld_SNP1); if (ld_anchor_number == -1) error("--ld-snp {marker} not found"); } int end = nl_all; // Do we need to go up to the last SNP? if ( (!par::matrix) && (!par::ld_anchor) ) end--; /////////////////////////// // First locus for (int l1=0; l1= par::disp_r_window_snp ) continue; if ( l1 - l2 >= par::disp_r_window_snp ) continue; if ( locus[l2]->chr != locus[l1]->chr ) continue; if ( locus[l2]->bp - locus[l1]->bp > par::disp_r_window_kb ) continue; if ( locus[l1]->bp - locus[l2]->bp > par::disp_r_window_kb ) continue; } //////////////////////// // Calculate correlation double r = correlation2SNP(l1,l2,par::disp_r2,false); if (par::matrix) { LD << r << " "; } else { // Using a r^2 threshold? if ( par::disp_r1 || r >= par::disp_r_window_r2 ) { LD << setw(6) << locus[l1]->chr << " " << setw(12) << locus[l1]->bp << " " << setw(par::pp_maxsnp) << locus[l1]->name << " " << setw(6) << locus[l2]->chr << " " << setw(12) << locus[l2]->bp << " " << setw(par::pp_maxsnp) << locus[l2]->name << " " << setw(12) << r << " " << "\n"; } } } if (par::matrix) LD << "\n"; } LD.close(); } double Plink::correlation2SNP(int l1, int l2, bool squared, bool covariance, bool useflag) { // Calculate simple correlation based on 0,1,2 allele counts // i.e. the 3x3 genotypic table, rather than the 2x2 haplotypic // table. // r = cov(1,2) / sqrt( var(1).var(2) ) double X = 0; double X2 = 0; double Y = 0; double Y2 = 0; double XY = 0; double count = 0; bool haploid_snp1 = par::chr_haploid[locus[l1]->chr]; bool X_snp1 = par::chr_sex[locus[l1]->chr]; bool haploid_snp2 = par::chr_haploid[locus[l2]->chr]; bool X_snp2 = par::chr_sex[locus[l2]->chr]; // Iterate over every individual // but only consider founders // Have a specicial case loop for when both SNPs // are autosomal (usual case) if ( haploid_snp1 || X_snp1 || haploid_snp2 || X_snp2 ) { // Allow for 1 or both SNPs to be non-autosomal for (int i=0; ifounder ) continue; // Are we using a flag? if ( useflag && ! person->flag ) continue; bool a1 = par::SNP_major ? SNP[l1]->one[i] : person->one[l1]; bool a2 = par::SNP_major ? SNP[l1]->two[i] : person->two[l1]; if ( a1 && (!a2) ) continue; bool b1 = par::SNP_major ? SNP[l2]->one[i] : person->one[l2]; bool b2 = par::SNP_major ? SNP[l2]->two[i] : person->two[l2]; if ( b1 && (!b2) ) continue; // Only consider if non-missing at both loci if ( a1 && (!a2) ) continue; if ( b1 && (!b2) ) continue; // Score individuals count++; int sx = 0, sy = 0; // Haploid/diploid marker 1 ? if ( haploid_snp1 || ( X_snp1 && person->sex ) ) { // Hemizygous "0" if ( ! a1 ) sx=1; } else { // Score 2,1,0 for 00,01,11 genotypes if ( ! a1 ) { if ( ! a2 ) sx=2; else sx=1; } } // Haploid/diploid marker 2 ? if ( haploid_snp2 || ( X_snp2 && person->sex ) ) { // Hemizygous "0" if ( ! b1 ) sy=1; } else { // Score 2,1,0 for 00,01,11 genotypes if ( ! b1 ) { if ( ! b2 ) sy=2; else sy=1; } } X += sx; Y += sy; XY += sx*sy; // Sum squares sx *= sx; sy *= sy; X2 += sx; Y2 += sy; // consider next person; } } else { // Autosomal only version for (int i=0; ifounder ) continue; // Are we using a flag? if ( useflag && ! person->flag ) continue; // Only consider if non-missing at both loci bool a1 = par::SNP_major ? SNP[l1]->one[i] : person->one[l1]; bool a2 = par::SNP_major ? SNP[l1]->two[i] : person->two[l1]; if ( a1 && (!a2) ) continue; bool b1 = par::SNP_major ? SNP[l2]->one[i] : person->one[l2]; bool b2 = par::SNP_major ? SNP[l2]->two[i] : person->two[l2]; if ( b1 && (!b2) ) continue; // Score individuals count++; int sx = 0, sy = 0; if ( ! a1 ) { if ( ! a2 ) sx=2; else sx=1; } if ( ! b1 ) { if ( ! b2 ) sy=2; else sy=1; } X += sx; Y += sy; XY += sx*sy; sx *= sx; sy *= sy; X2 += sx; Y2 += sy; // consider next person; } } // count refers to number of individuals X /= count; X2 /= count; Y /= count; Y2 /= count; XY /= count; double var1 = X2 - X*X; double var2 = Y2 - Y*Y; double cov12 = XY - X*Y; double r; // Return either: // covariance, correlation or correlation squared? if ( covariance ) r = cov12; else { if ( squared ) r = (cov12*cov12) / (var1*var2); else { r = cov12 / (sqrt(var1)*sqrt(var2)); // get sign of label: check minor allele assignment matches if ( ( (locus[l1]->allele1 > locus[l1]->allele2) && (locus[l2]->allele2 < locus[l2]->allele2) ) || ( (locus[l1]->allele1 < locus[l1]->allele2) && (locus[l2]->allele2 > locus[l2]->allele2) ) ) r *= -1; } } return r; } plink-1.07-src/cluster.cpp0000644000265600020320000012414611264127624014671 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "perm.h" #include "stats.h" using namespace std; extern ofstream LOG; // Helper function: find the maximum distance between two clusters double cldist(vector > &, vector &, vector &); // Helper function: group average link double groupAvgLink(vector > &, vector &, vector &); // Helper function: are two clusters phenotypically homogeneous? bool homogeneous_clusters(Plink &, vector &, vector &); // Do two clusters conform to any --mcc Ncase Ncontrol specification? bool spec_clusters(Plink &, vector &, vector &); // Any members of the clusters that can't be matched? bool pairable_cluster(vector > & , vector&, vector&); // Have we already picked somebody from this category? bool selcon_inds(Plink&, vector&, vector&, set&); class Neighbour { public: double dist; Individual * neighbour; bool operator< (const Neighbour & s2) const { return (dist < s2.dist); } }; // Complete-linkage clustering based on average IBS distance // Extra constraints: // --pmerge P do not merge clusters containing two individuals who differ at this level // --mc N do not let clusters contain more than N individuals // --cc do not merge phenotypically identical clusters // --mcc N1 N2 do not let cluster contain more than N1 cases and N2 controls // --match external categorical matching criteria // --match-type positive or negative matches // --qmatch external quantitative threshold based // --qt define thresholds for QT matching // --pick1 only select one individual from each covariate group // --ibm X identity-by-missingness threshold void Plink::buildCluster() { /////////////////////////////////////// // This is an individual-mode analysis if (par::SNP_major) SNP2Ind(); ////////////////////////////////////// // Force an initial cluster solution? // Initially, # of clusters = # of people, unless // we are forcing a starting solution int ni = n; vector > cl; if ( par::force_initial_cluster ) { if (!readClusterFile()) error("Problem reading --within {file}"); printLOG("Forcing an initial starting solution from [ " + par::include_cluster_filename + " ]\n"); set added; for (int k=0;k t; for (int i=0;isol == k ) { t.push_back(i); added.insert(i); } } cl.push_back(t); } // And now add any remain individuals, in their own clusters, // starting from cluster nk onwards for (int i=0;i t(1); t[0] = i; cl.push_back(t); } } } else { for (int i=0;i t(1); t[0] = i; cl.push_back(t); } } // T/F matrix (lower diagonal) for whether // two people can be matched, based on p-value // constraint and any external criteria // pairable[i][j] (default = T) vector > pairable(n); for (int i=0; i tmp(n,true); pairable[i] = tmp; } /////////////////////////////// // External matching criteria // Determine, in advance, potential pairwise matching if (par::bmatch) { printLOG("Applying categorical matching criteria...\n"); // Read in each covariate one at a time, // and determine matching // we can use the covariate file, as the cluster // routine exits after clustering (i.e. so covariates // never used) // Has the user specified a match-type file? If not, // assume all are positive matches. vector btype(0); if (par::bmatch_usertype) { checkFileExists(par::bmatch_direction_filename); ifstream BT(par::bmatch_direction_filename.c_str(), ios::in); while (!BT.eof()) { string tmp; BT >> tmp; if(BT.eof()) break; if (tmp=="+" || tmp=="1") btype.push_back(true); else btype.push_back(false); } BT.close(); printLOG(int2str(btype.size())+" match-type definitions read from [ "+ par::bmatch_direction_filename+" ]\n"); } int c=0; // Swap b-match filename as cluster/within filename par::include_cluster_filename = par::bmatch_filename; while (1) { par::mult_clst = ++c; if (!readClusterFile()) break; if (!par::bmatch_usertype) btype.push_back(true); for (int i=0; imissing means missing on covariate in this context // Simple matching (no usertypes or +-match) if ( btype[c-1] ) { // +/match if (sample[i]->sol != sample[j]->sol && (!sample[i]->missing ) && (!sample[j]->missing ) ) pairable[i][j] = pairable[j][i] = false; } else { // -/match if (sample[i]->sol == sample[j]->sol && (!sample[i]->missing ) && (!sample[j]->missing ) ) pairable[i][j] = pairable[j][i] = false; } } } printLOG("Matched on "+int2str(c-1)+ " variables from [ "+par::bmatch_filename+" ]\n"); } if (par::qmatch) { printLOG("Applying quantitative matching criteria...\n"); vector qt; // number of thresholds specified checkFileExists(par::qmatch_threshold_filename); ifstream QT(par::qmatch_threshold_filename.c_str(), ios::in); while (!QT.eof()) { double tmp; QT >> tmp; if(QT.eof()) break; qt.push_back(tmp); } QT.close(); printLOG(int2str(qt.size())+" q-match thresholds read from [ "+ par::qmatch_threshold_filename+" ]\n"); // Swap q-match filename as covariate file par::covar_filename = par::qmatch_filename; int c=0; // counter for number of fields in qmatch file for (int z=1; z<=qt.size(); z++) { par::mult_covar = z; if (!readCovariateFile()) break; c++; for (int i=0; icovar - sample[j]->covar ) > qt[c-1] && (!sample[i]->missing) && (!sample[j]->missing) ) pairable[i][j] = pairable[j][i] = false; } } printLOG("Matched on "+ int2str(c)+" quantitative covariates from [ " +par::qmatch_filename +" ]\n"); } if (par::cluster_missing) { printLOG("Clustering individuals based on genome-wide IBM\n"); } else { printLOG("Clustering individuals based on genome-wide IBS\n"); stringstream s2; s2 << "Merge distance p-value constraint = " << par::merge_p << "\n"; printLOG(s2.str()); } if (par::outlier_detection) printLOG("Outlier detection based on neighbours "+int2str(par::min_neighbour)+ " to "+int2str(par::max_neighbour)+"\n"); ///////////////////////////////////////////////////////// // Also, if --pick1 is in effect, we need to read a list from which // we can pick only 1 individual if (par::cluster_selcon) { // Swap pick1 filename as covariate file par::include_cluster_filename = par::cluster_selcon_file; par::mult_clst = 1; if (!readClusterFile()) error("Problem reading for --pick1 option"); } // Keep track of what has been selected already set selcon; ///////////////////////////// // Set up distance matrices // Lower diagonal structure, requires that i > j mdist.resize(n); for (int j=0;j prop_sig_diff(n); // Calculate... if (!par::ibd_read) { int c=0; int c2=0; for (int i1=0; i1 mperson; for (int i=0; ifid+"_"+sample[i]->iid , i )); map mcode; for (int i=0; i peeps; if ( par::ibd_read_minimal ) { // read in list of people here while ( 1 ) { vector ids = ZINC.tokenizeLine(); if ( ids.size() != 2 ) { string emsg = "Problem with line in [ " + par::ibd_file + " ]\n"; for (int i=0;i::iterator p = mperson.find(pcode); // Add NULL if this person actually not in // the current file -- in this case, they // will be ignored -- but remember we have // to check for NULLs below and skip those // numbers in that case... if ( p == mperson.end() ) peeps.push_back( NULL ); else peeps.push_back( sample[p->second] ); // Just in case we have a malformed file if ( ZINC.endOfFile() ) error("Problem with premature stop in file [ " + par::ibd_file + " ]\n"); } ////////////////////////////////////////////////////// // Now read the actual IBS/PPC values for these peeps if ( peeps.size() != sample.size() ) printLOG("Warning -- a different number of people in .genome.min that dataset\n"); int size = peeps.size(); int p1 = 0, p2 = 1; while ( 1 ) { double mydst, pv, ibd; vector val = ZINC.tokenizeLine(); if ( ZINC.endOfFile() ) { // Check that p1,p2 counts are as should be... break; } if ( val.size() != 3 ) { string emsg = "Problem with line in [ " + par::ibd_file + " ]\n"; for (int i=0;i( mydst, val[0], std::dec ) ) mydst = 0; if ( !from_string( pv, val[1], std::dec ) ) pv = 0; if ( !from_string( ibd, val[2], std::dec ) ) ibd = 0; Individual * person1 = peeps[p1]; Individual * person2 = peeps[p2]; int pn1 = mcode.find( person1 )->second; int pn2 = mcode.find( person2 )->second; if ( person1 == NULL || person2 == NULL || person1 == person2 ) { // Advance to next pair ++p2; if ( p2 == n ) { ++p1; p2=p1+1; } if ( p1==n ) break; continue; } // cout << "found " << pn1 << " and " << pn2 << " is " // << person1->fid << " " << person1->iid << " x " // << person2->fid << " " << person2->iid << "\t" // << " with " // << mydst << " " << pv << "\n"; // Record IBS distance if ( pn1 > pn2 ) mdist[pn1][pn2] = mydst; else mdist[pn2][pn1] = mydst; ////////////////////////// // Is this pair pairable? if (pv < par::merge_p && realnum(pv)) { // record pair as unpairable pairable[pn1][pn2] = false; pairable[pn2][pn1] = false; // record for both individuals a IBS-based mismatch prop_sig_diff[pn1]++; prop_sig_diff[pn2]++; } // Also calculate IBM as a constraint? if (par::cluster_ibm_constraint) { calcGenomeIBM(person1,person2); if ( dst < par::cluster_ibm_constraint_value ) { pairable[pn1][pn2] = false; pairable[pn2][pn1] = false; } } // Advance to next peep-pair ++p2; if ( p2 == n ) { ++p1; p2=p1+1; } // Finished? if ( p1==n ) break; } } else { // Read in .genome file in verbose mode // We only want FID1,IID1,FID2,IID2 (always first four) // DST and PPC // Get field codes from header int ppc_code = -1; int dst_code = -1; int col_length = 0; double mydst; vector tokens = ZINC.tokenizeLine(); col_length = tokens.size(); if ( tokens.size() < 4 || tokens[0] != "FID1" || tokens[1] != "IID1" || tokens[2] != "FID2" || tokens[3] != "IID2" ) error("Problem with header row of .genome file"); for ( int i = 4; i tokens = ZINC.tokenizeLine(); if ( tokens.size() == 0 ) continue; if ( col_length != tokens.size() ) { string strmsg = ""; for (int i=0;i( ibs0 , i0 , std::dec) && // from_string( ibs1 , i1 , std::dec) && // from_string( ibs2 , i2 , std::dec) ) ) // { // error("Problem with line in .genome file, IBS estimates: \n" // +i0+" "+i1+" "+i2+" "+ipv+"\n"); // } if ( ! from_string( mydst , idst , std::dec) ) mydst = 0; if ( ! from_string( pv , ipv , std::dec) ) pv = 1; // Calculate proportion IBS matching // if (par::cluster_euclidean) // mydst = (ibs2*2+ibs1*0.5)/(ibs2*2+ibs1+ibs0); // else // mydst = (ibs2+ibs1*0.5)/(ibs2+ibs1+ibs0); map::iterator person1 = mperson.find(fid1+"_"+iid1); map::iterator person2 = mperson.find(fid2+"_"+iid2); if ( person1 == mperson.end() || person2 == mperson.end() || person1 == person2 ) continue; // Record IBS distance if ( person1->second > person2->second ) mdist[person1->second][person2->second] = mydst; else mdist[person2->second][person1->second] = mydst; ////////////////////////// // Is this pair pairable? if (pv < par::merge_p && pv==pv) { // record pair as unpairable pairable[person1->second][person2->second] = false; pairable[person2->second][person1->second] = false; // record for both individuals a IBS-based mismatch prop_sig_diff[person1->second]++; prop_sig_diff[person2->second]++; } // Also calculate IBM as a constraint? if (par::cluster_ibm_constraint) { calcGenomeIBM(sample[person1->second],sample[person2->second]); if ( dst < par::cluster_ibm_constraint_value ) { pairable[person1->second][person2->second] = false; pairable[person2->second][person1->second] = false; } } } // Read next line in .genome } ZINC.close(); ///////////////////////////////////////////// // Check that every pair in the dataset has // actually been assigned a value -- i.e. check // for 0 IBS codes, etc. } /////////////////////////////////// // IBS permutation test if ( par::ibs_test ) { // If we were called by permutationIBSTest(), // now it is time to return return; } /////////////////////////////////// // Display matrix of IBS distances if (par::matrix) { string f; if (par::cluster_missing) f = par::output_file_name+ ".mdist.missing"; else if (par::distance_matrix) f = par::output_file_name+ ".mdist"; else f = par::output_file_name+ ".mibs"; if (!par::cluster_missing) { if (par::distance_matrix) printLOG("Writing IBS distance matrix to [ "+f + " ]\n"); else printLOG("Writing IBS similarity matrix to [ "+f + " ]\n"); } else printLOG("Writing IBM distance matrix to [ "+f + " ]\n"); ofstream MAT(f.c_str(),ios::out); MAT.clear(); for (int i=0;ij) MAT << 1 - mdist[i][j] << " "; else if (i==j) MAT << 0 << " "; else MAT << 1 - mdist[j][i] << " "; } else { // Similarities if (i>j) MAT << mdist[i][j] << " "; else if (i==j) MAT << 1 << " "; else MAT << mdist[j][i] << " "; } } MAT << "\n"; } MAT.close(); } //////////////////////////////////// // Determine how many pairable pairs // we have now if (!par::cluster_missing) { int paircount = 0; for (int i=0; i > min_dst(n); vector > zmin_dst(n); vector > min_ind(n); if (par::max_neighbour > n-1) error("Nearest neighbour range specified as [ "+int2str(par::max_neighbour) +" ] but only [ "+int2str(n)+" ] individuals in sample."); for (int k=par::min_neighbour;k<=par::max_neighbour;k++) { // Consider each person for (int i=0;i ibs(n-1); int c=0; for (int j=0;jj ) ibs[c].dist = mdist[i][j]; else ibs[c].dist = mdist[j][i]; ibs[c].neighbour = sample[j]; c++; } sort(ibs.begin(),ibs.end()); min_dst[i].push_back(ibs[ibs.size() - k].dist); min_ind[i].push_back(ibs[ibs.size() - k].neighbour); } // Calculate mean and variance of min_dst to // give Z-scores double mean = 0; double var = 0; for (int i=0; ifid << " " << setw(12) << sample[i]->iid << " " << setw(6) << par::min_neighbour+k << " " << setw(12) << min_dst[i][k] << " " << setw(12) << zmin_dst[i][k] << " " << setw(12) << min_ind[i][k]->fid << " " << setw(12) << min_ind[i][k]->iid << " "; if (!par::cluster_missing) MD << setw(12) << prop_sig_diff[i] << " "; MD << "\n"; } MD.close(); } ////////////////////////// // Cluster analysis if ( par::cluster ) { int c=1; bool done=false; // Matrix of solutions vector< vector > sol(ni); for (int i=0;i hist(1); // Build solution for (int i=0; idmin && pairable_cluster(pairable,cl[i],cl[j]) ) { // And will the max cluster size requirement be fulfilled? if (par::max_cluster_size==0 || (( cl[i].size()+cl[j].size()) <= par::max_cluster_size) ) { // And will the basic phenotypic matching requirement be fulfilled? if ( (!par::cluster_on_phenotype) || (!homogeneous_clusters((*this),cl[i],cl[j]))) { // What about the --mcc clustering if ( (!par::cluster_on_mcc) || spec_clusters( (*this),cl[i],cl[j]) ) { // And what about pick1 constrains? (this must be final constraint) if ( (!par::cluster_selcon) || selcon_inds( (*this),cl[i],cl[j],selcon)) { imin=i; jmin=j; dmin=d; } } } } } } // Did we get a merge? if (imin==-1) { done=true; //printLOG("Cannot make clusters that satisfy constraints at step "+int2str(c)+"\n"); goto done_making_clusters; } // Save merge distance hist.push_back(dmin); // Add to list of selected categories if (par::cluster_selcon) { if (cl[imin].size() == 1 ) selcon.insert( sample[cl[imin][0]]->sol ); if (cl[jmin].size() == 1 ) selcon.insert( sample[cl[jmin][0]]->sol ); } // 2. Join these clusters for(int j=0;jfid << "_" << sample[cl[i][j]]->iid; if (par::cluster_on_phenotype || par::cluster_on_mcc) CLST << "(" << (int)sample[cl[i][j]]->phenotype << ")"; } CLST << "\n"; } CLST.close(); printLOG("Writing cluster solution (2) [ " + par::output_file_name + ".cluster2 ]\n"); CLST.open((par::output_file_name+".cluster2").c_str(),ios::out); CLST.clear(); for (int j=0; jfid << " " << sample[j]->iid << "\t" << sol[j][best] << "\n"; // Keep track of this (might be needed if MDS plot done) sample[j]->sol = sol[j][best]; } CLST.close(); } if (!par::cluster_missing) { printLOG("Writing cluster solution (3) [ " + par::output_file_name + ".cluster3 ]\n"); CLST.open((par::output_file_name+".cluster3").c_str(),ios::out); } else { printLOG("Writing cluster solution (3) [ " + par::output_file_name + ".cluster3.missing ]\n"); CLST.open((par::output_file_name+".cluster3.missing").c_str(),ios::out); } CLST.clear(); for (int j=0; jfid << " " << sample[j]->iid << "\t"; for (int i=0; i > & d, vector & a, vector & b) { // Compare based on first metric, but also return paired second double l; l = a[0]>b[0] ? d[a[0]][b[0]] : d[b[0]][a[0]]; for (int i=0; i b[j] ) { if ( d[a[i]][b[j]] < l ) l = d[a[i]][b[j]]; } else { if ( d[b[j]][a[i]] < l ) l = d[b[j]][a[i]]; } } return l; } double groupAvgLink(vector > & d, vector & a, vector & b) { double s = 0; for (int i=0; i b[j] ) { s += d[a[i]][b[j]]; } else { s += d[b[j]][a[i]]; } } return 1.0 / ( a.size() * b.size() ) * s ; } bool homogeneous_clusters(Plink & P, vector & a, vector & b) { // Determine how to handle missing phenotypes? bool homogeneous = true; for (int i=0; iphenotype != P.sample[b[j]]->phenotype) && (!P.sample[a[i]]->missing) && (!P.sample[b[j]]->missing) ) homogeneous = false; } return homogeneous; } bool spec_clusters(Plink & P, vector & a, vector & b) { // Missing individuals will be treated as unaffected int ncase = 0, ncontrol = 0; for (int i=0; iaff) ncase++; else ncontrol++; for (int j=0; jaff) ncase++; else ncontrol++; if (ncase <= par::max_cluster_case && ncontrol <= par::max_cluster_control) return true; else return false; } bool pairable_cluster(vector > & pairable, vector & a, vector & b) { for (int i=0; i & a, vector & b, set & inc) { // Only need to check for singletons (i.e. once somebody is in a cluster, // they must have already passed this test) if ( a.size() == 1 ) { // Individual already in? if ( inc.find(P.sample[a[0]]->sol) != inc.end() ) return false; } else if ( b.size() == 1 ) { if ( inc.find(P.sample[b[0]]->sol) != inc.end() ) return false; } if ( a.size() == 1 && b.size() == 1 ) if ( P.sample[a[0]]->sol == P.sample[b[0]]->sol ) return false; return true; } void Plink::permutationIBSTest(Perm & perm) { // Take the IBS distance matrix, and ask (by permutation) // where the average difference between two groups is larger // than we would expect by chance // i.e. statistic = average between group IBS distance // permutation = label swapping // 1-sided test, asking whether people between // groups are *less* similar than we'd expect ///////////////////////////////// // Calculate distances // (will exit before clustering) buildCluster(); //////////////// // Perform test perm.setTests(12); perm.setPermClusters(*this); perm.originalOrder(); // Tests (1 sided), where ">" means less similar? // as tests are based on 1-f(mdist) // 0 1a. Case/control < all others // 1 1b. Case/control > all others // 2 2a. Case/case < control/control // 3 2b. Case/case > control/control // 4 3a. Case/case < all others // 5 3b. Case/case > all others // 6 4a. Control/control < all others // 7 4b. Control/control > all others // 8 5a. Case/case < Case/control // 9 5b. Case/case > Case/control // 10 6a. Control/control < Case/control // 11 6b. Control/control > Case/control /////////////////////////// // Original test statistics vector original(12,0); double bg_mean = 0; double ig1_mean = 0; double ig2_mean = 0; int bg_n = 0; int ig1_n = 0; int ig2_n = 0; double bg_var = 0; double ig1_var = 0; double ig2_var = 0; // Add addition 11 tests here... for (int i=0; iaff != sample[j]->aff ) { bg_mean += mdist[j][i]; bg_n++; } else if ( sample[i]->aff ) { ig2_mean += mdist[j][i]; ig2_n++; } else { ig1_mean += mdist[j][i]; ig1_n++; } } // 0 1a. Case/control < all others // 1 1b. Case/control > all others // 2 2a. Case/case < control/control // 3 2b. Case/case > control/control // 4 3a. Case/case < all others // 5 3b. Case/case > all others // 6 4a. Control/control < all others // 7 4b. Control/control > all others // 8 5a. Case/case < Case/control // 9 5b. Case/case > Case/control // 10 6a. Control/control < Case/control // 11 6b. Control/control > Case/control original[0] -= bg_mean; original[1] += bg_mean; original[2] += ig1_mean - ig2_mean; original[3] += ig2_mean - ig1_mean; original[4] -= ig2_mean; original[5] += ig2_mean; original[6] -= ig1_mean; original[7] += ig1_mean; original[8] += bg_mean - ig2_mean; original[9] += ig2_mean - bg_mean; original[10] += bg_mean - ig1_mean; original[11] += ig1_mean - bg_mean; if (bg_n==0) error("No between group individuals observed"); double tot_mean = (bg_mean+ig1_mean+ig2_mean) /(double)(bg_n+ig1_n+ig2_n); bg_mean /= (double)bg_n; ig1_mean /= (double)ig1_n; ig2_mean /= (double)ig2_n; for (int i=0; iaff != sample[j]->aff ) bg_var += ( mdist[j][i] - bg_mean ) * ( mdist[j][i] - bg_mean ); else if ( sample[i]->aff ) ig2_var += ( mdist[j][i] - ig2_mean ) * ( mdist[j][i] - ig2_mean ); else ig1_var += ( mdist[j][i] - ig1_mean ) * ( mdist[j][i] - ig1_mean ); } // Total sum of squares double total_ss = bg_var + ig2_var + ig1_var; // Between sum of squares double ig_mean = (ig1_mean * ig1_n + ig2_mean * ig2_n) / ( double ) ( ig1_n + ig2_n ); double between_ss = (double)bg_n * ( bg_mean - tot_mean ) * ( bg_mean - tot_mean ) + (double)(ig1_n+ig2_n) * ( ig_mean - tot_mean ) * ( ig_mean - tot_mean ); bg_var /= (double)(bg_n-1); ig2_var /= (double)(ig2_n-1); ig1_var /= (double)(ig1_n-1); printLOG("\nBetween-group IBS (mean, SD) = " +dbl2str(bg_mean)+", "+dbl2str(sqrt(bg_var))+"\n"); printLOG("In-group (2) IBS (mean, SD) = " +dbl2str(ig2_mean)+", "+dbl2str(sqrt(ig2_var))+"\n"); printLOG("In-group (1) IBS (mean, SD) = " +dbl2str(ig1_mean)+", "+dbl2str(sqrt(ig1_var))+"\n"); printLOG("Approximate proportion of variance between group = " +dbl2str(between_ss / total_ss)+"\n"); //////////////////// // Begin permutations bool finished = false; while(!finished) { vector pr(12,0); // Permute perm.permuteInCluster(); // Retest bg_mean = ig1_mean = ig2_mean = 0; for (int i=0; ipperson->aff != sample[j]->pperson->aff ) { bg_mean += mdist[j][i]; } else if ( sample[i]->pperson->aff ) { ig2_mean += mdist[j][i]; } else { ig1_mean += mdist[j][i]; } } pr[0] -= bg_mean; pr[1] += bg_mean; // are case/control more similar? pr[2] += ig1_mean - ig2_mean; pr[3] += ig2_mean - ig1_mean; pr[4] -= ig2_mean; pr[5] += ig2_mean; pr[6] -= ig1_mean; pr[7] += ig1_mean; pr[8] += bg_mean - ig2_mean; pr[9] += ig2_mean - bg_mean; pr[10] += bg_mean - ig1_mean; pr[11] += ig1_mean - bg_mean; //////////////////////////////// // Standard permutation counting finished = perm.update(pr,original); } if (!par::silent) cout << "\n\n"; //////////////////////////// // Display permuted p-values printLOG("IBS group-difference empirical p-values:\n\n"); printLOG(" T1: Case/control less similar p = " + dbl2str(perm.pvalue(0)) +"\n"); printLOG(" T2: Case/control more similar p = " + dbl2str(perm.pvalue(1)) +"\n\n"); printLOG(" T3: Case/case less similar than control/control p = " + dbl2str(perm.pvalue(2)) +"\n"); printLOG(" T4: Case/case more similar than control/control p = " + dbl2str(perm.pvalue(3)) +"\n\n"); printLOG(" T5: Case/case less similar p = " + dbl2str(perm.pvalue(4)) +"\n"); printLOG(" T6: Case/case more similar p = " + dbl2str(perm.pvalue(5)) +"\n\n"); printLOG(" T7: Control/control less similar p = " + dbl2str(perm.pvalue(6)) +"\n"); printLOG(" T8: Control/control more similar p = " + dbl2str(perm.pvalue(7)) +"\n\n"); printLOG(" T9: Case/case less similar than case/control p = " +dbl2str(perm.pvalue(8)) +"\n" ); printLOG("T10: Case/case more similar than case/control p = " +dbl2str(perm.pvalue(9)) +"\n\n"); printLOG("T11: Control/control less similar than case/control p = " + dbl2str(perm.pvalue(10)) +"\n"); printLOG("T12: Control/control more similar than case/control p = " + dbl2str(perm.pvalue(11)) +"\n"); } void Plink::groupGenome() { // Read from a (non-verbose) genome file checkFileExists(par::ibd_file); if ( par::ibd_read_minimal ) printLOG("Reading IBS estimates (minimal format) from [ " +par::ibd_file+" ] \n"); else printLOG("Reading genome-wide IBS estimates from [ " +par::ibd_file+" ] \n"); ifstream INC; INC.open(par::ibd_file.c_str()); map mperson; for (int i=0; ifid+"_"+sample[i]->iid , i )); map mcode; for (int i=0; i peeps; // We wish to read in an NxN matrix, and convert it to a KxK one matrix_t dk( nk ); sizeMatrix(dk,nk,0); for (int j=0; j tokens = tokenizeLine(INC); col_length = tokens.size(); if ( tokens.size() < 4 || tokens[0] != "FID1" || tokens[1] != "IID1" || tokens[2] != "FID2" || tokens[3] != "IID2" ) error("Problem with header row of .genome file"); for ( int i = 4; i tokens = tokenizeLine(INC); if ( tokens.size() == 0 ) continue; if ( col_length != tokens.size() ) { string strmsg = ""; for (int i=0;i( mydst , idst , std::dec) ) mydst = 0; map::iterator person1 = mperson.find(fid1+"_"+iid1); map::iterator person2 = mperson.find(fid2+"_"+iid2); if ( person1 == mperson.end() || person2 == mperson.end() || person1 == person2 ) continue; int k1 = sample[ person1->second ]->sol; int k2 = sample[ person2->second ]->sol; if ( k1 < 0 || k2 < 0 || k1 == k2 ) continue; if ( k2 > k1 ) { int tmp = k2; k2 = k1; k1 = tmp; } // Record IBS distance dk[k1][k2] = mydst; ++dkn[k1][k2]; } // Read next line in .genome INC.close(); for (int i=0; i 0 ) dk[i][j] /= (double)dkn[i][j]; } // Output a dummy .genome file ofstream GOUT0; GOUT0.open( (par::output_file_name + ".plst").c_str(), ios::out); printLOG("Writing person include list to [ " + par::output_file_name + ".plst ]\n"); ofstream GOUT1; GOUT1.open( (par::output_file_name + ".clst").c_str(), ios::out); printLOG("Writing cluster list to [ " + par::output_file_name + ".clst ]\n"); map k2i; for (int i=0;isol; if ( j < 0 ) continue; if ( k2i.find(j) == k2i.end() ) { k2i.insert(make_pair(j,i)); GOUT0 << sample[i]->fid << " " << sample[i]->iid << "\n"; GOUT1 << sample[i]->fid << " " << sample[i]->iid << " " << kname[j] << "\n"; } } GOUT0.close(); GOUT1.close(); ofstream GOUT; GOUT.open( (par::output_file_name + ".genome").c_str(), ios::out); printLOG("Writing grouped .genome file to [ " + par::output_file_name + ".genome ]\n"); GOUT << setw(par::pp_maxfid) << "FID1" << " " << setw(par::pp_maxiid) << "IID1" << " " << setw(par::pp_maxfid) << "FID2" << " " << setw(par::pp_maxiid) << "IID2" << " " << setw(8) << "DST" << " " << setw(8) << "PPC" << "\n"; for (int i=0; isecond]; Individual * s2 = sample[k2i.find(j)->second]; GOUT << s1->fid << " " << s1->iid << " " << s2->fid << " " << s2->iid << " "; // cout << i << " " << j << " "; // cout << dk.size() << " " << dk[i].size() << "\n"; // cout << dkn.size() << " " << dkn[i].size() << "\n"; // cout << dk[i][j] << " of " << dkn[i][j] << "\n"; GOUT << dk[i][j] << " 1\n"; } GOUT.close(); } plink-1.07-src/stats.h0000644000265600020320000000643611264127626014016 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __STATS_H__ #define __STATS_H__ #include #include #include #include "plink.h" using namespace std; void sizeMatrix(matrix_t &, int,int); void sizeTable(table_t & , int, int); void multMatrix(matrix_t & a, matrix_t & b, matrix_t & c); matrix_t vec2diag(vector_t &); class Eigen { public: void set(int n) { d.resize(n,0); sizeMatrix(z,n,n); } vector_t d; // eigenvalues matrix_t z; // eigenvectors }; bool realnum(double); long double factorial(int); double normdist(double); double ltqnorm(double); double chi2x2(double,double,double,double); double chi2x2(table_t); double chi2x2(matrix_t); double chiTable(table_t); double chiprobP(double, double); double symTable(table_t); double inverse_chiprob(double, double); double gammp(double a, double x); void gser(double *gamser, double a, double x, double *gln); void gcf(double *gammcf, double a, double x, double *gln); double gammln(double xx); double rnorm(); void lubksb(vector > &a, vector &indx, vector &b); void ludcmp(vector > &a, vector &indx, double &d); vector< vector > inverse(vector< vector > & m ); vector eigenvalues(vector > & a); void tred2(vector >&,vector &,vector &); void tqli(vector &d, vector&e, vector > &z); Eigen eigenvectors(vector > & a); void EV_tred2(vector >&,vector &,vector &); void EV_tqli(vector &d, vector&e, vector > &z); vector< vector > svd_inverse(vector< vector > & , bool & ); bool svd(matrix_t &,vector_t &, matrix_t &); bool svdcmp(vector > &, vector &, vector > &); void svbksb(vector > &u, vector &w, vector > &v, vector &b, vector &x); vector > msqrt(vector > & u); double qromb(double func(const double), double a, double b); void polint(vector_t &xa, vector_t &ya, const double x, double &y, double &dy); double trapzd(double func(const double), const double a, const double b, const int n); void svdvar(vector > & v, vector & w, vector > & cvm); int pca(matrix_t & x, boolmatrix_t & mask, vector_t & p, matrix_t & s,matrix_t & v, bool); double pythag(const double a, const double b); double betacf(const double a, const double b, const double x); double betai(const double a, const double b, const double x); double pF(const double F, const int df1, const int df2); double pT(const double T, const double df); #endif plink-1.07-src/haploQTL.cpp0000644000265600020320000001401611264127625014667 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2007 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "phase.h" #include "helper.h" #include "stats.h" void HaploPhase::haplotypicQTL(map & tests, int nt, bool display_results ) { // Quantitative trait test based on a test vector; like TDT, assumes // only ever two groups, for now // No implementation of QTL omnibus test yet if (nt!=2) return; // Genotypic and phenotype mean, variance, covariance double genotypic_mean = 0; double genotypic_variance = 0; double qt_mean = 0; double qt_variance = 0; double covariance = 0; // number of individuals in analysis int numberIndividuals = 0; // For this, we will make the male X coding equivalent to // --xchr-model 1; except we will not add a covariate for // sex; // Females: 0, 1, 2 // Males: 0, 1 ///////////////////////////// // Iterate over individuals for (int i = 0 ; i < P.n; i++) { if ( hap1[i].size() == 0 ) continue; Individual * pperson = P.sample[i]->pperson; Individual * gperson = P.sample[i]; if (!pperson->missing) { qt_mean += pperson->phenotype; // Consider all possible phases for (int z = 0 ; z < hap1[i].size(); z++) { map::iterator i1 = tests.find(hap1[i][z]); map::iterator i2 = tests.find(hap2[i][z]); // i1 and i2 should always point to a 0/1 variable; // but the coding is reversed (for god knows what reason) // such as convention means the to-be-tested variant(s) // have a 0; therefore reverse here. int c1 = 1 - i1->second; int c2 = 1 - i2->second; if ( i1 != tests.end() ) { if (!ambig[i]) genotypic_mean += c1; else genotypic_mean += c1 * pp[i][z]; } if ( ! ( haploid || ( X && gperson->sex ) ) ) { if ( i2 != tests.end() ) { if (!ambig[i]) genotypic_mean += c2; else genotypic_mean += c2 * pp[i][z]; } } } numberIndividuals++; } } // Next individual qt_mean /= (double)numberIndividuals; genotypic_mean /= (double)numberIndividuals; ////////////////////////////////// // Iterate over individuals again for (int i=0; i< P.n; i++) { if ( hap1[i].size() == 0 ) continue; Individual * pperson = P.sample[i]->pperson; Individual * gperson = P.sample[i]; if (!pperson->missing) { double g = 0; // Consider all possible phases for (int z = 0 ; z < hap1[i].size(); z++) { map::iterator i1 = tests.find(hap1[i][z]); map::iterator i2 = tests.find(hap2[i][z]); // i1 and i2 should always point to a 0/1 variable; // but the coding is reversed (for god knows what reason) // such as convention means the to-be-tested variant(s) // have a 0; therefore reverse here. int c1 = 1 - i1->second; int c2 = 1 - i2->second; if ( i1 != tests.end() ) { if (!ambig[i]) g += c1; else g += c1 * pp[i][z]; } if ( ! ( haploid || ( X && gperson->sex ) ) ) { if ( i2 != tests.end() ) { if (!ambig[i]) g += c2; else g += c2 * pp[i][z]; } } } qt_variance += (pperson->phenotype-qt_mean) * ( pperson->phenotype-qt_mean ) ; genotypic_variance += (g-genotypic_mean) * ( g-genotypic_mean ) ; covariance += ( pperson->phenotype - qt_mean ) * ( g - genotypic_mean ) ; } } // Next individual // Statistics qt_variance /= (double)numberIndividuals - 1; genotypic_variance /= (double)numberIndividuals - 1; covariance /= (double)numberIndividuals - 1; // Test statistic double beta = covariance / genotypic_variance; double vbeta = ( qt_variance/genotypic_variance - (covariance * covariance ) / (genotypic_variance* genotypic_variance) ) / (numberIndividuals-2); double t = beta / sqrt(vbeta); double t_p = pT(t,numberIndividuals-2); // Display results? if ( display_results ) { // Skip?, if filtering p-values if ( par::pfilter && ( t_p > par::pfvalue || t_p < 0 ) ) goto skip_p2; double r2 = (covariance * covariance ) / ( qt_variance * genotypic_variance ) ; HTEST << setw(10) << hname << " "; // Find test haplotype (assuming there is a single one; // otherwise we won't be in display mode, i.e. proxy // association has it's own display) int hh=0; map::iterator i1 = tests.begin(); while ( i1 != tests.end() ) { if ( i1->second == 0 ) hh = i1->first; i1++; } HTEST << setw(12) << haplotypeName(hh) << " "; HTEST << setw(8) << numberIndividuals << " " << setw(10) << beta << " " << setw(10) << r2 << " " ; if (t_p >= 0) HTEST << setw(8) << t << " " << setw(12) << t_p << " " ; else HTEST << setw(8) << "NA" << " " << setw(12) << "NA" << " " ; // Display SNPs for (int snps=0; snpsname << "|"; HTEST << P.locus[S[ns-1]]->name << "\n"; } skip_p2: // Store chi-sq and regression coefficient result = t; pvalue = t_p; odds = beta; } plink-1.07-src/model.cpp0000644000265600020320000003657211264127625014316 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include "model.h" #include "options.h" #include "helper.h" #include "phase.h" #include "stats.h" #include Model::Model() { np = nind = 0; haploid.resize(0); xchr.resize(0); order.clear(); sex_effect = false; all_valid = true; has_snps = true; testParameter = 1; // Permutation test parameter // Automatically add intercept now label.push_back("M"); // Intercept type.push_back( INTERCEPT ); order.push_back(0); /////////////////////////// // Default additive coding mAA = 0; mAB = 1; mBB = 2; /////////////////////////// // Set X chromosome coding if ( par::xchr_model == 1 ) { mA = 0; mB = 1; } else if ( par::xchr_model == 2 ) { mA = 0; mB = 2; } else if ( par::xchr_model > 2 ) { mA = 0; mB = 1; } } void Model::setDominant() { mAA = 0; mAB = 1; mBB = 1; mA = 0; mB = 1; } void Model::setRecessive() { mAA = 0; mAB = 0; mBB = 1; // No haploid effect mA = mB = 0; } void Model::addSexEffect() { sex_effect = true; type.push_back( SEX ); order.push_back(0); } bool Model::isSexInModel() { return sex_effect; } void Model::hasSNPs(bool b) { has_snps = b; } void Model::setMissing() { // Fill in missing data with existing pattern // and also optional per-test missingness miss.clear(); miss.resize(P->n,false); for (int i=0; in; i++) if ( P->sample[i]->missing || P->sample[i]->missing2 ) miss[i] = true; } vector Model::getMissing() { return miss; } void Model::yokeMissing(Model * m) { // } void Model::setMissing(vector & include) { // Fill in missing data with existing pattern if ( include.size() != P->n ) error("A problem in setMissing()\n"); miss.resize(P->n,false); for (int i=0; in; i++) if ( P->sample[i]->missing || ! include[i] ) miss[i] = true; } void Model::addAdditiveSNP(int a) { if ( ! has_snps ) error("Cannot add SNP to this MODEL"); additive.push_back(a); if ( par::chr_sex[P->locus[a]->chr] ) xchr.push_back(true); else xchr.push_back(false); if ( par::chr_haploid[P->locus[a]->chr] ) haploid.push_back(true); else haploid.push_back(false); type.push_back( ADDITIVE ); order.push_back( additive.size() - 1 ); } void Model::addDominanceSNP(int d) { if ( ! has_snps ) error("Cannot add SNP to thie MODEL"); dominance.push_back(d); type.push_back( DOMDEV ); order.push_back( dominance.size() - 1 ); } void Model::addCovariate(int c) { covariate.push_back(c); type.push_back( COVARIATE ); order.push_back( covariate.size() - 1 ); } void Model::addHaplotypeDosage(set & h) { haplotype.push_back(h); type.push_back( HAPLOTYPE ); order.push_back( haplotype.size() - 1 ); } void Model::addInteraction(int a, int b) { int2 i; i.p1 = a; i.p2 = b; interaction.push_back(i); type.push_back( INTERACTION ); order.push_back( interaction.size() - 1 ); } void Model::buildDesignMatrix() { // Build X matrix (including intercept) // Iterate a person at a time, entering only // valid rows into X (non-missing); also build Y // at the same time if ( has_snps && par::SNP_major ) error("Internal error: must be individual-major to perform this...\n"); /////////////////////// // Number of parameters // Standard variables // Note: 'additive' really means 'main effect' here, // i.e. which can also be coded recessive or dominant // i.e. the distinction is between the 2df general model np = 1 + additive.size() + dominance.size() + haplotype.size() + covariate.size() + interaction.size(); // Sex effect? if ( sex_effect ) np++; // QFAM variables // if (par::QFAM_total) np++; // else if (par::QFAM_between || par::QFAM_within1 || par::QFAM_within2) np +=2; if (par::QFAM_total || par::QFAM_between || par::QFAM_within1 || par::QFAM_within2) { np++; type.push_back( QFAM ); order.push_back( 0 ); } /////////////////////////// // Consider each individual for (int i=0; i < P->n; i++) { Individual * person = P->sample[i]; // Ignore if missing phenotype, or the user set this to missing if ( miss[i] ) continue; ///////////////////////////// // 0) Intercept // 1) Main effects of SNPs // 2) Dominance effects of SNPs // 3) Haplotypes // 4) Covariates // 5) Interactions of the above // 6) QFAM variables // Populate this vector with terms for this // individual skip = false; vector_t trow(np); for (int p = 0; p < np; p++) { int pType = type[p]; switch ( pType ) { case INTERCEPT : trow[p] = buildIntercept(); break; case ADDITIVE : trow[p] = buildAdditive( person, order[p] ); break; case DOMDEV : trow[p] = buildDominance( person, order[p] ); break; case HAPLOTYPE : trow[p] = buildHaplotype(i, order[p] ); break; case SEX : trow[p] = buildSex(person); break; case COVARIATE : trow[p] = buildCovariate( person, order[p] ); break; case INTERACTION : trow[p] = buildInteraction( person, order[p], trow ); break; case QFAM : trow[p] = buildQFAM( person ); break; } } if (skip) { miss[i] = true; skip = false; continue; } //////////////////////////// // Add row to design matrix X.push_back(trow); } ///////////////////////////////////////////////// // Set number of non-missing individuals nind = X.size(); ////////////////////////////////// // Apply a parameter list filter? if ( par::glm_user_parameters ) { // Intercept always fixed in np = 1; vector label2 = label; label.clear(); label.push_back("M"); int np2 = label2.size(); for (int i=0; i< par::parameter_list.size(); i++) { if ( par::parameter_list[i] >= 1 && par::parameter_list[i] < np2 ) { np++; label.push_back(label2[ par::parameter_list[i] ]); } } // For each individual for ( int i = 0 ; i < X.size() ; i++) { vector_t X2(1); X2[0] = 1; for ( int j = 0 ; j < par::parameter_list.size() ; j++) { if ( par::parameter_list[j] >= 1 && par::parameter_list[j] < np2 ) { X2.push_back(X[i][ par::parameter_list[j] ]); } } X[i] = X2; } } ///////////////////////////////////////// // VIF-based check for multicollinearity all_valid = checkVIF(); /////////////////////// // Add Y variable also setDependent(); // Now we are ready to perform the analysis if (par::verbose) { cout << "X design matrix\n"; display(X); cout << "\n"; } } vector Model::validParameters() { // Empty model? if (np==0 || nind==0) { vector v(np,false); all_valid = false; return v; } // Display covariance matrix in verbose mode if ( par::verbose ) { cout << "Covariance matrix of estimates\n"; display(S); cout << "\n"; } // Check for multicollinearity // For each term, see that estimate is not too strongly (r>0.99) // correlated with another, starting at last valid.resize(np); for (int i = 1; i0; i--) { for (int j = i-1; j>=0; j--) { if ( S[i][j] / sqrt( S[i][i] * S[j][j] ) > 0.99999 ) { valid[i] = false; all_valid = false; break; } } } return valid; } double Model::getStatistic() { if (all_valid) { return ( coef[testParameter] * coef[testParameter] ) / S[testParameter][testParameter]; } else return 0; } // ********************************************** // *** Function to aid testing linear // *** hypotheses after estimating of a // *** regression // ********************************************** double Model::linearHypothesis(matrix_t & H, vector_t & h) { // (H v - h)' (H S H')^-1 (H b - h) ~ X^2 with j df // where H = constraint matrix (j x (p+1) // h = null coefficient values // S = estimated covariance matrix of coefficients // return ( H*b - h ).transpose() // * ( H*v*H.transpose() ).inverse() // * ( H*b - h ) ; int nc = h.size(); // # of constraints // 1. Calculate Hb-h vector_t outer; outer.resize(nc,0); for (int r = 0; r < nc; r++) for (int c = 0; c < np; c++) outer[r] += H[r][c] * coef[c]; for (int r = 0; r < nc; r++) outer[r] -= h[r]; // 2. Calculate HVH' matrix_t tmp; sizeMatrix(tmp,nc,np); for (int r = 0; r < nc; r++) for (int c = 0; c < np; c++) for (int k = 0; k < np; k++) tmp[r][c] += H[r][k] * S[k][c]; matrix_t inner; sizeMatrix(inner,nc,nc); for (int r = 0; r < nc; r++) for (int c = 0; c < nc; c++) for (int k = 0; k < np; k++) inner[r][c] += tmp[r][k] * H[c][k]; bool flag = true; inner = svd_inverse(inner,flag); if ( !flag ) all_valid = false; vector_t tmp2; tmp2.resize(nc,0); for (int c = 0; c < nc; c++) for (int k = 0; k < nc; k++) tmp2[c] += outer[k] * inner[k][c]; double result = 0; for (int r = 0; r < nc; r++) result += tmp2[r] * outer[r]; return result; } bool Model::checkVIF() { // Calculate correlation matrix for X // Skip intercept int p = X.size(); if (p<2) return false; int q = X[0].size() - 1; if ( q < 2 ) return true; vector_t m(q); matrix_t c; sizeMatrix(c,q,q); for (int i=0; i 0.999 ) { if (par::verbose) cout << "individual element > 0.999, skipping VIF\n"; return false; } } // Any item with zero variance? for (int j=0; j par::vif_threshold ) return false; } return true; } double Model::buildIntercept() { return 1; } double Model::buildAdditive(Individual * person , int snp ) { // Additive effects (assuming individual-major mode) int s = additive[snp]; bool i1 = person->one[s]; bool i2 = person->two[s]; if ( xchr[snp] ) { ///////////////////////// // X chromosome coding if ( person->sex ) // male { if ( i1 ) { if ( ! i2 ) { skip = true; return 0; } else return mA; } else { if ( i2 ) { // This should not happen... skip = true; return 0; } else return mB; } } else // female x-chromosome { if ( i1 ) { if ( ! i2 ) { skip = true; return 0; } else return mAA; } else { if ( i2 ) return mAB; // het else return mBB; // hom } } } else if ( haploid[snp] ) { /////////////////// // Haploid coding if ( i1 ) { if ( ! i2 ) { skip = true; return 0; } else return 0; } else { if ( i2 ) { // haploid het skip = true; return 0; } else return 1; } } else { /////////////////////// // Autosomal coding if ( i1 ) { if ( ! i2 ) { skip = true; return 0; } else return mAA; } else { if ( i2 ) return mAB; // het else return mBB; // hom } } } double Model::buildDominance(Individual * person, int snp) { //////////////////// // Dominance effects int s = dominance[snp]; bool i1 = person->one[s]; bool i2 = person->two[s]; if ( i1 ) { if ( ! i2 ) { skip = true; return 0; } else return 0; } else { if ( i2 ) return 1; // het else return 0; // hom } } double Model::buildHaplotype(int i, int h ) { //////////////////// // Haplotype dosage if ( P->haplo->include[i] ) return P->haplo->dosage(i,haplotype[h]); // No valid haplotypes for this person skip = true; return 0; } double Model::buildSex(Individual * person ) { //////////////////////////////////// // Sex effect (automatically set for // X chromosome models) if ( person->sex ) return 1; else return 0; } double Model::buildCovariate(Individual * person, int j) { ///////////// // Covariates return person->clist[covariate[j]]; } double Model::buildInteraction(Individual * person, int j, vector_t & trow ) { /////////////// // Interactions return trow[ interaction[j].p1 ] * trow[ interaction[j].p2 ]; } double Model::buildQFAM(Individual * person) { /////////////// // QFAM if ( par::QFAM_total ) return person->T; else if ( par::QFAM_between ) return person->family->B; else if ( par::QFAM_within1 || par::QFAM_within2 ) return person->W; else error("Internal problem with QFAM model specification"); } void Model::setCluster() { cluster=true; clst.clear(); nc=0; map novelc; for (int i=0; in; i++) if (!miss[i]) { int c = P->sample[i]->sol; map::iterator m = novelc.find(c); if ( m == novelc.end() ) { clst.push_back( nc ); novelc.insert( make_pair( c, nc ) ); ++nc; } else { clst.push_back( m->second ); } } // We need at least two clusters: if ( novelc.size() == 1 ) noCluster(); } void Model::noCluster() { cluster=false; clst.clear(); nc=0; } plink-1.07-src/qfam.cpp0000644000265600020320000004724411264127625014140 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "linear.h" #include "perm.h" #include "crandom.h" #include "stats.h" void setCovariatesForSNP(Plink & P, int l) { vector::iterator gperson = P.sample.begin(); while ( gperson != P.sample.end() ) { // Assume a non-missing genotype (*gperson)->flag = true; bool s1 = (*gperson)->one[l]; bool s2 = (*gperson)->two[l]; if ( ! s1 ) { if ( ! s2 ) (*gperson)->T = 1; else (*gperson)->T = 0; } else { if ( ! s2 ) (*gperson)->flag = false; else (*gperson)->T = -1; } // Next individual gperson++; } } void scoreBetween(Plink & P , int l) { vector::iterator f = P.family.begin(); // Construct family B score for this SNP int fc=0; while ( f != P.family.end() ) { if (par::verbose) { if ( (*f)->singleton ) { cout << "SINGLETON(S)\t" << (*f)->kid[0]->fid << " : "; for (int k=0; k < (*f)->kid.size() ;k++) cout << (*f)->kid[k]->iid << " "; cout << "\n"; } else if ( (*f)->sibship ) { cout << "SIBSHIP \t" << (*f)->kid[0]->fid << " : "; for ( int k=0; k<(*f)->kid.size(); k++) cout << (*f)->kid[k]->iid << " "; cout << "\n"; } else if ( (*f)->parents ) { cout << "W/ PARENTS\t" << (*f)->pat->fid << " : "; cout << (*f)->pat->iid << " x " << (*f)->mat->iid << " -> "; for ( int k=0; k<(*f)->kid.size(); k++) cout << (*f)->kid[k]->iid << " "; cout << "\n"; } else cout << "UNDEFINED\t" << (*f)->pat->fid << " " << (*f)->pat->iid << "\n"; } double B = 0; bool Bset = false; // Include this entire family? (*f)->include = true; // Flag to indicate inclusion in parenQTDT (both parents genotyped) (*f)->discordant_parents = true; // Two theoretically genotyped parents? if ( (*f)->parents ) { // Two actually genotyped parents? if ( (*f)->pat->flag && (*f)->mat->flag ) { B = ( (*f)->pat->T + (*f)->mat->T ) * 0.5 ; Bset = true; } else (*f)->discordant_parents = false; } // Did this individual have parental genotype information to set B? If not... if ( !Bset ) { // Use sibling genotypes? This will default to one's own // genotype (i.e. sibship of size 1 (singletons are coded // offspring here) // Number of sibling int nsib = (*f)->kid.size(); // Number of genotyped sibling int nsib2 = nsib; for (int k=0; kkid[k]->flag ) B += (*f)->kid[k]->T; else nsib2--; } if (nsib2==0) { // No non-missing offspring in family // so does not matter what we set here (*f)->include = false; B = -9; } else B /= (double)nsib2; } // Store between family score (*f)->B = B; // Next family f++; fc++; } } void scoreBandW(Plink & P, int l , vector & include) { // Initially, everybody is included vector::iterator gperson = P.sample.begin(); int i=0; while ( gperson != P.sample.end() ) { Individual * pperson = (*gperson)->pperson; if ( ! pperson->family ) error("Internal problem: no family assigned for [ " + pperson->fid + " " + pperson->iid + " ]\n"); // Valid phenotype... if ( ( ! pperson->missing ) ) { // ... and genotype? if ( (*gperson)->flag ) { Family * f = (*gperson)->family; // Are we modelling parental phenotypes? if ( par::QFAM_total || // total association test... par::QFAM_between || // ...between association test... ( par::QFAM_within2 && f && f->discordant_parents ) || // ...parenQTDT... ! (*gperson)->founder ) // ...or, not a founder { // Between-family component (*gperson)->B = f ? f->B : 0; // Within-family component (*gperson)->W = (*gperson)->T - (*gperson)->B; } else include[i] = false; } else include[i] = false; } else include[i] = false; // Next person gperson++; i++; } } ////////////////////////////////////////////////////////////////////// // // For QFAM, and unlike all other tests, we use two different ways of // permuting: either standard (all SNPs per replicate) or on a per SNP // adaptive basis (i.e. all perms for a SNP; then move on to next // SNP). This saves the work of constructing the family, etc, as we // need to do each time. // void Plink::perm_testQTDT(Perm & perm) { ////////////////////////////// // Use individual-major coding if (par::SNP_major) SNP2Ind(); // for now, no covariates if ( par::clist_number > 0 ) error("Cannot specify covariates with QFAM for now...\n"); //////////////////////////////////////////////// // Specify special adaptive QFAM mode (i.e. one SNP // at a time) ///////////////////////////// // Set up permutation indices vector pbetween(family.size()); vector pwithin(family.size(),false); for (int i=0; i < family.size(); i++) pbetween[i] = i; /////////////// // Output files string f = ".qfam"; if (par::QFAM_within1) f += ".within"; else if (par::QFAM_within2) f += ".parents"; else if (par::QFAM_between) f += ".between"; else if (par::QFAM_total) f += ".total"; printLOG("Writing QFAM statistics to [ " + par::output_file_name + f + " ]\n"); if (!par::permute) printLOG("** Warning ** QFAM results require permutation to correct for family structure\n"); else printLOG("Important: asymptotic p-values not necessarily corrected for family-structure:\n" " use empirical p-values for robust p-values from QFAM\n" " and consult the above file only for parameter estimates\n"); ofstream QOUT((par::output_file_name+f).c_str(),ios::out); // dummy QOUT.precision(4); QOUT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "BP" << " " << setw(4) << "A1" << " " << setw(10) << "TEST" << " " << setw(8) << "NIND" << " " << setw(10) << "BETA" << " "; if (par::display_ci) QOUT << setw(8) << "SE" << " " << setw(8) << "LOWER" << " " << setw(8) << "UPPER" << " "; QOUT << setw(12) << "STAT" << " " << setw(12) << "P\n"; ////////////////////// // Familial clustering // C holds which family an individual belongs to // (as element in the family[] array vector C; map famcnt; for (int f = 0 ; f < family.size() ; f++) famcnt.insert( make_pair( family[f] , f ) ); vector::iterator person = sample.begin(); while ( person != sample.end() ) { map::iterator f = famcnt.find( (*person)->family ); if ( f == famcnt.end() ) error("Internal error in QFAM, allocating families to individuals...\n"); else C.push_back( f->second ); person++; } printLOG(int2str(family.size())+" nuclear families in analysis\n"); if ( family.size()<2 ) error("Halting: not enough nuclear families for this analysis\n"); //////////////////// // Run original QFAM perm.setTests(nl_all); perm.setPermClusters(*this); // Force adaptive perm par::adaptive_perm = true; vector_t orig = calcQTDT(C, QOUT, false, perm, pbetween, pwithin); QOUT.close(); //////////////// // Permutation if ( ! par::permute ) return; // Adpative permutation will already have been conducted in original // function call for QFAM (i.e. per-SNP adaptive permutation) if (!par::silent) cout << "\n\n"; //////////////////// // Display results ofstream TDT; f += ".perm"; TDT.open((par::output_file_name+f).c_str(),ios::out); printLOG("Writing QFAM permutation results to [ " + par::output_file_name + f + " ] \n"); TDT.precision(4); TDT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " "; if (par::perm_TDT_basic) TDT << setw(12) << "STAT" << " "; TDT << setw(12) << "EMP1" << " "; TDT << setw(12) << "NP" << " " << "\n"; for (int l=0; lchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " "; if (orig[l] < -0.5) TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " " << setw(12) << "NA"; else { TDT << setw(12) << orig[l] << " " << setw(12) << perm.pvalue(l) << " " << setw(12) << perm.reps_done(l); } TDT << "\n"; } TDT.close(); // Adjusted p-values, assumes 1-df chi-squares if (par::multtest) { vector obp(0); for (int l=0; l & C, ofstream & QOUT, bool permuting, Perm & perm, vector & pbetween, vector & pwithin) { ///////////////////////// // Iterate over each SNP vector_t results(nl_all); for (int l=0; lchr] || par::chr_haploid[locus[l]->chr]) { results[l] = -1; continue; } if (par::verbose) cout << "\n ******************************************\n" << " LOCUS " << locus[l]->name << "\n\n"; //////////////////////////////////////////////////////////////// // Create X vector that encodes the genotype for each individual // as 1,0,-1 (or -9 for missing) // Use the per-person 'flag' variable to indicate a non-missing genotype // at this SNP (i.e. for gperson) // Use 'covar' to store the X= 1,0,-1 codes for this SNP setCovariatesForSNP(*this,l); /////////////////////////////////////// // Score between and within components scoreBetween(*this,l); // Now, for each individual, set B and W vector include(n,true); scoreBandW(*this,l,include); // Now we have created the family structure, B and W and flagged who is missing // in terms of genotype and phenotype // We can either proceed to return one value for this (in max(T) mode) // or to exhaust all permutations ///////////////////////// // Prune out missing data (already done?) vector::iterator f = family.begin(); while ( f != family.end() ) { if ( ! (*f)->include ) { if ( (*f)->pat ) (*f)->pat->flag = false; if ( (*f)->mat ) (*f)->mat->flag = false; for ( int k = 0 ; k < (*f)->kid.size() ; k++) (*f)->kid[k]->flag = false; } f++; } // Prune individuals for (int i=0; iflag) || sample[i]->missing ) include[i] = false; ///////////////////////// // Optional display if (par::verbose) { for (int i=0; ifid << " " << sample[i]->iid << "\t" << sample[i]->phenotype << "\t" << genotype(*this,i,l) << " " << sample[i]->T << " " << sample[i]->B << " " << sample[i]->W ; cout << "\n"; } cout << "\n\n"; } /////////////////////////////////// // Form linear model Model * lm; LinearModel * m = new LinearModel(this); lm = m; // Copy pattern of missing data over, with // some additional exclusions based on family // structure lm->setMissing(include); // Add independent variables: T, B and/or W // and set the test parameter // (intercept is 0) // Covariates Model // 0 Total // 1 Between // 2 Within // Model // 0 Intercept Intercept // 1 Total Between // 2 n/a Within if (par::QFAM_total) { lm->label.push_back("TOT"); lm->testParameter = 1; } else if (par::QFAM_between) { lm->label.push_back("BET"); // lm->label.push_back("WITH"); lm->testParameter = 1; } else if (par::QFAM_within1 || par::QFAM_within2) { // lm->label.push_back("BET"); lm->label.push_back("WITH"); lm->testParameter = 1; } // Build design matrix lm->buildDesignMatrix(); // Fit linear model if ( par::QFAM_total && par::qt ) lm->fitUnivariateLM(); else lm->fitLM(); // Check for multi-collinearity lm->validParameters(); // Calculate Original Test statistic results[l] = lm->getStatistic(); // Store,return and display this value? lm->displayResults(QOUT,locus[l]); /////////////////// // Now, permutation // 1) We have the complete, non-missing data: permute only this // i.e. we do not need to worry about missing data; we are // no longer controlling the correlation between SNPs, as we // are permuting genotype, so we do not need to worry about this // in any case. // 2) Keep the same Model in each case: directly re-state the X // variables in the design matrix, then re-fit model. This // will avoid the cost of building the model, pruning for missing // data, etc, each iteration // Store original, and set up permutations // (i.e. return pperson to original order) perm.nextSNP(); double original = results[l]; //////////////////////// // Adaptive permutation /////////////////////////////////////////////////// // Set up permutation indices, specific to this SNP int tc = 0; while ( true ) { // Permute between and within family components permute(pbetween); for (int i=0; iinclude ) && // Recipient family is not... family[f]->include ) { // ... then swap // F P(F) --> // 0 2 --> 0 2 // 1 0 --> 1 0 // 2 3* --> 2 4 // 3* 4 --> 3* 3* // 4 1 --> 4 1 // ... // e.g. 3* is missing, so swap 3* and 4 in P(F), so 2 // and 4 end up together instead, 3* is invarint int missing_family = pbetween[f]; int swap_in_family = pbetween[pbetween[f]]; pbetween[missing_family] = missing_family; pbetween[f] = swap_in_family; // if (par::verbose) // { // cout << "FAM " << f << " (NOT MISS) has " << missing_family << " (MISS)\n"; // cout << "FAM " << missing_family << " (MISS) has " << swap_in_family << " (?)\n"; // cout << "SWAP MADE ..\n"; // cout << "FAM " << f << " has " << pbetween[f] << "\n"; // cout << "FAM " << missing_family << " has " << pbetween[missing_family] << "\n\n"; // } // And re-check this new pairing f--; } } // if (par::verbose) // for (int f=0; finclude ) // cout << " Permuted family is all missing " << f << "\t" << family[pbetween[f]]->kid[0]->fid << "\n"; // if ( ! family[f]->include ) // cout << " Recipient family is all missing " << f << "\t" << family[f]->kid[0]->fid << "\n"; // } // if (true) // { // for (int i=0; ifid << "\t" // << include[i] << "\t" // << C[i] << "\t" // << pbetween[C[i]] << "\t" // << sample[i]->family->include << "\t" // << family[C[i]]->include << "\t" // << family[pbetween[C[i]]]->include << "\n"; // } // } ////////////////////////////////// // Reconstitute genotypes // and fit back into LinearModel int c=0; for (int i=0; iX[c++][1] = pwithin[C[i]] ? pfam->B + person->W : pfam->B - person->W; else if ( par::QFAM_between ) { lm->X[c++][1] = pfam->B; } else { lm->X[c++][1] = pwithin[C[i]] ? person->W : - person->W; } // cout << "added " << person->fid << " " // << person->iid << " " // << lm->X[c-1][1] << "\n"; } } // cout << "\n\n"; //////////////////////////////////// // Re-fit model if ( par::QFAM_total && par::qt ) lm->fitUnivariateLM(); else lm->fitLM(); // Check for multi-collinearity lm->validParameters(); // Calculate Original Test statistic; // Should not encounter this too much, but if not valid, // count conservatively. double r = lm->isValid() ? lm->getStatistic() : original + 1 ; // cout << "Permutation ... \n"; // if ( ! lm->isValid() ) // cout << "NOT VALID>.. \n"; // int c2 = 0; // for (int i=0; ifid << " " << sample[i]->iid << "\t" // << sample[i]->phenotype << "\t" // << genotype(*this,i,l) << " "; // if ( include[i] ) // cout << lm->X[c2++][1] << " "; // else // cout << "NA" << " "; // cout << "\n"; // } // cout << "\n\n"; // Reset in case the previous model was not valid lm->setValid(); //////////////////////////////////// // Test / update / are we finished ? if ( perm.updateSNP( r , original , l ) ) { if ( ! par::silent ) { cout << "Adaptive permutation done for " << l+1 << " of " << nl_all << " SNPs \r"; cout.flush(); } break; // We are done for this SNP } } // Next adaptive permutation // Clear up delete lm; } // Next SNP return results; } plink-1.07-src/multiple.cpp0000644000265600020320000001443311264127625015041 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include "plink.h" #include "helper.h" #include "stats.h" #include "options.h" // ## QQ plots: concentration bands // > n <- 5000 // > j <- 1:n // > x <- (j-0.5)/n // > d <- sort( runif(n) ) // > plot( -log10(x) , -log10(d) , ylim = range(0,8) , xlim=range(0,8) ) // > lines(-log10(x),-log10(qbeta( 0.05 , j , ( n - j + 1 ) , ncp = d ) ),col="blue") // > lines(-log10(x),-log10(qbeta( 0.95 , j , ( n - j + 1 ) , ncp = d ) ),col="blue") class Pair { public: double p; int l; bool operator< (const Pair & p2) const { return ( p < p2.p ); } }; void pprint(ofstream & MT, double p) { MT << setw(10); if ( p == 0 ) MT << "INF"; else if( p < 0 ) MT << "NA"; else { if ( par::logscale ) { double lp = -log10(p); if ( lp <= 0 ) lp = 0; MT << lp; } else MT << p; } MT << " "; } void Plink::multcomp(vector & chi, string title) { if ( tcnt.size() > 0 && tcnt.size() != chi.size() ) error("Internal problem in multiple comparison routine"); printLOG("Computing corrected significance values (FDR, Sidak, etc)\n"); bool altern_pval = tcnt.size() > 0; vector sp; vector schi; for (int l=0; l=0 ) { double p = altern_pval ? pT(sqrt(chi[l]),tcnt[l]) : chiprobP(chi[l],1); if (p > -1) { Pair pt; pt.p = p; pt.l = l; sp.push_back(pt); Pair b; b.p = chi[l]; b.l = altern_pval ? (int)tcnt[l] : 0; schi.push_back(b); } } } if (schi.size()==0) { printLOG("Zero valid tests computed -- no adjusted values calculated\n"); return; } double t = (double)sp.size(); int ti = sp.size(); sort(sp.begin(),sp.end()); sort(schi.begin(),schi.end()); // Genomic control, based on median chi-square double lambda; double lambda_mean = 0; if (sp.size() % 2 == 0 ) { lambda = ( schi[ ti / 2 - 1 ].p + schi[ ti / 2 ].p ) / 2 ; } else { lambda = schi[ (ti-1) / 2 ].p ; } for (int i=0; i pv_GC(sp.size()); vector pv_sidakSS(sp.size()); vector pv_sidakSD(sp.size()); vector pv_holm(sp.size()); vector pv_BH(sp.size()); vector pv_BY(sp.size()); // Genomic control (reverse order) int i2=0; for (int i=ti-1;i>=0;i--) { pv_GC[i2++] = altern_pval ? pT(sqrt( schi[i].p / lambda ), schi[i].l ) : chiprobP( schi[i].p / lambda, 1 ); } // Base adjust values on GC p-values? if (par::use_GC) { printLOG("Using genomic-controlled p-values for adjusted p-values\n"); for (int i=0; i 1 ? 1 : sp[0].p*t; for (int i=1;i x ? pv_holm[i-1] : x; } // Sidak SS for (int i=0;i x ? pv_sidakSD[i-1] : x ; } // BH pv_BH[ti-1] = sp[ti-1].p; for (int i=ti-2;i>=0;i--) { double x = (t/(double)(i+1))*sp[i].p < 1 ? (t/(double)(i+1))*sp[i].p : 1 ; pv_BH[i] = pv_BH[i+1] < x ? pv_BH[i+1] : x; } // BY double a = 0; for (double i=1; i<=t; i++) a += 1/i; pv_BY[ti-1] = a * sp[ti-1].p < 1 ? a * sp[ti-1].p : 1 ; for (int i=ti-2;i>=0;i--) { double x = ((t*a)/(double)(i+1))*sp[i].p < 1 ? ((t*a)/(double)(i+1))*sp[i].p : 1 ; pv_BY[i] = pv_BY[i+1] < x ? pv_BY[i+1] : x; } // Output ofstream MT; string f = par::output_file_name + title + ".adjusted"; MT.open(f.c_str(),ios::out); MT.precision(4); printLOG("Writing multiple-test corrected significance values to [ "+f+" ] \n"); MT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "UNADJ" << " " << setw(10) << "GC" << " "; if ( par::qq_plot ) MT << setw(10) << "QQ" << " "; MT << setw(10) << "BONF" << " " << setw(10) << "HOLM" << " " << setw(10) << "SIDAK_SS" << " " << setw(10) << "SIDAK_SD" << " " << setw(10) << "FDR_BH" << " " << setw(10) << "FDR_BY" << "\n"; for (int l=0; l par::pfvalue ) continue; MT << setw(4) << locus[sp[l].l]->chr << " " << setw(par::pp_maxsnp) << locus[sp[l].l]->name << " "; // Unadjusted pprint(MT,sp[l].p); // Genomic control pprint(MT,pv_GC[l]); // Q-Q plot? if ( par::qq_plot ) { pprint(MT,(l+0.5)/(double)t); } // Bonferroni, etc double bonferroni = sp[l].p*t > 1 ? 1 : sp[l].p*t; pprint(MT,bonferroni); pprint(MT,pv_holm[l]); pprint(MT,pv_sidakSS[l]); pprint(MT,pv_sidakSD[l]); pprint(MT,pv_BH[l]); pprint(MT,pv_BY[l]); MT << "\n"; } MT.close(); } plink-1.07-src/fisher.h0000644000265600020320000000561111264127626014132 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// /* * R : A Computer Language for Statistical Data Analysis * Copyright (C) 1998-2001 Robert Gentleman, Ross Ihaka * and the R Development Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * * Memory Allocation (garbage collected) --- INCLUDING S compatibility --- */ #ifndef __FISHER_H__ #define __FISHER_H__ #include "plink.h" #ifndef R_EXT_MEMORY_H_ #define R_EXT_MEMORY_H_ #ifdef __cplusplus extern "C" { #endif char * vmaxget(void); void vmaxset(char*); void R_gc(void); char * R_alloc(long, int); char * S_alloc(long, int); char * S_realloc(char*, long, long, int); #ifdef __cplusplus } #endif #endif /* R_EXT_MEMORY_H_ */ #ifndef R_EXT_BOOLEAN_H_ #define R_EXT_BOOLEAN_H_ #undef FALSE #undef TRUE #ifdef __cplusplus extern "C" { #endif typedef enum { FALSE = 0, TRUE /*, MAYBE */ } Rboolean; #ifdef __cplusplus } #endif #endif /* R_EXT_BOOLEAN_H_ */ #ifndef R_EXT_CONSTANTS_H_ #define R_EXT_CONSTANTS_H_ #ifndef M_PI #define M_PI 3.141592653589793238462643383279502884197169399375 #endif #define PI M_PI #define SINGLE_EPS FLT_EPSILON #define SINGLE_BASE FLT_RADIX #define SINGLE_XMIN FLT_MIN #define SINGLE_XMAX FLT_MAX #define DOUBLE_DIGITS DBL_MANT_DIG #define DOUBLE_EPS DBL_EPSILON #define DOUBLE_XMAX DBL_MAX #define DOUBLE_XMIN DBL_MIN #endif // Fisher's exact test void fexact(int *nrow, int *ncol, double *table, int *ldtabl, double *expect, double *percnt, double *emin, double *prt, double *pre, int *workspace); double fisher(table_t t); #endif plink-1.07-src/blox.cpp0000644000265600020320000003371311264127626014155 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "options.h" #include "helper.h" #include "plink.h" #include "phase.h" #include "stats.h" extern Plink * PP; /////////////////////////////////////////////////////////////////////// // // // Haplotype block code, adapted from code courtesy of Jeff Barrett, // // and HAPLOVIEW, following some very quick-and-dirty Java->C++... // // // /////////////////////////////////////////////////////////////////////// class LDPair { public: int s1; int s2; int dist; LDPair(int s1_, int s2_, int dist_) { s1 = s1_; s2 = s2_; dist = dist_; } friend ostream & operator<<(ostream & out, LDPair & v) { out << "[" << v.s1 << " " << v.s2 << " " << v.dist << "]"; return out; } }; struct Pair_cmp { bool operator()(const LDPair & a, const LDPair & b) const { if ( a.dist < b.dist ) return true; if ( a.dist > b.dist ) return false; if ( a.s1 < b.s1 ) return true; if ( a.s1 > b.s1 ) return false; return ( a.s2 < b.s2 ); } }; class DPrime { public: double dp; double dpl; double dpu; double lod; }; class PairwiseLinkage { public: PairwiseLinkage(int a_, int b_) { a=a_; b=b_; knownAA = knownAB = knownBA = knownBB = unknownDH = 0; } int a; int b; double dp, rsq; double dp_upper, dp_lower; double lod; void calculateCI(); void calculateLD(); int knownAA, knownAB, knownBA, knownBB, unknownDH; }; map > Plink::mkBlks(int null1, int null2 ) { // First SNP, vector of SNPs (inc. first) map< int, vector > blocks; // Some constants const double cutHighCI = 0.98; const double cutLowCI = 0.70; const double cutLowCIVar [5] = {0,0,0.80,0.50,0.50}; const double maxDist [5] = {0,0,20000,30000,1000000}; const double recHighCI = 0.90; const double informFrac = 0.95; const double fourGameteCutoff = 0.01; const double mafThresh = 0.05; // Set to skip SNPs with low MAFs // Uses genome-wide reference number: need to allocate for all SNPs here vector skipMarker(nl_all,false); for (int x = 0; x < nl_all; x++) skipMarker[x] = locus[x]->freq < mafThresh; // Consider each chromosome one at a time; skip X for now int startChromosome = locus[ 0 ]->chr; int finalChromosome = locus[ nl_all - 1 ]->chr; for (int chr = startChromosome ; chr <= finalChromosome; chr++) { if ( scaffold.find(chr) == scaffold.end() ) continue; int fromPosition = scaffold[chr].lstart; int toPosition = scaffold[chr].lstop; int nsnps = toPosition - fromPosition + 1; ///////////////////////////////////////////////////////////////////////// // Make a list of marker pairs in "strong LD", sorted by distance apart set strongPairs; map dpStore; int numStrong = 0; int numRec = 0; int numInGroup = 0; // Each pair of markers for (int x = fromPosition; x < toPosition; x++) { if ( ! par::silent ) { std::cerr << "Chromosome " << locus[x]->chr << ", position " << locus[x]->bp/1000000.0 << "Mb \r"; } for (int y = x+1; y <= toPosition; y++) { if ( locus[x]->chr != locus[y]->chr ) continue; if ( ( locus[y]->bp - locus[x]->bp ) > par::disp_r_window_kb ) { continue; } if ( locus[x]->freq == 0 || locus[y]->freq == 0 ) continue; PairwiseLinkage thisPair(x,y); thisPair.calculateLD(); thisPair.calculateCI(); double lod = thisPair.lod; double lowCI = thisPair.dp_lower; double highCI = thisPair.dp_upper; int2 t(x,y); DPrime d; d.dp = thisPair.dp; d.dpl = lowCI; d.dpu = highCI; d.lod = lod; dpStore.insert( make_pair( t,d ) ); // Is this pair in strong LD? if (lod < -90) continue; //missing data if (highCI < cutHighCI || lowCI < cutLowCI) continue; //must pass "strong LD" test // Store this pair LDPair p(x,y, abs( locus[x]->bp - locus[y]->bp ) ); strongPairs.insert( p ); } } // Now we have a list of SNPs in strong LD within this region // Now construct blocks based on this set used; // #blocks: vector > blockArray; int cnt = 0; for ( set::reverse_iterator i = strongPairs.rbegin(); i != strongPairs.rend(); ++i ) { int numStrong = 0; int numRec = 0; int numInGroup = 0; vector thisBlock; int first = i->s1; int last = i->s2; long sep = i->dist; // See if this block overlaps with another: if ( used.find(first) != used.end() || used.find(last) != used.end() ) { continue; } // Next, count the number of markers in the block. // (nb. assume all SNPs belong) for (int x = first; x <=last ; x++) { if( !skipMarker[x] ) numInGroup++; } // Skip it if it is too long in bases for it's size in markers if (numInGroup < 4 && sep > maxDist[numInGroup]) { continue; } // Add first SNP thisBlock.push_back( first ); // Test block: requires 95% of informative markers to be "strong" for (int y = first+1; y <= last; y++) { if (skipMarker[y]) { continue; } thisBlock.push_back(y); //loop over columns in row y for (int x = first; x < y; x++) { if (skipMarker[x]) continue; double lod; double lowCI; double highCI; map::iterator l = dpStore.find( int2(x,y) ); if ( l == dpStore.end() ) { // Recalculate PairwiseLinkage thisPair(x,y); thisPair.calculateLD(); thisPair.calculateCI(); lod = thisPair.lod; lowCI = thisPair.dp_lower; highCI = thisPair.dp_upper; } else { // Get the right bits lod = l->second.lod; lowCI = l->second.dpl; highCI = l->second.dpu; } // Monomorphic marker error if ( lod < -90) continue; // Skip bad markers if ( lod == 0 && lowCI == 0 && highCI == 0) continue; // For small blocks use different CI cutoffs if (numInGroup < 5) { if (lowCI > cutLowCIVar[numInGroup] && highCI >= cutHighCI) numStrong++; } else { if (lowCI > cutLowCI && highCI >= cutHighCI) numStrong++; //strong LD } if (highCI < recHighCI) numRec++; //recombination } } // Change the definition somewhat for small blocks if (numInGroup > 3) { if (numStrong + numRec < 6) { continue; } } else if (numInGroup > 2) { if (numStrong + numRec < 3) { continue; } } else { if (numStrong + numRec < 1) { continue; } } // If this qualifies as a block, add to the block list, but in // order by first marker number: if ( (double)numStrong/(double)(numStrong + numRec) > informFrac) { blocks.insert( make_pair( first , thisBlock )); // Track that these SNPs belong to a block for (int u = first; u <= last; u++) used.insert(u); } } // Next chromosome } if ( ! par::silent ) cerr << "\n"; map >::iterator j = blocks.begin(); printLOG(int2str( blocks.size() ) + " blocks called, writing list to [ " + par::output_file_name + ".blocks ]\n"); ofstream O1( (par::output_file_name+".blocks").c_str() , ios::out ); printLOG("Writing extra block details to [ " + par::output_file_name + ".blocks.det ]\n"); ofstream O2( (par::output_file_name+".blocks.det").c_str() , ios::out ); O2 << setw(4) << "CHR" << " " << setw(12) << "BP1" << " " << setw(12) << "BP2" << " " << setw(12) << "KB" << " " << setw(6) << "NSNPS" << " " << setw(4) << "SNPS" << "\n"; while ( j != blocks.end() ) { O1 << "*"; vector & b = j->second; for (int k=0; klocus[b[k]]->name; O1 << "\n"; O2 << setw(4) << PP->locus[b[0]]->chr << " " << setw(12) << PP->locus[b[0]]->bp << " " << setw(12) << PP->locus[b[b.size()-1]]->bp << " " << setw(12) << (double)(PP->locus[b[b.size()-1]]->bp - PP->locus[b[0]]->bp + 1)/1000.0 << " " << setw(6) << b.size() << " "; for (int k=0; k0 ) O2 << "|" << PP->locus[b[k]]->name; else O2 << PP->locus[b[k]]->name; } O2 << "\n"; ++j; } O1.close(); O2.close(); // List of blocks created here // (dummy; not used) map > blocks0; return blocks0; } void PairwiseLinkage::calculateCI() { // Get counts of observed, unambiguous haplotypes vector > t = two_locus_table(a,b); // Assume autosome knownAA = 2 * t[0][0] + t[0][1] + t[1][0]; knownAB = 2 * t[0][2] + t[0][1] + t[1][2]; knownBA = 2 * t[2][0] + t[1][0] + t[2][1]; knownBB = 2 * t[2][2] + t[2][1] + t[1][2]; unknownDH = t[1][1]; int total_chroms = knownAA + knownAB + knownBA + knownBB + 2*unknownDH; // From Haploview code: // Likelihood surface vector_t lsurface(101); // // Assumed // // denom = of D' // // 4 haplotype frequencies pA1, pA2, pB1, pB2 const double LN10 = log(10.0); string sA1 = PP->locus[a]->allele1 + PP->locus[b]->allele1; string sA2 = PP->locus[a]->allele1 + PP->locus[b]->allele2; string sB1 = PP->locus[a]->allele2 + PP->locus[b]->allele1; string sB2 = PP->locus[a]->allele2 + PP->locus[b]->allele2; double pA1,pA2,pB1,pB2; for ( int i = 0 ; i < 4 ; i++ ) { if ( PP->haplo->haplotypeName(i) == sA1 ) pA1 = PP->haplo->f[i]; else if ( PP->haplo->haplotypeName(i) == sA2 ) pA2 = PP->haplo->f[i]; else if ( PP->haplo->haplotypeName(i) == sB1 ) pB1 = PP->haplo->f[i]; else if ( PP->haplo->haplotypeName(i) == sB2 ) pB2 = PP->haplo->f[i]; } double pA = pA1 + pA2; double pB = 1 - pA; double p1 = pA1 + pB1; double p2 = 1 - p1; // Estimated haplotype counts double D = pA1 - (pA*p1); if (D < 0) { double tmp; /* flip matrix so we get the positive D' */ /* flip AA with AB and BA with BB */ tmp=pA1; pA1=pA2; pA2=tmp; tmp=pB2; pB2=pB1; pB1=tmp; /* flip frequency of second allele */ tmp=p1; p1=p2; p2=tmp; /* flip known array for likelihood computation */ int tmpi; tmpi=knownAA; knownAA=knownAB; knownAB=tmpi; tmpi=knownBB; knownBB=knownBA; knownBA=tmpi; } double dmax1 = pA * p2 ; double dmax2 = pB * p1 ; double denom = dmax1 < dmax2 ? dmax1 : dmax2; for (int i=0; i<=100; i++) { double dpr = (double)i*0.01; double tmpAA = dpr*denom + pA*p1; double tmpAB = pA-tmpAA; double tmpBA = p1-tmpAA; double tmpBB = pB-tmpBA; if (i==100) { /* one value will be 0 */ if (tmpAA < 1e-10) tmpAA=1e-10; if (tmpAB < 1e-10) tmpAB=1e-10; if (tmpBA < 1e-10) tmpBA=1e-10; if (tmpBB < 1e-10) tmpBB=1e-10; } lsurface[i] = ( knownAA * log( tmpAA ) + knownAB * log( tmpAB ) + knownBA * log( tmpBA ) + knownBB * log( tmpBB ) + unknownDH * log( tmpAA*tmpBB + tmpAB*tmpBA)) / LN10; } double loglike1 = unknownDH * log( pA1*pB2 + pB1*pA2 ); if ( pA1>0 ) loglike1 += knownAA * log( pA1 ); if ( pA2>0 ) loglike1 += knownAB * log( pA2 ); if ( pB1>0 ) loglike1 += knownBA * log( pB1 ); if ( pB2>0 ) loglike1 += knownBB * log( pB2 ); loglike1 /= LN10; double loglike0= (knownAA * log(pA*p1) + knownAB * log(pA*p2) + knownBA * log(pB*p1) + knownBB * log(pB*p2) + unknownDH * log(2*pA*pB*p1*p2))/LN10; lod = loglike1-loglike0; if ( lod < 0 ) lod = 0; double total_prob=0; double sum_prob=0; int high_i = 0; int low_i = 0; for (int i=0; i<=100; i++) { lsurface[i] -= loglike1; lsurface[i] = pow(10.0,lsurface[i]); total_prob += lsurface[i]; } for (int i=0; i<=100; i++) { sum_prob += lsurface[i]; if (sum_prob > 0.05*total_prob && sum_prob-lsurface[i] < 0.05*total_prob) { low_i = i-1; break; } } sum_prob=0.0; for (int i=100; i>=0; i--) { sum_prob += lsurface[i]; if (sum_prob > 0.05*total_prob && sum_prob-lsurface[i] < 0.05*total_prob) { high_i = i+1; break; } } if (high_i > 100){ high_i = 100; } dp_lower = (double)low_i/100.0; dp_upper = (double)high_i/100.0; if ( par::verbose ) { cout << PP->locus[ a ]->name << " " << PP->locus[ b ]->name << " : "; cout << "Rsq= " << PP->haplo->rsq(a,b) << " : "; cout << "D' = " << dp << " CI = " << dp_lower << " to " << dp_upper << "; lod = " << lod << " " << loglike1 << " vs. " << loglike0 << "\n"; } } void PairwiseLinkage::calculateLD() { dp = PP->haplo->dprime( a, b ); } plink-1.07-src/multi.cpp0000644000265600020320000003354211264127624014341 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include "options.h" #include "plink.h" #include "helper.h" #define EPS 0.00001 using namespace std; void Plink::preCalcSinglePoint() { m1.resize(0); m2.resize(0); pos.resize(0); // All single markers for (int i=par::run_start;i<=par::run_end;i++) { m1.push_back(i); m2.push_back(i); pos.push_back(0); } // Final analysis is genome-wide phenotype on IBD m1.push_back(-1); m2.push_back(-1); pos.push_back(0); } void Plink::preCalcMultiPoint() { //////////////////////////////////////////////////////////////////// // Multipoint marker map to span range pos1-fringe to pos(nl)+fringe // Uniform grid in cM space; Uniform grid in marker space (i.e. X // positions between each marker pair). // Reset map m1.resize(0); m2.resize(0); pos.resize(0); // For each position on the cM map, determine the two flanking // markers and the proportional distance between the two. // Uniform map in cM space if (par::inter_grid==0) { for (double p=locus[par::run_start]->pos - par::fringe; p <= locus[par::run_end]->pos + par::fringe; p += par::grid) { // find marker that comes before this position... if (p < locus[par::run_start]->pos) { m1.push_back(-1); m2.push_back(par::run_start); double p2 = (p- (locus[par::run_start]->pos - par::fringe)) / par::fringe; pos.push_back(p2); } // ... in the normal range... for (int i=par::run_start; i= locus[i]->pos && p < locus[i+1]->pos ) { double p2 = (p-locus[i]->pos) / (locus[i+1]->pos - locus[i]->pos); pos.push_back(p2); m1.push_back(i); m2.push_back(i+1); } } // ... or after the last marker if (p >= locus[par::run_end]->pos) { m1.push_back(par::run_end); m2.push_back(-1); double p2 = (p-locus[par::run_end]->pos) / par::fringe; pos.push_back(p2); } } } else { // Uniform map in marker-space // ...before the first for (int j=0;j Plink::calcMultiPoint(vector & IBD, Z IBDg, ofstream & MP) { // Hidden Markov Model to estimate IBD states given IBS states, // alleles frequencies and a genetic map bool is0zero = IBDg.z0 < EPS ? true : false; bool is1zero = IBDg.z1 < EPS ? true : false; bool is2zero = IBDg.z2 < EPS ? true : false; ///////////////////////////////////////// // Calculate two IBD probabilities // For haploid genome pair // 00 ( 1 - (m-1)*G ) + (m-1)*G*(1-(1/(2^(m-1)-1)) ) // 01 1 - 00 // 10 (m-1)*G // 11 1 - (m-1)*G // P(haploid genome is IBD) = x1 and x2 // x1 * x2 = z2 // (1-x1) * (1-x2) = z0 // x^2 - x + 0.25 = 0 // Solution to this quadratic equation gives: // ax^2 + bx + c = 0 // a = -1; b = 1-z0+z2; c = -z2 double mA, mB; if (is2zero) { mA = 1 - log(IBDg.z1) / log(2.0); mB=0; } else { double b = 1 - IBDg.z0 + IBDg.z2; // After 'nudging' IBD probabilies, this should always be // positive -- but allow for rounding errors with fabs double b2 = fabs( b*b - 4*IBDg.z2 ); double x = sqrt( b2 ); double t1 = (-b + x ) / -2; double t2 = (-b - x ) / -2; mA = 1 - log(t1) / log(2.0); mB = 1 - log(t2) / log(2.0); } // L = 1R Q1 T1 Q2 T2 .. T(M-1) Q(M) 1C // 1R = 1x3 vector // 1C = 3x1 vector // Return value: vector of pi-hats vector pihat; // Working matrices vector left(nl); vector right(nl); // Left conditional begins with first locus on diagonal left[0] = IBD[0]; // Scaling factor double S = 1.0/(left[0].z0 + left[0].z1 + left[0].z2); left[0].z0 *= S; left[0].z1 *= S; left[0].z2 *= S; // Right conditional initial point right[nl-1] = IBD[nl-1]; S = 1.0 / ( right[nl-1].z0 + right[nl-1].z1 + right[nl-1].z2 ); right[nl-1].z0 *= S; right[nl-1].z1 *= S; right[nl-1].z2 *= S; /////////////////////// // Left conditional int l=1; for (int l2=par::run_start+1; l2<=par::run_end; l2++) { // Build transition matrix // 1 x 3 . 3x3 . 3x3 . ... Z prev; prev = left[l-1]; buildT(locus[l2]->pos - locus[l2-1]->pos, is2zero, mA,mB); // Tij = from state i to state j // // l l+1 // [ p0 p1 p2 ] [ 00 10 20 ] [ z0 0 0 ] -> [ l0 l1 l2 ] // [ 10 11 21 ] [ 0 z1 0 ] // [ 20 12 22 ] [ 0 0 z2 ] // left[l].z0 = ( ( prev.z0 * T00 + prev.z1 * T10 + prev.z2 * T20 ) * IBD[l].z0 ); left[l].z1 = ( ( prev.z0 * T01 + prev.z1 * T11 + prev.z2 * T21 ) * IBD[l].z1 ); left[l].z2 = ( ( prev.z0 * T02 + prev.z1 * T12 + prev.z2 * T22 ) * IBD[l].z2 ); // Scaling factor (sum to 1) double S = 1/(left[l].z0 + left[l].z1 + left[l].z2); left[l].z0 *= S; left[l].z1 *= S; left[l].z2 *= S; // Shift left l++; } // Next marker interval if (par::verbose) { cout << "SINGLEPOINT\n"; for (int i=0;i=par::run_start; l2--) { buildT(locus[l2+1]->pos - locus[l2]->pos, is2zero, mA,mB); // Right conditional [ R(n+1) * T * R(n) ] right[l].z0 = ( ( right[l+1].z0 * T00 + right[l+1].z1 * T10 + right[l+1].z2 * T20 ) * IBD[l].z0 ); right[l].z1 = ( ( right[l+1].z0 * T01 + right[l+1].z1 * T11 + right[l+1].z2 * T21 ) * IBD[l].z1 ); right[l].z2 = ( ( right[l+1].z0 * T02 + right[l+1].z1 * T12 + right[l+1].z2 * T22 ) * IBD[l].z2 ); // Scaling factor (sum to 1) double S = 1/(right[l].z0 + right[l].z1 + right[l].z2); right[l].z0 *= S; right[l].z1 *= S; right[l].z2 *= S; // Shift right l--; } // Next marker interval if (par::verbose) { cout << "RIGHT CONDITIONAL\n"; for (int i=0;ipos - par::fringe; else p1 = locus[ m1[ i ] ]->pos; if (m2[i]==-1) p2 = locus[par::run_end]->pos + par::fringe; else p2 = locus[m2[i]]->pos; double d1 = pos[i] * (p2-p1); double d2 = (1 - pos[i]) * (p2-p1); // P0 = L * TL * Q0 * TR * R; // P1 = L * TL * Q1 * TR * R; // P2 = L * TL * Q2 * TR * R; // L * TL // Left T matrix buildT(d1,is2zero,mA,mB); // Left & right conditional Z L; if (m1[i]==-1) { L.z0 = L.z1 = L.z2 = 1; } else L = left[m1[i] - par::run_start]; // * Q{0/1/2} [ Q 3x3 matrix -- just extracts elements ] Z M0; Z M1; Z M2; M0.z0 = ( L.z0 * T00 + L.z1 * T10 + L.z2 * T20 ) ; M1.z1 = ( L.z0 * T01 + L.z1 * T11 + L.z2 * T21 ) ; M2.z2 = ( L.z0 * T02 + L.z1 * T12 + L.z2 * T22 ) ; // * TR buildT(d2,is2zero,mA,mB); // Finally, 1x3 . 3x1 = 1x1 Z R; if (m2[i]==-1) { R.z0 = R.z1 = R.z2 = 1; } else R = right[m2[i] - par::run_start]; double P0 = M0.z0 * (( T00*R.z0 + T10*R.z1 + T20*R.z2)); double P1 = M1.z1 * (( T01*R.z0 + T11*R.z1 + T21*R.z2)); double P2 = M2.z2 * (( T02*R.z0 + T12*R.z1 + T22*R.z2)); // Standardized P double S = 1.0/(P0+P1+P2); P0 *= S; P1 *= S; P2 *= S; if (par::verbose) cout << "M1: " << P0 << "\t" << P1 << "\t" << P2 << "\n"; ///////////////////////////////////// // Apply Bayes Theorem to obtain // P(Z|M) = P(M|Z)P(Z) / P(M) S = 1.0 / (P0*IBDg.z0 + P1*IBDg.z1 + P2*IBDg.z2 ); P0 = (P0*IBDg.z0) * S; P1 = (P1*IBDg.z1) * S; P2 = (P2*IBDg.z2) * S; if (par::verbose) cout << "M2: " << P0 << "\t" << P1 << "\t" << P2 << "\n"; if (par::multi_output) { double p1, p2; string n1, n2; if (m1[i]==-1) { p1 = locus[par::run_start]->pos - par::fringe; n1 = "fringe"; } else { p1 = locus[m1[i]]->pos; n1 = locus[m1[i]]->name; } if (m2[i]==-1) { p2 = locus[par::run_end]->pos + par::fringe; n2 = "fringe"; } else { p2 = locus[m2[i]]->pos; n2 = locus[m2[i]]->name; } MP << par::run_chr << " " << pairid << " " << p1 + pos[i] * (p2-p1) << " " << P0 << " " << P1 << " " << P2 << " " << (P1*0.5+P2) << " " << (IBDg.z1*0.5 + IBDg.z2) << "\n"; } // Record pi-hat estimate pihat.push_back( (P1*0.5+P2) ); } // Final analysis is genome-wide IBD if (!par::done_global_pihat) pihat_G.push_back( (IBDg.z1*0.5 + IBDg.z2) ); return pihat; } void Plink::buildT(double G, bool z2zero, double mA, double mB) { ///////////////////////////////////////// // Build 2x2 haploid transition matrices // m // ------------ // 0 1 // ------------ // // m-1 | 0 | X t // | 1 | 1-X 1-t // // where t = mA * G = recombination fraction theta // mA = number of meioses separating the haploid genomes // G = genetic distance in Morgans // // All distances must be positive and small on a Morgan scale -- // move this check up to the initial map file to save time, no need // to recalculate always G = G < 0 ? 0 : G; G = G > 1 ? 1 : G; double A01 = (1- pow(1-G,mA-2) * (G*G+(1-G)*(1-G)))/(pow(2,mA-1)-1); double A00 = 1 - A01; double A11 = pow((1-G),(mA-2)) * (G*G+(1-G)*(1-G) ); double A10 = 1 - A11; double B00; double B01; double B10; double B11; if (z2zero) { B00 = B11 = 1; B01 = B10 = 0; } else { B01 = (1- pow(1-G,mB-2) * (G*G+(1-G)*(1-G)))/(pow(2,mB-1)-1); B00 = 1 - B01; B11 = pow((1-G),(mB-2)) * (G*G+(1-G)*(1-G) ); B10 = 1 - B11; } if (par::debug) { cout << "mA, mB, G = " << mA << " " << mB << " " << G << "\n"; cout << "A[i,j] = \n"; cout << "\t" << A00 << "\t" << A01 << "\n" << "\t" << A10 << "\t" << A11 << "\n\n"; cout << "B[i,j] = \n"; cout << "\t" << B00 << "\t" << B01 << "\n" << "\t" << B10 << "\t" << B11 << "\n\n"; } /////////////////////////// // Build transition matrix // Return transpose of transition matrix T00 = A00*B00; T10 = A00*B01 + A01*B00; T20 = A01*B01; T01 = A00*B10 + A10*B00; T11 = A00*B11 + A01*B10 + A10*B01 + A11*B00; T21 = A01*B11 + A11*B01; T01 /= 2; T11 /= 2; T21 /= 2; T02 = A10*B10; T12 = A10*B11 + A11*B10; T22 = A11*B11; // Or return non-transpose (we do not want this option) if (false) { T00 = A00*B00; T01 = A00*B01 + A01*B00; T02 = A01*B01; T10 = A00*B10 + A10*B00; T11 = A00*B11 + A01*B10 + A10*B01 + A11*B00; T12 = A01*B11 + A11*B01; T10 /= 2; T11 /= 2; T12 /= 2; T20 = A10*B10; T21 = A10*B11 + A11*B10; T22 = A11*B11; } if (par::debug) { cout << "cM = " << G << "\n"; cout << "transition matrix\n" << T00 << "\t" << T01 << "\t" << T02 << "\n" << T10 << "\t" << T11 << "\t" << T12 << "\n" << T20 << "\t" << T21 << "\t" << T22 << "\n"; } } plink-1.07-src/config.h0000644000265600020320000001057211264127626014121 0ustar tilleaadmin#ifdef WITH_R_PLUGINS /* src/config.h. Generated by configure. */ /* src/config.h.in. Generated from configure.ac by autoheader. */ /* If defined Rserve supports unix crypt password encryption. */ #define HAS_CRYPT 1 /* */ #define HAVE_CONNECT 1 /* Define to 1 if you have the header file. */ #define HAVE_CRYPT_H 1 /* Define to 1 if you have the `fork' function. */ #define HAVE_FORK 1 /* Define to 1 if you have the header file. */ #define HAVE_INTTYPES_H 1 /* Define to 1 if you have the `dl' library (-ldl). */ #define HAVE_LIBDL 1 /* Define to 1 if you have the `inet' library (-linet). */ /* #undef HAVE_LIBINET */ /* Define to 1 if you have the `nsl' library (-lnsl). */ /* #undef HAVE_LIBNSL */ /* Define to 1 if you have the `nsl_s' library (-lnsl_s). */ /* #undef HAVE_LIBNSL_S */ /* Define to 1 if you have the `socket' library (-lsocket). */ /* #undef HAVE_LIBSOCKET */ /* Define to 1 if your system has a GNU libc compatible `malloc' function, and to 0 otherwise. */ #define HAVE_MALLOC 1 /* Define to 1 if you have the header file. */ #define HAVE_MEMORY_H 1 /* Define to 1 if you have the `memset' function. */ #define HAVE_MEMSET 1 /* Define to 1 if you have the `mkdir' function. */ #define HAVE_MKDIR 1 /* Define to 1 if you have the header file. */ #define HAVE_NETINET_IN_H 1 /* Define to 1 if you have the header file. */ #define HAVE_NETINET_TCP_H 1 /* Define to 1 if you have the header file. */ #define HAVE_READLINE_HISTORY_H 1 /* Define to 1 if you have the header file. */ #define HAVE_READLINE_READLINE_H 1 /* Define to 1 if you have the `rmdir' function. */ #define HAVE_RMDIR 1 /* Define to 1 if you have the `select' function. */ #define HAVE_SELECT 1 /* Define to 1 if you have the `socket' function. */ #define HAVE_SOCKET 1 /* Define to 1 if you have the header file. */ #define HAVE_STDINT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRINGS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_SOCKET_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TIME_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_UN_H 1 /* Define to 1 if you have that is POSIX.1 compatible. */ #define HAVE_SYS_WAIT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_UNISTD_H 1 /* Define to 1 if you have the `vfork' function. */ #define HAVE_VFORK 1 /* Define to 1 if you have the header file. */ /* #undef HAVE_VFORK_H */ /* Define to 1 if `fork' works. */ #define HAVE_WORKING_FORK 1 /* Define to 1 if `vfork' works. */ #define HAVE_WORKING_VFORK 1 /* Define to the address where bug reports for this package should be sent. */ #define PACKAGE_BUGREPORT "Simon.Urbanek@r-project.org" /* Define to the full name of this package. */ #define PACKAGE_NAME "Rserve" /* Define to the full name and version of this package. */ #define PACKAGE_STRING "Rserve 0.4" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "rserve" /* Define to the version of this package. */ #define PACKAGE_VERSION "0.4" /* Defined if readline is supported */ /* #undef READLINE */ /* Define as the return type of signal handlers (`int' or `void'). */ #define RETSIGTYPE void /* Define to 1 if you have the ANSI C header files. */ #define STDC_HEADERS 1 /* Must be defined for platforms with bytesex inverse to intel style */ /* #undef SWAPEND */ /* Define to 1 if you can safely include both and . */ #define TIME_WITH_SYS_TIME 1 /* Define to empty if `const' does not conform to ANSI C. */ /* #undef const */ /* Define to rpl_malloc if the replacement function should be used. */ /* #undef malloc */ /* Define to `int' if does not define. */ /* #undef pid_t */ /* Define to `int' if neither nor define. */ /* #undef socklen_t */ /* Define as `fork' if `vfork' does not work. */ /* #undef vfork */ #endif plink-1.07-src/em.cpp0000644000265600020320000000720011264127625013601 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2007 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "genogroup.h" #include "phase.h" #include "haplowindow.h" extern ofstream LOG; using namespace std; //////////////////////////////////////////////// // Original, single window EM algorithm, without // genoGrouping -- this is now used for the meta-EM void HaploPhase::performEM_original() { vector_t uc(nh, 0); // unambigous counts vector_t ac(nh, 0); // ambigous counts // Count numbers of unambigous haplotypes // as these stay constant throughout EM for (int i=0; ifounder && include[i]) { if (!ambig[i]) { uc[hap1[i][0]]++; if ( ! (haploid || (X && P.sample[i]->sex))) uc[hap2[i][0]]++; } } } ////////////////// // Begin E-M double sampleLogLikelihood = 0; for (int j=0; j<= par::haplo_plem_meta_iter; j++) { /////////// // E-step for (int i=0; ifounder && include[i]) { if (ambig[i]) { double s=0; // Haploid phases... if (haploid || (X && P.sample[i]->sex)) { for (int z=0; zfounder && include[i]) if (ambig[i]) { if (haploid || (X && P.sample[i]->sex)) { for (int z=0; zfounder && include[i]) { double lk = 0; if (haploid || (X && P.sample[i]->sex)) { for (int z=0; z 0 && sampleLogLikelihood - lnl < par::haplo_plem_meta_tol ) { break; } sampleLogLikelihood = lnl; } } } plink-1.07-src/model.h0000644000265600020320000000662611264127626013761 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __MODEL_H__ #define __MODEL_H__ #include #include "plink.h" using namespace std; class Model { public: Model(); virtual ~Model() { }; virtual void setDependent() = 0; virtual void pruneY() = 0; virtual void fitLM() = 0; virtual vector_t getCoefs() = 0; virtual vector_t getVar() = 0; virtual vector_t getSE() = 0; virtual vector_t getPVals() = 0; virtual void displayResults(ofstream &, Locus *) = 0; virtual void fitUnivariateLM() = 0; void setMissing(); vector getMissing(); void setMissing(vector&); void yokeMissing(Model *); void setHaploid(); void setX(); void setDominant(); void setRecessive(); void hasSNPs(bool); void addAdditiveSNP(int); void addDominanceSNP(int); void addHaplotypeDosage(set&); void addSexEffect(); bool isSexInModel(); void addCovariate(int); void addInteraction(int,int); void buildDesignMatrix(); bool checkVIF(); vector validParameters(); bool isValid() { return all_valid; } double getStatistic(); // double getPValue(); double linearHypothesis(matrix_t &, vector_t &); int Ysize() { return nind; } int getNP() { return np; } void setValid() { all_valid = true; } vector label; vector order; vector type; int testParameter; void noCluster(); void setCluster(); virtual void HuberWhite() = 0; // Independent variables (can be directly manipulated...) vector > X; protected: Plink * P; // Missing flag vector miss; int nind; int np; // Main effects + interaction + intercept bool has_snps; vector xchr; vector haploid; bool sex_effect; vector valid; bool all_valid; vector_t coef; // beta matrix_t S; // Sigma // Term types enum terms { INTERCEPT, ADDITIVE, DOMDEV, HAPLOTYPE, SEX, COVARIATE, INTERACTION, QFAM }; double buildIntercept(); double buildAdditive(Individual *, int); double buildDominance(Individual *, int); double buildHaplotype(int, int); double buildSex(Individual *); double buildCovariate(Individual *, int); double buildInteraction(Individual *, int, vector_t &); double buildQFAM(Individual *); bool skip; // List of additive SNP effects // assuming SNP major mode vector additive; int mAA; int mAB; int mBB; double mA, mB; // List of dominance deviation SNP effects vector dominance; // List of covariates (clist) vector covariate; // List of pairwise interactions // ( indexing previously specified components, 1,2,..) vector interaction; // List of sets of haplotypes vector > haplotype; // Clustering information bool cluster; vector clst; int nc; }; #endif plink-1.07-src/gvar.h0000644000265600020320000000241311264127626013606 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __GVAR_H__ #define __GVAR_H__ #include #include #include "plink.h" using namespace std; class Variant : public Locus { public: int nallele; vector alleles; vector_t freqs; map acode; bool allelicVariation; bool copyNumberVariation; bool integerDosage; Variant() { nallele = 0; integerDosage = true; allelicVariation = copyNumberVariation = false; } }; class GVariant { public: GVariant() { missing = true; allele1 = allele2 = -1; dosage1 = dosage2 = 0; } bool missing; int allele1; int allele2; float dosage1; float dosage2; }; #endif plink-1.07-src/merge.cpp0000644000265600020320000007025511264127625014311 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" void Plink::mergeList() { // Perform multiple merge operations (first fileset is specified on // the command line, the rest in a merge-list). Simply perform // multiple calls to mergeData() // If a line contains 2 items, it suggests a PED/MAP merge // If a line contains 3 items, it suggests a BED/BIM/FAM merge printLOG( "Using merge mode " + int2str( par::merge_mode ) + " : "); if (par::merge_mode==1) printLOG("consensus call (default)\n"); else if (par::merge_mode==2) printLOG("overwrite if missing in original\n"); else if (par::merge_mode==3) printLOG("overwrite unless missing in new\n"); else if (par::merge_mode==4) printLOG("overwrite none\n"); else if (par::merge_mode==5) printLOG("overwrite all\n"); // Check that a merge-list exists checkFileExists(par::merge_list_filename); ifstream MLIST(par::merge_list_filename.c_str()); MLIST.clear(); // Iterate over all files to be merged int c=0; while(!MLIST.eof()) { char cline[5000] = ""; MLIST.getline(cline,5000,'\n'); // convert to string string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); // Is this a PED/MAP, or a BED/BIM/FAM merge? if (tokens.size() == 2) { par::merge_pedfile = tokens[0]; par::merge_mapfile = tokens[1]; if (par::merge_pedfile=="" || par::merge_mapfile=="") continue; par::merge_binary = false; } else if (tokens.size() == 3) { par::merge_bedfile = tokens[0]; par::merge_bimfile = tokens[1]; par::merge_famfile = tokens[2]; if (par::merge_bedfile=="" || par::merge_bimfile=="" || par::merge_famfile=="") continue; par::merge_binary = true; } else error("Problem with merge-list file line:\n should be either 2 (PED/MAP) or 3(BED/BIM/FAM) fields:\n"+sline); c++; if (!par::silent) cout << c << " files merged \r"; // Perform the actual merge if (!par::merge_binary) mergeData(); else mergeBinaryData(); // Reset number of individuals n = sample.size(); // Set number of pairs np = (int)((double)(n*(n-1))/(double)2); // Total number of all (test+background) loci nl_all = locus.size(); } if (!par::silent) cout << "\n"; MLIST.close(); // Report some stats now we've finished printLOG("Merging "+int2str(c)+" samples, final sample contains "+int2str( n ) + " individuals"); printLOG(" and " +int2str( nl_all ) + " markers\n"); } void Plink::mergeData() { if (!par::SNP_major) error("--merge can only apply in SNP-major mode currently\n"); // Function to merge a text file with an exisiting data set // either SNP-major or individual-major modes if (!par::merge_list) { printLOG( "Using merge mode " + int2str( par::merge_mode ) + " : "); if (par::merge_mode==1) printLOG("consensus call (default)\n"); else if (par::merge_mode==2) printLOG("overwrite if missing in original\n"); else if (par::merge_mode==3) printLOG("overwrite unless missing in new\n"); else if (par::merge_mode==4) printLOG("overwrite none\n"); else if (par::merge_mode==5) printLOG("overwrite all\n"); else if (par::merge_mode==6) printLOG("diff mode: all differences\n"); else if (par::merge_mode==7) printLOG("diff mode: non-missing differences\n"); } // We've already loaded in the first file // Do not overwrite any existing phenotype information checkFileExists(par::merge_pedfile); checkFileExists(par::merge_mapfile); // Make hash of original SNP names map mlocus; for (int l=0;lname,l)); map::iterator ilocus; // Reset counts diff_overlap = 0; diff_nonmissing_overlap = 0; diff_concordant_overlap = 0; // A temporary hash for the names of any markers that // do not match in terms of strand map misstrand; /////////////////////////////////////// // .map vector include(0); map exists; vector ordered(0); vector locus2(0); ifstream MAP(par::merge_mapfile.c_str()); MAP.clear(); int exist_cnt=0; int c=0; while(!MAP.eof()) { Locus * loc = new Locus; string chr; long int inc; char cline[256] = ""; MAP.getline(cline,256,'\n'); // convert to string string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); if (tokens.size() == 0) continue; else if ( par::map3 && tokens.size() != 3 ) error("Problem with MAP file line:\n"+sline); else if ( (!par::map3) && tokens.size() != 4 ) error("Problem with MAP file line:\n"+sline); chr = tokens[0]; loc->name = tokens[1]; if ( par::map3 ) { loc->pos = 0; loc->bp = (long int)atoi(tokens[2].c_str()); } else { loc->pos = atof(tokens[2].c_str()); loc->bp = (long int)atoi(tokens[3].c_str()); } inc = loc->bp; // Check that cM/M specification looks correct, if // we want to perform a plink-based analysis if (par::plink && !par::cm_map && loc->pos > 50) error("Looks like you need to specify --cm ??"); // Convert cM to M map distances if (par::cm_map) loc->pos /= 100; // Convert chromosome code, taking species into account loc->chr = getChromosomeCode( chr ); // Use the frequency slot temporarily to // store order information loc->freq = c++; // Are we including this locus? if (loc->name!="") { if (inc<0) { include.push_back(false); } else { ilocus = mlocus.find(loc->name); // Check whether or not this Locus already exists? if (ilocus != mlocus.end()) { Locus * loc2 = locus[ilocus->second]; // Check same chromosome and positions, etc if ( loc2->chr != loc->chr ) { cerr << "Warning: different chromosome for " << loc->name << "\n"; loc->chr = loc2->chr; } // Check same chromosome and positions, etc if ( loc2->bp != loc->bp ) { cerr << "Warning: different physical position for " << loc->name << "\n"; loc->bp = loc2->bp; } if ( loc2->pos != loc->pos ) { cerr << "Warning: different genetic position for " << loc->name << "\n"; loc->pos = loc2->pos; } exists[loc->name] = true; exist_cnt++; include.push_back(true); locus2.push_back(loc2); // Keep the new file order (would have been in freq) int t = (int)loc->freq; // Clean up if we do not need another locus delete loc; // Replace loc = loc2; loc->freq = t; } else { // Locus does not exist -- add to locus list exists[loc->name] = false; include.push_back(true); locus2.push_back(loc); } } ordered.push_back(*loc); } } MAP.clear(); MAP.close(); if (!par::merge_list) { printLOG("\n" +int2str(locus2.size()) + " (of " + int2str(include.size()) + ") markers to be merged from [ " +par::merge_mapfile + " ]\n"); printLOG("Of these, "+int2str( locus2.size()-exist_cnt ) + " are new, " + int2str( exist_cnt ) + " already exist\n"); } /////////////////////////////////////////////// // Build ordered table, so that genotypes can be inserted // in correct order; then swap locus file over // Sorting a vector of pointers, so we need this special fix stable_sort(locus2.begin(),locus2.end(),less()); // Sorting a normal vector stable_sort(ordered.begin(),ordered.end()); c=0; for (int i=0; ifreq // p2 p3 p1 p5 p4 : genetic position // 0 1 2 3 4 : file order // 1 0 1 0 1 : include // sort by cM // p1 p2 p3 p4 p5 : genetic // 2 0 1 4 3 : file order // 1 1 0 1 0 : include // 0 1 2 : add genetic order: nonmissing... // // sort by file order again // p2 p3 p1 p5 p4 : genetic // 0 1 2 3 4 : file // 1 0 1 0 1 : include // 1 0 2 : position to put in locus[l] // Add new locus2() to end of locus() for (int l=0; lname)->second ) { Locus * loc = new Locus; loc = locus2[l]; locus.push_back(loc); } } /////////////////////////////////////////////// // .ped // Make new hash of Locus names mlocus.clear(); for (int l=0;lname,l)); } if (mlocus.size() != locus.size() ) { cerr << "Problem encountered merging files, with the following markers:\n"; mlocus.clear(); for (int l=0;lname) != mlocus.end()) cerr << locus[l]->name << "\n"; mlocus.insert(make_pair(locus[l]->name,l)); } cerr << "[ dump info: sizes = " << mlocus.size() << " and " << locus.size() << " ]\n"; error("Cannot merge files. Check your MAP files."); } // Make hash of existing individuals map msample; for (int i=0;ifid+"_"+sample[i]->iid,i)); map::iterator isample; // Resize all existing individuals // and set new elements to missing genotype (TF) if (par::SNP_major) { // Add space for new SNPs for (int i=0; ione.resize(n,true); newlocus->two.resize(n,false); SNP.push_back(newlocus); } } else { // If using individual-major mode for (int i=0; ione.resize(locus.size(),true); sample[i]->two.resize(locus.size(),false); } } // An output file for diff mode ofstream MERD; if (par::merge_mode >=6) { string f = par::output_file_name+".diff"; MERD.open(f.c_str(), ios::out); MERD << setw(20) << "SNP" << " " << setw(20) << "FID" << " " << setw(20) << "IID" << " " << setw(8) << "NEW" << " " << setw(8) << "OLD" << " " << "\n"; } int new_person = 0; int old_person = 0; /////////////////////////////////////// // Read in PED file to new merge file bool fatal_error = false; FILE * PED; PED = fopen(par::merge_pedfile.c_str(),"r"); c=0; while(! feof(PED) ) { Individual * person = new Individual; // Get first field int f=0; if (readString(PED,person->fid )) f++; // End of file? if ( person->fid=="" ) continue; // Check for reserved family ID code if ( person->fid=="FID" ) error("FID is a reserved ID... please select a different family ID"); // Is this line a comment? if (person->fid.substr(0,1)=="#") { // Ignore rest of line and advance to next line while (fgetc(PED) != '\n' && !feof(PED)) {} continue; } // First 6 obligatory fields if ( readString(PED,person->iid )) f++; if ( readString(PED, person->pat )) f++; if ( readString(PED, person->mat )) f++; if ( readString(PED, person->sexcode)) f++; string phenotype; if (readString(PED,phenotype)) f++; // Are we using 0/1 coding? if (par::coding01) { if ( phenotype == "1" ) phenotype = "2"; else if ( phenotype == "0" ) phenotype = "1"; else phenotype = "0"; } // Skip last empty line that gets read if (person->fid=="") break; if (person->sexcode=="1") person->sex = true; // male else if (person->sexcode=="2") person->sex = false; // female (default) // Have we already created this person? bool already_in = false; isample = msample.find(person->fid+"_"+person->iid); int indn = isample->second; if ( isample != msample.end() ) { already_in = true; delete person; person = sample[isample->second]; old_person++; } else new_person++; // Only look at phenotype if not already created if (!already_in) { ////////////////// // A non-founder? person->founder = (person->pat == "0" && person->mat == "0") ? true : false; ///////////////////////////////////////////////////// // Set missing status; test for quantitative traits? if (phenotype == par::missing_phenotype) person->missing = true; else { if ( ! from_string( person->phenotype, phenotype, std::dec )) person->missing = true; else if (phenotype != "0" && phenotype != "1" && phenotype != "2" ) { par::qt = true; par::bt = false; } } // if (!par::merge_list) // if (person->missing) // { // stringstream s2; // s2 << "Individual " << person->fid << " " // << person->iid << " has missing phenotype: " // << person->phenotype << " / " // << par::missing_phenotype << "\n"; // printLOG(s2.str()); // } } /////////////////////////////////////// // Add necessary space for a new person // Missing genotypes by default if (!already_in) { if (par::SNP_major) { // Add a new missing person to each SNP vector::iterator s = SNP.begin(); while ( s != SNP.end() ) { (*s)->one.push_back(true); (*s)->two.push_back(false); s++; } // And set the individual number indn = n + new_person - 1; } else { // Add all new SNPs to this person person->one.resize(locus.size(),true); person->two.resize(locus.size(),false); } } ///////////////////////// // For each locus int gn=0; int i=0; bool linedone = false; bool fatal = false; string fmsg; while ( ! linedone ) { string one=""; string two=""; while (1) { char ch = fgetc(PED); // Delimiter? if (ch==' ' || ch=='\t' || ch=='\n' || ch=='\r' || feof(PED) ) { if (ch=='\n' || ch=='\r' || feof(PED)) linedone = true; // have we already seen something? if (one.length()>0) { gn++; break; } if (ch=='\n' || ch=='\r' || feof(PED)) break; } else { one += ch; } } // Second allele if (!linedone) while (1) { char ch = fgetc(PED); // Delimiter? if (ch==' ' || ch=='\t' || ch=='\n' || ch=='\r' || feof(PED) ) { if (ch=='\n' || ch=='\r' || feof(PED)) linedone = true; // have we already seen something? if (two.length()>0) { gn++; break; } if (ch=='\n' || ch=='\r' || feof(PED)) break; } else { two += ch; } } if (linedone && one.length()==0 && two.length()==0 ) break; ///////////////////////////////////// // Only consider loci to be included if (include[i]) { int k0 = (int)ordered[i].freq; ilocus = mlocus.find(locus2[k0]->name); int k = ilocus->second; bool e = reconcileMerge(indn, k, one,two, already_in, exists.find(locus2[k0]->name)->second, MERD, misstrand ); if (e) fatal_error = true; } // Advance to next locus i++; if ( i > include.size()) { fmsg += "\nProblem with line "+int2str(c+1)+" in [ " +par::merge_pedfile+" ]\n"; fmsg += "Expecting 6 + 2 * " + int2str(include.size()) + " = " + int2str(6+2*include.size())+ " columns, but found more\n"; error(fmsg); } } // Next locus // check size of line length somewhere if ( gn != 2 * include.size() ) { fmsg += "\nProblem with line "+int2str(c+1)+" in [ " +par::merge_pedfile+" ]\n"; fmsg += "Expecting 6 + 2 * " + int2str(include.size()) + " = " + int2str(6+2*include.size())+ " columns, but found " + int2str(f+gn) + "\n"; fatal=true; } if (fatal) error(fmsg); // Increase person counter c++; // Add individual to list, if need be if (!already_in) sample.push_back(person); } /////////////////////////////////////////////////////// // Did we encounter any fatal errors from flipped SNPs? if (fatal_error) { ofstream MSNP; string f = par::output_file_name+".missnp"; MSNP.open(f.c_str(), ios::out); map::iterator ilocus; for ( ilocus = misstrand.begin() ; ilocus != misstrand.end() ; ilocus++) { MSNP << locus[ilocus->second]->chr << "\t" << locus[ilocus->second]->name << "\n"; } MSNP.close(); cerr << "\nFound " << misstrand.size() << " SNPs that do not match in terms of allele codes\n"; cerr << "Might include strand flips, although flipped A/T and C/G SNPs will be undetected)\n"; cerr << "Writing problem SNPs to [ " << f << " ]\n"; error("Stopping due to mis-matching SNPs -- check +/- strand?"); } // If a binary trait, now make 0 missing also // i.e. if we never saw other than missing, 0, 1 or 2 if (par::bt) for (int i=0; iphenotype == 0 ) sample[i]->missing = true; // Close the PED file fclose(PED); if (!par::merge_list) { printLOG(int2str( c ) + " individuals merged from [ " + par::merge_pedfile + " ] \n"); printLOG("Of these, " + int2str(new_person) + " were new, " + int2str( old_person ) + " were already in current data\n\n"); } if (par::merge_mode >=6) { printLOG("Results from diff ( merge mode " + int2str(par::merge_mode) + " ) written to [ " + par::output_file_name + ".diff ]\n"); MERD.close(); printLOG("Of " + int2str(diff_overlap) + " overlapping SNPs, " + int2str( diff_nonmissing_overlap ) + " were both genotyped\nand " + int2str( diff_concordant_overlap ) + " were concordant\n"); printLOG("Concordance rate is " + dbl2str( (double)diff_concordant_overlap / (double)diff_nonmissing_overlap ) + "\n"); shutdown(); } // Phenotype statistics if (!par::merge_list) { int nm=0; for (int i=0;imissing) nm++; printLOG(int2str( nm ) + " individuals with nonmissing phenotypes\n"); if (par::bt) { printLOG("Assuming a disease phenotype (1=unaff, 2=aff, 0=miss)\n"); if (par::missing_phenotype!="0") printLOG("Missing phenotype value is also " + par::missing_phenotype + "\n"); int ncase = 0; int ncontrol = 0; for (int i=0; iphenotype == 1 ) ncontrol++; else if ( sample[i]->phenotype == 2 ) ncase++; printLOG(int2str(ncase)+" cases and "+int2str(ncontrol)+" controls\n"); } else { printLOG("Assuming a quantitative trait\n"); printLOG("Missing phenotype value is " + par::missing_phenotype + "\n"); } } } bool Plink::reconcileMerge(int indn, int k, string one, string two, bool already_in, bool snp_exists, ofstream & MERD, map & misstrand) { // cout << "rec locus " << locus[k]->name << "\n"; // cout << "in = " << one << " " << two <<"\n"; // cout << "existing = " << locus[k]->allele1 // << " " << locus[k]->allele2 << "\n"; // Note -- this routine does not fully implemented individual-major // mergeing, which is why we currently only allow SNP-major in the // main mergeData() routine above bool fatal_error = false; Locus * loc = locus[k]; Individual * person = NULL; if (already_in) person = sample[indn]; ///////////////////////////////////////// // Add allele names to list, if needed // // For a merged-in binary file, we already will have performed // this step when reading in the BIM file if ( ! par::merge_binary ) { // If allele is not missing... if (one!=par::missing_genotype && two!=par::missing_genotype) { // ...and not already listed if (one!=loc->allele1 && one!=loc->allele2) { // ...then add to first empty slot if(loc->allele1=="" || loc->allele1==par::missing_genotype) loc->allele1=one; else if(loc->allele2=="" || loc->allele2==par::missing_genotype) loc->allele2=one; else { // .. or show an error if no empty slots misstrand.insert(make_pair(loc->name,k)); fatal_error = true; } } } ////////////////////////////////////////// // Repeat for second allele, if different if (two!=one) { // If allele is not missing... if (two!=par::missing_genotype) // ...and not already listed if (two!=loc->allele1 && two!=loc->allele2) { // ...then add to first empty slot if(loc->allele1=="" || loc->allele1==par::missing_genotype) loc->allele1=two; else if(loc->allele2=="" || loc->allele2==par::missing_genotype ) loc->allele2=two; else { misstrand.insert(make_pair(loc->name,k)); fatal_error = true; } } } } ///////////////////////// // Add specific genotypes bool write; if ( !already_in ) write = true; // Write if new person else if ( ! snp_exists ) write = true; // Write if new SNP else { // The genotype exists in both the original and the new files // Depending on the merge-mode, we need to determine whether // to overwrite, or report, these genotypes bool s1; bool s2; if (par::SNP_major) { s1 = SNP[k]->one[indn]; s2 = SNP[k]->two[indn]; } else { s1 = person->one[k]; s2 = person->two[k]; } // MODE 1: Consensus call if ( par::merge_mode == 1) { // If new genotype missing, never write if ( one==par::missing_genotype || two==par::missing_genotype) write = false; // If existing is missing, always write else if ( (s1) && (!s2) ) write = true; // Else if both called, check they match else { bool mismatch = false; if ( one==loc->allele1 && two==loc->allele1 ) // New == 11 { if ( ! ( (!s1) && (!s2) ) ) mismatch = true; } else if ( one==loc->allele1 && two==loc->allele2 ) // New == 12 { if ( ! ( (!s1) && s2 ) ) mismatch = true; } else if ( one==loc->allele2 && two==loc->allele1 ) // New == 12 { if ( ! ( (!s1) && s2 ) ) mismatch = true; } else if ( one==loc->allele2 && two==loc->allele2 ) // New == 22 { if ( ! ( (s1) && (s2) ) ) mismatch = true; } if (mismatch) { one = par::missing_genotype; two = par::missing_genotype; write = true; } } } // MODE 2: Overwrite if original missing else if ( par::merge_mode == 2) { if ( s1 && (!s2) ) write = true; else write = false; } // MODE 3: Overwrite unless missing in new else if ( par::merge_mode == 3) { if ( one==par::missing_genotype || two==par::missing_genotype) write = false; else write = true; } // MODE 4: Never overwrite if ( par::merge_mode == 4) write = false; // MODE 5: Overwrite all else if ( par::merge_mode == 5) write = true; // MODE 6,7 : Report diffs (if non-missing) else if ( par::merge_mode == 6 || par::merge_mode == 7 ) { bool mismatch = false; bool new_geno_missing = one==par::missing_genotype || two==par::missing_genotype ; if ( new_geno_missing ) // New == ?? { if ( ! ( s1 && (!s2) ) ) mismatch = true; } else if ( one==loc->allele1 && two==loc->allele1 ) // New == 11 { if ( ! ( (!s1) && (!s2) ) ) mismatch = true; } else if ( one==loc->allele1 && two==loc->allele2 ) // New == 12 { if ( ! ( (!s1) && s2 ) ) mismatch = true; } else if ( one==loc->allele2 && two==loc->allele1 ) // New == 12 { if ( ! ( (!s1) && s2 ) ) mismatch = true; } else if ( one==loc->allele2 && two==loc->allele2 ) // New == 22 { if ( ! ( (s1) && (s2) ) ) mismatch = true; } if (par::merge_mode==7) { if ( new_geno_missing || ( s1 && (!s2) ) ) mismatch=false; } // Summary stats ++diff_overlap; if ( ! ( new_geno_missing || ( s1 && (!s2) ) ) ) { ++diff_nonmissing_overlap; if ( ! mismatch ) ++diff_concordant_overlap; } if (mismatch) { MERD << setw(20) << loc->name << " " << setw(20) << person->fid << " " << setw(20) << person->iid << " " << setw(8) << (string)(one+"/"+two) << " "; if ((!s1) && (!s2)) MERD << setw(8) << (string)(loc->allele1+"/"+loc->allele1) << " "; if ((!s1) && s2) MERD << setw(8) << (string)(loc->allele1+"/"+loc->allele2) << " "; if ( s1 && s2 ) MERD << setw(8) << (string)(loc->allele2+"/"+loc->allele2) << " "; if ( s1 && (!s2)) MERD << setw(8) << (string)(par::missing_genotype+"/"+par::missing_genotype) << " "; MERD << "\n"; } } } if (write) { if (par::SNP_major) { // 00 hom if (one==loc->allele1 && two==loc->allele1 && one!=par::missing_genotype ) { SNP[k]->one[indn] = false; SNP[k]->two[indn] = false; } // 01 het else if (one!=par::missing_genotype && two!=par::missing_genotype && one!=two) { SNP[k]->one[indn] = false; SNP[k]->two[indn] = true; } // 11 hom else if (one==loc->allele2 && two==loc->allele2 && one!=par::missing_genotype) { SNP[k]->one[indn] = true; SNP[k]->two[indn] = true; } // 10 missing else { SNP[k]->one[indn] = true; SNP[k]->two[indn] = false; } } else { ///////////////////////// // Individual-major mode // 00 hom if (one==loc->allele1 && two==loc->allele1 && one!=par::missing_genotype) { person->one[k]=false; person->two[k]=false; } // 01 het else if (one!=par::missing_genotype && two!=par::missing_genotype && one!=two) { person->one[k]=false; person->two[k]=true; } // 11 hom else if (one==loc->allele2 && two==loc->allele2 && one!=par::missing_genotype) { person->one[k]=true; person->two[k]=true; } // 10 missing else { person->one[k]=true; person->two[k]=false; } } } return fatal_error; } plink-1.07-src/sharing.cpp0000644000265600020320000001565511264127624014647 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "plink.h" #include "perm.h" #include "options.h" #include "helper.h" void Plink::perm_sharingIBSTest(Perm & perm) { // This is a SNP-major test if (!par::SNP_major) Ind2SNP(); // Test statistic (set-based) vector delta(snpset.size()); // Empirical p-values perm.setTests(snpset.size()); //////////////////////////////// // Fast binary affection coding if (!par::qt) affCoding(*this); //////////////////////////////// // Set up permutation structure // (we need to perform this step // whether or not we also // subsequently permute) perm.setPermClusters(*this); perm.originalOrder(); ///////////////////// // Create original delta = sharingIBSTest(perm); ////////////////////// // Begin permutations bool finished = false; while(!finished) { perm.permuteInCluster(); vector pr = sharingIBSTest(perm); //////////////////////////////// // Standard permutation counting finished = perm.update(pr,delta); } // next permutation if (!par::silent) cout << "\n\n"; //////////////////// // Display results ofstream ASC; string f; if (par::adaptive_perm) f = par::output_file_name + ".sharing.perm"; else f = par::output_file_name + ".sharing.mperm"; ASC.open(f.c_str(),ios::out); ASC.precision(4); printLOG("Writing IBS sharing association results to [ " + f + " ] \n"); ASC << setw(20) << "SET" << " " << setw(12) << "EMP1" << " "; if (par::adaptive_perm) ASC << setw(12)<< "NP" << " "; else ASC << setw(12)<< "EMP2" << " "; ASC << "\n"; for (int l=0; l Plink::sharingIBSTest(Perm & perm) { // // number of rare (F) alleles shared // // FF FF -> // // FF FT -> // // FF TT -> -2 // // FT FF -> // // FT FT -> +1 // // FT TT -> // // TT FF -> -2 // // TT FT -> // // TT TT -> // // Number of sets // int ns = snpset.size(); // // Test statistics // vector delta(ns,0); // // Iterate over sets // for (int i=0; i::iterator a1 = (*loc)->one.begin(); // vector::iterator a2 = (*loc)->two.begin(); // vector::iterator gperson1 = sample.begin(); // int i1 = 0; // //////////////// // // Individual A // while ( i1 < n-1 ) // { // // Permuted self for first individual // Individual * pperson1 = (*gperson1)->pperson; // // U-? -- first individual unaffected // if ( ! ( sample[perm_pheno[i1]]->aff ) // { // // Consider each locus: FF x TT -> -2 // for (int l=0; lone[l]) && (!g1->two[l]) ) // { // // Consider all other individuals // for (int i2=i1+1; i2one[l] && // sample[perm_geno[i2]]->two[l] ) // { // // only count discordant pairs // if ( sample[perm_geno[i2]]->aff ) // s1[l]-=2; // } // } // } // else if ( (!g1->one[l]) && g1->two[l] ) // { // // ... if first member is FT // // Consider all other individuals // for (int i2=i1+1; i2 +1 // if ( (!sample[perm_geno[i2]]->one[l]) && // sample[perm_geno[i2]]->two[l] ) // { // // Discordant pair // if ( sample[perm_geno[i2]]->aff ) // s1[l]++; // } // } // } // else if ( g1->one[l] && g1->two[l] ) // { // // ... if first member is TT // // Consider all other individuals // for (int i2=i1+1; i2one[l]) && // (!sample[perm_geno[i2]]->two[l]) ) // { // // Discordant pair // if ( sample[perm_geno[i2]]->aff ) // s1[l]-=2; // } // } // } // } // next locus // } // else // { // // otherwise, we know first individul is affected // // and so we must now scan for AU and AA pairs (s1, s2) // // Consider each locus: FF x TT -> -2 // for (int l=0; lone[l]) && (!g1->two[l]) ) // { // // Consider all other individuals // for (int i2=i1+1; i2one[l] && // sample[perm_geno[i2]]->two[l] ) // { // // only count discordant pairs // if ( sample[perm_geno[i2]]->aff ) // s2[l]-=2; // conc aff // else // s1[l]-=2; // disc // } // } // } // else if ( (!g1->one[l]) && g1->two[l] ) // { // // ... if first member is FT // // Consider all other individuals // for (int i2=i1+1; i2 +1 // if ( (!sample[perm_geno[i2]]->one[l]) && // sample[perm_geno[i2]]->two[l] ) // { // if ( sample[perm_geno[i2]]->aff ) // s2[l]++; // else // s1[l]++; // } // } // } // else if ( g1->one[l] && g1->two[l] ) // { // // ... if first member is TT // // Consider all other individuals // for (int i2=i1+1; i2one[l]) && // (!sample[perm_geno[i2]]->two[l]) ) // { // if ( sample[perm_geno[i2]]->aff ) // s2[l]-=2; // else // s1[l]-=2; // } // } // } // } // next locus // } // } // next, first individual of pair vector t(1); return t; } plink-1.07-src/input.cpp0000644000265600020320000020350311264127624014342 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "nlist.h" #include "gvar.h" extern ofstream LOG; void Plink::readData() { ////////////////////// // Check files exist if ( ! par::ped_from_stdin) checkFileExists(par::pedfile); checkFileExists(par::mapfile); /////////////////////////////////////////////// // .map file vector include; vector include_pos(0); int nl_actual=0; readMapFile(par::mapfile, include, include_pos, nl_actual); /////////////////////////////////////////////// // .ped FILE * PED; if ( ! par::ped_from_stdin ) { PED = fopen64(par::pedfile.c_str(),"r"); if ( PED == NULL ) error("Problem opening PED file, errno = "+int2str(errno)); } vector ambiguous; int nmale = 0; int nfemale = 0; int nambig = 0; int c=0; // number of individuals string s2; while( 1 ) { // End of input stream? if ( par::ped_from_stdin ) { if ( cin.eof() ) break; } else { if ( feof(PED) ) break; } // Otherwise read in the next person Individual * person = new Individual; // Get first field int f=0; if ( par::ped_from_stdin) cin >> person->fid; else if (readString(PED,person->fid )) f++; // End of file? if ( person->fid=="" ) { delete person; continue; } if ( person->fid=="FID" ) error("FID is a reserved ID... please select a different family ID"); // Is this line a comment? if ( ! par::ped_from_stdin) { if (person->fid.substr(0,1)=="#") { // Ignore rest of line and advance to next line while (fgetc(PED) != '\n' && !feof(PED)) {} delete person; continue; } } // First 6 or 7 obligatory fields if ( par::ped_skip_fid ) person->iid = person->fid; else { if ( par::ped_from_stdin ) cin >> person->iid; else if ( readString(PED,person->iid )) f++; } if ( par::ped_skip_parents ) { person->mat = person->pat = "0"; } else { if ( par::ped_from_stdin ) { cin >> person->pat >> person->mat; } else { if ( readString(PED, person->pat )) f++; if ( readString(PED, person->mat )) f++; } } if ( par::ped_skip_sex ) person->sexcode = "0"; else { if ( par::ped_from_stdin ) cin >> person->sexcode; else if ( readString(PED, person->sexcode)) f++; } string phenotype; if ( par::ped_skip_pheno ) phenotype = par::missing_phenotype; else { if ( par::ped_from_stdin ) cin >> phenotype; else if (readString(PED,phenotype)) f++; } // Are we using 0/1 coding? if (par::coding01) { if ( phenotype == "1" ) phenotype = "2"; else if ( phenotype == "0" ) phenotype = "1"; else phenotype = "0"; } // Optional liability class if (par::liability) { string dummy; if ( par::ped_from_stdin ) cin >> dummy; else if (readString(PED,dummy)) f++; } // Skip last empty line that gets read if (person->fid=="") break; // Check sex if (person->sexcode=="1") { person->sex = true; // male nmale++; } else if (person->sexcode=="2") { person->sex = false; // female nfemale++; } else { ambiguous.push_back(person); nambig++; if (!par::ignore_missing_sex) person->missing = true; } ////////////////// // A non-founder? person->founder = (person->pat == "0" && person->mat == "0") ? true : false; ////////////////////////////// // Test for quantitative trait if (phenotype == par::missing_phenotype) person->missing = true; else { // Store in person->phenotype as number, checking for // conversion failure if ( ! from_string( person->phenotype, phenotype, std::dec ) ) person->missing = true; else { if (phenotype != "0" && phenotype != "1" && phenotype != "2" ) { par::qt = true; par::bt = false; } } } ///////////////////////////// // Add necessary locus space if (!par::SNP_major) { person->one.resize(nl_actual); person->two.resize(nl_actual); } ///////////////////// // Read genotypes now int gn=0; int i=0; bool linedone = false; bool fatal = false; string fmsg; while ( ! linedone ) { string one=""; string two=""; if ( par::ped_from_stdin ) cin >> one >> two; else { while (1) { char ch = fgetc(PED); // Delimiter? if (ch==' ' || ch=='\t' || ch=='\n' || ch=='\r' || feof(PED) ) { if (ch=='\n' || ch=='\r' || feof(PED)) linedone = true; // have we already seen something? if (one.length()>0) { gn++; break; } if (ch=='\n' || ch=='\r' || feof(PED)) break; } else { one += ch; } } // Is this a compound genotype? if ( par::compound_genotype_code ) { // In this case, each allele must be exactly 1-character long if ( one.length() != 2 ) error("Problem with compound genotype [ " + one + " ] should be two characters long\n"); two = one[1]; one = one[0]; // Add second allele ++gn; } else { // Second allele if (!linedone) while (1) { char ch = fgetc(PED); // Delimiter? if (ch==' ' || ch=='\t' || ch=='\n' || ch=='\r' || feof(PED) ) { if (ch=='\n' || ch=='\r' || feof(PED)) linedone = true; // have we already seen something? if (two.length()>0) { gn++; break; } if (ch=='\n' || ch=='\r' || feof(PED)) break; } else { two += ch; } } } if (linedone && one.length()==0 && two.length()==0 ) break; } ///////////////////////////////////// // Only consider loci to be included if (include[i]) { ////////////////////////////// // Look up genomic order, // insert in slot k in locus[] int k = include_pos[i]; Locus * loc = locus[k]; ///////////////////////////////////////// // Add allele names to list, if needed // If allele is not missing... if (one!=par::missing_genotype && two!=par::missing_genotype) { // ...and not already listed if (one!=loc->allele1 && one!=loc->allele2) { // ...then add to first empty slot if(loc->allele1=="") loc->allele1=one; else if(loc->allele2=="") loc->allele2=one; else { // .. or show an error if no empty slots if (!fatal) fmsg = "Locus " + loc->name + " has >2 alleles:\n individual " + person->fid + " " + person->iid + " has genotype [ " + one +" "+two+" ]\n" + " but we've already seen [ " + loc->allele1 + " ] and [ " + loc->allele2 + " ]\n"; fatal=true; } } } // Repeat for second allele, if different if (two!=one) { // If allele is not missing... if (one!=par::missing_genotype) // ...and not already listed if (two!=loc->allele1 && two!=loc->allele2) { // ...then add to first empty slot if(loc->allele1=="") loc->allele1=two; else if(loc->allele2=="") loc->allele2=two; else { if (!fatal) fmsg = "Locus " + loc->name + " has >2 alleles:\n individual " + person->fid + " " + person->iid + " has genotype [ " + one +" "+two+" ]\n" + " but we've already seen [ " + loc->allele1 + " ] and [ " + loc->allele2 + " ]\n"; fatal=true; } } } ///////////////////////////// // Add specific genotypes if (par::SNP_major) { // 00 hom if (one==loc->allele1 && two==loc->allele1) { SNP[k]->one.push_back(false); SNP[k]->two.push_back(false); } // 01 het else if (one!=par::missing_genotype && two!=par::missing_genotype && one!=two) { SNP[k]->one.push_back(false); SNP[k]->two.push_back(true); } // 11 hom else if (one==loc->allele2 && two==loc->allele2) { SNP[k]->one.push_back(true); SNP[k]->two.push_back(true); } // 10 missing else if (one==par::missing_genotype || two==par::missing_genotype) { SNP[k]->one.push_back(true); SNP[k]->two.push_back(false); } } else { // 00 hom if (one==loc->allele1 && two==loc->allele1) { person->one[k]=false; person->two[k]=false; } // 01 het else if (one!=par::missing_genotype && two!=par::missing_genotype && one!=two) { person->one[k]=false; person->two[k]=true; } // 11 hom else if (one==loc->allele2 && two==loc->allele2) { person->one[k]=true; person->two[k]=true; } // 10 missing else if (one==par::missing_genotype || two==par::missing_genotype) { person->one[k]=true; person->two[k]=false; } } } // Advance to next locus i++; if ( par::ped_from_stdin ) { if ( i == include.size() ) linedone = true; } else { if ( i > include.size()) { int ef = 1; if ( ! par::ped_skip_fid ) ef++; if ( ! par::ped_skip_parents ) ef+=2; if ( ! par::ped_skip_sex ) ef++; if ( ! par::ped_skip_pheno ) ef++; if ( par::liability ) ef++; fmsg += "\nProblem with line " +int2str(c+1)+" in [ "+par::pedfile+" ]\n"; fmsg += "Expecting " +int2str(ef) + " + 2 * " + int2str(include.size()) + " = " + int2str(ef+2*include.size())+ " columns, but found more\n"; error(fmsg); } } } // line done? // check size of line length somewhere if ( ! par::ped_from_stdin ) { if ( gn != 2 * include.size() ) { int ef = 1; if ( ! par::ped_skip_fid ) ef++; if ( ! par::ped_skip_parents ) ef+=2; if ( ! par::ped_skip_sex ) ef++; if ( ! par::ped_skip_pheno ) ef++; if ( par::liability ) ef++; fmsg += "\nA problem with line " +int2str(c+1)+" in [ "+par::pedfile+" ]\n"; fmsg += "Expecting "+int2str(ef) +" + 2 * " + int2str(include.size()) + " = " + int2str(ef+2*include.size())+ " columns, but found " + int2str(f+gn) + "\n"; fatal=true; } } if (fatal) error(fmsg); // Increase person counter c++; // Add individual to list sample.push_back(person); } // If a binary trait, now make 0 missing also // i.e. if we never saw other than missing, 0, 1 or 2 if (par::bt) for (int i=0; iphenotype == 0 ) sample[i]->missing = true; // Display list of ambiguously-sexed individuals? if (ambiguous.size()>0) { printLOG("Warning, found " +int2str(ambiguous.size()) +" individuals with ambiguous sex codes\n"); if (!par::ignore_missing_sex) printLOG("These individuals will be set to missing ( or use --allow-no-sex )\n"); string f = par::output_file_name + ".nosex"; printLOG("Writing list of these individuals to [ "+f+" ]\n"); ofstream AMB; AMB.open(f.c_str(), ifstream::out); for (int i=0; ifid << "\t" << ambiguous[i]->iid << "\n"; AMB.close(); ambiguous.clear(); } // Close PED file if ( ! par::ped_from_stdin ) fclose(PED); printLOG(int2str(c)+" individuals read from [ "+par::pedfile+" ] \n"); int nm=0; for (int i=0;imissing) nm++; printLOG(int2str(nm)+" individuals with nonmissing phenotypes\n"); if (par::bt) { if (par::coding01) printLOG("Assuming a disease phenotype (0=unaff, 1=aff, other=miss)\n"); else { printLOG("Assuming a disease phenotype (1=unaff, 2=aff, 0=miss)\n"); if (par::missing_phenotype!="0") printLOG("Missing phenotype value is also " + par::missing_phenotype + "\n"); } int ncase = 0; int ncontrol = 0; int nmissing = 0; for (int i=0; imissing ) nmissing++; else if ( sample[i]->phenotype == 1 ) ncontrol++; else if ( sample[i]->phenotype == 2 ) ncase++; printLOG(int2str(ncase)+" cases, " +int2str(ncontrol)+" controls and " +int2str(nmissing)+" missing\n"); } else { printLOG("Assuming a quantitative trait\n"); printLOG("Missing phenotype value is " + par::missing_phenotype + "\n"); } // Display sex counts printLOG(int2str(nmale)+" males, "+int2str(nfemale) +" females, and "+int2str(nambig)+" of unspecified sex\n"); } void Plink::readSet() { bool firsttime = true; set subset; if ( par::use_subset ) { printLOG("Reading a list of subsets from [ " + par::subsetfile + " ]\n"); checkFileExists( par::subsetfile ); ifstream IN(par::subsetfile.c_str(), ios::in); while ( ! IN.eof() ) { string gname; IN >> gname; if ( gname=="" ) continue; subset.insert(gname); } printLOG("Read " + int2str( subset.size() ) + " sets to extract\n"); } ////////////////////// // Clear current sets // (i.e. after removing SNPs, the lookup numbers will be wrong so we // need to reload) if (snpset.size()>0) { firsttime = false; for (int i=0; i s; // First set name string name; SET >> name; // Make map of locus name with 'l' number map mlocus; for (int l=0;lname,l)); map::iterator ilocus; while(!SET.eof()) { string t; SET >> t; if (t=="END" || t=="end" || SET.fail() ) // End of SET { if ( SET.fail() ) printLOG("Warning: the set-file did not end with the END keyword\n"); if ( ( ! par::use_subset ) || subset.find(name) != subset.end() ) { // Save set snpset.push_back(s); // Save set name setname.push_back(name); } // Get next set name SET >> name; // Clear buffer s.resize(0); } else { // Lookup locus name ilocus = mlocus.find(t); if (ilocus != mlocus.end()) s.push_back(ilocus->second); } } if (firsttime) printLOG(int2str(snpset.size()) + " sets read from [ " + par::setfile + " ] \n"); } bool Plink::readClusterFile(bool verbose) { checkFileExists(par::include_cluster_filename); ifstream CLST(par::include_cluster_filename.c_str(), ios::in); // Make map of family/individual IDs. Originally, set to not be // permuted: i.e. if an individual does not appear in the --within // file, then they will not be permuted. Otherwise, all individuals // are permuted. map uid; map::iterator ii; map::iterator ik; for (int i=0; ifid+"_"+sample[i]->iid,sample[i])); sample[i]->sol = -1; } int cnt=0; int k=0; kname.resize(0); kmap.clear(); while (!CLST.eof()) { string pfid, piid; string cluster; char cline[par::MAX_LINE_LENGTH]; CLST.getline(cline,par::MAX_LINE_LENGTH,'\n'); // convert to string string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); // Trying to read past last column if (tokens.size() < 2+par::mult_clst) { if (! par::bmatch ) { for (int i0=0; i0second)->sol = ik->second; else // add to list { kmap.insert(make_pair(cluster,k)); (ii->second)->sol = k; kname.push_back(cluster); k++; } cnt++; } } CLST.close(); // Assign to Family-like groups of pointers (e.g. to enable DFAM routine) klist.clear(); for (int j=0;jsol > -1 ) klist[sample[i]->sol]->person.push_back(sample[i]); } if ( verbose ) printLOG(int2str( cnt ) + " of " + int2str(n) + " individuals assigned to " + int2str(k) + " cluster(s)\n"); nk = k; return true; } bool Plink::readPhenoFile() { // Assume binary trait unless we find out otherwise par::qt = false; par::bt = true; printLOG( "Reading alternate phenotype from [ " + par::pheno_filename + " ] \n"); checkFileExists(par::pheno_filename); ifstream PHE(par::pheno_filename.c_str(), ios::in); // Make map of family/individual IDs // and initially set all individuals as missing map uid; map::iterator ii; for (int i=0; ifid+"_"+sample[i]->iid,sample[i])); sample[i]->phenotype = -9; sample[i]->missing = true; } // Read in phenotype labels, if --pheno-name has been used if ( par::name_pheno != "" ) { string pfid, piid, ph; char cline[par::MAX_LINE_LENGTH]; PHE.getline(cline,par::MAX_LINE_LENGTH,'\n'); string sline = cline; if (sline!="") { string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); if ( tokens[0] != "FID" ) error("First header field must be FID"); if ( tokens[1] != "IID" ) error("Second header field must be IID"); par::mult_pheno = -1; for ( int i=2; i tokens; while (ss >> buf) tokens.push_back(buf); if ( ccount < 0 ) ccount = tokens.size(); else if ( ccount != tokens.size() ) error("Wrong number of columns in file [ "+par::pheno_filename +" ] line :\n"+sline); if (tokens.size() < 2+par::mult_pheno) { if ( par::all_pheno ) { printLOG("Processed all phenotypes\n"); return false; } else error("Problem with [ "+par::pheno_filename +" ] -- not enough columns :\n"+sline); } pfid = tokens[0]; piid = tokens[1]; // Skip header, if --pheno-name not used but should have been // (shouldn't matter in any case, as no individual with FID // of "FID" should be allowed if ( pfid == "FID" || piid == "IID" ) { phenotype_name = tokens[1 + par::mult_pheno]; continue; } ph = tokens[1 + par::mult_pheno]; // Are we using 0/1 coding? if (par::coding01) { if ( ph == "1" ) ph = "2"; else if ( ph == "0" ) ph = "1"; else ph = "0"; } ii = uid.find(pfid+"_"+piid); if (ii != uid.end() ) { (ii->second)->missing = true; // Only connect for non-missing phenotypes if (ph != par::missing_phenotype) { (ii->second)->missing = false; } // Convert to double, checking for illegal values if ( ! from_string( (ii->second)->phenotype , ph , std::dec)) (ii->second)->missing = true; if (ph != par::missing_phenotype && ph != "0" && ph != "1" && ph != "2" ) { par::qt = true; par::bt = false; } } } PHE.close(); // If a binary trait, now make 0 missing also // i.e. if we never saw other than missing, 0, 1 or 2 if (par::bt) for (int i=0; iphenotype == 0 ) sample[i]->missing = true; int new_nmissing=0; for (int i=0; imissing ) new_nmissing++; printLOG(int2str(new_nmissing) + " individuals with non-missing alternate phenotype\n"); if (par::bt) { if (par::coding01) printLOG("Assuming a disease phenotype (0=unaff, 1=aff, other=miss)\n"); else { printLOG("Assuming a disease phenotype (1=unaff, 2=aff, 0=miss)\n"); if (par::missing_phenotype!="0") printLOG("Missing phenotype value is also " + par::missing_phenotype + "\n"); } int ncase = 0; int ncontrol = 0; int nmissing = 0; for (int i=0; imissing ) nmissing++; else if ( sample[i]->phenotype == 1 ) ncontrol++; else if ( sample[i]->phenotype == 2 ) ncase++; } printLOG(int2str(ncase)+" cases, " +int2str(ncontrol)+" controls and " +int2str(nmissing)+" missing\n"); } else { printLOG("Assuming a quantitative trait\n"); printLOG("Missing phenotype value is " + par::missing_phenotype + "\n"); } return true; } void Plink::makePhenotype() { // Implies a binary trait par::qt = false; par::bt = true; printLOG("Constructing a binary phenotype from [ " +par::make_pheno_filename+" ]\n"); if ( par::make_pheno_present ) { map imap; printLOG("All individuals not present will be set as controls\n"); int cnt = 0; for (int i=0;ifid +"_" + sample[i]->iid , i )); sample[i]->phenotype = 1; sample[i]->missing = false; sample[i]->aff = false; } ifstream I1; while ( ! I1.eof() ) { vector tokens = tokenizeLine(I1); if ( tokens.size() < 2 ) continue; string fiid = tokens[0] + "_" + tokens[1]; map::iterator it = imap.find(fiid); if ( it != imap.end() ) { sample[ it->second ]->phenotype = 2; sample[ it->second ]->aff = true; ++cnt; } } I1.close(); printLOG( int2str( cnt ) + " individuals set as cases\n"); } else { printLOG("Test value is [ " + par::make_pheno_value + " ] and missing value is [ " + par::missing_phenotype + " ]\n"); // Swap filename as temporary cluster file string tmp_covar_file = par::include_cluster_filename; int tmp_mult_covar = par::mult_clst; par::include_cluster_filename = par::make_pheno_filename; par::mult_clst = 1; if (!readClusterFile()) error("Problem reading filter file [ " + par::make_pheno_filename + " ]\n"); // Put back the original covariate specificiation par::include_cluster_filename = tmp_covar_file; par::mult_clst = tmp_mult_covar; int setCase = 0; int setControl = 0; int setMissing = 0; int notFound = 0; map::iterator k = kmap.find( par::make_pheno_value ); int value = k != kmap.end() ? k->second : -1; map::iterator km = kmap.find( par::missing_phenotype ); int missingValue = km != kmap.end() ? km->second : -1; for (int i=0; imissing = true; for (int i=0; isol == -1 ) { person->missing = true; person->phenotype = 0; ++notFound; } else if ( person->sol == missingValue ) { person->missing = true; person->phenotype = 0; ++setMissing; } else if ( person->sol == value ) { person->missing = false;; person->phenotype = 2; ++setCase; } else { person->missing = false;; person->phenotype = 1; ++setControl; } } printLOG("Set "+int2str(setCase)+" cases and "+int2str(setControl)+" controls, "); printLOG(int2str(setMissing)+" missing, "+int2str(notFound)+" not found\n"); ////////////////////////////// // Clear cluster values now nk=1; kmap.clear(); kname.clear(); for (int i=0; isol = 0; } } bool Plink::readCovariateFile() { // This will set individuals as missing // if they have a missing value for the // covariate, or do not appear in the file checkFileExists(par::covar_filename); ifstream COV(par::covar_filename.c_str(), ios::in); map uid; map::iterator ii; set hasCovariate; for (int i=0; ifid+"_"+sample[i]->iid,sample[i])); } int nvalid=0; while (!COV.eof()) { string pfid, piid, cov; char cline[par::MAX_LINE_LENGTH]; COV.getline(cline,par::MAX_LINE_LENGTH,'\n'); // convert to string string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); // Trying to read past last column if (tokens.size() < 2+par::mult_covar) { if (! par::qmatch ) { for (int i0=0; i0second; // Set covariate value bool badValue = false; if ( ! from_string( person->covar, cov, std::dec ) ) badValue = true; // Note that we've seen a covariate for this individual hasCovariate.insert(person); // Was this missing? if (cov == par::missing_phenotype || badValue ) { person->missing = true; } else nvalid++; } } COV.close(); // Set to missing any individuals for who we did not see the covariate vector::iterator person = sample.begin(); while ( person != sample.end() ) { if ( hasCovariate.find( *person ) == hasCovariate.end() ) (*person)->missing = true; person++; } printLOG("Reading covariate from [ " + par::covar_filename + " ] with "); printLOG("nonmissing values for "+int2str(nvalid)+" individuals\n"); return true; } bool Plink::readCovListFile() { // This will set individuals as missing if they have a missing value // for the covariate, or do not appear in the file ifstream COV(par::clist_filename.c_str(), ios::in); map uid; map::iterator ii; set hasCovariate; for (int i=0; ifid+"_"+sample[i]->iid,sample[i])); } // If need (for later selection) keep explicit track of what is missing map > isMissing; map originalPersonMissingStatus; par::clist_number = -1; int nvalid=0; while (!COV.eof()) { string pfid, piid; vector_t clist; char cline[par::MAX_LINE_LENGTH]; COV.getline(cline,par::MAX_LINE_LENGTH,'\n'); // convert to string string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); if ( par::clist_number < 0 ) { par::clist_number = tokens.size() - 2; // Assign default headers (can be overwritten) clistname.resize(par::clist_number); for (int c=0; csecond; // Track individual covariate missing status person->clistMissing.resize(par::clist_number); // Store original missingness status for this person originalPersonMissingStatus.insert(make_pair( person, person->missing )); vector missing_status; // Were any missing/bad values? bool okay = true; // Add covariate values to clist person->clist.clear(); for (int c=2; c( t, tokens[c], std::dec ) ) okay = false; person->clist.push_back( t ); } // Note that we've seen a covariate for this individual hasCovariate.insert(person); for (int c=0; cclistMissing[c] = true; } else { missing_status.push_back(true); person->clistMissing[c] = false; } } if (!okay) person->missing = true; else nvalid++; // Record, if we will use this below if ( par::clist_selection ) isMissing.insert(make_pair( person, missing_status )); } else if ( pfid == "FID" && piid == "IID" ) { // This is a header row -- read in covariate names for (int c=0; c::iterator person = sample.begin(); vector dummy_missing_status(n,false); while ( person != sample.end() ) { if ( hasCovariate.find( *person ) == hasCovariate.end() ) { (*person)->missing = true; (*person)->clist.clear(); (*person)->clist.resize(par::clist_number, -9 ); (*person)->clistMissing.clear(); (*person)->clistMissing.resize(par::clist_number, true ); if ( par::clist_selection ) isMissing.insert(make_pair( (*person), dummy_missing_status )); } person++; } printLOG("Reading " + int2str(par::clist_number) + " covariates from [ " + par::clist_filename + " ] with "); printLOG("nonmissing values for " +int2str(nvalid) +" individuals\n"); ///////////////////////////////////////////////////////// // Do we actually want to keep all these covariates? if ( par::clist_selection_number || par::clist_selection_name ) { vector covlist; if ( par::clist_selection_number ) { NList nl(par::clist_number); covlist = nl.deparseNumberList(par::clist_selection_string); } else { map mapping; for (int c=0; cclist; vector tmpMissing = person->clistMissing; person->clist.clear(); person->clistMissing.clear(); // Reset per-person missing code person->missing = originalPersonMissingStatus.find( person )->second; vector missing_status = isMissing.find( person )->second; bool okay = true; for (int c=0; cclist.push_back( tmp[ covlist[c] ] ); person->clistMissing.push_back( tmpMissing[ covlist[c] ] ); if ( ! missing_status[covlist[c]] ) { person->missing = true; okay = false; } } if ( okay ) nvalid++; } // Reset sample-wide values (names, number) vector tmp = clistname; clistname.clear(); for (int c=0; c uid; map::iterator ii; set hasPhenotype; for (int i=0; ifid+"_"+sample[i]->iid,sample[i])); // If need (for later selection) keep explicit track of what is missing map > isMissing; map originalPersonMissingStatus; par::plist_number = -1; int nvalid=0; while (!PHEFILE.eof()) { string pfid, piid; vector_t clist; vector tokens = tokenizeLine( PHEFILE ); if ( par::plist_number < 0 ) { par::plist_number = tokens.size() - 2; // Assign default headers (can be overwritten) plistname.resize(par::plist_number); for (int c=0; csecond; // Track individual phenotype missing status person->plistMissing.resize(par::plist_number); // Store original missingness status for this person originalPersonMissingStatus.insert(make_pair( person, person->missing )); vector missing_status; // Were any missing/bad values? bool okay = true; // Add phenotype values to plist person->plist.clear(); for (int c=2; c( t, tokens[c], std::dec ) ) okay = false; person->plist.push_back( t ); } // Note that we've seen a covariate for this individual hasPhenotype.insert(person); for (int c=0; cplistMissing[c] = true; } else { missing_status.push_back(true); person->plistMissing[c] = false; } } if (!okay) person->missing = true; else nvalid++; // Record, if we will use this below if ( par::plist_selection ) isMissing.insert(make_pair( person, missing_status )); } else if ( pfid == "FID" && piid == "IID" ) { // This is a header row -- read in covariate names for (int c=0; c::iterator person = sample.begin(); vector dummy_missing_status(n,false); while ( person != sample.end() ) { if ( hasPhenotype.find( *person ) == hasPhenotype.end() ) { (*person)->missing = true; (*person)->plist.clear(); (*person)->plist.resize(par::plist_number, -9 ); (*person)->plistMissing.clear(); (*person)->plistMissing.resize(par::plist_number, true ); if ( par::plist_selection ) isMissing.insert(make_pair( (*person), dummy_missing_status )); } person++; } printLOG("Reading " + int2str(par::plist_number) + " phenotypes from [ " + par::multiple_phenotype_file + " ] with "); printLOG("nonmissing values for " +int2str(nvalid) +" individuals\n"); ///////////////////////////////////////////////////////// // Do we actually want to keep all these covariates? if ( par::plist_selection_number || par::plist_selection_name ) { vector phelist; if ( par::plist_selection_number ) { NList nl(par::plist_number); phelist = nl.deparseNumberList(par::plist_selection_string); } else { map mapping; for (int c=0; cplist; vector tmpMissing = person->plistMissing; person->plist.clear(); person->plistMissing.clear(); // Reset per-person missing code person->missing = originalPersonMissingStatus.find( person )->second; vector missing_status = isMissing.find( person )->second; bool okay = true; for (int c=0; cplist.push_back( tmp[ phelist[c] ] ); person->plistMissing.push_back( tmpMissing[ phelist[c] ] ); if ( ! missing_status[phelist[c]] ) { person->missing = true; okay = false; } } if ( okay ) nvalid++; } // Reset sample-wide values (names, number) vector tmp = plistname; plistname.clear(); for (int c=0; c uid; for (int i=0; ifid+"_"+sample[i]->iid,sample[i])); map mlocus; for (int l=0; lname,l)); map::iterator ii1; map::iterator ii2; map::iterator il1; map::iterator il2; int nseg=0; while (!SEG.eof()) { string pfid1, piid1, pfid2, piid2; string snp1, snp2; string dummy; Segment s; // Contains pointers to individuals (p1, p2) // and ints for start/stop (SNP coding) SEG >> pfid1 >> piid1 >> pfid2 >> piid2 >> dummy // phenotype >> dummy // CHR >> dummy // BP1 >> dummy // BP2 >> snp1 >> snp2 >> dummy // NSNP >> dummy; // KB bool okay = true; // Attached individuals ii1 = uid.find(pfid1+"_"+piid1); ii2 = uid.find(pfid2+"_"+piid2); if ( ii1 != uid.end() && ii2 != uid.end() ) { s.p1 = ii1->second; s.p2 = ii2->second; } else okay = false; // Attached bounding SNPs il1 = mlocus.find(snp1); il2 = mlocus.find(snp2); if ( il1 != mlocus.end() && il2 != mlocus.end() ) { s.start = il1->second; s.finish = il2->second; } else okay = false; if ( okay ) if ( locus[s.finish]->bp - locus[s.start]->bp < par::segment_length || s.finish - s.start + 1 < par::segment_snp ) okay = false; // Add to list if (okay) { segment.push_back(s); nseg++; } } printLOG("Read " + int2str(nseg) + " valid segments\n"); } void Plink::readSegmentFileMinimal(ifstream & SEG) { // Format: (no header) // {n1, n2} , {s1,s2}, {s1,s2} , {s1,s2} , {-1,-1} \n // n1 = int for individual 1 position in sample // n2 = int for individual 2 position in sample // s1 = segment start (first SNP int position) // s2 = segment finish // *** Health warning : the sample must be identical, or else markers // and individuals will be out... The verbose format avoids this problem, // but will generate much larger files *** int nseg=0; while (!SEG.eof()) { int p1, p2; Segment s; // Contains pointers to individuals (p1, p2) // and ints for start/stop (SNP coding) // For this pair; SEG >> p1 >> p2; if ( p1 < 0 || p2 > n ) error("Problem with segment file (minimal format)\n"); s.p1 = sample[p1]; s.p2 = sample[p2]; // Read segments while (true) { SEG >> s.start >> s.finish; // Move to next pair? if ( s.start == -1 ) break; // Segment long enough? if ( locus[s.finish]->bp - locus[s.start]->bp >= par::segment_length && s.finish - s.start + 1 >= par::segment_snp ) { segment.push_back(s); nseg++; } // Read next segment for this pair } // Read next pair, while not EOF } printLOG("Read " + int2str(nseg) + " valid segments\n"); } void Plink::readConditioningList() { checkFileExists(par::conditioning_snps_file); ifstream COV(par::conditioning_snps_file.c_str(), ios::in); printLOG("Reading list of conditioning SNPs from [ " + par::conditioning_snps_file + " ]\n"); int c=0; int c2=0; while (!COV.eof()) { string snp; COV >> snp; if (snp=="") break; int x = getMarkerNumber( *this , snp ); if (x>=0) { conditioner.push_back( getMarkerNumber( *this , snp ) ); conditioner_mask.push_back( false ); c++; } c2++; } printLOG("Using " + int2str(c) + " of " +int2str(c2) +" specified conditioning SNPs\n"); COV.close(); } void Plink::readMapFile(string filename, vector & include, vector & include_pos, int & nl_actual ) { // chromosome code // SNP identifier // cM / M // Base Position (-ve implies exclude) vector ordered; ifstream MAP; MAP.open( filename.c_str()); MAP.clear(); int c=0; while(!MAP.eof()) { string chr; long int inc; char cline[256]; MAP.getline(cline,256,'\n'); // convert to string string sline = cline; if (sline=="") continue; string buf; stringstream ss(sline); vector tokens; while (ss >> buf) tokens.push_back(buf); if (tokens.size() == 0) continue; // Is this line a comment? if (tokens[0].substr(0,1)=="#") { continue; } if ( par::map3 && tokens.size() != 3 ) error("Problem with MAP file line:\n"+sline); else if ( (!par::map3) && tokens.size() != 4 ) error("Problem with MAP file line:\n"+sline); Locus * loc; if ( ! par::load_gvar ) loc = new Locus; else { Variant * vloc = new Variant; loc = (Locus*)vloc; } chr = tokens[0]; loc->name = tokens[1]; if ( par::map3 ) { loc->pos = 0; loc->bp = (long int)atoi(tokens[2].c_str()); } else { loc->pos = atof(tokens[2].c_str()); loc->bp = (long int)atoi(tokens[3].c_str()); } inc = loc->bp; // Check that cM/M specification looks correct, if // we want to perform a plink-based analysis if (par::plink && (!par::cm_map) && (loc->pos > 50) ) error("Looks like you need to specify --cm ??"); // Convert cM to M map distances if (par::cm_map) loc->pos /= 100; // Chromosome coding loc->chr = getChromosomeCode(chr); // Use the frequency slot temporarily to // store order information loc->freq = c++; // Are we including this locus? if (loc->name!="") { if (inc<0) { include.push_back(false); } else { include.push_back(true); locus.push_back(loc); } ordered.push_back(*loc); } } MAP.clear(); MAP.close(); printLOG(int2str(locus.size()) + " (of " + int2str(include.size()) + ") markers to be included from [ " + filename + " ]\n"); if ( par::load_gvar ) printLOG("Read in as generic variants, rather than SNPs\n"); if ( locus.size() == 0 ) shutdown(); /////////////////////////////////////////////// // Build ordered table, so that genotypes can // be inserted in correct order; then swap locus // file over // Sorting a vector of pointers, so we need this special fix stable_sort(locus.begin(),locus.end(),less()); // Sorting a normal vector stable_sort(ordered.begin(),ordered.end()); c=0; for (int i=0; iafreq // p2 p3 p1 p5 p4 : genetic position // 0 1 2 3 4 : file order // 1 0 1 0 1 : include // sort by cM // p1 p2 p3 p4 p5 : genetic // 2 0 1 4 3 : file order // 1 1 0 1 0 : include // 0 1 2 : add genetic order: nonmissing... // // sort by file order again // p2 p3 p1 p5 p4 : genetic // 0 1 2 3 4 : file // 1 0 1 0 1 : include // 1 0 2 : position to put in locus[l] /////////////////////////////////////////////// // Do we want to look at all the data? nl_actual = locus.size(); if ( (!par::plink) && (!par::run_chr==0) ) { // Get range setMarkerRange(); // And set to 'exclude' all markers outside of this range // (in physical distance terms) nl_actual = 0; for (int j=0; j par::run_end ) { include_pos.push_back(-1); include[j] = false; } else { include_pos.push_back(fp); nl_actual++; } } else // if already excluded { include_pos.push_back(-1); } } // 0 1 2 3 4 5 6 7 8 9 // We now have -1 -1 -1 3 4 5 6 -1 -1 -1 // but we want -1 -1 -1 0 1 2 3 -1 -1 -1 for (int j=0; j -1 ) include_pos[j] -= par::run_start ; } } else { // If we do want to look at all the data for (int j=0; j l0(0); for(int l=0; l < locus.size(); l++) { // If not in range if ( l < par::run_start || l > par::run_end ) { // Free memory for original element delete locus[l]; } else { l0.push_back(locus[l]); } } ///////////////// // And copy back locus.clear(); locus = l0; } /////////////////////////////////////////////////// // Add necessary locus space, if in SNP-major mode if (par::SNP_major && ! par::load_gvar ) { for (int i=0; i ambiguous; int nmale = 0; int nfemale = 0; int nambig = 0; int c=0; while(!PED.eof()) { Individual * person = new Individual; // First 6 obligatory fields string phenotype; PED >> person->fid >> person->iid >> person->pat >> person->mat >> person->sexcode >> phenotype; // Are we using 0/1 coding? if (par::coding01) { if ( phenotype == "1" ) phenotype = "2"; else if ( phenotype == "0" ) phenotype = "1"; else phenotype = "0"; } // Skip last empty line that gets read if (person->fid=="") { delete person; break; } // Check for reserved family ID code if ( person->fid=="FID" ) error("FID is a reserved ID... please select a different family ID"); // Check sex if (person->sexcode=="1") { person->sex = true; // male nmale++; } else if (person->sexcode=="2") { person->sex = false; // female (default) nfemale++; } else { ambiguous.push_back(person); nambig++; if (!par::ignore_missing_sex) person->missing = true; } /////////////// // A non-founder? person->founder = (person->pat == "0" && person->mat == "0") ? true : false; ////////////////////////////// // Test for quantitative trait if (phenotype == par::missing_phenotype) person->missing = true; else { if ( ! from_string( person->phenotype, phenotype, std::dec ) ) person->missing = true; else if (phenotype != "0" && phenotype != "1" && phenotype != "2" ) { par::qt = true; par::bt = false; } } // Increase person counter c++; // Add individual to list sample.push_back(person); } PED.clear(); PED.close(); // If a binary trait, now make 0 missing also // i.e. if we never saw other than missing, 0, 1 or 2 if (par::bt) for (int i=0; iphenotype == 0 ) sample[i]->missing = true; printLOG(int2str(c)+" individuals read from [ " + filename + " ] \n"); int nm=0; for (int i=0;imissing) nm++; printLOG(int2str(nm) + " individuals with nonmissing phenotypes\n"); if (par::bt) { if (par::coding01) printLOG("Assuming a disease phenotype (0=unaff, 1=aff, other=miss)\n"); else { printLOG("Assuming a disease phenotype (1=unaff, 2=aff, 0=miss)\n"); if (par::missing_phenotype!="0") printLOG("Missing phenotype value is also " + par::missing_phenotype + "\n"); } int ncase = 0; int ncontrol = 0; int nmissing = 0; for (int i=0; imissing ) nmissing++; else if ( sample[i]->phenotype == 1 ) ncontrol++; else if ( sample[i]->phenotype == 2 ) ncase++; printLOG(int2str(ncase)+" cases, " +int2str(ncontrol)+" controls and " +int2str(nmissing)+" missing\n"); } else { printLOG("Assuming a quantitative trait\n"); printLOG("Missing phenotype value is " + par::missing_phenotype + "\n"); } // Display sex counts printLOG(int2str(nmale)+" males, "+int2str(nfemale) +" females, and "+int2str(nambig)+" of unspecified sex\n"); // Display list of ambiguously-sexed individuals? if (ambiguous.size()>0) { printLOG("Warning, found "+int2str(ambiguous.size()) +" individuals with ambiguous sex codes\n"); if (!par::ignore_missing_sex) printLOG("These individuals will be set to missing ( or use --allow-no-sex )\n"); string f = par::output_file_name + ".nosex"; printLOG("Writing list of these individuals to [ "+f+" ]\n"); ofstream AMB; AMB.open(f.c_str(), ifstream::out); for (int i=0; ifid << "\t" << ambiguous[i]->iid << "\n"; AMB.close(); ambiguous.clear(); } } void Plink::readHomozygSegmentFile(ifstream & SEG) { // No need to skip header line(s) as no individuals should be called // "FID". Because of this, we can just concatenate multiple .segment // files, and not worry about repeating the headers map uid; for (int i=0; ifid+"_"+sample[i]->iid,sample[i])); map mlocus; for (int l=0; lname,l)); map::iterator ii; map::iterator il1; map::iterator il2; int nseg=0; while (!SEG.eof()) { string pfid, piid; string snp1, snp2; string dummy; Segment s; // Contains pointers to individuals (p1, p2) // and ints for start/stop (SNP coding) SEG >> pfid >> piid >> dummy // phenotype >> dummy // CHR >> snp1 >> snp2 >> dummy // BP1 >> dummy // BP2 >> dummy // NSNP >> dummy // KB >> dummy // DENSITY >> dummy // PHOM >> dummy; // PHET bool okay = true; // Attached individuals ii = uid.find(pfid+"_"+piid); if ( ii != uid.end() ) s.p1 = s.p2 = ii->second; else okay = false; // Attached bounding SNPs il1 = mlocus.find(snp1); il2 = mlocus.find(snp2); if ( il1 != mlocus.end() && il2 != mlocus.end() ) { s.start = il1->second; s.finish = il2->second; } else okay = false; if ( okay ) { if ( locus[s.finish]->bp - locus[s.start]->bp < ( par::homo_run_length_kb * 1000) || s.finish - s.start + 1 < par::homo_run_length_snps ) okay = false; } // Add to list if (okay) { segment.push_back(s); nseg++; } } printLOG("Read " + int2str(nseg) + " valid segments\n"); } void Plink::readStdIn() { // Read a PED/MAP file from standard input } void Plink::updateMapFile() { map mlocus; for (int l=0; lname, l )); if ( par::update_cm ) printLOG("Reading new cM/M positions from [ " + par::update_mapfile + " ]\n"); else if ( par::update_chr ) printLOG("Reading new chromosome positions from [ " + par::update_mapfile + " ]\n"); else if ( par::update_name ) printLOG("Reading new SNP labels from [ " + par::update_mapfile + " ]\n"); else printLOG("Reading new physical positions from [ " + par::update_mapfile + " ]\n"); checkFileExists( par::update_mapfile ); ifstream MAPIN; MAPIN.open( par::update_mapfile.c_str(), ios::in ); int num_found = 0; int num_notfound = 0; set done; set names; bool nameWarning = false; while ( ! MAPIN.eof() ) { string snp; double value; string svalue; if ( par::update_chr || par::update_name ) MAPIN >> snp >> svalue; else MAPIN >> snp >> value; if ( snp == "" ) continue; map::iterator il = mlocus.find( snp ); set::iterator sl = done.find( snp ); if ( sl != done.end() ) { error(snp+" seen more than once in [ "+par::update_mapfile+" ]\n"); } if ( il != mlocus.end() ) { done.insert( snp ); if ( par::update_name ) { if ( names.find( svalue ) != names.end() ) nameWarning = true; names.insert( svalue ); } ++num_found; if ( par::update_cm ) locus[ il->second ]->pos = value; else if ( par::update_chr ) locus[ il->second ]->chr = getChromosomeCode(svalue); else if ( par::update_name ) locus[ il->second ]->name = svalue; else locus[ il->second ]->bp = (int)value; } else ++num_notfound; } printLOG( int2str( num_found ) + " SNP positions read and updated\n"); printLOG( int2str( nl_all - done.size() ) + " in data but not in [ " + par::update_mapfile + " ]\n"); if ( num_notfound > 0 ) printLOG(int2str(num_notfound) + " in [ " + par::update_mapfile + " ] but not in data\n"); if ( par::update_name && nameWarning ) printLOG("Warning -- duplicated SNP names found in update\n"); // Check if file needs re-ordering bool allInOrder = true; for (int l=1; lchr == locus[l-1]->chr ) { if ( par::update_cm ) { if ( locus[l]->pos < locus[l-1]->pos ) allInOrder = false; } else { if ( locus[l]->bp < locus[l-1]->bp ) allInOrder = false; } } else if ( locus[l]->chr < locus[l-1]->chr ) allInOrder = false; if ( ! allInOrder ) break; } if ( ! allInOrder ) printLOG("*** Implicit order changed from re-mapping ***\n"); } void Plink::updateAlleles() { map mlocus; for (int l=0; lname, l )); printLOG("Reading new allele codes from [ " + par::update_allele_file + " ]\n"); checkFileExists( par::update_allele_file ); ifstream MAPIN; MAPIN.open( par::update_allele_file.c_str(), ios::in ); int num_found = 0; int num_notfound = 0; int num_prob = 0; set done; set probs; while ( ! MAPIN.eof() ) { string snp; string old1, old2, new1, new2; MAPIN >> snp >> old1 >> old2 >> new1 >> new2; if ( snp == "" ) continue; map::iterator il = mlocus.find( snp ); set::iterator sl = done.find( snp ); if ( sl != done.end() ) { error(snp + " seen more than once in [ "+par::update_allele_file+" ]\n"); } bool success = false; if ( il != mlocus.end() ) { done.insert( snp ); ++num_found; if ( locus[ il->second ]->allele1 == old1 || locus[ il->second ]->allele2 == old2 ) { locus[ il->second ]->allele1 = new1; locus[ il->second ]->allele2 = new2; } else if ( locus[ il->second ]->allele1 == old2 || locus[ il->second ]->allele2 == old1 ) { locus[ il->second ]->allele1 = new2; locus[ il->second ]->allele2 = new1; } else { ++num_prob; probs.insert( il->second ); } } else ++num_notfound; } printLOG( int2str( num_found-num_prob ) + " SNPs found and allele codes updated\n"); if( nl_all - done.size() > 0 ) printLOG( int2str( nl_all - done.size() ) + " in data but not in [ " + par::update_allele_file + " ]\n"); if ( num_notfound > 0 ) printLOG(int2str(num_notfound) + " in [ " + par::update_allele_file + " ] but not in data\n"); if ( num_prob > 0 ) { printLOG(int2str(num_prob) + " SNPs with allele conflicts listed in [ " + par::output_file_name + ".allele.no.snp ]\n"); ofstream O( ( par::output_file_name + ".allele.no.snp" ).c_str() , ios::out); set::iterator i = probs.begin(); while ( i != probs.end() ) { O << locus[ *i ]->name << "\t" << locus[ *i ]->allele1 << "\t" << locus[ *i ]->allele2 << "\n"; ++i; } O.close(); } } void Plink::updateFamFile() { map mpeople; for (int i=0; ifid+"_"+sample[i]->iid , i )); ifstream FAM_ID, FAM_PAR, FAM_SEX, FAM_PHE; if ( par::update_ids ) { printLOG("Reading new FIDs and IIDs from [ " + par::update_ids_file + " ]\n"); checkFileExists( par::update_ids_file ); FAM_ID.open( par::update_ids_file.c_str(), ios::in ); } if ( par::update_ids ) { int num_found = 0; int not_found = 0; while (! FAM_ID.eof() ) { vector tokens = tokenizeLine( FAM_ID ); if ( tokens.size() == 0 ) continue; if ( tokens.size() != 4 ) error("Problem with line in --update-ids file: expects 4 columns per row"); string index = tokens[0] + "_" + tokens[1]; if ( mpeople.find( index ) != mpeople.end() ) { ++num_found; Individual * p = sample[ mpeople.find(index)->second ]; p->fid = tokens[2]; p->iid = tokens[3]; } else ++not_found; } printLOG( int2str(num_found) + " individuals found, " + int2str(not_found) + " not in sample\n"); FAM_ID.close(); } if ( par::update_sex ) { printLOG("Reading new sex codes from [ " + par::update_sex_file + " ]\n"); checkFileExists( par::update_sex_file ); FAM_SEX.open( par::update_sex_file.c_str(), ios::in ); } if ( par::update_sex ) { int num_found = 0; int not_found = 0; while (! FAM_SEX.eof() ) { vector tokens = tokenizeLine( FAM_SEX ); if ( tokens.size() == 0 ) continue; if ( tokens.size() != 3 ) error("Problem with line in --update-sex file: expects 3 columns per row"); string index = tokens[0] + "_" + tokens[1]; if ( mpeople.find( index ) != mpeople.end() ) { ++num_found; Individual * p = sample[ mpeople.find(index)->second ]; p->sexcode = tokens[2]; if (p->sexcode=="1") p->sex = true; // male else if (p->sexcode=="2") p->sex = false; // female else if (!par::ignore_missing_sex) p->missing = true; } else ++not_found; } printLOG( int2str(num_found) + " individuals found, " + int2str(not_found) + " not in sample\n"); FAM_SEX.close(); } if ( par::update_parents ) { printLOG("Reading new parental codes from [ " + par::update_parents_file + " ]\n"); checkFileExists( par::update_parents_file ); FAM_PAR.open( par::update_parents_file.c_str(), ios::in ); } if ( par::update_parents ) { int num_found = 0; int not_found = 0; while (! FAM_PAR.eof() ) { vector tokens = tokenizeLine( FAM_PAR ); if ( tokens.size() == 0 ) continue; if ( tokens.size() != 4 ) error("Problem with line in --update-parents file: expects 4 columns per row"); string index = tokens[0] + "_" + tokens[1]; if ( mpeople.find( index ) != mpeople.end() ) { ++num_found; Individual * p = sample[ mpeople.find(index)->second]; p->pat = tokens[2]; p->mat = tokens[3]; } else ++not_found; } printLOG( int2str(num_found) + " individuals found, " + int2str(not_found) + " not in sample\n"); FAM_PAR.close(); } if ( par::update_pheno ) { printLOG("Reading phenotypes to update from [ " + par::update_pheno_file + " ]\n"); checkFileExists( par::update_pheno_file ); FAM_PHE.open( par::update_pheno_file.c_str(), ios::in ); } if ( par::update_pheno ) { int num_found = 0; int not_found = 0; while (! FAM_PHE.eof() ) { vector tokens = tokenizeLine( FAM_PHE ); if ( tokens.size() == 0 ) continue; if ( tokens.size() != 3 ) error("Problem with line in --update-pheno file: expects 3 columns per row"); string index = tokens[0] + "_" + tokens[1]; if ( mpeople.find( index ) != mpeople.end() ) { ++num_found; Individual * p = sample[ mpeople.find(index)->second]; if (par::coding01) { if ( tokens[2] == "1" ) tokens[2] = "2"; else if ( tokens[2] == "0" ) tokens[2] = "1"; else tokens[2] = "0"; } if ( tokens[2] == par::missing_phenotype) p->missing = true; else { if ( ! from_string( p->phenotype, tokens[2], std::dec ) ) p->missing = true; else { if ( tokens[2] != "0" && tokens[2] != "1" && tokens[2] != "2" ) { par::qt = true; par::bt = false; } } } } else ++not_found; } printLOG( int2str(num_found) + " individuals found, " + int2str(not_found) + " not in sample\n"); FAM_PHE.close(); // Do we need to recode binary phenotypes? if (par::bt) affCoding(*this); } } plink-1.07-src/segment.cpp0000644000265600020320000016055011264127625014652 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "perm.h" #include "fisher.h" #include "stats.h" void Plink::findSegments(int i1, int i2, vector_t & p, ofstream & SEG) { Individual * p1 = sample[i1]; Individual * p2 = sample[i2]; bool inseg = false; Segment s; s.p1 = p1; s.p2 = p2; int npos = p.size(); // Marker if minimal segment output file is selected bool already_seen_pair = false; for (int l=0; l= par::segment_threshold_start) { inseg = true; if (m1[l] != -1) s.start = m1[l]; else s.start = par::run_start; } /////////////////////////// // End of existing segment? if ( inseg && ( p[l] < par::segment_threshold_finish || l == npos-1 ) ) { inseg = false; if (m2[l-1] != -1) s.finish = m2[l-1]; else s.finish = par::run_end; // Do we like this segment? if ( locus[s.finish]->bp - locus[s.start]->bp >= par::segment_length && s.finish-s.start+1 >= par::segment_snp ) { // Add segment to list segment.push_back(s); // Display? if (par::segment_output) { if ( par::segment_minimal ) { if ( ! already_seen_pair ) { already_seen_pair = true; SEG << i1 << " " << i2 << " "; } SEG << s.start << " " << s.finish << " "; } else { // Long format SEG << setw(par::pp_maxfid) << s.p1->fid << " " << setw(par::pp_maxiid) << s.p1->iid << " " << setw(par::pp_maxfid) << s.p2->fid << " " << setw(par::pp_maxiid) << s.p2->iid << " "; if (par::bt) { if ( (!p1->aff) && (!p2->aff) ) SEG << setw(4) << "-1" << " "; else if ( p1->aff && p2->aff ) SEG << setw(4) << "1" << " "; else if ((!p1->aff) && p2->aff) SEG << setw(4) << "0" << " "; else if (p1->aff && !p2->aff) SEG << setw(4) << "0" << " "; else SEG << setw(4) << "NA" << " "; } else SEG << setw(4) << "NA" << " "; Locus * start = locus[s.start]; Locus * finish = locus[s.finish]; SEG << setw(4) << par::run_chr << " " << setw(10) << start->bp << " " << setw(10) << finish->bp << " " << setw(par::pp_maxsnp) << start->name << " " << setw(par::pp_maxsnp) << finish->name << " " << setw(6) << s.finish-s.start+1 << " " << setw(10) << (double)(finish->bp - start->bp)/(double)1000 << "\n"; SEG.flush(); } } } } } // If we have found segments for this pair, then // record a end-of-pair code if in minimal output mode if ( par::segment_minimal && already_seen_pair ) SEG << "-1 -1\n"; } void Plink::segmentIndividualTest(Perm & perm) { printLOG("Writing individual-based segment tests to [ " + par::output_file_name + ".segtest1 ]\n"); int total_cases = 0, total_controls = 0; for (int i=0; iaff ) total_cases++; else total_controls++; map ip; for (int i=0; ichr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << perm.pvalue(l) << " " << setw(12) << perm.max_pvalue(l) << "\n"; } SEGS.close(); } vector_t Plink::perm_segmentIndividualTest(Perm & perm, bool display, int total_cases, int total_controls, map & ip) { vector_t results; ofstream SEGS; if ( display ) { SEGS.open( (par::output_file_name+".segtest1").c_str() , ios::out ); SEGS.precision(4); SEGS << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(5) << "TEST" << " " << setw(8) << "AFF" << " " << setw(8) << "UNAFF" << " " << setw(8) << "PHAT" << " " << setw(8) << "P0" << " " << setw(8) << "Z" << " " << setw(8) << "P" << "\n"; } // Consider each SNP position // Count : number of cases with at least one segment // number of controls with at least one segment // versus : equivalent counts of those w/out a segment // Calculate genome-wide means for cases and controls // Tests: // 1) Number of cases with at least one segment versus // number of contorls with at least one segment // 2) Number of case segments versus numbr of control segments // Options: // Make 1-sided (i.e. more case sharing expected) // Adjust for genome-wide sharing level // Use Fisher's exact versus use standard chi-square // Use permutation or not (permute individuals) // CHECK: okay to have floating point values for fisher's test? /////////////////////////////////////////////////////// // Calculate mean levels of sharing in cases and controls long int ncases = 0; long int ncontrols = 0; for (int l=0; l people; vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( s->start <= l && s->finish >= l ) { if ( people.find( s->p1 ) == people.end() ) { people.insert( s->p1 ); if ( s->p1->pperson->aff ) ncases++; else ncontrols++; } // Only consider second individual for IBD sharing test if ( ! par::homo_run ) { if ( people.find( s->p2 ) == people.end() ) { people.insert( s->p2 ); if ( s->p2->pperson->aff ) ncases++; else ncontrols++; } } } s++; } // next segment } double mean_case_sharing = (double)ncases / (double)(nl_all); double mean_control_sharing = (double)ncontrols / (double)(nl_all); ///////////////////////////////////////// // Consider this position for (int l=0; l people; /////////////////////////////////////////////////////// // Consider all segments that might span this position vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( s->start <= l && s->finish >= l ) { if ( people.find( s->p1 ) == people.end() ) { people.insert( s->p1 ); if ( s->p1->pperson->aff ) ncases++; else ncontrols++; } // Only consider second individual for IBD sharing test if ( ! par::homo_run ) { if ( people.find( s->p2 ) == people.end() ) { people.insert( s->p2 ); if ( s->p2->pperson->aff ) ncases++; else ncontrols++; } } } s++; } // next segment ////////////////////////////////////////////////// // Adjust table by the genome-wide means for case // sharing and control sharing // Scale adjustment to be proportional to total amount of // sharing at this particular locus // double expected_cases = (ncases+ncontrols) * // ( mean_case_sharing/(mean_case_sharing+mean_control_sharing) ); // double expected_controls = (ncases+ncontrols) * // ( mean_control_sharing/(mean_case_sharing+mean_control_sharing) ); // double pvalue = 1; // if ( ncases > expected_cases ) // { // double chi1 = ncases - expected_cases; // chi1 *= chi1; // chi1 /= expected_cases; // double chi2 = ncontrols - expected_controls; // chi2 *= chi2; // chi2 /= expected_controls; // double chisq = chi1 + chi2; // pvalue = chiprobP(chisq,1); // } // Scale ////////////////////////////////////////////////// // Fisher's exact test, as counts might be small, // or standard chi-sq // matrix_t t; // sizeMatrix(t,2,2); // t[0][0] = ncases_adj; // t[0][1] = ncontrols_adj; // t[1][0] = total_cases - ncases_adj; // t[1][1] = total_controls - ncontrols_adj; // cout << t[0][0] << "\t" // << t[0][1] << "\t" // << t[1][0] << "\t" // << t[1][1] << "\n"; ////////////////////////////////////////////////// // Either chi-square of Fisher's exact for a p-value // double pvalue = 1; // // A one sided t-test: // if ( ncases_adj > ncontrols_adj ) // && (double)ncases/(double)total_cases // > (double)ncontrols/(double)total_controls ) // { // pvalue = par::segment_test_fisher // ? // fisher(t) : // chiprobP(chi2x2(t),1); // pvalue = chiprobP(chi2x2(t),1); // } ////////////////////////////////////////////////////////////// // 1-sided test of proportions with the normal approximation // z = (p_hat - p0) / sqrt( ( p0*(1-p0) ) / N ) // where N is total number of individuals // N*p should be > 5, double p0 = mean_case_sharing/(mean_case_sharing+mean_control_sharing); double phat = (double)ncases/double(ncases+ncontrols); double z = (phat - p0) / sqrt( ( p0*(1-p0) ) / (ncases+ncontrols) ); double pvalue = z < 0 ? 1 : normdist(-z); // cout << "STAT: " << p0 << "\t" << phat << "\t" << z << "\n"; /////////////////// // Store results? if ( ! realnum(p0) ) { pvalue = 1; } results.push_back(1-pvalue); //////////////////// // Display results? if ( display ) { if ( ! realnum(p0) ) SEGS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(5) << "ALL" << " " << setw(8) << ncases << " " << setw(8) << ncontrols << " " << setw(8) << "NA" << " " << setw(8) << p0 << " " << setw(8) << 0 << " " << setw(8) << 1 << "\n"; else SEGS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(5) << "ALL" << " " << setw(8) << ncases << " " << setw(8) << ncontrols << " " << setw(8) << phat << " " << setw(8) << p0 << " " << setw(8) << z << " " << setw(8) << pvalue << "\n"; } //////////////////////////////////////////////////////////// // Consider allele-specific segments spanning this position // (groups of) // NEED TO ADJUST TEST STATISTIC BELOW, IF ABOVE FIX WORKS if ( false && par::segment_test_specific_segs ) { groupSegmentsSpanning(l); map groupCount; for ( int i = 0; i < n ; i++ ) { if ( indivSegmentGroup[i] == -1 ) continue; map::iterator gi = groupCount.find( indivSegmentGroup[i] ); if ( gi == groupCount.end() ) groupCount.insert(make_pair( indivSegmentGroup[i] , 1 ) ); else gi->second++; } // Consider all allelically-matching groups with more than a // set number of segments map::iterator gi = groupCount.begin(); while ( gi != groupCount.end() ) { if ( gi->second < 10 ) { gi++; continue; } int group = gi->first; int ncases = 0, ncontrols = 0; set people; // Consider all segments that might span this position vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( s->start <= l && s->finish >= l ) { if ( people.find( s->p1 ) == people.end() ) { if ( indivSegmentGroup[ ip.find( s->p1 )->second ] == group ) { people.insert( s->p1 ); if ( s->p1->pperson->aff ) ncases++; else ncontrols++; } } if ( ! par::homo_run ) { if ( people.find( s->p2 ) == people.end() ) { if ( indivSegmentGroup[ ip.find( s->p2 )->second ] == group ) { people.insert( s->p2 ); if ( s->p2->pperson->aff ) ncases++; else ncontrols++; } } } } s++; } // next segment // Fisher's exact test, as counts might be small table_t t; sizeTable(t,2,2); t[0][0] = ncases; t[0][1] = ncontrols; t[1][0] = total_cases - ncases; t[1][1] = total_controls - ncontrols; double pvalue = 1; if ( ncases > 0 && (double)ncases/(double)total_cases > (double)ncontrols/(double)total_controls ) { pvalue = par::segment_test_fisher ? fisher(t) : chiprobP(chi2x2(t),1); } if ( display ) { SEGS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(5) << group << " " << setw(12) << ncases << " " << setw(12) << ncontrols << " " << setw(12) << pvalue << "\n"; } gi++; } } //end of specific segment section } // Next SNP if (display) SEGS.close(); return results; } void Plink::segmentPermutationTest(Perm & perm, bool ibd, string f, vector & coverage_conc_aff, vector & coverage_disc, vector & coverage_conc_unaff ) { // IBD or IBS segment permutation test for case/control data // Calculate total number of concordant/discordant pairs int tot_conc_aff=0; int tot_not=0; for (int i1=0; i1aff && sample[i2]->aff ) tot_conc_aff++; else { if ( par::segment_test_ignore_discordant ) { if ( (!sample[i1]->aff) && (!sample[i2]->aff) ) tot_not++; } else tot_not++; } printLOG(int2str(tot_conc_aff)+" concordant affected pairs out of " +int2str(tot_not+tot_conc_aff)+" in total\n"); if ( par::segment_test_ignore_discordant ) printLOG("Comparing case/case to control/control pairs\n"); else printLOG("Comparing case/case to non-case/case pairs\n"); int nt = nl_all; // IBS mode if (ibd) { nt = nl; } perm.setTests(nt); perm.setPermClusters(*this); perm.originalOrder(); vector_t original(nt); vector_t origSC(nt); vector_t origSD(nt); // Get genome-wide means double SCg = 0; double SDg = 0; for (int l=0; l0 ? statistic / (SC+SD) : statistic; original[l] = statistic; origSC[l] = SC; origSD[l] = SD; } ////////////////////// // Begin permutations // Edit 'fringe' status if in IBD mode if (ibd) { vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( s->start == -1 ) s->start = par::run_start; if ( s->finish == -1 ) s->finish = par::run_end; // Edit to make positions relative to start of this chromosome s->start -= par::run_start; s->finish -= par::run_start; s++; } } bool finished = false; while(!finished) { // Store permuted results vector pr(nt); // Permute perm.permuteInCluster(); // Retest vector coverage_conc_aff(nt,0); vector coverage_conc_unaff(nt,0); vector coverage_disc(nt,0); vector::iterator s = segment.begin(); while ( s != segment.end() ) { if (s->p1->pperson->aff == s->p2->pperson->aff) { if ( s->p1->pperson->aff) for (int l = s->start ; l <= s->finish; l++) coverage_conc_aff[l]++; else for (int l = s->start ; l <= s->finish; l++) coverage_conc_unaff[l]++; } else for (int l = s->start ; l <= s->finish; l++) coverage_disc[l]++; s++; } double SCg = 0; double SDg = 0; for (int l=0; l 0 ? statistic / (SC+SD) : statistic; pr[l] = statistic; } //////////////////////////////// // Standard permutation counting finished = perm.update(pr,original); } if (!par::silent) cout << "\n\n"; //////////////////////////// // Display permuted p-values f += ".mperm"; printLOG("Writing segment test permutation results to [ "+f+" ]\n"); ofstream SIBS; SIBS.open( f.c_str() , ios::out ); SIBS.precision(4); SIBS << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "STAT" << " " << setw(10) << "CONA" << " " << setw(10) << "OTHER" << " " << setw(10) << "EMP1" << " " << setw(10) << "EMP2" << "\n"; for (int l=0; lchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " "; SIBS << setw(10) << original[l] << " " << setw(10) << origSC[l] << " " << setw(10) << origSD[l] << " " << setw(10) << perm.pvalue(l) << " " << setw(10) << perm.max_pvalue(l) << "\n"; } SIBS.close(); } void Plink::testGenomeIBDByCovariate(Perm & perm) { // Calculate case/case, control/control and case/control // Comparisons: // 1) Case/case versus all others // 2) Case/control versus all others // 3) Control/control versus all others // 4) Case/case versus case/control // 5) Control/Control versus case/control // 6) Case/case versus control/control // Test based on yes/no dichotomy for % pairs above certain PIHAT threshold int tot_conc_aff=0; int tot_conc_unaff=0; int tot_disc=0; for (int i1=0; i1aff ) { if ( sample[i2]->aff ) tot_conc_aff++; else tot_disc++; } else { if ( sample[i2]->aff ) tot_disc++; else tot_conc_unaff++; } } printLOG(int2str(tot_conc_aff)+" concordant affected pairs\n"); printLOG(int2str(tot_conc_unaff)+" concordant unaffected pairs\n"); printLOG(int2str(tot_disc)+" discordant pairs\n"); printLOG(int2str(tot_conc_aff+tot_conc_unaff+tot_disc)+" total pairs\n"); int nt = 6; // six tests (above) perm.setTests(nt); perm.setPermClusters(*this); perm.originalOrder(); /////////////////////////// // Original test statistics vector original(nt); // Iterate over all pairs double prop_11 = 0; double prop_01 = 0; double prop_00 = 0; int c=0; for (int i1=0; i1aff ) { if ( sample[i2]->aff ) prop_11++; else prop_01++; } else { if ( sample[i2]->aff ) prop_01++; else prop_00++; } } } // 1) Case/case versus all others // 2) Case/control versus all others // 3) Control/control versus all others // 4) Case/case versus case/control // 5) Control/Control versus case/control // 6) Case/case versus control/control original[0] = (double)prop_11/(double)tot_conc_aff - double(prop_01+prop_00)/(double)(tot_disc+tot_conc_unaff) ; original[1] = (double)prop_01/(double)tot_disc - double(prop_11+prop_00)/(double)(tot_conc_aff+tot_conc_unaff) ; original[2] = (double)prop_00/(double)tot_conc_unaff - double(prop_01+prop_11)/(double)(tot_disc+tot_conc_aff) ; original[3] = (double)prop_11/(double)tot_conc_aff - double(prop_01)/(double)(tot_disc) ; original[4] = (double)prop_00/(double)tot_conc_unaff - double(prop_01)/(double)(tot_disc) ; original[5] = (double)prop_11/(double)tot_conc_aff - double(prop_00)/(double)(tot_conc_unaff) ; //////////////////// // Begin permutations bool finished = false; while(!finished) { // Store permuted results vector pr(nt,0); // Permute perm.permuteInCluster(); // Retest double prop_11 = 0; double prop_01 = 0; double prop_00 = 0; for (int i1=0; i1pperson->aff ) { if ( sample[i2]->pperson->aff ) prop_11++; else prop_01++; } else { if ( sample[i2]->pperson->aff ) prop_01++; else prop_00++; } } } // 1) Case/case versus all others // 2) Case/control versus all others // 3) Control/control versus all others // 4) Case/case versus case/control // 5) Control/Control versus case/control // 6) Case/case versus control/control pr[0] = (double)prop_11/(double)tot_conc_aff - double(prop_01+prop_00)/(double)(tot_disc+tot_conc_unaff) ; pr[1] = (double)prop_01/(double)tot_disc - double(prop_11+prop_00)/(double)(tot_conc_aff+tot_conc_unaff) ; pr[2] = (double)prop_00/(double)tot_conc_unaff - double(prop_01+prop_11)/(double)(tot_disc+tot_conc_aff) ; pr[3] = (double)prop_11/(double)tot_conc_aff - double(prop_01)/(double)(tot_disc) ; pr[4] = (double)prop_00/(double)tot_conc_unaff - double(prop_01)/(double)(tot_disc) ; pr[5] = (double)prop_11/(double)tot_conc_aff - double(prop_00)/(double)(tot_conc_unaff) ; //////////////////////////////// // Standard permutation counting finished = perm.update(pr,original); } if (!par::silent) cout << "\n\n"; //////////////////////////// // Display permuted p-values string f = par::output_file_name + ".genome.mperm"; printLOG("Writing permuted results for genome-wide IBD test to [ "+f+" ]\n"); ofstream SIBS; SIBS.open( f.c_str() , ios::out ); for (int l=0; l::iterator s = segment.begin(); while ( s != segment.end() ) { int c2 = 0; int c1 = 0; int c0 = 0; int cx = 0; for (int l = s->start; l <= s->finish; l++) { bool a1 = s->p1->one[l]; bool a2 = s->p1->two[l]; bool b1 = s->p2->one[l]; bool b2 = s->p2->two[l]; bool ibs0 = false; bool ibs1 = false; bool miss = false; if ( a1 == a2 && b1 == b2 && a1 != b1 ) ibs0 = true; else if ( a1 && !(a2) ) miss = true; else if ( b1 && !(b2) ) miss = true; else if ( a1 != b1 || a2 != b2 ) ibs1 = true; if (ibs0) c0++; else if (ibs1) c1++; else if (miss) cx++; else c2++; } SIBS << setw(10) << c0 << " " << setw(10) << c1 << " " << setw(10) << c2 << " " << setw(10) << cx << "\n"; // Next segment s++; } SIBS.close(); } ////////////////////////////////////////////// // General segmental permutation test routine void Plink::summaryIBSsegments(Perm & perm) { vector coverage_conc_aff(nl_all,0); vector coverage_conc_unaff(nl_all,0); vector coverage_disc(nl_all,0); vector::iterator s = segment.begin(); while ( s != segment.end() ) { if (s->p1->aff == s->p2->aff) { if ( s->p1->aff) for (int l = s->start ; l <= s->finish; l++) coverage_conc_aff[l]++; else for (int l = s->start ; l <= s->finish; l++) coverage_conc_unaff[l]++; } else for (int l = s->start ; l <= s->finish; l++) coverage_disc[l]++; s++; } // Optionally, if we allow 'wings' to increase span // of events (and so, each data point represents the // number of events with X kb of that position) if ( ( par::homo_run || par::cnv_list ) && par::seg_test_window ) { printLOG("Summarising segments within a window of " + int2str((int)(par::seg_test_window_bp/1000)) + " kb\n"); vector::iterator s = segment.begin(); while ( s != segment.end() ) { // Shift left from start int l = s->start; Locus * loc1 = locus[s->start]; while ( 1 ) { --l; if ( l < 0 ) break; Locus * loc2 = locus[l]; if ( loc2->chr != loc1->chr ) break; if ( loc1->bp - loc2->bp > par::seg_test_window_bp ) break; if ( s->p1->aff ) ++coverage_conc_aff[l]; else ++coverage_conc_unaff[l]; } // Shift right from start l = s->finish; loc1 = locus[s->finish]; while ( 1 ) { ++l; if ( l == nl_all ) break; Locus * loc2 = locus[l]; if ( loc2->chr != loc1->chr ) break; if ( loc2->bp - loc1->bp > par::seg_test_window_bp ) break; if ( s->p1->aff ) ++coverage_conc_aff[l]; else ++coverage_conc_unaff[l]; } // Next segment s++; } } string f = par::output_file_name; if ( par::segment_output ) f += ".segment.summary"; else if ( par::homo_run ) f += ".hom.summary"; else if ( par::cnv_list ) f += ".cnv.summary"; else if ( par::ibs_2only ) f += ".ibs2.summary"; else f += ".ibs.summary"; ///////////////////////////////////// // Treat CNVs as homozygous segments // i.e. just a per-individual and not // a per-pair phenomenon if ( par::cnv_list ) par::homo_run = true; ofstream SIBS; SIBS.open( f.c_str() , ios::out ); ////////////////////////////// // And display if ( par::homo_run ) SIBS << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "BP" << " " << setw(8) << "AFF" << " " << setw(8) << "UNAFF" << "\n"; else SIBS << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "BP" << " " << setw(8) << "CONA" << " " << setw(8) << "DISC" << " " << setw(8) << "CONU" << "\n"; for (int l=0; lchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << locus[l]->bp << " " << setw(8) << coverage_conc_aff[l] << " " << setw(8) << coverage_conc_unaff[l] << "\n"; else SIBS << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << locus[l]->bp << " " << setw(8) << coverage_conc_aff[l] << " " << setw(8) << coverage_disc[l] << " " << setw(8) << coverage_conc_unaff[l] << "\n"; } SIBS.close(); ////////////////////////////////// // Permutation test, if C/C given? if (!par::permute) return; if (par::homo_run) homozygousSegmentPermutationTest(perm,f,coverage_conc_aff,coverage_conc_unaff); else segmentPermutationTest(perm,false,f,coverage_conc_aff,coverage_disc,coverage_conc_unaff); return; } void Plink::summaryIBDsegments(Perm & perm) { // Number of map positions // (extra position for global IBD) int npos = m1.size() - 1; vector coverage_conc_aff(nl,0); vector coverage_conc_unaff(nl,0); vector coverage_disc(nl,0); // pihat[pair][position] // check if, e.g. pihat>0.2 (i.e. atleast 80% chance of IBD1 sharing) // level set in options.cpp for (int l=0;lpos - par::fringe; } else { p1 = locus[m1[l]]->pos; } if (m2[l]==-1) { p2 = locus[par::run_end]->pos + par::fringe; } else { p2 = locus[m2[l]]->pos; } if (m1[l]==-1 && m2[l]==-1) { p1 = p2 = 0; } double d1 = p1 + pos[l] * (p2-p1); vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( locus[s->start]->chr != par::run_chr ) { s++; continue; } // In segment? if (locus[s->start]->pos < d1 && locus[s->finish]->pos > d1 ) { if (s->p1->aff == s->p2->aff) { if ( s->p1->aff) coverage_conc_aff[l]++; else coverage_conc_unaff[l]++; } else coverage_disc[l]++; } s++; } } string f = par::output_file_name + ".segment.summary"; ofstream SSEG; if (par::segment_output_started) SSEG.open( f.c_str() , ios::app ); else SSEG.open( f.c_str() , ios::out ); par::segment_output_started = true; ////////////////////////////// // And display for (int l=0; lpos - par::fringe; n1 = "fringe"; } else { p1 = locus[m1[l]]->pos; n1 = locus[m1[l]]->name; fr = locus[m1[l]]->freq; } if (m2[l]==-1) { p2 = locus[par::run_end]->pos + par::fringe; n2 = "fringe"; } else { p2 = locus[m2[l]]->pos; n2 = locus[m2[l]]->name; } if (m1[l]==-1 && m2[l]==-1) { p1 = p2 = 0; n1 = "Genomewide"; n2 = "IBD"; } double d1 = p1 + pos[l] * (p2-p1); SSEG << par::run_chr << " " << n1 << " " << n2 << " " << d1 << " " << fr << " " << coverage_conc_unaff[l] << " " << coverage_disc[l] << " " << coverage_conc_aff[l] << "\n"; } SSEG.close(); //////////////////////// // C/C permutation test? if (!par::permute) return; segmentPermutationTest(perm,true,f,coverage_conc_aff,coverage_disc,coverage_conc_unaff); } void Plink::summaryIBD() { int npos = pihat[0].size(); int npair = pihat.size(); vector coverage(npos,0); vector coverage_conc_aff(npos,0); vector coverage_conc_unaff(npos,0); vector coverage_disc(npos,0); // pihat[pair][position] // check if, e.g. pihat>0.2 (i.e. atleast 80% chance of IBD1 sharing) // level set in options.cpp for (int i=0;i par::IBD_threshold ) { coverage[j]++; if (sample[pair1[i]]->phenotype != sample[pair2[i]]->phenotype) coverage_disc[j]++; if (sample[pair1[i]]->phenotype==1 && sample[pair2[i]]->phenotype==1) coverage_conc_unaff[j]++; if (sample[pair1[i]]->phenotype==2 && sample[pair2[i]]->phenotype==2) coverage_conc_aff[j]++; } } ////////////////////////////// // And display for (int l=0; lpos - par::fringe; n1 = "fringe"; } else { p1 = locus[m1[l]]->pos; n1 = locus[m1[l]]->name; fr = locus[m1[l]]->freq; } if (m2[l]==-1) { p2 = locus[par::run_end]->pos + par::fringe; n2 = "fringe"; } else { p2 = locus[m2[l]]->pos; n2 = locus[m2[l]]->name; } if (m1[l]==-1 && m2[l]==-1) { p1 = p2 = 0; n1 = "Genomewide"; n2 = "IBD"; } double d1 = p1 + pos[l] * (p2-p1); cout << "C " << par::run_chr << " " << n1 << " " << n2 << " " << d1 << " " << fr << " " << coverage[l] << " " << coverage_conc_unaff[l] << " " << coverage_disc[l] << " " << coverage_conc_aff[l] << " " << coverage[l]/(double)npair << "\n"; } } void Plink::displayGMULTI(Individual * p1, Individual * p2, int l, ofstream & GMULTI) { Locus * loc = locus[l]; bool a1 = p1->one[l]; bool a2 = p1->two[l]; bool b1 = p2->one[l]; bool b2 = p2->two[l]; bool miss = false; GMULTI << setw(par::pp_maxfid) << p1->fid << " " << setw(par::pp_maxiid) << p1->iid << " " << setw(par::pp_maxfid) << p2->fid << " " << setw(par::pp_maxiid) << p2->iid << " " << setw(par::pp_maxsnp) << loc->name << " "; // Minor allele and frequency GMULTI << setw(2) << loc->allele1 << " " << setw(8) << loc->freq << " "; // Genotypes if ((!a1) && (!a2)) GMULTI << setw(4) << loc->allele1+"/"+loc->allele1 << " "; else if ((!a1) && a2) GMULTI << setw(4) << loc->allele1+"/"+loc->allele2 << " "; else if (a1 && a2) GMULTI << setw(4) << loc->allele2+"/"+loc->allele2 << " "; else { GMULTI << setw(4) << "0/0" << " "; miss = true; } if ((!b1) && (!b2)) GMULTI << setw(4) << loc->allele1+"/"+loc->allele1; else if ((!b1) && b2) GMULTI << setw(4) << loc->allele1+"/"+loc->allele2; else if (b1 && b2) GMULTI << setw(4) << loc->allele2+"/"+loc->allele2; else { GMULTI << setw(4) << "0/0"; miss = true; } // IBS count if (miss) GMULTI << setw(3) << "NA" << " "; else { int ibs=0; if (a1==b1) ibs++; if (a2==b2) ibs++; GMULTI << setw(3) << ibs << " "; } GMULTI << "\n"; } void Plink::indivSegmentSummaryCalc(map & segmentCount, map & segmentLength, bool countCases, bool countControls ) { segmentCount.clear(); segmentLength.clear(); segmentCount2.clear(); if ( par::cnv_count_baseline ) segmentCount2Baseline.clear(); vector::iterator s = segment.begin(); while ( s != segment.end() ) { indivPair p; p.p1 = s->p1; p.p2 = s->p2; // For now, assume that case/control masking only applies // to homozygosity and CNV tests if ( ! countCases && s->p1->pperson->aff ) continue; if ( ( !countControls ) && ! s->p1->pperson->aff ) continue; // We have not yet seen this indiv/pair map::iterator ip = segmentCount.find(p); map::iterator il = segmentLength.find(p); map::iterator ic2 = segmentCount2.find(p); // KB length double l = (double)(locus[s->finish]->bp - locus[s->start]->bp)/(double)1000; if ( ip == segmentCount.end() ) { segmentCount.insert( make_pair( p, 1 ) ); segmentLength.insert( make_pair( p, l ) ); if ( par::cnv_weighted_gene_test ) { segmentCount2.insert( make_pair( p, s->weightedCount )); if ( par::cnv_count_baseline ) segmentCount2Baseline.insert( make_pair( p, s->weightedBaseline )); } else { segmentCount2.insert( make_pair( p, s->count )); if ( par::cnv_count_baseline ) segmentCount2Baseline.insert( make_pair( p, s->baseline )); } } else { (ip->second)++; (il->second) += l; if ( par::cnv_weighted_gene_test ) { (ic2->second) += s->weightedCount; if ( par::cnv_count_baseline ) { map::iterator ic2b = segmentCount2Baseline.find(p); (ic2b->second) += s->weightedBaseline; } } else { (ic2->second) += s->count; if ( par::cnv_count_baseline ) { map::iterator ic2b = segmentCount2Baseline.find(p); (ic2b->second) += s->baseline; } } } s++; } } void Plink::indivSegmentSummary() { // Again, this function works for both homozygous and shared // segments (individuals and pairs) // Now these are part of Plink class //map segmentCount; //map segmentLength; // Generate basic summary file for all people indivSegmentSummaryCalc(segmentCount, segmentLength, true, true); //////////// // Output string f = par::output_file_name; if ( par::segment_output ) f += ".segment.indiv"; else if ( par::cnv_list ) f += ".cnv.indiv"; else if ( par::homo_run ) f += ".hom.indiv"; ofstream HOM; HOM.open( f.c_str() , ios::out ); if ( par::homo_run ) HOM << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(4) << "PHE" << " " << setw(8) << "NSEG" << " " << setw(8) << "KB" << " " << setw(8) << "KBAVG" << "\n"; else if ( par::cnv_list ) { HOM << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(4) << "PHE" << " " << setw(8) << "NSEG" << " " << setw(8) << "KB" << " " << setw(8) << "KBAVG" << " "; if ( par::cnv_count ) HOM << setw(8) << "COUNT" << " "; HOM << "\n"; } else HOM << setw(par::pp_maxfid) << "FID1" << " " << setw(par::pp_maxiid) << "IID2" << " " << setw(par::pp_maxfid) << "FID1" << " " << setw(par::pp_maxiid) << "IID2" << " " << setw(4) << "PHE" << " " << setw(8) << "NSEG" << " " << setw(8) << "KB" << " " << setw(8) << "KBAVG" << "\n"; // Output all individuals in homozygous segment mode if ( par::homo_run || par::cnv_list ) { for ( int i = 0; i < n; i++) { indivPair t; t.p1 = t.p2 = sample[i]; map::iterator ic = segmentCount.find(t); map::iterator il = segmentLength.find(t); map::iterator ic2 = segmentCount2.find(t); indivPair p = ic->first; HOM << setw(par::pp_maxfid) << sample[i]->fid << " " << setw(par::pp_maxiid) << sample[i]->iid << " " << setw(4) << sample[i]->phenotype << " "; if ( ic != segmentCount.end() ) { HOM << setw(8) << ic->second << " " << setw(8) << il->second << " " << setw(8) << il->second / (double)ic->second << " "; if ( par::cnv_count ) HOM << setw(8) << ic2->second << " "; HOM << "\n"; } else { HOM << setw(8) << 0 << " " << setw(8) << 0 << " "; if ( par::cnv_count ) HOM << setw(8) << 0 << " "; HOM << setw(8) << 0 << "\n"; } } // Next individual } else { // For now, just output obsevred pairs in segment mode // (i.e. typically too many pairs) map::iterator ic = segmentCount.begin(); map::iterator il = segmentLength.begin(); while ( ic != segmentCount.end() ) { indivPair p = ic->first; HOM << setw(par::pp_maxfid) << p.p1->fid << " " << setw(par::pp_maxiid) << p.p1->iid << " "; if ( p.p1 != p.p2 ) { int pcode = 0; if ( p.p1->aff ) { if ( p.p2->aff ) pcode = 1; else pcode = 0; } else { if ( p.p2->aff ) pcode = 0; else pcode = -1; } HOM << setw(par::pp_maxfid) << p.p2->fid << " " << setw(par::pp_maxiid) << p.p2->iid << " " << setw(4) << pcode << " "; } else HOM << setw(4) << p.p1->phenotype << " "; HOM << setw(8) << ic->second << " " << setw(8) << il->second << " " << setw(8) << il->second / (double)ic->second << "\n"; ++ic; ++il; } } HOM.close(); } class SegmentSizeCmp { public: bool operator() (const Segment & s1, const Segment & s2) const { int len1 = s1.finish - s1.start; int len2 = s2.finish - s2.start; if ( len1 < len2 ) return true; if ( len1 > len2 ) return false; if ( s1.p1 < s2.p1 ) return true; if ( s1.p1 > s2.p1 ) return false; return ( s1.start < s2.start ); } }; void Plink::displaySegmentsLong() { printLOG("Writing long-format segment list to [ " + par::output_file_name + ".cnv.seglist ]\n"); ofstream SEG; string f = par::output_file_name + ".cnv.seglist"; SEG.open( f.c_str(), ios::out ); // Determine list of which chromosomes we need to report // on set chr; for (int l=0; lchr ); set::iterator ichr = chr.begin(); // Consider each chromosome, one at a time while ( ichr != chr.end() ) { SEG << "\nChromosome " << *ichr << "\n\n"; // Sort list of events in decreasing size // for this chromosome map smap; vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( locus[ s->start ]->chr == *ichr ) { smap.insert(make_pair(*s,0)); } ++s; } /////////////////////////////////////////////////////// // How many segments to consider on this chromosome? int nseg = smap.size(); // For each segment, determine "height" map::iterator i1 = smap.begin(); while ( i1 != smap.end() ) { // Look at all smaller than this one map::iterator i2 = i1; while ( i2 != smap.end() ) { if ( i1 == i2 ) { ++i2; continue; } const Segment * s1 = &(i1->first); const Segment * s2 = &(i2->first); // cout << s1->start << " - " << s1->finish // << " to " << s2->start << " - " << s2->finish << "\n"; // cout << "overlap test\n"; // if ( ( s2->finish >= s1->start && s2->start <= s1->finish ) || // ( s1->finish >= s2->start && s1->start <= s2->finish ) ) if ( s2->finish >= s1->start && s2->start <= s1->finish ) { // Place this small segment one above the larger one (i2->second) = (i1->second) + 1; } ++i2; } ++i1; } // Take smallest segment; // if any larger on overlaps // for (int i=1; i=0; j--) // Use overlap <- function(x,y,k) //{ // k$BP2[y] >= k$BP1[x] && k$BP1[y] <= k$BP2[x] //} // if ( overlap(i,j,k) ) // ++h[i]; // Map into per-row logic // Find max height i1 = smap.begin(); int maxh = 0; while ( i1 != smap.end() ) { if ( i1->second > maxh ) maxh = i1->second; ++i1; } ++maxh; // cout << "Segs = \n"; // i1 = smap.begin(); // while ( i1 != smap.end() ) // { // const Segment * s = &(i1->first); // cout << locus[ s->start ]->bp << " to " << locus[ s->finish ]->bp << ", h = " << i1->second << "\n"; // ++i1; // } // cout << "-------\n"; vector switches(maxh,false); // Display int chrmin = 0; int chrmax = 0; int tmp = 0; for (int l=0; lchr == *ichr ) { chrmin = chrmax = l; tmp = l; break; } for (int l=tmp; lchr > *ichr ) break; chrmax = l; } int interval = 10 * 1000; // 10kb steps int fringe = 50 * 1000; // 50kb fringe bool done = false; int l = chrmin; int p = locus[ chrmin ]->bp - fringe; if ( p < 0 ) p = 0; // Track p (and move l along with it) bool firstPosition = true; while ( ! done ) { bool atLocus = false; if ( p == locus[l]->bp ) atLocus = true; vector starts(maxh,false); vector stops(maxh,false); vector thisSeg(maxh); if ( atLocus ) { i1 = smap.begin(); while ( i1 != smap.end() ) { const Segment * s1 = &(i1->first); if ( s1->start == l ) { starts[ i1->second ] = true; switches[ i1->second ] = true; thisSeg[ i1->second ] = s1; } else if ( s1->finish == l ) { stops[ i1->second ] = true; switches[ i1->second ] = false; thisSeg[ i1->second ] = s1; } ++i1; } } ////////////// // Output row bool display = atLocus; if ( ! atLocus ) { for (int j=0; jname << " "; else SEG << setw(par::pp_maxsnp) << "("+int2str(p)+")" << " "; // Symbols // Duplication + // Deletion - // Case A // Control U if ( par::cnv_write_freq ) { for (int j=0; jfreq > 9 ) SEG << seg->freq; else SEG << seg->freq << " "; } else if ( stops[j] ) { const Segment * seg = thisSeg[j]; if ( seg->freq > 9 ) SEG << seg->freq; else SEG << seg->freq << " "; } else if ( switches[j] ) { //const Segment * seg = thisSeg[j]; // if ( seg->freq > 9 ) // SEG << seg->freq; // else // SEG << seg->freq << " "; SEG << "| "; } else SEG << " "; } SEG << "\n"; } else { for (int j=0; jtype == 1 ) SEG << "+"; else SEG << "-"; } else if ( stops[j] ) { const Segment * seg = thisSeg[j]; if ( seg->p1->aff ) SEG << "A"; else SEG << "U"; } else if ( switches[j] ) SEG << "|"; else SEG << " "; } SEG << "\n"; } } // Advance pointer p += interval; // Are we done? if ( p > locus[chrmax]->bp + fringe ) break; // note -- we might have missed the first one... // Advance only as far as next SNP position if ( firstPosition && locus[l]->bp <= p ) { p = locus[l]->bp; firstPosition = false; } else if ( l < chrmax && locus[l+1]->bp <= p ) { ++l; firstPosition = false; p = locus[l]->bp; } } ++ichr; } // Next chromosome SEG.close(); } void Plink::displaySegmentsBED() { // BED format provides a flexible way to define the data lines that // are displayed in an annotation track. BED lines have three // required fields and nine additional optional fields. The number // of fields per line must be consistent throughout any single set // of data in an annotation track. The order of the optional fields // is binding: lower-numbered fields must always be populated if // higher-numbered fields are used. //The first three required BED fields are: // chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) // or scaffold (e.g. scaffold10671). // chromStart - The starting position of the feature in the // chromosome or scaffold. The first base in a chromosome is // numbered 0. // chromEnd - The ending position of the feature in the chromosome // or scaffold. The chromEnd base is not included in the display of // the feature. For example, the first 100 bases of a chromosome are // defined as chromStart=0, chromEnd=100, and span the bases // numbered 0-99. // The 9 additional optional BED fields are: // name - Defines the name of the BED line. This label is displayed // to the left of the BED line in the Genome Browser window when the // track is open to full display mode or directly to the left of the item in pack mode. // score - A score between 0 and 1000. If the track line useScore // attribute is set to 1 for this annotation data set, the score // value will determine the level of gray in which this feature is // displayed (higher numbers = darker gray). // strand - Defines the strand - either '+' or '-'. // thickStart - The starting position at which the feature is drawn // thickly (for example, the start codon in gene displays). // thickEnd - The ending position at which the feature is drawn // thickly (for example, the stop codon in gene displays). // itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the // track line itemRgb attribute is set to "On", this RBG value will // determine the display color of the data contained in this BED // line. NOTE: It is recommended that a simple color scheme (eight // colors or less) be used with this attribute to avoid overwhelming // the color resources of the Genome Browser and your Internet // browser. // blockCount - The number of blocks (exons) in the BED line. // blockSizes - A comma-separated list of the block sizes. The // number of items in this list should correspond to blockCount. // blockStarts - A comma-separated list of block starts. All of the // blockStart positions should be calculated relative to // chromStart. The number of items in this list should correspond to // blockCount. // Here's an example of an annotation track that uses a complete BED definition: // track name=pairedReads description="Clone Paired Reads" useScore=1 // chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512 // chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601 printLOG("Writing CNV information as BED track to [ " + par::output_file_name + ".cnv.bed ]\n"); ofstream MOUT; MOUT.open( ( par::output_file_name + ".cnv.bed").c_str() , ios::out ); MOUT << "track name=delCases description=\"Deletions, cases (PLINK CNV track)\" visibility=4 priority=1 itemRgb=\"On\"\n"; vector::iterator s = segment.begin(); while ( s != segment.end() ) { Individual * person = s->p1; if ( s->type == 1 && person->aff ) { int pos1 = locus[s->start]->bp; int pos2 = locus[s->finish]->bp+1; MOUT << "chr" << chromosomeName( locus[s->start]->chr ) << " " << pos1 << " " << pos2 << " " << (person->fid + "_" + person->iid ) << " " << s->score << " " << ". " << pos1 << " " << pos2 << " "; // Colour if ( par::cnv_col == 0 ) MOUT << "255,0,0\n"; else if ( par::cnv_col = 1 ) MOUT << "0,0,255\n"; else if ( par::cnv_col = 2 ) MOUT << "0,255,0\n"; else if ( par::cnv_col = 3 ) MOUT << "255,0,0\n"; } ++s; } MOUT << "track name=dupCases description=\"Duplications, cases (PLINK CNV track)\" visibility=4 priority=3 itemRgb=\"On\"\n"; s = segment.begin(); while ( s != segment.end() ) { Individual * person = s->p1; if ( s->type == 2 && person->aff ) { int pos1 = locus[s->start]->bp; int pos2 = locus[s->finish]->bp+1; MOUT << "chr" << chromosomeName( locus[s->start]->chr ) << " " << pos1 << " " << pos2 << " " << (person->fid + "_" + person->iid ) << " " << s->score << " " << ". " << pos1 << " " << pos2 << " "; // Colour if ( par::cnv_col == 0 ) MOUT << "0,0,255\n"; else if ( par::cnv_col = 1 ) MOUT << "0,0,255\n"; else if ( par::cnv_col = 2 ) MOUT << "0,255,0\n"; else if ( par::cnv_col = 3 ) MOUT << "255,0,0\n"; } ++s; } MOUT << "track name=delControls description=\"Deletions, controls (PLINK CNV track)\" visibility=4 priority=2 itemRgb=\"On\"\n"; s = segment.begin(); while ( s != segment.end() ) { Individual * person = s->p1; if ( s->type == 1 && ! person->aff ) { int pos1 = locus[s->start]->bp; int pos2 = locus[s->finish]->bp+1; MOUT << "chr" << chromosomeName( locus[s->start]->chr ) << " " << pos1 << " " << pos2 << " " << (person->fid + "_" + person->iid ) << " " << s->score << " " << ". " << pos1 << " " << pos2 << " "; // Colour if ( par::cnv_col == 0 ) MOUT << "128,0,0\n"; else if ( par::cnv_col = 1 ) MOUT << "0,0,255\n"; else if ( par::cnv_col = 2 ) MOUT << "0,255,0\n"; else if ( par::cnv_col = 3 ) MOUT << "255,0,0\n"; } ++s; } MOUT << "track name=dupControls description=\"Duplications, controls (PLINK CNV track)\" visibility=4 priority=4 itemRgb=\"On\"\n"; s = segment.begin(); while ( s != segment.end() ) { Individual * person = s->p1; if ( s->type == 2 && ! person->aff ) { int pos1 = locus[s->start]->bp; int pos2 = locus[s->finish]->bp+1; MOUT << "chr" << chromosomeName( locus[s->start]->chr ) << " " << pos1 << " " << pos2 << " " << (person->fid + "_" + person->iid ) << " " << s->score << " " << ". " << pos1 << " " << pos2 << " "; // Colour if ( par::cnv_col == 0 ) MOUT << "0,0,128\n"; else if ( par::cnv_col = 1 ) MOUT << "0,0,255\n"; else if ( par::cnv_col = 2 ) MOUT << "0,255,0\n"; else if ( par::cnv_col = 3 ) MOUT << "255,0,0\n"; } ++s; } MOUT.close(); } plink-1.07-src/profile.cpp0000644000265600020320000005376011264127625014654 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "model.h" #include "sets.h" extern ofstream LOG; extern Plink * PP; using namespace std; void scoreRanges(int, vector &, map > &, map &, ofstream &); void Plink::scoreIndividuals() { map mlocus; for(int l=0; lname,l)); string suffix = ""; map qscore; vector qthresh; vector qlabel; if ( par::score_risk_on_qrange ) { checkFileExists( par::score_qfile ); checkFileExists( par::score_qrange_file ); printLOG("Reading quantitative scores from [ " + par::score_qfile + " ]\n"); printLOG("Reading score ranges from [ " + par::score_qrange_file + " ]\n"); ifstream Q1( par::score_qfile.c_str() , ios::in ); while ( ! Q1.eof() ) { string snp; string str_score; double score; Q1 >> snp >> str_score; if ( ! from_string( score , str_score , std::dec ) ) continue; if ( snp == "" ) continue; map::iterator i1 = mlocus.find( snp ); if ( i1 != mlocus.end() ) qscore.insert( make_pair( i1->second , score ) ); } Q1.close(); printLOG("Read q-scores for " + int2str( qscore.size() ) + " SNPs\n"); Q1.open( par::score_qrange_file.c_str() , ios::in ); while ( ! Q1.eof() ) { // Expect: name, lower, upper string label; double lower, upper; Q1 >> label >> lower >> upper; if ( label == "" ) continue; double2 d2(lower,upper); qthresh.push_back( d2 ); qlabel.push_back( label ); } Q1.close(); printLOG("Read " + int2str( qthresh.size() ) + " thresholds to apply\n"); } printLOG("Reading set of predictors from [ " + par::score_risk_file + " ]\n"); checkFileExists(par::score_risk_file); // Main loop in which we consider either multiple takes from the score file, or we // just run through once int qcnt = 0; while(1) { string suffix = ""; double2 th; if ( par::score_risk_on_qrange ) { suffix = "." + qlabel[qcnt]; th = qthresh[ qcnt ]; printLOG("Thresholding group " + qlabel[qcnt] + " on ( " + dbl2str(th.p1) + " -- " + dbl2str(th.p2) + " )\n"); } ifstream PROFIN; PROFIN.open( par::score_risk_file.c_str(), ios::in ); string problems; map scores; map allele1; int cnt1 = 0, cnt2 = 0, cnt2b = 0, cnt3 = 0; while ( ! PROFIN.eof() ) { // Format assumed: SNP allele score string snp, allele, sscore; double score; PROFIN >> snp >> allele >> sscore; if ( sscore=="" ) continue; if ( ! from_string( score, sscore , std::dec)) { problems += "BADVAL\t" + snp + "\n"; continue; } ++cnt1; map::iterator ilocus = mlocus.find(snp); // SNP not found if ( ilocus == mlocus.end() ) { problems += "NOSNP\t" + snp + "\n"; continue; } int l = ilocus->second; ++cnt2; // Purposely not include this SNP base on Q-range? // Are we only looking at subsets of SNPs? if ( par::score_risk_on_qrange ) { map::iterator i1 = qscore.find( l ); if ( i1 == qscore.end() ) continue; double sc = i1->second; if ( sc < th.p1 || sc > th.p2 ) continue; } ++cnt2b; // Allele found? if ( allele == locus[l]->allele1 ) { scores.insert(make_pair(l,score)); allele1.insert(make_pair(l,false)); ++cnt3; } else if ( allele == locus[l]->allele2 ) { scores.insert(make_pair(l,score)); allele1.insert(make_pair(l,true)); ++cnt3; } else problems += "NOALLELE\t" + snp + " " + allele + " vs " + locus[l]->allele1 + " " + locus[l]->allele2 + "\n"; } PROFIN.close(); if ( par::score_risk_on_qrange ) printLOG("Read " + int2str(cnt1) + " predictors; " + int2str(cnt2) + " mapped to SNPs; " + int2str(cnt2b) + " selected; " + int2str(cnt3) + " to alleles\n"); else printLOG("Read " + int2str(cnt1) + " predictors; " + int2str(cnt2) + " mapped to SNPs; " + int2str(cnt3) + " to alleles\n"); if ( problems != "" ) { printLOG("Writing problem SNPs in predictor to [ " + par::output_file_name + suffix + ".nopred ]\n"); ofstream O1; O1.open( (par::output_file_name + suffix + ".nopred").c_str() , ios::out ); O1 << problems ; O1.close(); problems = ""; } //////////////////////////////// // Calculate for each individual printLOG("Writing profiles to [ " + par::output_file_name + suffix + ".profile ]\n"); //////////////////////////////////////////// // First, perform this for all SNPs vector_t profile; matrix_t set_profile; if ( par::profile_sets ) pS->initialiseSetMapping(); vector cnt; vector acount; calculateProfile(scores,allele1,profile,set_profile,cnt,acount); /////////////////////////////// // Report for all individuals ofstream PROFOUT; string f = par::output_file_name + suffix + ".profile"; PROFOUT.open( f.c_str(), ios::out ); PROFOUT << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(6) << "PHENO" << " " << setw(6) << "CNT" << " " << setw(6) << "CNT2" << " " << setw(8) << "SCORE" << "\n"; for ( int i=0; ifid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(6) << person->phenotype << " " << setw(6) << cnt[i] << " " << setw(6) << acount[i] << " " << setw(8) << profile[i] << "\n"; } PROFOUT.close(); /////////////////////////////////////////////// // Test association with score and phenotype if ( par::score_test ) { vector_t results; /////////////////////////////////////////////////// // Set up association model bool OLD_assoc_glm_without_main_snp = par::assoc_glm_without_main_snp; bool OLD_clist = par::clist; par::assoc_glm_without_main_snp = true; par::clist = true; par::clist_number = 0; int totalTerms = 1; par::clist_number += totalTerms; for (int i=0; iclist.resize( par::clist_number ); // Fill in label forms int testTerm = par::clist_number - 1; clistname.resize( par::clist_number ); clistname[ testTerm ] = "SCORE"; //////////////////////////////////////// // Put relevant variables in clist slots for ( int i = 0; i < n; i++) sample[i]->clist[ testTerm ] = profile[i]; /////////////////////////////////////////// // Perform association glmAssoc(false,*pperm); /////////////////////////////////////////// // Report results ofstream OUTF; bool valid = model->isValid(); vector_t b = model->getCoefs(); vector_t chisq(1,model->getStatistic()); vector_t pval = model->getPVals(); // NOTE: b includes intercept; pval doesn't double statistic = valid ? model->getStatistic() : 0; double pvalue = pval[ pval.size()-1 ]; double beta = b[ b.size()-1 ]; delete model; string f = par::output_file_name + suffix + ".profile.test"; PROFOUT.open( f.c_str(), ios::out ); PROFOUT << setw(12) << "TEST" << " " << setw(12) << "BETA" << " " << setw(12) << "P" << "\n"; for (int c = 0; c < par::clist_number; c++) { PROFOUT << setw(12) << clistname[c] << " " << setw(12) << b[c+1] << " " << setw(12) << pval[c] << "\n"; } PROFOUT.close(); } //////////////////////////////////////////////////////// // // Recalculate scores for specified sets only if ( par::profile_sets ) { ofstream PROFOUT; string f = par::output_file_name + suffix + ".profile.sets"; PROFOUT.open( f.c_str(), ios::out ); PROFOUT << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(6) << "PHENO" << " "; for (int s=0; sfid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(6) << person->phenotype << " "; // Consider each pathway for (int s=0; s & scores, map & allele1, vector_t & profile, matrix_t & set_profile, vector & count, vector & acount ) { // Generate a vector of scores, one for each individual, given then // scoring set (and allele direction) for a set of SNPs profile.resize(n,0); if ( par::profile_sets ) sizeMatrix( set_profile , n , snpset.size() ); count.resize(n,0); acount.resize(n,0); /////////////////////////////////////// // Do we want to score for genes also? map > ranges; vector rangeLabels; map > snp2range; map rangeCount; ////////////////////////////////////////// // Consider each individual and calculate // the score for (int i=0; i::iterator i1 = scores.begin(); map::iterator i2 = allele1.begin(); double score = 0; vector_t set_score( snpset.size() , 0 ); int cnt = 0; int cntActual = 0; int cntNamedAllele = 0; vector flaggedSNPs; while ( i1 != scores.end() ) { int l = i1->first; bool a1 = i2->second; bool s1 = par::SNP_major ? SNP[l]->one[i] : person->one[l]; bool s2 = par::SNP_major ? SNP[l]->two[i] : person->two[l]; bool missingGenotype = false; double thisScore = 0; ///////////////////////////////////////////// // Individual is missing this genotype // We with either skip, or impute mean if ( s1 && ! s2 ) { if ( ! par::score_impute_expected ) { ++i1; ++i2; continue; } missingGenotype = true; if ( i2->second ) thisScore = ( 1 - locus[l]->freq ) * i1->second; else thisScore = locus[l]->freq * i1->second; if ( par::chr_haploid[ locus[l]->chr ] || ( par::chr_sex[ locus[l]->chr ] && person->sex ) ) ++cnt; else { cnt += 2; thisScore *= 2; } } // Currently, just an allelic scoring: we could extend this // to genotypes, dominant/recessive models, bool sawNamedAllele = false; if ( ! missingGenotype ) { if ( par::chr_haploid[ locus[l]->chr ] || ( par::chr_sex[ locus[l]->chr ] && person->sex ) ) { // A single copy if ( i2->second ) { if ( s1 ) { thisScore = i1->second; sawNamedAllele = true; ++cntNamedAllele; } } else { if ( !s1 ) { thisScore = i1->second; sawNamedAllele = true; ++cntNamedAllele; } } ++cnt; ++cntActual; } else // .. autosomal { if ( i2->second ) { if ( s1 ) { thisScore = i1->second; sawNamedAllele = true; ++cntNamedAllele; } if ( s2 ) { thisScore += i1->second; sawNamedAllele = true; ++cntNamedAllele; } } else { if ( !s1 ) { thisScore = i1->second; sawNamedAllele = true; ++cntNamedAllele; } if ( !s2 ) { thisScore += i1->second; sawNamedAllele = true; ++cntNamedAllele; } } cnt += 2; cntActual +=2; } } ////////////////////////////////////////// // Accumulate score score += thisScore; ////////////////////////////////////////// // Score in pathways also? if ( par::profile_sets ) { map >::iterator si = pS->setMapping.find(l); set::iterator i2 = si->second.begin(); while ( i2 != si->second.end() ) { set_score[ *i2 ] += thisScore; ++i2; } } ++i1; ++i2; } // Get average per seen loci if ( cnt>0 ) score /= (double)cnt; if ( par::profile_sets ) { for (int j=0; j & f, map > & snp2range, map & rangeCount, ofstream & ROUT) { Individual * person = PP->sample[i]; ROUT << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(6) << person->phenotype << " "; // Get list of ranges that will be flagged set mappedRanges; for (int l=0; l >::iterator ri = snp2range.find( f[l] ); if ( ri == snp2range.end() ) continue; set::iterator si = ri->second.begin(); while ( si != ri->second.end() ) { mappedRanges.insert( *si ); ++si; } } // We now have populated the set mappedRanges map::iterator r = rangeCount.begin(); while ( r != rangeCount.end() ) { if ( mappedRanges.find(r->first) != mappedRanges.end() ) { ROUT << "1 "; ++rangeCount[ r->first ]; } else ROUT << "0 "; ++r; ROUT << "\n"; } } // OLD VERSION WITH SCORE RANGES CODE IN PLACE // void Plink::calculateProfile(map & scores, // map & allele1, // vector_t & profile, // matrix_t & set_profile, // vector & count, // vector & acount ) // { // // Generate a vector of scores, one for each individual, given then // // scoring set (and allele direction) for a set of SNPs // profile.resize(n,0); // if ( par::profile_sets ) // sizeMatrix( set_profile , n , snpset.size() ); // count.resize(n,0); // acount.resize(n,0); // /////////////////////////////////////// // // Do we want to score for genes also? // map > ranges; // vector rangeLabels; // map > snp2range; // map rangeCount; // ofstream ROUT; // if ( par::score_risk_ranges ) // { // printLOG("Writing range scores to [ " // + par::output_file_name // + ".profile.ranges ]\n"); // // Helper function to map ranges to SNPs // mapRangesToSNPs( par::score_risk_ranges_file, // ranges, // snp2range ); // map >::iterator r = ranges.begin(); // while ( r != ranges.end() ) // { // set * theseRanges = &( r->second ); // set::iterator thisRangeSet = theseRanges->begin(); // while ( thisRangeSet != theseRanges->end() ) // { // rangeLabels.push_back( thisRangeSet->name ); // ++thisRangeSet; // } // ++r; // } // ROUT.open( ( par::output_file_name+".profile.ranges").c_str(), ios::out); // // Header // ROUT << setw(par::pp_maxfid) << "FID" << " " // << setw(par::pp_maxiid) << "IID" << " " // << setw(6) << "PHENO" << " "; // for (int r=0; r::iterator i1 = scores.begin(); // map::iterator i2 = allele1.begin(); // double score = 0; // vector_t set_score( snpset.size() , 0 ); // int cnt = 0; // int cntActual = 0; // int cntNamedAllele = 0; // vector flaggedSNPs; // while ( i1 != scores.end() ) // { // int l = i1->first; // bool a1 = i2->second; // bool s1 = par::SNP_major ? SNP[l]->one[i] : person->one[l]; // bool s2 = par::SNP_major ? SNP[l]->two[i] : person->two[l]; // bool missingGenotype = false; // double thisScore = 0; // ///////////////////////////////////////////// // // Individual is missing this genotype // // We with either skip, or impute mean // if ( s1 && ! s2 ) // { // if ( ! par::score_impute_expected ) // { // ++i1; // ++i2; // continue; // } // missingGenotype = true; // if ( i2->second ) // thisScore = ( 1 - locus[l]->freq ) * i1->second; // else // thisScore = locus[l]->freq * i1->second; // if ( par::chr_haploid[ locus[l]->chr ] || // ( par::chr_sex[ locus[l]->chr ] && person->sex ) ) // ++cnt; // else // { // cnt += 2; // thisScore *= 2; // } // } // // Currently, just an allelic scoring: we could extend this // // to genotypes, dominant/recessive models, // bool sawNamedAllele = false; // if ( ! missingGenotype ) // { // if ( par::chr_haploid[ locus[l]->chr ] || // ( par::chr_sex[ locus[l]->chr ] && person->sex ) ) // { // // A single copy // if ( i2->second ) // { // if ( s1 ) // { // thisScore = i1->second; // sawNamedAllele = true; // ++cntNamedAllele; // } // } // else // { // if ( !s1 ) // { // thisScore = i1->second; // sawNamedAllele = true; // ++cntNamedAllele; // } // } // ++cnt; // ++cntActual; // } // else // .. autosomal // { // if ( i2->second ) // { // if ( s1 ) // { // thisScore = i1->second; // sawNamedAllele = true; // ++cntNamedAllele; // } // if ( s2 ) // { // thisScore += i1->second; // sawNamedAllele = true; // ++cntNamedAllele; // } // } // else // { // if ( !s1 ) // { // thisScore = i1->second; // sawNamedAllele = true; // ++cntNamedAllele; // } // if ( !s2 ) // { // thisScore += i1->second; // sawNamedAllele = true; // ++cntNamedAllele; // } // } // cnt += 2; // cntActual +=2; // } // } // ////////////////////////////////////////// // // Accumulate score // score += thisScore; // ////////////////////////////////////////// // // Pathway-specific scores? // if ( par::profile_sets ) // { // for ( int j = 0 ; j < snpset.size(); j++ ) // { // // Is this SNP in this pathway? If so, // // add to pathway-specific score // if ( 1 ) // set_score[j] += thisScore; // } // } // ////////////////////////////////////////// // // Do we want to score a "yes/no" for a // // gene? // if ( par::score_risk_ranges && ! missingGenotype ) // { // // Did we see at least one risk-increasing allele? // if ( ( sawNamedAllele && i1->second > 0 ) || // ( (!sawNamedAllele) && i1->second < 0 ) ) // { // flaggedSNPs.push_back(l); // } // } // ++i1; // ++i2; // } // // Get average per seen loci // if ( cnt>0 ) // score /= (double)cnt; // // Save for this individual (actual number) // profile[i] = score; // set_profile[i] = set_score; // count[i] = cntActual; // acount[i] = cntNamedAllele; // // Score for ranges? // if ( par::score_risk_ranges ) // { // scoreRanges(i,flaggedSNPs,snp2range,rangeCount,ROUT); // } // } // Next individual // if ( par::score_risk_ranges ) // { // ROUT.close(); // // Also ouput how many times each range was seen // printLOG("Writing range summary counts to [ " // + par::output_file_name // + ".profile.ranges.summary ]\n"); // ofstream ROUT2; // ROUT2.open( ( par::output_file_name+".profile.ranges.summary").c_str(), ios::out); // // Header // ROUT2 << setw(18) << "RANGE" << " " // << setw(8) << "CNT" << "\n"; // map::iterator r = rangeCount.begin(); // while ( r != rangeCount.end() ) // { // ROUT2 << setw(18) << r->first->name << " " // << setw(8) << r->second << "\n"; // ++r; // } // ROUT2.close(); // } // return; //} plink-1.07-src/clumpld.h0000644000265600020320000000373611264127626014320 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2007 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef CLUMP_LD_H_ #define CLUMP_LD_H_ #include #include #include #include "plink.h" #include "helper.h" using namespace std; class ResultTrio { public: double p; // p-value string annot; // Annotation int f; // which results file? string s; // SNP name bool operator< (const ResultTrio & p2) const { return ( p < p2.p ); } }; class ClumpPair { public: string snp; int f; bool operator< (const ClumpPair & p2) const { if ( snp == p2.snp ) return ( f < p2.f ); return ( snp < p2.snp ); } }; class ClumpResults { public: double p; string annot; bool operator< (const ClumpResults & p2) const { return ( p < p2.p ); } }; class clump_LD { public: Plink * P; HaploPhase * hp; //will be user defined double pval_cutoff; double ld_distance; double second_pval_cutoff; float r2_cutoff; map clumped; vector snps; vector pvals; map assoc_results; vector filename; // constructer clump_LD(Plink*,HaploPhase*, double, double, double, float); // accessors void set_pval( double ); void set_second_pval( double ); void set_ld( double ); void set_r2( double ); // methods vector read_assoc_file(string); void clump(); string allelePairs(int,int); }; #endif /*CLUMP_LD_H_*/ plink-1.07-src/setscreen.cpp0000644000265600020320000002025711264127626015203 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include "options.h" #include "plink.h" #include "helper.h" #include "sets.h" #include "stats.h" extern Plink * PP; class SortedResult { public: double chisq; double p; int l; bool operator< (const SortedResult & s2) const { return (p < s2.p); } }; void Plink::setAssocSummary() { // For each set, // 1) get largest unselected test statistic * Ne // 2) attenuate all other Ne factor of r^2 to SNP selected in 1 // 3) repeat to (1) until all SNPs selected // 4) calculate sum test statistic * Ne ///////////////////////////////////// // Output file ofstream SSUM; SSUM.open( (par::output_file_name + ".set.summary").c_str() , ios::out); SSUM.precision(4); //////////////////////////////////// // Open a single results file ifstream RESIN; checkFileExists( par::set_screen_resultfile ); RESIN.open( par::set_screen_resultfile.c_str() , ios::in ); // Read first (header) row vector tokens = tokenizeLine( RESIN ); int chisq_column = -1; int snp_column = -1; int p_column = -1; int cols = tokens.size(); for (int i=0; i mlocus; makeLocusMap(*this,mlocus); set slist; bool convert = chisq_column == -1 || p_column == -1 ; bool from_p = chisq_column == -1; while ( !RESIN.eof() ) { vector tokens = tokenizeLine( RESIN ); if ( tokens.size() != cols ) continue; string snp = tokens[snp_column]; map::iterator i1 = mlocus.find( snp ); if ( i1 == mlocus.end() ) continue; double x2, pv; if ( ! convert ) { if ( ! from_string( x2, tokens[chisq_column] , std::dec)) continue; if ( ! from_string( pv, tokens[p_column] , std::dec)) continue; } else { if ( from_p ) { if ( ! from_string( pv, tokens[p_column] , std::dec)) continue; x2 = inverse_chiprob( pv, 1 ); } else { if ( ! from_string( x2, tokens[chisq_column] , std::dec)) continue; pv = chiprobP(x2,1); } } // Bounds on possible p-values if ( pv == 1 ) pv = 1-1e-10; if ( pv == 0 ) pv = 1e-10; stat[ i1->second ] = x2; p[ i1->second ] = pv; slist.insert( i1->second ); } RESIN.close(); printLOG("Read results for " + int2str( slist.size() ) + " SNPs\n"); printLOG("Writing set summary statistics to [ " + par::output_file_name + ".set.summary ]\n"); ////////////////////////////////// // Header row SSUM << setw(22) << "SET" << " " << setw(6) << "NSNP" << " " << setw(32) << "POS" << " " << setw(8) << "KB" << " " << setw(12) << "P1" << " " << setw(12) << "P2" << "\n"; ofstream SVERB; if ( par::verbose ) SVERB.open( (par::output_file_name+".set.summary.verbose").c_str(), ios::out ); // Substract from all other SNPs this value times the r^2 with the other SNP ///////////////////////////////////// // Score each Set for ( int j = 0; j < pS->snpset.size(); j++ ) { // Gather and sort test statistics (chi-sq) vector t; for (int i=0; i < snpset[j].size(); i++) { if ( p[snpset[j][i]] > 0 && p[snpset[j][i]] <= 1 ) { SortedResult s; s.chisq = stat[snpset[j][i]]; s.l = snpset[j][i]; s.p = p[snpset[j][i]]; t.push_back(s); } } int ns = t.size(); if ( ns == 0 ) { SSUM << setw(22) << setname[j] << " " << setw(6) << 0 << " " << setw(32) << "NA" << " " << setw(8) << "NA" << " " << setw(12) << "NA" << " " << setw(12) << "NA" << "\n"; continue; } // Sort statistics (in descrending order) sort(t.begin(),t.end()); ////////////////////////////////////////////// // // // Makambi (2003), Delongchamp et al (2006) // // // ////////////////////////////////////////////// // Makambi; // assume uniform weighting vector_t w(ns, 1.0/(double)ns); double var = 0; for (int l=0; lchr; int bp = locus[l1]->bp; if ( ! par::silent ) cout << "Set " << j << ", SNP " << l1 << " of " << ns << " \r"; for (int l2=l1+1; l2chr != chr ) continue; if ( abs( bp - locus[l2]->bp ) > 1000000 ) continue; double r = correlation2SNP( t[l1].l , t[l2].l, false, false); if ( ! realnum(r) ) r = 0; r = abs(r); // Makambi var += 2 * w[l1] * w[l2] * ( 3.25 * r + 0.75 * r*r ) ; // Delongchamp denom += 2 * r; } } // Makambi double df = 8.0 / var; if ( df < 2 ) df = 2; double score = 0; for (int l=0; lchr; int minBP = locus[ t[0].l ]->bp; int maxBP = minBP; bool diffchr = false; for (int l=1;lchr != chr ) { diffchr = true; break; } if ( locus[ t[l].l ]->bp > maxBP ) maxBP = locus[ t[l].l ]->bp; else if ( locus[ t[l].l ]->bp < minBP ) minBP = locus[ t[l].l ]->bp; } if ( diffchr ) { SSUM << setw(32) << "NA" << " " << setw(8) << "NA" << " "; } else { string pstr = "chr" + int2str(chr) + ":" + int2str( minBP) + ".." + int2str( maxBP ); SSUM << setw(32) << pstr << " " << setw(8) << (maxBP-minBP)/1000.0 << " "; } SSUM << setw(12) << pv << " " << setw(12) << pv2 << "\n"; } if ( ! par::silent ) cout << " \n"; SSUM.close(); if ( par::verbose ) SVERB.close(); } plink-1.07-src/mh.cpp0000644000265600020320000007301411264127625013612 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "plink.h" #include "perm.h" #include "options.h" #include "helper.h" #include "stats.h" void Plink::calcMH() { /////////////////////////////////// // Basic 2 x 2 x K CMH test // i.e. Disease x allele x strata // is taken care of in assoc.cpp // (i.e. allows for permutation, sets, etc) if (!par::SNP_major) Ind2SNP(); ////////////////////////////////// // Any individual not assigned to a cluster, // making missing phenotype vector::iterator person = sample.begin(); while ( person != sample.end() ) { if ( (*person)->sol < 0 ) (*person)->missing = true; person++; } /////////////////////////////////// // Generalized I x J x K CMH test // Either ordinal or normal // i.e. test strata X SNP controlling for disease if (par::CMH_test_2 || par::CMH_test_ORD ) { if (par::CMH_test_ORD && !par::bt) error("--mh-ord specified but the phenotype is only binary: use --mh"); if (nk==1) error("No clusters defined for --mh2 test, i.e. K=1"); string f = par::output_file_name + ".cmh2"; if (par::CMH_test_ORD) f = par::output_file_name + ".cmh.ord"; ofstream MHOUT; MHOUT.open(f.c_str(),ios::out); MHOUT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "CHISQ" << " " << setw(10) << "P" << "\n"; MHOUT.precision(4); if (par::CMH_test_ORD) { printLOG("Cochran-Mantel-Haenszel IxJxK ordinal test, K = " + int2str(nk) + "\n"); printLOG("Testing SNP x ORDINAL DISEASE | STRATUM (option --mh-ord)\n"); } else { printLOG("Cochran-Mantel-Haenszel IxJxK test, K = " + int2str(nk) + "\n"); printLOG("Testing SNP x STRATUM | DISEASE (option --mh2)\n"); } printLOG("Writing results to [ " + f + " ]\n"); vector::iterator s = SNP.begin(); int l=0; while ( s != SNP.end() ) { ///////////////////////// // Autosomal or haploid? bool Xchr=false, haploid=false; if (par::chr_sex[locus[l]->chr]) Xchr=true; else if (par::chr_haploid[locus[l]->chr]) haploid=true; if (haploid || Xchr ) error("--mh2 / --mh-ord cannot handle X/Y markers currently..."); vector X(0); // SNP vector Y(0); // Cluster vector Z(0); // Phenotype vector::iterator person = sample.begin(); vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); while ( person != sample.end() ) { if ((*person)->missing) { // Next person person++; i1++; i2++; continue; } // Only consider individuals who have been assigned to a cluster if ( (*person)->sol >= 0 ) { if ( (!(*i1)) && (!(*i2)) ) { X.push_back(1); X.push_back(1); } else if ( (!(*i1)) && *i2 ) { X.push_back(1); X.push_back(2); } else if ( *i1 && *i2 ) { X.push_back(2); X.push_back(2); } else { // Next person person++; i1++; i2++; continue; } Y.push_back((*person)->sol); Y.push_back((*person)->sol); if (par::CMH_test_ORD) Z.push_back( (int)(*person)->phenotype ); else { if ((*person)->phenotype==2) { Z.push_back(2); Z.push_back(2); } else { Z.push_back(1); Z.push_back(1); } } } // Next person person++; i1++; i2++; } vector res; if ( par::CMH_test_ORD ) res = calcMantelHaenszel_ORD(X,Z,Y); else res = calcMantelHaenszel_IxJxK(X,Y,Z); MHOUT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(10) << res[0] << " " << setw(10) << chiprobP(res[0],res[1]) << "\n"; // Next SNP s++; l++; } MHOUT.close(); } } vector Plink::calcMantelHaenszel_2x2xK(Perm & perm, bool original) { // Should we perform BD test (K>1) if (nk<2) par::breslowday = false; ofstream MHOUT; if ( original ) { ////////////////////////////////// // Any individual not assigned to a cluster, making missing // phenotype (only need to do this once, for original) vector::iterator person = sample.begin(); while ( person != sample.end() ) { if ( (*person)->sol < 0 ) (*person)->missing = true; person++; } string f = par::output_file_name + ".cmh"; MHOUT.open(f.c_str(),ios::out); MHOUT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "BP" << " " << setw(4) << "A1" << " " << setw(8) << "MAF" << " " << setw(4) << "A2" << " " << setw(10) << "CHISQ" << " " << setw(10) << "P" << " " << setw(10) << "OR" << " " << setw(10) << "SE" << " " << setw(10) << string("L"+dbl2str(par::ci_level*100)) << " " << setw(10) << string("U"+dbl2str(par::ci_level*100)) << " "; if (par::breslowday) MHOUT << setw(10) << "CHISQ_BD" << " " << setw(10) << "P_BD" << " "; MHOUT << "\n"; MHOUT.precision(4); printLOG("Cochran-Mantel-Haenszel 2x2xK test, K = " + int2str( nk) + "\n"); if (par::breslowday) printLOG("Performing Breslow-Day test of homogeneous odds ratios\n"); printLOG("Writing results to [ " + f + " ]\n"); // Warnings, if (par::breslowday && nk>10) printLOG("** Warning ** Breslow-Day statistics require large N per cluster ** \n"); } double zt = ltqnorm( 1 - (1 - par::ci_level) / 2 ) ; // Cochran-Mantel-Haenszel 2x2xK test vector results(nl_all); vector::iterator s = SNP.begin(); int l=0; while ( s != SNP.end() ) { // Skip possibly if (par::adaptive_perm && !perm.snp_test[l]) { s++; l++; continue; } // Disease X allele X strata // Calculate mean of 11 cell for each strata vector mean_11(nk,0); vector var_11(nk,0); // Calculate statistic vector n_11(nk,0); vector n_12(nk,0); vector n_21(nk,0); vector n_22(nk,0); // Disease marginals vector n_1X(nk,0); // disease vector n_2X(nk,0); // no disease vector n_X1(nk,0); // F allele vector n_X2(nk,0); // T allele vector n_TT(nk,0); // Total allele count ///////////////////////// // Autosomal or haploid? bool X=false, haploid=false; if (par::chr_sex[locus[l]->chr]) X=true; else if (par::chr_haploid[locus[l]->chr]) haploid=true; //////////////////////// // Consider each person vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator gperson = sample.begin(); while ( gperson != sample.end() ) { Individual * pperson = (*gperson)->pperson; bool s1 = *i1; bool s2 = *i2; // Affected individuals if ( pperson->aff && !pperson->missing ) { // Haploid? if ( haploid || ( X && (*gperson)->sex ) ) { // Allelic marginal if ( ! s1 ) { // FF hom n_11[ pperson->sol ] ++ ; n_X1[ pperson->sol ] ++ ; } else { if ( ! s2 ) // FT { gperson++; i1++; i2++; continue; // skip missing genotypes } else // TT { n_12[ pperson->sol ] ++ ; n_X2[ pperson->sol ] ++ ; } } // Disease marginal n_1X[ pperson->sol ] ++; n_TT[ pperson->sol ] ++; } else // autosomal { // Allelic marginal if ( ! s1 ) { if ( ! s2 ) // FF hom { n_11[ pperson->sol ] +=2 ; n_X1[ pperson->sol ] +=2 ; } else { n_11[ pperson->sol ]++ ; // FT het n_12[ pperson->sol ]++ ; n_X1[ pperson->sol ]++ ; n_X2[ pperson->sol ]++ ; } } else { if ( ! s2 ) // FT { gperson++; i1++; i2++; continue; // skip missing genotypes } else // TT { n_12[ pperson->sol ] +=2 ; n_X2[ pperson->sol ] +=2 ; } } // Disease marginal n_1X[ pperson->sol ] += 2; n_TT[ pperson->sol ] += 2; } // end autosomal } else if ( ! pperson->missing ) // Unaffecteds { // Haploid? if ( haploid || ( X && (*gperson)->sex ) ) { // Allelic marginal if ( ! s1 ) { // FF hom n_21[ pperson->sol ] ++ ; n_X1[ pperson->sol ] ++ ; } else { if ( ! s2 ) // FT { gperson++; i1++; i2++; continue; // skip missing genotypes } else // TT { n_22[ pperson->sol ] ++ ; n_X2[ pperson->sol ] ++ ; } } // Disease marginal n_2X[ pperson->sol ] ++; n_TT[ pperson->sol ] ++; } else // autosomal { // Allelic marginal if ( ! s1 ) { if ( ! s2 ) // FF { n_X1[ pperson->sol ] +=2 ; n_21[ pperson->sol ] +=2 ; } else { n_X1[ pperson->sol ] ++ ; n_X2[ pperson->sol ] ++ ; n_21[ pperson->sol ] ++ ; n_22[ pperson->sol ] ++ ; } } else { if ( ! s2 ) // FT { gperson++; i1++; i2++; continue; // skip missing genotypes } else // TT { n_X2[ pperson->sol ] +=2 ; n_22[ pperson->sol ] +=2 ; } } // disease marginal n_2X[ pperson->sol ] += 2; n_TT[ pperson->sol ] += 2; } // end autosomal } // end unaffected gperson++; i1++; i2++; } // count next individual // Finished iterating over individuals: cluster needs at least 2 // nonmissing individuals vector validK(nk,false); for (int k=0; k=2) validK[k]=true; for (int k=0; k r2(nk); vector s2(nk); for (int k=0; k par::pfvalue || pvalue < 0 ) ) goto skip_p_cmh; MHOUT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(10) << locus[l]->bp << " " << setw(4) << locus[l]->allele1 << " " << setw(8) << locus[l]->freq << " " << setw(4) << locus[l]->allele2 << " "; if (realnum(CMH)) MHOUT << setw(10) << CMH << " " << setw(10) << chiprobP(CMH,1) << " "; else MHOUT << setw(10) << "NA" << " " << setw(10) << "NA" << " "; if (realnum(OR)) MHOUT << setw(10) << OR << " "; else MHOUT << setw(10) << "NA" << " "; if (realnum(SE)) MHOUT << setw(10) << SE << " "; else MHOUT << setw(10) << "NA" << " "; if (realnum(OR_lower)) MHOUT << setw(10) << OR_lower << " "; else MHOUT << setw(10) << "NA" << " "; if (realnum(OR_upper)) MHOUT << setw(10) << OR_upper << " "; else MHOUT << setw(10) << "NA" << " "; // Optional Breslow-Day test of homogeneity of odds ratios if (par::breslowday) { double amax; double bb; double determ; double as_plus; double as_minus; double Astar; double Bstar; double Cstar; double Dstar; double Var; double BDX2 = 0; int df = 0; for (int k=0; k= 0 ? as_minus : as_plus ; Bstar = n_1X[k] - Astar; Cstar = n_X1[k] - Astar; Dstar = n_2X[k] - n_X1[k] + Astar; Var = 1/(1/Astar + 1/Bstar + 1/Cstar + 1/Dstar); BDX2 += ( (n_11[k] - Astar) * ( n_11[k] - Astar ) ) / Var ; } } double BDp = chiprobP( BDX2 , df-1 ); if ( BDp > -1 ) MHOUT << setw(10) << BDX2 << " " << setw(10) << BDp << " "; else MHOUT << setw(10) << "NA" << " " << setw(10) << "NA" << " "; } MHOUT << "\n"; } skip_p_cmh: // Store for permutation procedure, based 2x2xK CMH result results[l] = CMH; // Next SNP s++; l++; } if (original) MHOUT.close(); return results; } vector Plink::calcMantelHaenszel_IxJxK(vector & X, vector & Y, vector & Z) { if (X.size() != Y.size() || Y.size() != Z.size() || X.size() != Z.size() ) error("Internal problem:\n problem in calcMantelHaenszel_IxJxK()...uneven input columns"); // Determine unique elements int nx=0, ny=0, nz=0; map mx; map my; map mz; for (unsigned int i=0; i > N(nz); // observed counts vector< vector > U(nz); // expected vector< vector< vector > > V(nz); // variance matrix vector > Tx(nz); // marginal totals vector > Ty(nz); // .. vector T(nz); // totals (per K) for (int k=0; ksecond; int vy = my.find(Y[i])->second; int vz = mz.find(Z[i])->second; // exclude nx + ny (upper limits) if (vx validK(nk,false); for (int k=0; k=2) validK[k]=true; // Calculate expecteds for (int k=0; k > V0((nx-1)*(ny-1)); for (int k2=0; k2<(nx-1)*(ny-1); k2++) V0[k2].resize((nx-1)*(ny-1)); vector N0((nx-1)*(ny-1)); vector U0((nx-1)*(ny-1)); // Sum N, U and V over K for (int k=0; k tmp1((nx-1)*(ny-1),0); vector tmp2((nx-1)*(ny-1),0); V0 = svd_inverse(V0,flag); for (int i=0; i<(nx-1)*(ny-1); i++) tmp1[i] = N0[i] - U0[i]; // Matrix mult -- rows by columns for (int i=0; i<(nx-1)*(ny-1); i++) for (int j=0; j<(nx-1)*(ny-1); j++) tmp2[j] += tmp1[i] * V0[i][j]; vector result(2); // CMH Chi-square result[0]=0; for (int i=0; i<(nx-1)*(ny-1); i++) result[0] += tmp2[i] * tmp1[i]; // DF result[1] = (nx-1)*(ny-1); return result; } void Plink::calcHomog() { if (!par::SNP_major) Ind2SNP(); string f = par::output_file_name + ".homog"; ofstream MHOUT; MHOUT.open(f.c_str(),ios::out); MHOUT.precision(4); if (nk==0) error("No clusters (K=0)... cannot perform CMH tests"); printLOG("Homogeneity of odds ratio test, K = " + int2str(nk) + "\n"); if (nk<2) { printLOG("** Warning ** less then 2 clusters specified... \n"); printLOG(" cannot compute between-cluster effects ** \n"); return; } if (nk>10) printLOG("** Warning ** statistics can be unreliable if strata have small N ** \n"); printLOG("Writing results to [ " + f + " ]\n"); MHOUT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(4) << "A1" << " " << setw(4) << "A2" << " " << setw(8) << "F_A" << " " << setw(8) << "F_U" << " " << setw(8) << "N_A" << " " << setw(8) << "N_U" << " " << setw(8) << "TEST" << " " << setw(10) << "CHISQ" << " " << setw(4) << "DF" << " " << setw(10) << "P" << " " << setw(10) << "OR" << "\n"; /////////////////////////////////// // Create boolean affection coding affCoding(*this); ////////////////////////////////// // Any individual not assigned to a cluster, // making missing phenotype vector::iterator person = sample.begin(); while ( person != sample.end() ) { if ( (*person)->sol < 0 ) (*person)->missing = true; person++; } /////////////////////////////// // Iterate over SNPs vector::iterator s = SNP.begin(); int l=0; while ( s != SNP.end() ) { // Uncomment this if we allow permutation for the CMH // tests // In adaptive mode, possibly skip this test // if (par::adaptive_perm && (!perm.snp_test[l])) // { // s++; // l++; // continue; // } // Calculate statistic vector n_11(nk,0); vector n_12(nk,0); vector n_21(nk,0); vector n_22(nk,0); vector lnOR(nk,0); vector SEsq(nk,0); ///////////////// // Autosomal or haploid? bool X=false, haploid=false; if (par::chr_sex[locus[l]->chr]) X=true; else if (par::chr_haploid[locus[l]->chr]) haploid=true; ///////////////////////////// // Iterate over individuals vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); vector::iterator gperson = sample.begin(); while ( gperson != sample.end() ) { // Phenotype for this person (i.e. might be permuted) Individual * pperson = (*gperson)->pperson; // SNP alleles bool s1 = *i1; bool s2 = *i2; int hom = 2; if ( haploid || ( X && (*gperson)->sex ) ) hom = 1; // Affected individuals if ( pperson->aff && !pperson->missing ) { // Allelic marginal if ( !s1 ) { if ( !s2 ) // FF hom { n_11[ pperson->sol ] += hom ; } else { n_11[ pperson->sol ]++ ; // FT het n_12[ pperson->sol ]++ ; } } else { if ( !s2 ) // FT { gperson++; i1++; i2++; continue; // skip missing genotypes } else // TT { n_12[ pperson->sol ] += hom ; } } } else if ( ! pperson->missing ) // Unaffecteds { // Allelic marginal if ( ! s1 ) { if ( ! s2 ) // FF { n_21[ pperson->sol ] += hom ; } else { n_21[ pperson->sol ] ++ ; n_22[ pperson->sol ] ++ ; } } else { if ( ! s2 ) // FT { gperson++; i1++; i2++; continue; // skip missing genotypes } else // TT { n_22[ pperson->sol ] += hom ; } } } // Next individual gperson++; i1++; i2++; } // Calculate log(OR) and SE(ln(OR)) for eacsh strata double X_total = 0; double X_assoc1 = 0; double X_assoc2 = 0; vector X_indiv(nk,0); for (int k=0; kchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(6) << "TOTAL" << " " << setw(10) << X_total << " " << setw(4) << nk << " " << setw(10) << chiprobP(X_total,nk) << " " << setw(10) << "NA" << "\n"; MHOUT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(6) << "ASSOC" << " " << setw(10) << X_assoc << " " << setw(4) << 1 << " " << setw(10) << chiprobP(X_assoc,1) << " " << setw(10) << "NA" << "\n"; MHOUT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(6) << "HOMOG" << " " << setw(10) << X_homog << " " << setw(4) << nk-1 << " " << setw(10) << chiprobP(X_homog,nk-1) << " " << setw(10) << "NA" << "\n"; for (int k=0; kchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << n_11[k] + n_12[k] - 1 << " " << setw(8) << n_21[k] + n_22[k] - 1 << " " << setw(6) << kname[k] << " " << setw(10) << "NA" << " " << setw(4) << "NA" << " " << setw(10) << "NA" << " " << setw(10) << "NA" << "\n"; } else { MHOUT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << n_11[k]/double(n_11[k]+n_12[k]) << " " << setw(8) << n_21[k]/double(n_21[k]+n_22[k]) << " " << setw(8) << n_11[k] + n_12[k] - 1 << " " << setw(8) << n_21[k] + n_22[k] - 1 << " " << setw(6) << kname[k] << " " << setw(10) << X_indiv[k] << " " << setw(4) << 1 << " " << setw(10) << chiprobP(X_indiv[k],1) << " "; double odr = ( n_11[k] * n_22[k] ) / ( n_12[k] * n_21[k] ); if ( realnum(odr) ) MHOUT << setw(10) << odr << "\n"; else MHOUT << setw(10) << "NA" << "\n"; } } // Next locus s++; l++; } MHOUT.close(); } vector Plink::calcMantelHaenszel_ORD(vector & X, vector & Y, vector & Z) { // X is SNP coding // Y is phenotype (assumed to be ordinal, integers) // Z is cluster if (X.size() != Y.size() || Y.size() != Z.size() || X.size() != Z.size() ) error("Internal problem:\n problem in calcMantelHaenszel_ORD()...uneven input columns"); // Determine unique elements int nx=0, ny=0, nz=0; map mx; map my; map mz; for (unsigned int i=0; i > N(nz); // observed counts vector< vector > U(nz); // expected vector< vector< vector > > V(nz); // variance matrix vector > Tx(nz); // marginal totals vector > Ty(nz); // .. vector T(nz); // totals (per K) for (int k=0; ksecond; int vy = my.find(Y[i])->second; int vz = mz.find(Z[i])->second; // exclude nx + ny (upper limits) if (vx validK(nk,false); for (int k=0; k=2) validK[k]=true; // Calculate expecteds for (int k=0; k #include #include #include "linear.h" #include "logistic.h" #include "helper.h" #include "plink.h" #include "options.h" #include "crandom.h" #include "sets.h" #include "perm.h" #include "phase.h" #include "whap.h" #include "stats.h" // Usage of Model // Fit model LinearModel lm; // Give pointer to PLINK lm.setPlink(this); // Set missing data lm.setMissing(); // Set dependent (adds intercept) lm.setDependent(Y); // Addive effects, labels lm.addAdditiveSNP(CSNP*); // lm.label.push_back("ADD"); // lm.addDominanceSNP(CSNP*); // Sex effect? lm.addSexEffect(); // lm.label.push_back("SEX"); // Covariates? lm.addCovariate(int); // lm.label.push_back("COV"+int2str(c+1)); // Interactions? lm.addInteraction(int,int); // Build design matrix lm.buildDesignMatrix(); // Fit logistic model lm.fitLM(); // Fit okay? lm.validParameters(); // Display results lm.displayResults(); // Get statistic lm.getStatistic(); vector_t Plink::glmAssoc(bool print_results, Perm & perm) { // The model.cpp functions require a SNP-major structure, if SNP // data are being used. There are some exceptions to this however, // listed below if ( par::SNP_major && ! ( par::epi_genebased || par::set_score || par::set_step || par::proxy_glm || par::dosage_assoc || par::cnv_enrichment_test || par::cnv_glm || par::score_test || par::rare_test || par::gvar ) ) SNP2Ind(); // Test all SNPs 1 at a time automatically, or is this // a tailored single test? int ntests = par::assoc_glm_without_main_snp ? 1 : nl_all; vector results(ntests); if ( print_results && par::qt && par::multtest ) tcnt.resize(ntests); ofstream ASC; if (print_results) { string f = par::output_file_name; if ( par::bt) { f += ".assoc.logistic"; printLOG("Writing logistic model association results to [ " + f + " ] \n"); } else { f += ".assoc.linear"; printLOG("Writing linear model association results to [ " + f + " ] \n"); } ASC.open(f.c_str(),ios::out); ASC << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "BP" << " " << setw(4) << "A1" << " " << setw(10) << "TEST" << " " << setw(8) << "NMISS" << " "; if ( par::bt && ! par::return_beta ) ASC << setw(10) << "OR" << " "; else ASC << setw(10) << "BETA" << " "; if (par::display_ci) ASC << setw(8) << "SE" << " " << setw(8) << string("L"+dbl2str(par::ci_level*100)) << " " << setw(8) << string("U"+dbl2str(par::ci_level*100)) << " "; ASC << setw(12) << "STAT" << " " << setw(12) << "P" << " " << "\n"; ASC.precision(4); } ///////////////////////////// // Determine sex distribution int nmales = 0, nfemales = 0; for (int i=0; imissing ) { if ( sample[i]->sex ) nmales++; else nfemales++; } bool variationInSex = nmales > 0 && nfemales > 0; ////////////////////////////////////////// // Iterate over each locus, or just once for (int l=0; lchr] || par::chr_haploid[locus[l]->chr] ) continue; } else if (par::chr_sex[locus[l]->chr]) X=true; } ////////////////////////////////////////////////////////// // A new GLM Model * lm; ////////////////////////////////////////////////////////// // Linear or logistic? if (par::bt) { LogisticModel * m = new LogisticModel(this); lm = m; } else { LinearModel * m = new LinearModel(this); lm = m; } ////////////////////////////////////////////////////////// // A temporary fix if ( par::dosage_assoc || par::cnv_enrichment_test || par::cnv_glm || par::score_test || par::set_score || par::proxy_glm || par::gvar || par::rare_test ) lm->hasSNPs(false); ////////////////////////////////////////////////////////// // Set missing data lm->setMissing(); ////////////////////////////////////////////////////////// // Set genetic model if ( par::glm_dominant ) lm->setDominant(); else if ( par::glm_recessive || par::twoDFmodel_hethom ) lm->setRecessive(); string mainEffect = ""; bool genotypic = false; ///////////////////////////////////////////////// // Main SNP if ( ! par::assoc_glm_without_main_snp ) { genotypic = par::chr_haploid[locus[l]->chr] ? false : par::twoDFmodel ; // Models // AA AB BB // Additive 0 1 2 // Dominant 0 1 1 // Recessive 0 0 1 // Genotypic(1) // Additive 0 1 2 // Dom Dev. 0 1 0 // Genotypic(2) // Homozygote 0 0 1 // Heterozygote 0 1 0 //////////////////////////////////////////////////////////// // An additive effect? (or single coded effect) of main SNP if ( par::glm_recessive ) mainEffect = "REC"; else if ( par::glm_dominant ) mainEffect = "DOM"; else if ( par::twoDFmodel_hethom ) mainEffect = "HOM"; else mainEffect = "ADD"; lm->addAdditiveSNP(l); lm->label.push_back(mainEffect); ////////////////////////////////////////////////////////// // Or a 2-df additive + dominance model? if ( genotypic ) { lm->addDominanceSNP(l); if ( par::twoDFmodel_hethom ) lm->label.push_back("HET"); else lm->label.push_back("DOMDEV"); } } ////////////////////////////////////////////////////////// // Haplotypes: WHAP test (grouped?) if ( par::chap_test ) { // Use whap->group (a list of sets) to specify these, from // the current model (either alternate or null) // Start from second category (i.e. first is reference) for (int h=1; h < whap->current->group.size(); h++) { lm->addHaplotypeDosage( whap->current->group[h] ); lm->label.push_back( "WHAP"+int2str(h+1) ); } } ////////////////////////////////////////////////////////// // Haplotypes: proxy test if ( par::proxy_glm ) { // Unlike WHAP tests, we now will only ever have two // categories; and a single tested coefficient set t1 = haplo->makeSetFromMap(haplo->testSet); lm->addHaplotypeDosage( t1 ); lm->label.push_back( "PROXY" ); } if ( par::test_hap_GLM ) { // Assume model specified in haplotype sets // Either 1 versus all others, or H-1 versus // terms for omnibus set::iterator i = haplo->sets.begin(); while ( i != haplo->sets.end() ) { set t; t.insert(*i); lm->addHaplotypeDosage( t ); lm->label.push_back( haplo->haplotypeName( *i ) ); ++i; } } ////////////////////////////////////////////////////////// // Conditioning SNPs? // (might be X or autosomal, dealth with automatically) if (par::conditioning_snps) { if ( par::chap_test ) { for (int c=0; ccurrent->masked_conditioning_snps[c] ) { lm->addAdditiveSNP(conditioner[c]); lm->label.push_back(locus[conditioner[c]]->name); } } } else { for (int c=0; caddAdditiveSNP(conditioner[c]); lm->label.push_back(locus[conditioner[c]]->name); } } } ////////////////////////////////////////////////////////// // Sex-covariate (necessary for X chromosome models, unless // explicitly told otherwise) if ( ( par::glm_sex_effect || ( X && !par::glm_no_auto_sex_effect ) ) && variationInSex ) { automaticSex = true; lm->addSexEffect(); lm->label.push_back("SEX"); } ////////////////////////////////////////////////////////// // Covariates? if (par::clist) { for (int c=0; caddCovariate(c); lm->label.push_back(clistname[c]); } } ////////////////////////////////////////////////////////// // Interactions // addInteraction() takes parameter numbers // i.e. not covariate codes // 0 intercept // 1 {A} // {D} // {conditioning SNPs} // {sex efffect} // {covariates} // Allow for interactions between conditioning SNPs, sex, covariates, etc //////////////////////////////////////// // Basic SNP x covariate interaction? // Currently -- do not allow interactions if no main effect // SNP -- i.e. we need a recoding of things here. if ( par::simple_interaction && ! par::assoc_glm_without_main_snp ) { // A, D and haplotypes by conditioning SNPs, sex, covariates int cindex = 2; if ( genotypic ) cindex = 3; for (int c=0; caddInteraction(1,cindex); lm->label.push_back(mainEffect+"xCSNP"+int2str(c+1)); if ( genotypic ) { lm->addInteraction(2,cindex); if ( par::twoDFmodel_hethom ) lm->label.push_back("HETxCSNP"+int2str(c+1)); else lm->label.push_back("DOMDEVxCSNP"+int2str(c+1)); } cindex++; } if ( automaticSex ) { lm->addInteraction(1,cindex); lm->label.push_back(mainEffect+"xSEX"); if ( genotypic ) { lm->addInteraction(2,cindex); if ( par::twoDFmodel_hethom ) lm->label.push_back("HETxSEX"); else lm->label.push_back("DOMDEVxSEX"); } cindex++; } for (int c=0; caddInteraction(1,cindex); lm->label.push_back(mainEffect+"x"+clistname[c]); if ( genotypic ) { lm->addInteraction(2,cindex); if ( par::twoDFmodel_hethom ) lm->label.push_back("HETx"+clistname[c]); else lm->label.push_back("DOMDEVx"+clistname[c]); } cindex++; } } ////////////////////////////// // Fancy X chromosome models if ( X && automaticSex && par::xchr_model > 2 ) { // Interaction between allelic term and sex (i.e. // allow scale of male effect to vary) int sindex = 2; if ( genotypic ) sindex++; sindex += conditioner.size(); lm->addInteraction(2,sindex); lm->label.push_back("XxSEX"); // xchr model 3 : test ADD + XxSEX // xchr model 4 : test ADD + DOM + XxSEX } ////////////////////////////// // Build design matrix lm->buildDesignMatrix(); ////////////////////////////// // Clusters specified? if ( par::include_cluster ) { lm->setCluster(); } ////////////////////////////////////////////////// // Fit linear or logistic model (Newton-Raphson) lm->fitLM(); //////////////////////////////////////// // Check for multi-collinearity lm->validParameters(); //////////////////////////////////////// // Obtain estimates and statistic if (print_results) lm->displayResults(ASC,locus[l]); //////////////////////////////////////////////// // Test linear hypothesis (multiple parameters) // Perform if: // automatic 2df genotypic test ( --genotypic ) // OR // sex-tests ( --xchr-model ) // OR // test of everything ( --test-all ) // OR // user has specified user-defined test ( --tests ) if ( ( genotypic && ! par::glm_user_parameters ) || par::glm_user_test || par::test_full_model ) { vector_t h; // dim = number of fixes (to =0) matrix_t H; // row = number of fixes; cols = np int df; string testname; //////////////////////////////////////////////// // Joint test of all parameters if (par::test_full_model) { df = lm->getNP() - 1; h.resize(df,0); testname = "FULL_"+int2str(df)+"DF"; sizeMatrix(H,df,lm->getNP()); for (int i=0; igetNP()); for (int i=0; igetNP() ) H[i][par::test_list[i]] = 1; } //////////////////////////////////////////////// // Joint test of additive and dominant models else if ( genotypic ) { testname = "GENO_2DF"; df = 2; h.resize(2,0); sizeMatrix(H,2,lm->getNP()); H[0][1] = H[1][2] = 1; } else if ( X && par::xchr_model == 3 ) { testname = "XMOD_2DF"; } //////////////////////////////////////////////// // Joint test of all parameters double chisq = lm->isValid() ? lm->linearHypothesis(H,h) : 0; double pvalue = chiprobP(chisq,df); // If filtering p-values if ( (!par::pfilter) || pvalue <= par::pfvalue ) { ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(10) << locus[l]->bp << " " << setw(4) << locus[l]->allele1 << " " << setw(10) << testname << " " << setw(8) << lm->Ysize() << " " << setw(10) << "NA" << " "; if (par::display_ci) ASC << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " "; if (lm->isValid() && realnum(chisq) ) ASC << setw(12) << chisq << " " << setw(12) << pvalue << "\n"; else ASC << setw(12) << "NA" << " " << setw(12) << "NA" << "\n"; } } //////////////////////////////////////// // Store statistic (1 df chisq), and p-value // if need be ( based on value of testParameter ) if ( ! par::assoc_glm_without_main_snp ) results[l] = lm->getStatistic(); if ( par::qt && print_results && par::multtest ) tcnt[l] = lm->Ysize() - lm->getNP(); ////////////////////////////////////////////// // Clear up linear model, if no longer needed if ( par::chap_test || par::test_hap_GLM || par::set_step || par::set_score || par::proxy_glm || par::dosage_assoc || par::cnv_enrichment_test || par::cnv_glm || par::score_test || par::gvar || par::rare_test ) { // Responsibility to clear up in parent routine model = lm; } else { delete lm; } // Flush output buffer ASC.flush(); // Next SNP } if (print_results) ASC.close(); return results; } plink-1.07-src/dfam.cpp0000644000265600020320000003656011264127625014122 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2007 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "crandom.h" #include "sets.h" #include "perm.h" #include "stats.h" vector Plink::testSibTDT(bool print_results, bool permute, Perm & perm, vector & flipA, vector & flipP) { /////////////////////////// // Vector to store results vector res(nl_all); ofstream TDT; if (print_results) { string f = par::output_file_name + ".dfam"; TDT.open(f.c_str(),ios::out); printLOG("Writing DFAM results (asymptotic) to [ " + f + " ] \n"); TDT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(4) << "A1" << " " << setw(4) << "A2" << " " << setw(8) << "OBS" << " " << setw(8) << "EXP" << " " << setw(12) << "CHISQ" << " " << setw(12) << "P" << " "; TDT << "\n"; } /////////////////////////////////// // Verbose display of pedigrees if ( par::dumpped && ! par::permute ) { string str = par::output_file_name + ".pdump"; printLOG("Dumping pedigree information to [ " + str + " ]\n"); ofstream PD(str.c_str(),ios::out); // Of course, in practice, due to missing genotypes, the exact // family configurations may shift. for (int f=0; fparents && fam->TDT && par::dfam_tdt ) { PD << "W/PARENTS\t" << fam->pat->fid << " : "; PD << fam->pat->iid << " x " << fam->mat->iid << " -> "; for ( int k=0; kkid.size(); k++) PD << fam->kid[k]->iid << " "; PD << "\n"; } // ...and those sibling without 2 parents else if ( fam->sibship && par::dfam_sibs && fam->kid.size() > 1 ) { PD << "SIBSHIP \t" << fam->kid[0]->fid << " : "; for ( int k=0; kkid.size(); k++) PD << fam->kid[k]->iid << " "; PD << "\n"; } } // And unrelated clusters if ( par::dfam_unrelateds ) for ( int k=0; kperson.size(); c++) { Individual * person = klist[k]->person[c]; if ( ! ( person->family->sibship || ( person->family->parents ) ) ) PD << "CLUSTER " << k << "\t" << klist[k]->person[c]->fid << " : " << klist[k]->person[c]->iid << "\n"; } } PD.close(); } /////////////////////////////////// // Perform analysis for each locus for (int l=0; lchr ] || par::chr_haploid[ locus[l]->chr ] ) { continue; } // Allele T counts double numerator = 0; double denom = 0; // Total counts double totalCount = 0; double totalExpected = 0; int totalInformative = 0; ///////////////////////// // Count over families for (int f=0; f AA 0.5 2 // AB 0.5 1 // E = 1.5 * K // V = 0.25 * K // BB AB -> AB 0.5 1 // BB 0.5 0 // E = 0.5 * K // V = 0.25 * K // AB AB -> AA 0.25 2 // AB 0.50 1 // BB 0.25 0 // E = 0 // V = 0.5 * K // 4. For each type (b) family, also obtain the count A // among the affected children and its expected value and // variance (under H0) given the genotypes of all the // children. These are given by the hypergeometric // distribution, e.g. if the sibship contain n A alleles m B // alleles and there are N affected members, then the // expected value of the count of A among the affected // members would be 2N n/(n+m), and its variance is 2N // nm/(n+m)^2 Note the factor of 2 is because each person // has 2 alleles. // n "A" alleles, m "B" alleles // S = n | k affected siblings // E = 2 k n/(n+m) // V = 2 k nm/(n+m)^2 // replacement issue? // 5. An overall test statistic is (Sum of the Counts of A - // Sum of expected counts )^2 / (Sum of variances) Family * fam = family[f]; bool informative = false; bool parents = true; // Type A: two genotyped parents and at least 1 affected individual if ( fam->parents && fam->TDT && par::dfam_tdt ) { Individual * pat = family[f]->pat; Individual * mat = family[f]->mat; bool pat1 = pat->one[l]; bool pat2 = pat->two[l]; bool mat1 = mat->one[l]; bool mat2 = mat->two[l]; // We need two genotyped parents, with // at least one het if ( pat1 && (!pat2) || mat1 && (!mat2) ) { parents = false; goto jump_to_sibships; } int heteroParents = 0; bool homozygParent; if ( pat1 != pat2 ) heteroParents++; else homozygParent = pat1; if ( mat1 != mat2 ) heteroParents++; else homozygParent = mat1; if ( heteroParents == 0 ) { parents = false; goto jump_to_sibships; } // Consider all offspring in nuclear family double alleleCount = 0; double childCount = 0; for (int c=0; ckid.size(); c++) { // Only consider affected children: based on true // (not permuted) phenotype here: permutation works // by flipping transmissions if ( ! family[f]->kid[c]->aff ) continue; bool kid1 = family[f]->kid[c]->one[l]; bool kid2 = family[f]->kid[c]->two[l]; // Skip if offspring has missing genotype if ( kid1 && !kid2 ) continue; // We've now established: no missing genotypes // and at least one heterozygous parent if ( permute ) { if ( heteroParents == 1 ) { if ( ! homozygParent ) alleleCount++; if ( flipA[f] ) alleleCount++; } else // ...two heterozygous parents { if ( flipA[f] ) alleleCount+=2; } } else { // No permtutation: standard counting if ( ! kid1 ) alleleCount++; if ( ! kid2 ) alleleCount++; } childCount++; } // next offspring in family double expected = childCount; if ( heteroParents == 1 ) { if ( ! homozygParent ) expected *= 1.5; else expected *= 0.5; } double variance = heteroParents * 0.25 * childCount; numerator += (alleleCount - expected); denom += variance; totalCount += alleleCount; totalExpected += expected; } jump_to_sibships: // Sibships, considering genotypes using the // multivariate hypergeometric distribution if ( ( fam->sibship || ( fam->parents && !parents ) ) && par::dfam_sibs ) { // Let the numbers of offspring with genotypes AA, AB // and BB. // N=NAA+NAB+NBB // Let the numbers of affected offsrping with genotypes // AA, AB and BB. be // D=DAA+DAB+DBB // Then the observed count of A is DA = 2*DAA+DAB The // expected value of DA = 2E(DAA) + E D(AB) = 2 NAA * // (D/N) + NAB * (D/N). The variance of DA = 4 Var (DAA) // + Var(DAB) + 4Cov(AA,DAB) // The variances and covariances from the multivariate // hypergeometric distribution. double childCount = 0; double affectedCount = 0; double genotype1Count = 0; // Aa double affectedGenotype1Count = 0; // Aa double genotype2Count = 0; // AA double affectedGenotype2Count = 0; // AA for (int c=0; ckid.size(); c++) { bool kid1 = family[f]->kid[c]->one[l]; bool kid2 = family[f]->kid[c]->two[l]; // Skip if offspring has missing genotype if ( kid1 && !kid2 ) continue; // Only consider affected children (possibly allowing // for permutation) if ( family[f]->kid[c]->pperson->aff ) { if ( ! kid1 ) { if ( ! kid2 ) affectedGenotype2Count++; else affectedGenotype1Count++; } affectedCount++; } if ( !kid1 ) { if ( !kid2 ) genotype2Count++; else genotype1Count++; } childCount++; } // next offspring in family if ( affectedCount > 0 && affectedCount != childCount ) { double expectedGenotype2 = affectedCount * ( genotype2Count / childCount ) ; double expectedGenotype1 = affectedCount * ( genotype1Count / childCount ) ; double varianceGenotype2 = affectedCount * ( genotype2Count / childCount ) * ( 1 - genotype2Count / childCount ) * ( (childCount - affectedCount ) / ( childCount - 1 ) ) ; double varianceGenotype1 = affectedCount * ( genotype1Count / childCount ) * ( 1 - genotype1Count / childCount ) * ( (childCount - affectedCount ) / ( childCount - 1 ) ) ; double covarianceGenotype = - ( affectedCount * ( ( genotype2Count * genotype1Count ) / ( childCount * childCount ) ) * ( ( (childCount - affectedCount ) / ( childCount - 1 ) ) ) ); double affectedAlleleCount = 2 * affectedGenotype2Count + affectedGenotype1Count; double expected = 2 * expectedGenotype2 + expectedGenotype1; double variance = 4 * varianceGenotype2 + varianceGenotype1 + 4 * covarianceGenotype; numerator += ( affectedAlleleCount - expected ); denom += variance; totalCount += affectedAlleleCount; totalExpected += expected; // cout << "SIB " // << childCount << " " // << genotype1Count << " " // << genotype2Count << " " // << affectedCount << " VAR1,2,3= " // << varianceGenotype2 << " " // << varianceGenotype1 << " " // << covarianceGenotype << " " // << affectedAlleleCount << " [ " // << expected << " & " // << variance << "] \t" // << numerator << " / " // << denom << " , " // << totalCount << " " // << totalExpected << "\n"; } } } // Next nuclear family /////////////////////////////////////// // Now consider clusters of unrelateds // As sibling test, except allelic rather than genotypic // variance estimate (i.e. univariate rather than // multivariate hypergeometic distribution, so equivalent // to CMH test if ( par::dfam_unrelateds ) for ( int k=0; kperson.size(); c++) { Individual * person = klist[k]->person[c]; // Skip any individuals who we've already // analysed as a family if ( person->family->sibship || person->family->parents ) continue; bool s1 = person->one[l]; bool s2 = person->two[l]; // Skip if offspring has missing genotype if ( s1 && !s2 ) continue; // Are we seeing any genotypic discordance? // Only consider families where we do (i.e. // ignore (het, het) pairs, for example if ( c>0 && ( s1 != klist[k]->person[c-1]->one[l] || s2 != klist[k]->person[c-1]->two[l] ) ) same = false; // Only consider affected children if ( person->pperson->aff ) { if ( ! s1 ) affectedAlleleCount++; if ( ! s2 ) affectedAlleleCount++; affectedCount++; } if ( ! s1 ) alleleCount++; if ( ! s2 ) alleleCount++; childCount++; } // next individual in cluster // S = n | A affected individuals // E = 2 k n/(n+m) // V = 2 k nm/(n+m)^2 if ( (!same) && childCount > 1 ) { double D = alleleCount; double N = 2 * childCount; double A = 2 * affectedCount; double expected = A * ( D / N ) ; double variance = A *(D/N)*(1-D/N)*((N-A)/(N-1)); numerator += ( affectedAlleleCount - expected ); denom += variance; totalCount += affectedAlleleCount; totalExpected += expected; } } // Next cluster of unrelateds ////////////////////////////// // Calculate DFAM statistic double chisq = numerator*numerator; chisq /= denom; ////////////////////////////// // Display asymptotic results if (print_results) { double pvalue = chiprobP(chisq,1); // Skip?, if filtering p-values if ( par::pfilter && pvalue > par::pfvalue ) continue; TDT.precision(4); TDT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(8) << totalCount << " " << setw(8) << totalExpected << " "; if ( realnum(chisq) ) { TDT << setw(12) << chisq << " " << setw(12) << pvalue << " "; } else TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " "; TDT << "\n"; } ///////////////////////////////// // Save statistic for permutation res[l] = chisq; } // next locus ////////////////////////////// // Close output file, if open if (print_results) TDT.close(); /////////////////////////////////////////// // Return chosen statistic for permutation return res; } plink-1.07-src/nlist.h0000644000265600020320000000304511264127626014002 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __NLIST_H__ #define __NLIST_H__ #include #include #include #include "helper.h" #include "options.h" using namespace std; class NList{ vector tokenize(string); vector expandNumberList(vector &); int maxcat; bool negmode; // exclude list from 1..maxcat string firstWord; string lastWord; char range_char; string range_string; char delimit_char; public: NList(int n, bool nmode = true ) { range_char = par::range_delimiter[0]; range_string = par::range_delimiter; delimit_char = ','; maxcat = n; negmode = ! nmode; } void setRangeChar(string s) { range_char = s[0]; range_string = s; } void setDelimiter(string s) { delimit_char = s[0]; } vector deparseNumberList(string); vector deparseStringList(string, map *); vector deparseStringList(string); }; #endif plink-1.07-src/annot.cpp0000644000265600020320000002552611264127626014333 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "options.h" #include "plink.h" #include "helper.h" #include "zed.h" extern Plink * PP; map > filterRanges(map > & ranges, string filename); void Plink::annotateFile() { // Simply read in any generic results file and list of SNPs by // ranges (which may be subsetted). Input could be compressed checkFileExists( par::annot_filename ); ZInput zin( par::annot_filename , compressed( par::annot_filename ) ); // If input is compressed, also make output compressed string f = par::output_file_name + ".annot"; if ( compressed( par::annot_filename ) ) f += ".gz"; ZOutput zout( f , compressed( par::annot_filename ) ); printLOG("Reading input from [ " + par::annot_filename + " ]\n"); printLOG("Writing annotated file to [ " + f + " ]\n"); // Range information map > ranges; // SNP attribute information map > attrib; // Only output rows that correspond to these filters set snp_filter; map > range_filter; // Read list of ranges OptionSet * annot_opt = par::opt.getOptions("ANNOT"); if ( annot_opt->isSet("ranges") ) { ranges = readRange( annot_opt->getValue("ranges") ); // Filter to a subset of attribs? if ( annot_opt->isSet("subset") ) ranges = filterRanges( ranges, annot_opt->getValue("subset") ); } // Read list of SNP annotations, which can be compressed // Just simple format: rs-number , 1 + space-delimited annotations if ( annot_opt->isSet("attrib") ) { string fname = annot_opt->getValue("attrib"); checkFileExists( fname ); ZInput ZIN1( fname , compressed(fname) ); while ( ! ZIN1.endOfFile() ) { vector tok = ZIN1.tokenizeLine(); for (int j=1; j >::iterator i = attrib.find( tok[0] ); if ( i == attrib.end() ) { set t; t.insert( tok[j]) ; attrib.insert(make_pair( tok[0] , t ) ); } else i->second.insert( tok[j] ); } } ZIN1.close(); printLOG("Read attributes for " + int2str(attrib.size()) + " SNPs\n"); } // Filters? if ( annot_opt->isSet("filter") ) range_filter = readRange( annot_opt->getValue("filter") ); if ( annot_opt->isSet("snps") ) { checkFileExists( annot_opt->getValue("snps") ); ifstream IN1(annot_opt->getValue("snps").c_str() , ios::in ); while ( ! IN1.eof() ) { string s; IN1 >> s; if ( s == "" ) continue; snp_filter.insert(s); } IN1.close(); } bool hasRanges = ranges.size() > 0; bool hasSNPs = attrib.size() > 0; bool filterRanges = range_filter.size() > 0; bool filterSNPs = snp_filter.size() > 0; bool needPosition = hasRanges || filterRanges; // Default is to output all rows; however, if the // 'annot-only' option is set, then only output a // row that has at least some annotation bool output_all = annot_opt->isSet("prune") ? false : true ; if ( ! ( hasRanges || hasSNPs || filterRanges || filterSNPs ) ) error("Nothing to do -- stopping"); // Open a single results file // Read first (header) row string header = zin.readLine(); vector tokens = tokenizeLine( header ); // Find appropriate columns to filter int chr_column = -1; int bp_column = -1; int pval_column = -1; int snp_column = -1; for (int i=0; iisSet("distance"); // Minimal range output (i.e. not (distkb) bool minimal = annot_opt->isSet("minimal"); // Use "NA" or "." for missing fields string missingValue = annot_opt->isSet("NA") ? "NA" : "."; // block0/1 reporting // find all possible annotations, then write fields on 0/1s for each variant bool block01 = annot_opt->isSet("block"); // Determine all unique annotations set uniqFields; if ( block01 ) { map >::iterator j = attrib.begin(); while ( j != attrib.end() ) { set::iterator k = j->second.begin(); while ( k != j->second.end() ) { if ( uniqFields.find( *k ) == uniqFields.end() ) uniqFields.insert( *k ); ++k; } ++j; } map >::iterator i = ranges.begin(); while ( i != ranges.end() ) { if ( uniqFields.find( i->first ) == uniqFields.end() ) uniqFields.insert( i->first ); ++i; } printLOG("Found " + int2str( uniqFields.size() ) + " unique annotations\n"); } // Write header back out, with additional field if ( block01 ) { zout << header; set::iterator l = uniqFields.begin(); while ( l != uniqFields.end() ) { zout << " " << *l; ++l; } zout << "\n"; } else { if ( track_distance ) zout << header << sw("DIST",12) << sw("SGN",12) << " ANNOT\n"; else zout << header << " ANNOT\n"; } int cnt = 0, cnt2 = 0; while ( ! zin.endOfFile() ) { // Get line of output string input = zin.readLine(); vector tokens = tokenizeLine(input); if ( tokens.size() == 0 ) continue; if ( needPosition ) { if ( tokens.size() <= chr_column || tokens.size() <= bp_column ) continue; } // Using a p-value-filtering field? double pvalue = 0; if ( pval_column != -1 ) { if ( tokens.size() <= pval_column ) continue; if ( ! from_string( pvalue, tokens[pval_column] , std::dec)) continue; if ( par::pfilter && pvalue > par::pfvalue ) continue; } // Filtering on pre-specified SNP names? if ( filterSNPs ) { if ( tokens.size() <= snp_column ) continue; if ( snp_filter.find( tokens[snp_column] ) == snp_filter.end() ) continue; } int thisChr = -1; int thisBP = -1; if ( needPosition ) { if ( ! from_string( thisChr, tokens[chr_column] , std::dec)) continue; if ( ! from_string( thisBP, tokens[bp_column] , std::dec)) continue; } Range r1(thisChr,thisBP,thisBP,"dummy"); // Filtering on a set of ranges? if ( filterRanges ) { bool include = false; set implicated = rangeIntersect(r1,range_filter); if ( implicated.size() == 0 ) continue; } // Annotation to build up, if any string annotation = ""; // If we need to track what we see (for block01 output) set x; // 1) Ranges int min_distance = 999999999; int sign = 0; // Do we need to store this? i.e. what ranges is it actually in? // This information is in snp2range // Does this point overlap with any ranges of interest? if ( hasRanges ) { set implicated = rangeIntersect(r1,ranges); set::iterator ri = implicated.begin(); while ( ri != implicated.end() ) { string distance = "0"; if( thisBP < (*ri)->start + par::make_set_border ) { distance = "-" + dbl2str(( ( (*ri)->start + par::make_set_border ) - thisBP ) / 1000.00 , 4 ) + "kb" ; if ( track_distance ) if ( ( (*ri)->start + par::make_set_border ) - thisBP < min_distance ) { min_distance = ( (*ri)->start + par::make_set_border ) - thisBP; sign = -1; } } else if ( thisBP > (*ri)->stop - par::make_set_border ) { distance = "+" + dbl2str( ( thisBP - ( (*ri)->stop - par::make_set_border ) ) / 1000.00 , 4 ) + "kb" ; if ( track_distance ) if ( thisBP - ( (*ri)->stop - par::make_set_border ) < min_distance ) { min_distance = thisBP - ( (*ri)->stop - par::make_set_border ); sign = 1; } } else { min_distance = 0; sign = 0; } if ( annotation == "" ) annotation += (*ri)->name; else annotation += "|" + (*ri)->name; if ( ! minimal ) annotation += "(" + distance + ")"; // Do we need to track this? if ( block01 ) x.insert( (*ri)->name ); ++ri; } } // 2) Attributes if ( hasSNPs ) { map >::iterator i = attrib.find(tokens[snp_column]); if ( i != attrib.end() ) { set::iterator j = i->second.begin(); while ( j != i->second.end() ) { if (annotation=="" ) annotation += *j; else annotation += "|" + *j; // Do we need to track this? if ( block01 ) x.insert( *j ); ++j; } } } // Output this row (or possibly not) if ( block01 ) { zout << input << " "; set::iterator l = uniqFields.begin(); while ( l != uniqFields.end() ) { if ( x.find( *l ) != x.end() ) zout << " 1"; else zout << " 0"; ++l; } zout << "\n"; } else if ( annotation != "" ) { ++cnt2; if ( track_distance ) { zout << input << sw( min_distance / 1000.0 , 12 ); if ( sign == -1 ) zout << sw("-",4); else if ( sign == 1 ) zout << sw("+",4); else if ( sign == 0 ) zout << sw(missingValue,4); zout << " " << annotation << "\n"; } else zout << input << " " << annotation << "\n"; } else if ( output_all ) { if ( track_distance ) zout << input << sw("NA",12) << sw("NA",4) << " " << missingValue << "\n"; else zout << input << " " << missingValue << "\n"; } ++cnt; // Read next line of results } printLOG("Processed " + int2str(cnt) + " rows"); if ( !block01 ) printLOG(", " + int2str(cnt2) + " of which were annotated"); printLOG("\n"); zin.close(); zout.close(); shutdown(); } plink-1.07-src/README.txt0000645000265600020320000000240710652403213014165 0ustar tilleaadminPROGRAM: PLINK DESCRIPTION: Whole-genome association analysis toolset AUTHOR: Shaun Purcell CONTACT: plink@chgr.mgh.harvard.edu YEAR: 2006, 2007 LICENSE: Released under GNU General Public License, v2 (see COPYING.txt) DOCUMENTATION: http://pngu.mgh.harvard.edu/purcell/plink/ INSTALLATION: If you have download a zip or gzipped archive with an executable binary, no installation is necessary (except perhaps you might want to place the executable in your path, see documentation for details). Otherwise, see notes on compilation below. COMPILATION: You will need a standard C/C++ compiler such as GNU gcc (version 3). This is likely available on all Linux/Unix platforms. For MS-DOS, DJGPP or MinGW are appropriate choices. To help compiling, see documentation (basically, just be sure to select the correct Makefile and type make -f Makefile.*) USAGE: Type "plink" or "./plink" from the command line followed by the options of choice (see documentation) EXAMPLE DATA: Two example files test.ped and test.map are included in the distribution; for example, once PLINK is installed try running: plink --file test plink --file test --freq plink --file test --assoc plink --file test --make-bed plink --bfile test --assoc etc... SMP, Aug 2006 plink-1.07-src/perm.h0000644000265600020320000000660511264127626013621 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __PERM_H__ #define __PERM_H__ #include "options.h" using namespace std; class Perm { int t; // Number of tests long int replicates; // Basic number of replicates long int performed; // Number of replicates actually performed bool count; // Output counts, not p-values ofstream PDUMP; // Verbose permutation dump bool dump_best; // Record just best permutation result bool dump_all; // Record all permutation results vector order; // For --rank, record order of original vector reorder; // For --rank, record order of original, reverse mapping ///////////////////////////// // Gene-dropping permutation bool genedrop; map idmap; /////////////////////////////////////////// // Standard phenotype-swapping permutation // Parameters for adaptive permutation bool adaptive; int min; // Minimum number of permutations double zt; // SD CI range (based on par::adaptive_alpha) int interval; // Prune tests every (I+N*I2) permutations // Main storage vector R; // number of successes vector maxR; // number of genome-wide successes vector N; // number of trials (adaptive) // Cluster information vector< vector > s; int ns; // number of clusters Plink & P; // reference to Plink class public: Perm(Plink &); void closeDUMP() { if (dump_all || dump_best) PDUMP.close(); } // For basic permutation vector pheno; // label-swapped phenotype vector geno; // label-swapped phenotype vector test; // whether to stop with this test vector snp_test; // whether to skip these SNPs in // a set-based test void setTests(int x); void setAdaptiveSetSNPs(int x); void originalOrder(); void setOriginalRanking(vector_t&); bool finished(); bool update(vector&, vector&); bool updateSNP(double,double,int); void nextSNP(); vector & report(); int current_reps() { return performed; } int reps_done(int); double pvalue(int); double max_pvalue(int); int rank(int); void permuteInCluster(); void setPermClusters(Plink &); void preGeneDrop(); void geneDrop(); void dropAlleles(Plink &, Individual*, int, int, vector&, vector&, vector&, map &); }; #endif plink-1.07-src/lookup2.cpp0000644000265600020320000002054111264127626014577 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "sockets.h" #include "nlist.h" using namespace std; #define PORT_NUM 80 #define IP_ADDR "152.19.78.148" void convertPosition(string pquery, int & chr, int & bp1, int & bp2, bool useKb, bool useMb) { size_t p1 = pquery.find(":"); size_t p2 = pquery.find("-"); if ( p1 == string::npos || p2 == string::npos || p1 >= p2 ) error("Badly formed positional query: " + pquery ); string ccode = pquery.substr(0,p1); if ( ccode.size() < 4 ) error("Badly formed positional query: " + pquery ); if ( ccode.substr(0,3) != "chr" ) error("Badly formed positional query: " + pquery ); if ( ! from_string( chr , ccode.substr(3) , std::dec ) ) error("Badly formed positional query: " + pquery ); string p1code = pquery.substr(p1+1,p2-p1-1); double pp1; if ( ! from_string( pp1 , p1code , std::dec ) ) error("Badly formed positional query: " + pquery ); string p2code = pquery.substr(p2+1); double pp2; if ( ! from_string( pp2 , p2code , std::dec ) ) error("Badly formed positional query: " + pquery ); if ( useKb ) { bp1 = int(pp1 * 1000); bp2 = int(pp2 * 1000); } else if ( useMb ) { bp1 = int(pp1 * 1000000); bp2 = int(pp2 * 1000000); } else { bp1 = int(pp1); bp2 = int(pp2); } // Possible overflow if ( bp1 < 0 ) bp1 = 0; if ( bp2 < 0 ) bp2 = 0; } void Plink::lookup2() { // In general, for these lookups, we do not want to treat the // minus/hyphen character as a range delimiter in the normal // sense. It will be specially handled for chr1:1-100 values; also, // it might appear in gene names, e.g. HLA-A. By setting the range // character to a space, we essentially ensure that we will never // encounter it. Because we shutdown() after performing this // analysis, we do not need to worry about other operations getting // messed up. par::range_delimiter = " "; #ifdef SKIP printLOG("Web-lookup not implemented on this system...\n"); return; #else // printLOG("PLINK-SNP (WGAS SNP annotation courtesy of Patrick Sullivan)\n"); // http://sullivanlab.unc.edu/plink/snp.php?rsid=rs12345,rs67890&gene=DISC1,CACNA1C&exon=1&cnv=0 // http://sullivanlab.unc.edu/plink/genelist.php?&searchtype=gn&gene=COMT,GTF2H2,FURBERG,AMY1,AMY2B // http://sullivanlab.unc.edu/plink/genelist.php?searchtype=gp&chr=6&start=30000000&end=30500000 // { rsid = rs12345 OR rsid = rs67890 } // AND { exon = true } // AND { cnv = false } // AND { gene = DISC1 OR gene = CACNA1C } string GET_STRING; string command; // Lookup genes // # 'pos' key -- default is BP // --lookup-gene [pos,mb] chr18:30.2-30.2 // --lookup-gene [pos,kb] chr18:30200-302809 // --lookup-gene [pos] chr18:30200000-30280900 // # list // --lookup-gene [file] mygenes.lst // --lookup-gene [list] mygenes.lst // # name // --lookup-gene CACNA1C // --lookup-gene [name] CACNA1C // # query // --lookup-gene [query] gset=GO_363526,pos=chr6:30200000-30280900,list=mygenes.lst,gene=CACNA1C // --lookup-gene [pos] chr18: // Agreed keywords OptionSet * lookup2_opt = par::opt.getOptions("LOOKUP"); bool useMb = lookup2_opt->isSet("mb") || lookup2_opt->isSet("Mb") || lookup2_opt->isSet("MB"); bool useKb = lookup2_opt->isSet("kb") || lookup2_opt->isSet("Kb") || lookup2_opt->isSet("KB"); if ( useMb && useKb ) error("Cannot specify both Mb and Kb positional queries"); // A query if ( lookup2_opt->isSet("query") ) { NList nl(0); NList tlist(0); vector ids = tlist.deparseStringList( par::lookup2_cmd ); for (int i = 0 ; i < ids.size(); i++) { string pquery = ids[i]; // Expect format : X=Y size_t p1 = pquery.find("="); string key,val; if (p1==string::npos) { key = pquery; val = "=1"; } else { key = pquery.substr(0,p1+1); val = pquery.substr(p1+1); if ( key=="pos=" ) { int chr, bp1, bp2; convertPosition(val,chr,bp1,bp2,useKb,useMb); val = int2str(chr) + "," + int2str( (int)bp1 ) + "," + int2str( (int)bp2 ); } } if ( i == 0 ) command += key + val; else command += "&"+ key + val; } } else if ( lookup2_opt->isSet("pos")) { // Positional query // Assume a comma delimited list in format: // chr2:737-3993 NList nl(0); NList tlist(0); vector ids = tlist.deparseStringList( par::lookup2_cmd ); for (int i = 0 ; i < ids.size(); i++) { // Always requires format X:Y-Z // X should start "chrXX" // Y should be a number // Z should also be a number string pquery = ids[i]; int chr, bp1, bp2; convertPosition(pquery,chr,bp1,bp2,useKb,useMb); pquery = "pos=" + int2str(chr) + "," + int2str( (int)bp1 ) + "," + int2str( (int)bp2 ); if ( i == 0 ) command += pquery; else command += "&"+pquery; } } else if ( lookup2_opt->isSet("list") || lookup2_opt->isSet("file") ) { checkFileExists(par::lookup2_cmd ); ifstream IN1( par::lookup2_cmd.c_str() , ios::in ); command = par::lookup_gene ? "gene=" : "rsid="; bool doneFirst = false; while ( ! IN1.eof() ) { string name; IN1 >> name; if ( name == "" ) continue; if ( ! doneFirst ) { command += name; doneFirst = true; } else command += "," + name; } IN1.close(); } else if ( lookup2_opt->isSet("qfile") || lookup2_opt->isSet("qlist") ) { checkFileExists(par::lookup2_cmd); ifstream IN1( par::lookup2_cmd.c_str() , ios::in ); bool doneFirst = false; while ( ! IN1.eof() ) { vector tok = tokenizeLine(IN1); if ( tok.size() == 0 ) continue; if ( tok.size() != 2 ) error("Problem with query file: not 2 columns"); string key = tok[0]; string val = tok[1]; if ( ! doneFirst ) { command += key + "=" + val; doneFirst = true; } else command += "&" + key + "=" + val; } IN1.close(); } else // assume simple gene-name query { command = par::lookup_gene ? "gene=" : "rsid="; NList nl(0); NList tlist(0); vector ids = tlist.deparseStringList( par::lookup2_cmd ); for (int i = 0 ; i < ids.size(); i++) { if ( ids[i].find("=") != string::npos ) error("Badly formed gene name, with equals sign '=' in it -- did you mean to add [query]?"); if ( i == 0 ) command += ids[i]; else command += ","+ids[i]; } } if ( par::lookup_gene ) GET_STRING = "GET /plink/genelist.php?"; else GET_STRING = "GET /plink/snplist.php?"; GET_STRING += command; // GET_STRING += "searchtype=gp&chr=6&start=30000000&end=30500000"; cout << "Proposed command = \n"; cout << GET_STRING << "\n"; GET_STRING += "\nHTTP/1.0\nContent Length: 10000\nHost: 152.19.78.148\nConnection: close\n\n"; cout << "GET_STRING:\n\n" << GET_STRING << "\n"; //////////////////////////////////////////////// // Make database call vector tokens = socketConnection( this , IP_ADDR, PORT_NUM, GET_STRING ); //////////////////////////////////////////////// // Relay output cout << "Output = \n"; for (int t = 0 ; t < tokens.size() ; t++) { cout << "token[" << t << "] = [ " << tokens[t] << "]\n"; } cout << "\n"; cout << "-------------------------------\n"; #endif } plink-1.07-src/sockets.cpp0000644000265600020320000000757011264127625014665 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include "plink.h" #include "helper.h" #include "options.h" // Requires wsock32.lib WINDOWS // -lsocket -nsl BSD //#define UNIX // WIN/UNIX/SKIP #ifndef SKIP #include #include #include #include #ifdef WIN #include #endif #ifdef UNIX #include // Needed for system defined identifiers. #include // Needed for internet address structure. #include // Needed for socket(), bind(), etc... #include // Needed for inet_ntoa() #include #include #include #endif using namespace std; extern string PVERSION; #endif // of SKIP vector socketConnection( Plink * P, string ip_addr , int port , string message ) { int BUF_SIZE = 4096; #ifndef SKIP P->printLOG("Connecting to web... "); vector tokens(0); #ifdef WIN WORD wVersionRequested = MAKEWORD(1,1); // Stuff for WSA functions WSADATA wsaData; // Stuff for WSA functions #endif unsigned int server_s; // Server socket descriptor struct sockaddr_in server_addr; // Server Internet address char out_buf[BUF_SIZE+1]; // Output buffer for GET request char in_buf[BUF_SIZE+1]; // Input buffer for response unsigned int retcode; // Return code unsigned int i; // Loop counter #ifdef WIN WSAStartup(wVersionRequested, &wsaData); #endif // Create a socket server_s = socket(AF_INET, SOCK_STREAM, 0); // Fill-in the Web server socket's address information server_addr.sin_family = AF_INET; // Address family to use server_addr.sin_port = htons(port); // Port num to use server_addr.sin_addr.s_addr = inet_addr(ip_addr.c_str()); // IP address to use //server_addr.sin_addr = *((struct in_addr *)he->h_addr); // Do a connect (connect() blocks) retcode = connect(server_s, (struct sockaddr *)&server_addr, sizeof(server_addr)); if (retcode != 0) { P->printLOG(" failed connection\n\n"); #ifdef WIN WSACleanup(); #endif return tokens; } // Send a message to the server message += '\0'; send(server_s, message.c_str(), message.length(), 0); // Receive from the Web server int echoStringLen = 100; string all_string = ""; char echoBuffer[BUF_SIZE + 1]; // Buffer for echo string + \0 // Receive the same string back from the server while ( 1 ) { int retcode = recv(server_s, echoBuffer, BUF_SIZE, 0); // Give up if we encounter any problems if ( retcode < 0 ) { P->printLOG("Problem reading from SNPServer\n"); return tokens; } echoBuffer[retcode] = '\0'; // Terminate the string! all_string += echoBuffer; // Is this the end of the input? if ( echoBuffer[retcode-1] == '\0' ) break; } string buf; stringstream ss(all_string); while (ss >> buf) tokens.push_back(buf); return tokens; // Close all open sockets #ifdef WIN closesocket(server_s); #endif #ifdef UNIX close(server_s); #endif #ifdef WIN WSACleanup(); #endif #endif } plink-1.07-src/prephap.cpp0000644000265600020320000003646311264127624014653 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "phase.h" #include "helper.h" #include "nlist.h" #include "stats.h" extern ofstream LOG; using namespace std; // Format for haplotype file: format 1: // SNP_ID CHR CM BP A1 A2 HAP SNP1 SNP2 ... // i.e. length of Pred_allele indicate how many SNPs to expect: // rs10001 5 0 10203 A G TTC rs001 rs002 rs003 // Alternatively: using wild cards: format 2: // will name haplotype "H1_TTC_", "H1_CTT_", "H2_AA_", etc // * rs001 rs002 rs003 // * rs002 rs004 // Alternatively: using wild cards: format 3: // will name haplotype "MYHAP1_1_TTC_", "MYHAP1_1_CTT_", "GENEB_2_AA_", etc // ** MYHAP1 rs001 rs002 rs003 // ** GENEB rs002 rs004 // Alternatively: using --whap weighted multimarker tests // rs10001 5 0 10203 A G rs001 rs002 rs003 / TTC / CCT 0.9 / TCT 0.1 // i.e. "/" separator used, if number omitted, then assume PP=1 void HaploPhase::readTagFile() { P.printLOG("\n"); /////////////////////////////////////////////// // Haplotype inference is a SNP-major function if (!par::SNP_major) P.Ind2SNP(); //////////////////////// // Lookup table for SNPs map mlocus; map::iterator ilocus; for (int l=0;lname,l)); /////////////////// // Affection coding if (par::bt) affCoding(P); //////////////////////////////// // Read list of tags/haplotypes checkFileExists(par::tagfile); ifstream TAG(par::tagfile.c_str(), ios::in); TAG.clear(); string f2 = par::output_file_name + ".mishap"; ofstream MISHAP(f2.c_str(), ios::out); bool all_okay = true; // Count of new haplotypes we want to infer int hc=1; while(!TAG.eof()) { char c[500000]; TAG.getline(c,500000,'\n'); string l = c; // Catch blank lines or DOS carriage-returns if (l=="" || l=="\r") continue; // Tokenize line string buf; stringstream ss(l); vector tokens; while (ss >> buf) tokens.push_back(buf); // whitepsace line? if (tokens.size() == 0) continue; // tokens[0] predicted SNP rs# // tokens[1] predicted SNP chromosome // tokens[2] predicted SNP Morgan position // tokens[3] predicted SNP base pair position // tokens[4] predicted SNP allele 1 // tokens[5] predicted SNP allele 2 // tokens[6] tag allele (-> allele 1) // tokens[7+] predictor rs#(s) // If we see one or more wildcard specifications, then // automatically set phase_all_haps to be true; if ( tokens[0] == "*" ) { // Wildcard format if ( tokens.size() == 1 ) error("Problem with " + par::tagfile + " line\n" + l + "\n: must have atleast one SNP list\n"); par::phase_hap_all = true; } else if ( tokens[0] == "**") { // Wildcard2 format if (tokens.size() == 2 ) error("Problem with " + par::tagfile + " line\n" + l + "\n: must have atleast one SNP list\n"); par::phase_hap_all = true; } else if ( ! par::weighted_mm ) { // Standard format if (tokens.size() < 8 ) { string e = "Problem with " + par::tagfile + " line\n" + l + "\n (expecting at least 8 items, or to start with */** wildcard)\n"; error(e); } } else { // Weighted multi-marker format if (tokens.size() < 9 ) { string e = "Problem with " + par::tagfile + " line\n" + l + "\n (expecting at least 9 items for --whap format file)\n"; error(e); } } if (tokens[0].substr(tokens[0].size()-1) == "_" ) error("Cannot use '_' in tag/haplotype name: reserved for wildcards\n"); int len; // length of haplotype vector locusList; // list of predictor #s // Is this particular line a wildcard? okay? bool wildcard = tokens[0] == "*" || tokens[0] == "**" ? true : false ; string wildname = "H"; // Take the name from the second position? (** wildcard?) if ( tokens[0] == "**" ) { wildname = tokens[1]+"_"; tokens.erase(tokens.begin()+1); } bool okay = true; ///////////////////////////////// // Fully-specified haplotype if (!wildcard) { int offset = 7; if ( par::weighted_mm ) offset = 6; if ( par::weighted_mm ) { // Find first "/" separator int sep = 6; while ( tokens[++sep] != "/" ) { } len = sep - 6; } else { len = tokens[6].length(); if (len != tokens.size() - offset ) { string e = "Problem with " + par::tagfile + " line\n" + l + "\n"; error(e); } } ////////////////////// // Lookup locus name for (int i=0; isecond); } else { MISHAP << "NOSNP\t" << tokens[0] << "\t" << tokens[i+offset] << "\n"; okay = false; } } ///////////////////////////////// // Check specified alleles exist if (okay) { // Just one haplotype to check for non-weighted test version if ( ! par::weighted_mm ) { for (int s=0;sallele1 == tokens[6].substr(s,1) || P.locus[locusList[s]]->allele2 == tokens[6].substr(s,1) ) ) { MISHAP << "NOALLELE\t" << tokens[0] << "\t" << P.locus[locusList[s]]->name << "\t" << tokens[6] << "\n"; okay = false; } } // Otherwise, we must parse through and check each else { int allele = 5 + len + 2; for ( int i = allele ; i < tokens.size() ; i++ ) { // Is this a haplotype? if ( i == allele || tokens[i-1] == "/" ) { for (int s=0;sallele1 == tokens[i].substr(s,1) || P.locus[locusList[s]]->allele2 == tokens[i].substr(s,1) ) ) { MISHAP << "NOALLELE\t" << tokens[0] << "\t" << P.locus[locusList[s]]->name << "\t" << tokens[i] << "\n"; okay = false; } } } } } } else { ///////////////////////////////// // Wildcard selection len = tokens.size()-1; // Lookup locus name for (int i=0; isecond); } else { MISHAP << "NOSNP\t" << tokens[i+1] << "\n"; okay = false; } } } ////////////////////////////////////// // Should we try to add this haploype if (!okay) { all_okay = false; continue; } //////////////////////////////////////////////////////// // Check that all predictors are on the same chromosome if (!wildcard) { for (int ck=0; ckchr != atoi(tokens[1].c_str())) { MISHAP << "DIFF_CHR\t" << P.locus[locusList[ck]]->name << "\t" << tokens[0] << "\n"; okay = false; } } else { for (int ck=0; ckchr != P.locus[locusList[ck+1]]->chr) { MISHAP << "DIFF_CHR\t" << P.locus[locusList[ck]]->name << "\t" << P.locus[locusList[ck+1]]->name << "\n"; okay = false; } } if (!okay) { all_okay = false; continue; } /////////////////////////////////////// // Standard approach -- only one entry if ( ( !wildcard) && (!par::weighted_mm) ) { // Add numbers for predictors to list new_pred_locus.push_back(locusList); // Add which allele to look for (corresponding to allele1) new_pred_allele.push_back(tokens[6]); // Make new entry in MAP file Locus * loc = new Locus; loc->name = tokens[0]; loc->chr = getChromosomeCode( tokens[1] ); loc->pos = atof(tokens[2].c_str()); loc->bp = atoi(tokens[3].c_str()); loc->allele1 = tokens[4]; loc->allele2 = tokens[5]; // Add this new locus to the list new_map.push_back(loc); } else if ( ( !wildcard ) && par::weighted_mm ) { /////////////////////////////////////// // Weighted MM test -- only one entry // Add numbers for predictors to list new_pred_locus.push_back(locusList); // Add which allele(s) to look for (corresponding to allele 1) // and then the corresponding weights map whap; int index = 5 + len + 2; while ( index < tokens.size() ) { // Read haplotype, then optionally a weight if ( tokens[index].size() != len ) error("Problem with " + par::tagfile + " line\n" + l + "\n"); // Read weight, or advance to next haplotype? if ( index == tokens.size() - 1 ) { whap.insert(make_pair( tokens[index] , 1 ) ); break; } else if ( tokens[index+1] == "/" ) { whap.insert(make_pair( tokens[index] , 1 ) ); index += 2; } else { double w = atof(tokens[index+1].c_str()); if ( ( !realnum(w) ) || w < 0 || w > 1 ) error("Problem with specified weight in line:\n"+l+"\n"); whap.insert(make_pair( tokens[index] , w ) ); index += 3; } } new_pred_weighted_allele.push_back(whap); // Make new entry in MAP file Locus * loc = new Locus; loc->name = tokens[0]; loc->chr = getChromosomeCode( tokens[1] ); loc->pos = atof(tokens[2].c_str()); loc->bp = atoi(tokens[3].c_str()); loc->allele1 = tokens[4]; loc->allele2 = tokens[5]; // Add this new locus to the list new_map.push_back(loc); } else { //////////////////////////////////////////// // Wildcard approach -- just a single entry // Add numbers for predictors to list new_pred_locus.push_back(locusList); // Put in a dummy allele code (we ignore this...) string hstr=""; for (int s=0;sallele1; new_pred_allele.push_back(hstr); // Make new entry in MAP file Locus * loc = new Locus; loc->name = wildname +int2str(hc)+"_DUMMY_"+hstr+"_"; loc->chr = P.locus[locusList[0]]->chr; loc->pos = 0; loc->bp = hc; loc->allele1 = "1"; loc->allele2 = "2"; // Add this new locus to the list new_map.push_back(loc); } // Increment new haplotype count hc++; // Read next in TAG file } TAG.close(); MISHAP.close(); if (!all_okay) P.printLOG("Warning: misspecified haplotypes found: listed in [ " + f2 + " ]\n"); // End of reading haplotype list -- did we encounter any problems? P.printLOG("Read " + int2str(new_map.size()) + " haplotypes from [ " + par::tagfile + " ]\n"); } void HaploPhase::makeSlidingWindow(string winspec) { P.printLOG("\n"); /////////////////////////////////////////////// // Haplotype inference is a SNP-major function if (!par::SNP_major) P.Ind2SNP(); ///////////////////////////////////// // NList nl(0); vector tok = nl.deparseStringList( winspec ); vector spec; // size vector spec2; // step for (int i=0; i( t, tok[i], std::dec ) ) error("Problem with specification of haplotype sliding window"); spec.push_back(t); spec2.push_back(1); } else { string u1 = tok[i].substr(0,tok[i].find("+")); string u2 = tok[i].substr(tok[i].find("+")+1); int t; if ( ! from_string( t, u1, std::dec ) ) error("Problem with specification of haplotype sliding window"); spec.push_back(t); if ( ! from_string( t, u2 , std::dec ) ) error("Problem with specification of haplotype sliding window"); spec2.push_back(t); } } int w=1; for (int i=0; i= P.nl_all ) break; // Make a window, as large as possible vector snps; // Make sure it is restricted to one chromosome int chr = P.locus[start]->chr; bool fail = false; bool newChromosome = false; int actualStop = start; // Add SNPs to window for (int s = start; s < start + winsize; s++) { // No more SNPs left if ( s == P.nl_all ) { fail = true; break; } // Next chromosome? if ( P.locus[s]->chr != chr ) { newChromosome = true; actualStop = s; break; } snps.push_back(s); actualStop = s; } // Have we come up to the end of this chromosome in an exact number? if ( actualStop == P.nl_all-1 ) { fail = true; } else if ( ( ! newChromosome ) && P.locus[actualStop+1]->chr != chr ) { newChromosome = true; actualStop++; } // Finished constructing this particular window: // do we have anything to add? if ( snps.size() == 0 ) { if ( newChromosome ) start = actualStop; else start += winstep; continue; } Locus * tmploc = new Locus; tmploc->name = "WIN"+int2str(w++); tmploc->chr = P.locus[start]->chr; tmploc->pos = P.locus[start]->pos; tmploc->bp = P.locus[start]->bp; tmploc->allele1 = "1"; tmploc->allele2 = "2"; new_pred_locus.push_back(snps); new_map.push_back(tmploc); new_pred_allele.push_back(""); // Advance window if ( fail ) break; if ( newChromosome ) start = actualStop; else start += winstep; } } P.printLOG("Created " + int2str(w-1) + " sliding windows\n"); } void HaploPhase::setSpecificSNPs(string snps) { map mapping; for (int l=0; lname,l)); NList nl(P.nl_all,true); vector snplist = nl.deparseStringList(snps,&mapping); if (snplist.size() == 0 ) return; int start = snplist[0]; new_pred_locus.push_back(snplist); Locus * tmploc = new Locus; tmploc->name = "WIN1"; tmploc->chr = P.locus[start]->chr; tmploc->pos = P.locus[start]->pos; tmploc->bp = P.locus[start]->bp; tmploc->allele1 = "1"; tmploc->allele2 = "2"; new_map.push_back(tmploc); new_pred_allele.push_back(""); return; } plink-1.07-src/sets.h0000644000265600020320000000426211264127626013631 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __SETS_H__ #define __SETS_H__ class SetSortedSNP { public: double chisq; int l; bool operator< (const SetSortedSNP & s2) const { return (chisq < s2.chisq); } }; class Set { public: vector s_min; vector s_max; Set(vector > &); vector > & snpset; map > setMapping; // Sum-statistics based set-based scores void empiricalSetPValues(); void cumulativeSetSum_WITHOUTLABELS(vector&,int); void cumulativeSetSum_WITHLABELS(Plink &, vector&); // Profile-score based set-based tests void initialiseSetMapping(); void profileTestSNPInformation(int,double); vector_t profileTestScore(); void profileTestInitialise(); vector > profileSNPs; vector profileScore; // Stepwise regression models vector_t fitStepwiseModel(); // New, LD-aware single-statistic set test vector_t fitLDSetTest(vector_t&,bool); vector< vector > > ldSet; vector numSig; vector > selectedSNPs; // Helper functions void sizeSets(); void pruneSets(Plink&); void pruneMC(Plink &,bool,double); void dropNotSet(Plink &); void makeLDSets(); // get better name ?? vector > setsort; // All statistics vector > > stat_set; // Empirical set-based p-values (p0, p1 and p2) vector > > pv_set; vector > pv_maxG_set; vector > pv_maxE_set; // Include or drop this SNP (multi-collinearity) vector > cur; }; #endif plink-1.07-src/options.h0000644000265600020320000007271211264127626014353 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distibuted under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __OPTIONS_H__ #define __OPTIONS_H__ #include #include #include "plink.h" using namespace std; /* static Options lookup2_options; */ // static Options idhelp_replace_options; // static Options idhelp_match_options; // static Options annot_options; // static Options dosage_options; // class OptionSet { public: map > val; bool isSet(string s) { return val.find(s) != val.end(); } vector getValues(string s) { vector sv; map >::iterator i = val.find(s); if ( i == val.end() ) return sv; return i->second; } string getValue(string s) { map >::iterator i = val.find(s); if ( i == val.end() ) return ""; if ( i->second.size() > 0 ) return i->second[0]; else return ""; } void display() { map >::iterator i = val.begin(); while ( i != val.end() ) { cout << i->first; if ( i->second.size() > 0 ) { cout << " : "; for ( int k = 0 ; k < i->second.size(); k++) { cout << " " << i->second[k] ; } } cout << "\n"; ++i; } } }; class Options { map opt; public: OptionSet * addOption(string s) { map::iterator i = opt.find(s); if ( i != opt.end() ) return i->second; OptionSet * o = new OptionSet; opt.insert(make_pair(s,o)); return o; } ~Options() { map::iterator i = opt.begin(); while (i != opt.end() ) { delete i->second; ++i; } } OptionSet * getOptions(string s) { map::iterator i = opt.find(s); if ( i != opt.end() ) return i->second; else { OptionSet * o = new OptionSet; return o; } } }; class par { public: static bool myfunction; static Options opt; static bool verbose; static bool flag; static bool dumpped; static bool debug; static bool dummy; static int dummy_nind; static int dummy_nsnp; static bool web_check; static bool tucc; static bool do_not_load_snps; static const double epsilon; static long unsigned int random_seed; static int simul_ncases; static int simul_ncontrols; static string simul_label; static double simul_prevalence; static bool simul; static string simul_file; static bool simul_tags; static bool simul_haps; static bool simul_qt; static double simul_qt_var; static bool lookup; static bool lookup_single_snp; static bool lookup_to_file; static string lookup_snp; static string lookup_gene_name; static bool lookup_gene; static bool lookup_multiple_genes; static int lookup_gene_kb_window; static int lookup_snp_kb_window; static bool lookup2; static string lookup2_cmd; static bool idhelp; static string idhelp_output_delimit; static string idhelp_dictionary; static bool idhelp_dump_from_dict; static string idhelp_dump_from_dict_cmd; static bool idhelp_auto_alias; static bool idhelp_lookup; static string idhelp_lookup_string; static bool idhelp_subset; static string idhelp_subset_string; static bool idhelp_replace; static string idhelp_replace_string; static bool idhelp_match; static vector idhelp_match_string; static bool idhelp_no_dict; static bool idhelp_list_aliases; static bool idhelp_alias_update; static string idhelp_command; static string idhelp_input; static bool run_R_script; static bool run_R_write_script; static string R_script; static bool run_R_chisq; static bool run_R_z; static int run_R_nsnps; static int R_port; static bool recode; static bool recode_transpose; static bool recode_long; static bool recode_long_ref; static bool recode_mutlist; static bool recode_12; static bool recode_AD; static bool recode_AD_Aonly; static bool recode_AD_fixed; static bool recode_allele_coding; static string recode_allele_coding_file; static bool recode_1234; static bool recode_ACGT; static bool set_reference_allele; static string set_reference_allele_file; static bool lfile_allele_count; static string recode_delimit; static string recode_indelimit; static bool recode_HV; static bool recode_whap; static bool recode_fastphase; static bool recode_structure; static bool recode_bimbam; static bool preserve_all_genotypes; static bool preserve_mendel_errors; static bool zero_cluster; static string zero_cluster_filename; static bool oblig_missing; static string oblig_missing_filename; static string oblig_clusters_filename; static bool loop_over; static string loop_over_label; static int loop_counter; static string loop_over_filename; static bool list_by_allele; static bool list_twolocus; static string twolocus_snp1; static string twolocus_snp2; static bool indiv_report; static string indiv_report_fid; static string indiv_report_iid; static bool plist; static string plist_fid1; static string plist_iid1; static string plist_fid2; static string plist_iid2; static bool merge_data; static bool merge_force_strand; static int merge_mode; static bool merge_binary; static bool merge_list; static string merge_list_filename; static string merge_pedfile; static string merge_mapfile; static string merge_bedfile; static string merge_bimfile; static string merge_famfile; static bool write_snplist; static bool update_map; static bool update_cm; static bool update_chr; static bool update_name; static bool update_ids; static string update_ids_file; static bool update_sex; static string update_sex_file; static bool update_parents; static string update_parents_file; static bool update_pheno; static string update_pheno_file; static string update_mapfile; static string range_delimiter; static bool update_alleles; static string update_allele_file; static bool compound_genotype_code; static string tpedfile; static string tfamfile; static bool tfile_input; static string lpedfile; static string lfamfile; static bool lfile_input; static bool ref_file; static string ref_file_name; static bool gvar; static bool gvar_write; static bool gvar_to_standard; static bool load_gvar; static bool gvar_verbose_association; static string gmapfile; static string gfamfile; static string gvarfile; static bool gvar_include_all_variants; static bool gvar_full_report; static bool flip_strand; static string flip_file; static bool flip_subset; static string flip_subset_file; static bool read_bitfile; static bool write_bitfile; static bool fast_binary; static string bitfilename; static string famfile; static string bitfilename_map; static bool SNP_major; static bool out_SNP_major; static bool compress_file; static bool uncompress_file; static string compress_filename; static bool read_ped; static string pedfile; static string mapfile; static bool ped_from_stdin; static string fileroot; static bool map3; static bool liability; static bool ped_skip_sex; static bool ped_skip_parents; static bool ped_skip_fid; static bool ped_skip_pheno; static string output_file_name; static bool silent; static bool gplink; static bool cli; static string missing_genotype; static string out_missing_genotype; static string missing_phenotype; static string out_missing_phenotype; static bool missing_genotype_explicit; static bool missing_phenotype_explicit; static bool ignore_missing_sex; static bool pheno_file; static bool covar_file; static bool clist; static bool no_show_covar; static bool dump_covar; static bool dump_covar_with_phenotype; static bool dump_covar_dummy_coding; static bool filter_on_covar; static int clist_number; static int plist_number; static bool snp_attrib_filter; static string snp_attrib_value; static string snp_attrib_file; static bool ind_attrib_filter; static string ind_attrib_value; static string ind_attrib_file; static bool multiple_phenotypes; static string multiple_phenotype_file; static string make_pheno_filename; static string make_pheno_value; static bool make_pheno; static bool make_pheno_present; static bool dump_clst; static bool clist_selection; static bool clist_selection_name; static bool clist_selection_number; static string clist_selection_string; static bool plist_selection; static bool plist_selection_name; static bool plist_selection_number; static string plist_selection_string; static int mult_pheno; static string name_pheno; static bool all_pheno; static int mult_covar; static int mult_clst; static int mult_filter; static string filter_value; static string number_list_string; static bool number_list_positive; static string pheno_filename; static string covar_filename; static string clist_filename; static string filter_filename; static bool cm_map; static double grid; static double fringe; static bool singlepoint; static int inter_grid; static bool done_global_pihat; static bool sol_family; static bool summ_nonfounders; static bool make_founders; static bool has_nonfounders; static bool make_missing_parents; static bool score_risk; static string score_risk_file; static bool score_risk_ranges; static string score_risk_ranges_file; static int score_risk_ranges_min; static bool score_impute_expected; static bool score_risk_on_qrange; static string score_qrange_file; static string score_qfile; static bool score_test; static bool profile_sets; static bool report_missing; static bool test_missing; static bool mishap_test; static int mishap_window; static bool calcFst; static bool proxy_assoc; static bool proxy_glm; static bool proxy_all; static bool proxy_full_report; static bool proxy_error; static bool proxy_impute; static bool proxy_impute_replace; static bool proxy_impute_preserve_genotyped; static bool proxy_record_dosage; static bool proxy_impute_genotypic_concordance; static double proxy_impute_threshold; static double proxy_info_threshold; static bool impute_verbose; static bool proxy_exclude; static string proxy_exclude_list; static bool proxy_exclude_from_file; static bool proxy_reference_only; static bool proxy_leave_out; static bool proxy_include_reference; static bool proxy_CC; static bool proxy_TDT; static string proxy_assoc_snp; static int proxy_window; static bool proxy_list; static string proxy_list_file; static bool proxy_all_list; static string proxy_all_list_file; static double proxy_kb; static double proxy_r2; static double proxy_maf; static double proxy_mhf; static double proxy_geno; static bool proxy_list_proxies; static int proxy_maxhap; static bool proxy_r2_filter; static double proxy_r2_filter_A; static double proxy_r2_filter_B; static double proxy_r2_filter_C; static int proxy_snp_filter; static double proxy_kb_planA; static int proxy_window_planA; static int proxy_snp_filter_planA; static double proxy_r2_filter_A_planA; static double proxy_r2_filter_B_planA; static double proxy_r2_filter_C_planA; static double proxy_planB_threshold; static double proxy_kb_planB; static int proxy_window_planB; static int proxy_snp_filter_planB; static double proxy_r2_filter_A_planB; static double proxy_r2_filter_B_planB; static double proxy_r2_filter_C_planB; static bool greport; static string greport_results; static string greport_gene_list; static bool greport_subset; static string greport_subset_file; static bool greport_display_empty; static bool annot_file; static string annot_filename; static bool meta_analysis; static vector meta_files; static bool set_screen; static string set_screen_resultfile; static bool gettag_mode; static bool gettag_mode1; static bool gettag_mode2; static string gettag_file; static double gettag_r2; static int gettag_kb; static bool gettag_listall; static bool clumpld; static bool clumpld_best; static string clumpld_results; static string clumpld_column; static bool clumpld_verbose; static bool clumpld_indep; static int clumpld_kb; static double clumpld_r2; static double clumpld_p1; static double clumpld_p2; static bool clumpld_index1; static bool clumpld_only_show_replications; static bool clumpld_only_show_replications_list; static bool clumpld_annot; static string clumpld_annot_fields; static string clumpld_range_file; static bool clumpld_range_annotate; static int clumpld_min; static double min_af; static double max_af; static bool make_minor_allele; static double min_hf; static double max_hf; static int min_geno_cell; static double rarer_maf_threshold; static double rarer_dist_threshold; static int rarer_interval; static bool rare_test; static bool rare_test_weight1; static bool rare_test_print_details; static string rare_test_print_details_snp; static bool elf_pcmode; static bool elf_pcmode_2sided; static bool elf_baseline; static bool rare_test_score_range; static double rare_test_score_range_threshold; static string rare_test_score_results_file; static string rare_test_score_range_file; static bool rare_test_summary_controls; static vector chr_haploid; static vector chr_sex; static vector chr_Y; static vector chr_code; static map chr_map; static bool species_dog; static bool species_cow; static bool species_sheep; static bool species_horse; static bool species_rice; static bool species_mouse; static int run_start; static int run_end; static int run_chr; static string m1; static string m2; static double window; static bool position_window; static int from_window; static int to_window; static bool qt; static bool bt; static bool coding01; static bool ignore_phenotypes; static bool filter_cases; static bool filter_controls; static bool filter_males; static bool filter_females; static bool filter_founders; static bool filter_nonfounders; static bool SD; static bool CP; static bool affpair; static bool remove_unaffected_pairs; static bool fix_prev; static double fixed_prev; static string tagfile; static string mapfile_impute; static bool make_tags; static bool impute_tags; static bool sliding_window; static string sliding_window_size; static bool make_blocks; static bool meta_large_phase; static bool phase_snps; static bool phase_hap_all; static double hap_post_prob; static double hap_missing_geno; static double hap_min_phase_prob; static int hap_max_nf_phases; static bool display_hap_freqs; static int haplo_plem_window; static int haplo_plem_overlap; static int haplo_plem_original_overlap; static int haplo_plem_iter; static bool haplo_plem_verbose; static bool haplo_plem_follow; static int haplo_plem_follow_ind; static string haplo_plem_follow_fid; static string haplo_plem_follow_iid; static int haplo_plem_likelihood_iter; static double haplo_plem_window_prune_phase; static double haplo_plem_window_tol; static double haplo_plem_zero_threshold; static bool haplo_plem_nonzero_threshold; static int haplo_plem_meta_window; static double haplo_plem_meta_prune_haplotype; static double haplo_plem_meta_prune_phase; static int haplo_plem_meta_iter; static int haplo_plem_meta_likelihood_iter; static double haplo_plem_meta_tol; static bool test_hap_CC; static bool test_hap_TDT; static bool test_hap_QTL; static bool test_hap_only; static bool test_hap_GLM; static bool test_hap_GLM_omnibus; static bool display_phase_probs; static bool display_phase_probs_wide; static bool weighted_mm; static bool chap_test; static bool chap_sole_variant; static bool chap_sole_variant_specific_alleles; static string chap_sole_variant_specific_allele_list; static bool chap_independent_effect; static bool chap_haplotype_specific; static string chap_entity; static bool chap_specified_groups; static bool chap_specified_snps; static string chap_model1; static string chap_model0; static bool chap_drop_snps; static string chap_drop_snps_list; static bool chap_add_grp_specifics; static bool assoc_test; static bool assoc_counts; static bool assoc_glm; static bool standard_beta; static bool assoc_glm_without_main_snp; static bool assoc_test_alt_perm; static bool full_model_assoc; static bool trend_only; static bool fisher_test; static bool return_beta; static bool hap_specific_snps; static string hap_specific_snps_list; static bool output_pheno_perm; static bool qt_means; static bool conditioning_snp_single; static string conditioning_snp_name; static bool conditioning_snps; static string conditioning_snps_file; static int xchr_model; static bool glm_sex_effect; static bool glm_no_auto_sex_effect; static bool glm_dominant; static bool glm_recessive; static double vif_threshold; static bool twoDFmodel; static bool twoDFmodel_hethom; static bool test_full_model; static bool simple_interaction; static vector parameter_list; static vector test_list; static bool glm_user_test; static bool glm_user_parameters; static bool qt_with_covariates; static bool model_perm_best; static bool model_perm_gen; static bool model_perm_dom; static bool model_perm_rec; static bool model_perm_trend; static bool assoc_gxe; static bool QTDT_test; static bool QFAM_total; static bool QFAM_between; static bool QFAM_within1; static bool QFAM_within2; static bool QFAM_adaptive; static bool TDT_test; static bool sibTDT_test; static bool mating_tests; static bool dfam_tdt; static bool dfam_sibs; static bool dfam_unrelateds; static bool perm_TDT_basic; static bool perm_TDT_parent; static bool discordant_parents; static bool parent_of_origin; static bool perm_POO_poo; static bool perm_POO_pat; static bool perm_POO_mat; static bool perm_POO_best; static bool built_families; static bool MENDEL_test; static bool MENDEL_report; static double MENDEL_snp; static double MENDEL_ind; static bool HWD_test; static bool HWD_report; static double HWD_limit; static bool HWD_standard; static bool HWD_filter_on_all; static bool CMH_test_1; static bool CMH_test_2; static bool CMH_test_ORD; static bool breslowday; static bool OR_homog_test; static double ci_level; static double ci_zt; static bool display_ci; static bool pfilter; static double pfvalue; static bool multtest; static bool use_GC; static bool fix_lambda; static double lambda; static bool qq_plot; static bool logscale; static bool ibs_sharing_test; static bool extract_set; static bool exclude_set; static bool snp_range_list; static bool thin_snps; static double thin_param; static bool make_set; static string make_set_file; static int make_set_border; static bool make_set_collapse; static bool make_set_ignore_group; static string make_set_collapse_label; static bool make_set_complement; static bool write_set; static bool read_set; static string exclude_file; static string extract_file; static string keep_file; static string remove_file; static bool read_snp_qual; static string snp_qual_file; static double snp_qual_min; static double snp_qual_max; static bool read_geno_qual; static string geno_qual_file; static double geno_qual_min; static double geno_qual_max; static bool snp_include_from_cl; static string snp_include_range; static bool dump_gene; static string dump_genename; static bool hotel; static bool set_test; static bool set_p2; static int set_min; static int set_max; static bool set_r2; static double set_r2_val; static bool set_r2_phase; static double set_chisq_threshold; static bool set_r2_write; static bool set_r2_read; static string set_r2_read_file; static string subsetfile; static bool use_subset; static string setfile; static bool set_score; static double set_score_p; static double set_step_in; static bool set_step; static bool set_table; static bool permute_within_sol; static bool boot; static bool disp_r1; static bool disp_r2; static bool disp_r_window; static int disp_r_window_snp; static int disp_r_window_kb; static double disp_r_window_r2; static bool ld_anchor; static bool ld_anchor_list; static bool flip_scan; static double flip_scan_threshold; static bool flip_scan_verbose; static bool prune_ld; static bool prune_ld_pairwise; static bool prune_ld_pairwise_maf; static double prune_ld_vif; static double prune_ld_r2; static int prune_ld_win; static int prune_ld_step; static bool prune_r2_prefer; static string prune_r2_prefer_list; static bool prune_r2_fixed; static string prune_r2_fixed_list; static bool calc_SNPSNP_LD; static string ld_SNP1; static string ld_SNP1_file; static string ld_SNP2; static bool epistasis; static bool fast_epistasis; static bool epi_caseonly; static double epi_caseonly_kb_gap; static bool epi_filter; static double epi_alpha1; static double epi_alpha2; static bool set_by_set; static bool epi_genebased; static bool epi_quickscan; static bool drop_sets; static bool inbreeding; static bool check_sex; static bool impute_sex; static double sex_threshold_male; static double sex_threshold_female; static bool homo_run; static bool homo_run_consensus_match; static bool homo_run_kb; static bool homo_run_snps; static double homo_run_density; static int homo_run_gap; static bool homo_miss_as_hom; static int homo_windowSize; static int homo_windowKB; static int homo_windowAllowedHet; static int homo_windowAllowedMissing; static double homo_threshold; static int homo_run_length_kb; static int homo_run_length_snps; static int homo_run_het; static bool homo_summary_allelic_match; static double fuzzy_homo; static bool homozyg_verbose; static int pool_size_min; static bool ibs_run; static int ibs_run_length_snps; static int ibs_run_length_kb; static double ibs_run_density; static int ibs_inner_run_length_kb; static int ibs_inner_run_length_snp; static int ibs_join_kb; static int ibs_join_snp; static int ibs_run_missing; static int ibs_run_0; static int ibs_inter_snp_distance; static bool ibs_2only; static bool miss_run; static int miss_run_length; static bool miss_run_length_kb; static double miss_run_level; static bool segment_haplotrack; static string segment_haplotrack_fid1; static string segment_haplotrack_iid1; static string segment_haplotrack_fid2; static string segment_haplotrack_iid2; static bool mk_datfile; static bool segment_output; static bool segment_minimal; static bool segment_silently_return_groups; static int segment_current_focal_snp; static bool segment_overlap; static bool segment_verbose; static bool segment_validate; static bool segment_test_individual; static bool segment_test_specific_segs; static bool segment_test_fisher; static bool segment_test_1sided; static bool segment_test_force_1sided; static bool segment_test_ignore_discordant; static int segment_snp1; static int segment_snp2; static string segment_m1; static string segment_m2; static bool force_span; static int segment_length; static int segment_snp; static bool segment_output_started; static bool read_segment_file; static string read_segment_filename; static int segment_inter_snp_distance; static bool multi_output; static bool gmulti_output; static bool pihat_filter; static bool genome_output; static bool compress_genome; static bool genome_only_check_rels; static bool genome_output_minimal; static bool genome_output_full; static bool genome_2sets; static string genome_setlist1; static string genome_setlist2; static bool genome_test; static double genome_test_threshold; static int genome_test_min_snp; static bool ibs_test; static int ibs_test_min_snp; static bool ibs_test_method2; static bool summary_ibd_output; static double IBD_threshold; static double segment_threshold_start; static double segment_threshold_finish; static bool nudge; static bool bound; static bool show_impossible_IBD; static bool IBD_within; static bool permute; static int replicates; static bool perm_count; static bool mperm_save_best; static bool mperm_save_all; static bool mperm_rank; static bool adaptive_perm; static int adaptive_min; static int adaptive_max; static int adaptive_interval; static double adaptive_interval2; static double adaptive_alpha; static double adaptive_ci; static bool perm_genedrop; static bool perm_genedrop_and_swap; static bool perm_genedrop_unrel; static bool perm_genedrop_parents; static bool perm_genedrop_sibships; static bool FIXED; static bool FIXED_p; static Z FIX_IBD; static double FIX_p; static bool matrix; static bool distance_matrix; static bool cluster; static bool cluster_euclidean; static bool cluster_group_avg; static bool cluster_plot; static bool force_initial_cluster; static int cluster_mds_dim; static bool mds_by_individual; static bool genome_groups; static bool cluster_ibm_constraint; static double cluster_ibm_constraint_value; static bool cluster_missing; static bool cluster_selcon; static string cluster_selcon_file; static int max_cluster_N; static double merge_p; static int ibstest_gap; static int max_cluster_size; static int max_cluster_case; static int max_cluster_control; static bool include_cluster; static bool include_cluster_from_file; static string include_cluster_filename; static int analyse_cluster; static bool cluster_on_phenotype; static bool cluster_on_mcc; static int min_neighbour; static int max_neighbour; static bool outlier_detection; static bool bmatch; static bool bmatch_usertype; static bool qmatch; static string bmatch_filename; static string bmatch_direction_filename; static string qmatch_filename; static string qmatch_threshold_filename; static bool include_all_pairs; static double include_all_z1; static double MIN_PIHAT; static double MAX_PIHAT; static double MAX_CORR_PIHAT_PIHAT_G; static double MAX_GENO_MISSING; static double MAX_IND_MISSING; static int MAX_LINE_LENGTH; static bool remove_indiv; static string remove_indiv_list; static string keep_indiv_list; static bool keep_indiv; static bool extract_before_exclude; static bool remove_before_keep; static bool locked; static bool af_read; static bool af_write; static bool ibd_read; static string ibd_file; static bool ibd_read_minimal; static bool ibd_read_list; static string ibd_file_list; static string af_file; static bool af_count; static bool inc_write; static bool inc_read; static string inc_file; static int pp_maxsnp; static int pp_maxfid; static int pp_maxiid; static int BATCH_SIZE; static bool plink; static bool display_segment_long; static bool display_cnv_track; static int cnv_col; static bool cnv_makemap; static bool cnv_writelist; static bool cnv_list; static string cnv_listname; static int cnv_min_kb; static double cnv_min_score; static int cnv_min_sites; static int cnv_max_kb; static double cnv_max_score; static int cnv_max_sites; static bool cnv_del_only; static bool cnv_dup_only; static int cnv_type; static bool cnv_intersect; static bool cnv_exclude; static string cnv_intersect_file; static bool cnv_intersect_subset; static string cnv_intersect_subset_file; static bool cnv_count; static double cnv_overlap; static bool cnv_defined_overlap; static bool cnv_indiv_perm; static bool cnv_pos_perm; static bool cnv_drop_no_segment; static bool cnv_freq_method2; static double cnv_freq_method2_threshold; static bool cnv_write_freq; static bool cnv_freq_include; static bool cnv_freq_include_below; static bool cnv_freq_include_exact; static bool cnv_freq_include_exact_exclude; static int cnv_freq_include_cnt; static bool cnv_unique; static bool cnv_intersect_writeback; static bool cnv_intersect_writeback_verbose; static bool cnv_disrupt; static int cnv_region_border; static bool cnv_union_overlap; static bool cnv_region_overlap; static bool cnv_check_overlap; static bool cnv_count_baseline; static string cnv_count_baseline_file; static bool cnv_weighted_gene_test; static bool cnv_enrichment_test; static int cnv_en_model; static bool cnv_glm; static bool seg_test_window; static double seg_test_window_bp; static bool seg_test_region; static bool dosage_assoc; static string dosage_file; static bool dosage_hard_call; static double dosage_hard_call_thresh; static int dosage_hard_call_thresh2; static bool dosage_hasMap; static bool write_dosage; }; void setOptions(CArgs &); void getOutputFilename(CArgs &); #endif plink-1.07-src/haploTDT.cpp0000644000265600020320000001241011264127625014656 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "phase.h" #include "helper.h" #include "stats.h" ////////////////////////////// // Unweighted tests void HaploPhase::haplotypicTDT(map & tests, int nt, bool display) { // No implementation of TDT omnibus test yet if ( nt != 2 ) { result = -9; pvalue = -9; odds = -9; return; } // This might be a haplotype-specific test (i.e. of a single // haplotype) or of a group of haplotypes versus the rest // When rescoring the T and U counts, we will have supplied a 'downcoding' // map, which is the same as the 'tests' map, i.e. mapping each haplotype // onto a 0/1 space (i.e. nt==2) // Find test haplotype(s), if we are in display mode (i.e. here we // know it is not a group of haplotypes, but a specific haplotype, and // we want the name of it, --hap-tdt; for all other instances, we // can not assume that we will be testing a specific haplotype; so // we do not bother about the name (it might be a group). We // can assume a binary test though (nt==2), so always set hh to 0. int hh=0; double tr = 0; double un = 0; // This test is always of 1 haplotype/group versus all others -- // i.e. the downcoding will have been performed previously, with the // transmissions being appropriately rescored before hand (and so // trans[]/untrans[] will already be downcoded. if ( display ) { map::iterator i1 = tests.begin(); while ( i1 != tests.end() ) { if ( i1->second == 0 ) { hh = i1->first; break; } i1++; } } tr += trans[hh]; un += untrans[hh]; /////////////////////////////////////////////////////// // 'result' visible outside this class, // Either use McNemar's chi-square (b-c)^2/(b+c) // or normal approximation for test of transmission // ratio equals 0.5 (with the empirical variance added // here) odds = tr/un; // if ( true || useEmpiricalVariance ) // { result = tr - un; result *= result; result /= tr + un; pvalue = chiprobP(result,1); case_freq = tr; control_freq = un; // } // else // { // // Calculate empirical variance of transmissions: the // // relevant quantities will have been stored during the // // transmission scoring routine // double transmissionProportion = tr / ( tr + un ); // double transmissionCount = tr + un; // // double eHH = transmissionX2[hh] / ( transmissionTotal-1); // // double eH = transmissionX[hh] / ( transmissionTotal-1); // double eHH = transmissionX2[hh] / ( transmissionCount ); // double eH = transmissionX[hh] / ( transmissionCount ); // empiricalVariance = eHH - ( eH * eH ); // empiricalVariance /= transmissionCount - 1 ; // double Z = ( transmissionProportion - 0.5 ) // / ( sqrt( empiricalVariance * ( 1 / transmissionCount ) ) ); // result = Z * Z; // pvalue = chiprobP( result , 1 ); // } if ( display ) HTEST << setw(10) << hname << " " << setw(12) << haplotypeName(hh) << " " << setw(10) << trans[hh] << " " << setw(10) << untrans[hh] << " "; if ( display ) { if ( realnum(result) ) { HTEST << setw(10) << result << " " << setw(10) << pvalue << " "; } else { HTEST << setw(10) << "NA" << " " << setw(10) << "NA" << " "; } for (int snps=0; snpsname << "|"; HTEST << P.locus[S[ns-1]]->name << "\n"; } return; } void HaploPhase::haplotypicWeightedTDT() { vector_t weights; for (int i=0; i::iterator whap = new_pred_weighted_allele[current].find( haplotypeName(i) ); if ( whap != new_pred_weighted_allele[current].end() ) { weights.push_back( whap->second ); } else { weights.push_back( 0 ); } } double T = 0; double U = 0; for (int h=0; hallele1 << " " << setw(10) << T << " " << setw(10) << U << " "; if ( realnum(chisq) ) { HTEST << setw(10) << chisq << " " << setw(10) << chiprobP(chisq,1) << " "; } else { HTEST << setw(10) << "NA" << " " << setw(10) << "NA" << " "; } for (int snps=0; snpsname << "|"; HTEST << P.locus[S[ns-1]]->name << "\n"; return; } plink-1.07-src/sisocks.h0000644000265600020320000001372311264127626014333 0ustar tilleaadmin/* system independent sockets (basically for unix and Win) (C)2000,1 Simon Urbanek conditional defines: MAIN should be defined in just one file that will contain the fn definitions and variables USE_SNPRINTF emulate snprintf on Win platforms (you will lose the security which is provided under unix of course) SOCK_ERRORS include error code handling and checking functions */ #ifndef __SISOCKS_H__ #define __SISOCKS_H__ #if defined __GNUC__ && !defined unix && !defined Win32 /* MacOS X hack (gcc on any platform should behave as unix - except for Win32, where we need to keep using winsock) */ #define unix #endif #if defined SOCK_ERRORS || defined USE_SNPRINTF #include #endif #include #ifdef unix #include #include #include #include #include #include #include #define sockerrno errno #define SOCKET int #define INVALID_SOCKET (-1) #define closesocket(A) close(A) #else #define windows #include #include #include #include #define inet_aton(A,B) (0, B.s_addr=inet_addr(A)) #define sockerrno WSAGetLastError() #define ECONNREFUSED WSAECONNREFUSED #define EADDRINUSE WSAEADDRINUSE #define ENOTSOCK WSAENOTSOCK #define EISCONN WSAEISCONN #define ETIMEDOUT WSAETIMEDOUT #define ENETUNREACH WSAENETUNREACH #define EINPROGRESS WSAEINPROGRESS #define EALREADY WSAEALREADY #define EAFNOSUPPORT WSAEAFNOSUPPORT #define EBADF WSAEBADF #define EINVAL WSAEINVAL #define EOPNOTSUPP WSAEOPNOTSUPP #define EFAULT WSAEFAULT #define EWOULDBLOCK WSAEWOULDBLOCK #define EACCES WSAEACCES #ifdef USE_SNPRINTF #ifdef MAIN int snprintf(char *buf, int len, char *fmt, ...) { va_list argptr; int cnt; va_start(argptr, fmt); cnt = vsprintf(buf, fmt, argptr); va_end(argptr); return(cnt); } #else extern int snprintf(char *buf, int len, char *fmt, ...); #endif #endif #endif #define SA struct sockaddr #define SAIN struct sockaddr_in #ifdef windows #ifdef MAIN int initsocks(void) { WSADATA dt; /* initialize WinSock 1.1 */ return (WSAStartup(0x0101,&dt))?-1:0; } #else extern int initsocks(void); #endif #define donesocks() WSACleanup() #else /* no stupid stuff necessary for unix */ #define initsocks() #define donesocks() #endif #ifdef SOCK_ERRORS #ifdef MAIN int suppmode=0; int socklasterr; FILE *sockerrlog=0; /* copy error description to buf or set *buf=0 if none */ int sockerrorchecks(char *buf, int blen, int res) { *buf=0; if (res==-1) { switch(sockerrno) { case EBADF: strncpy(buf,"bad descriptor",blen); break; case EINVAL: strncpy(buf,"already in use",blen); break; case EACCES: strncpy(buf,"access denied",blen); break; case ENOTSOCK: strncpy(buf,"descriptor is not a socket",blen); break; case EOPNOTSUPP: strncpy(buf,"operation not supported",blen); break; case EFAULT: strncpy(buf,"fault",blen); break; case EWOULDBLOCK: strncpy(buf,"operation would block",blen); break; case EISCONN: strncpy(buf,"is already connected",blen); break; case ECONNREFUSED: strncpy(buf,"connection refused",blen); break; case ETIMEDOUT: strncpy(buf,"operation timed out",blen); break; case ENETUNREACH: strncpy(buf,"network is unreachable",blen); break; case EADDRINUSE: strncpy(buf,"address already in use",blen); break; case EINPROGRESS: strncpy(buf,"in progress",blen); break; case EALREADY: strncpy(buf,"previous connect request not completed yet",blen); break; #ifdef unix default: snprintf(buf,blen,"unknown socket error %d",sockerrno); #else default: sprintf(buf,"unknown socket error %d",sockerrno); #endif } } return res; } /* check socket error and add to log file if necessary */ int sockerrorcheck(char *sn, int rtb, int res) { if (!sockerrlog) sockerrlog=stderr; if ((signed int)res==-1) { if (socklasterr==sockerrno) { suppmode++; } else { if (suppmode>0) { fprintf(sockerrlog,"##> REP: (last error has been repeated %d times.)\n",suppmode); suppmode=0; } fprintf(sockerrlog,"##> SOCK_ERROR: %s error #%d",sn,sockerrno); switch(sockerrno) { case EBADF: fprintf(sockerrlog,"(bad descriptor)"); break; case EINVAL: fprintf(sockerrlog,"(already in use)"); break; case EACCES: fprintf(sockerrlog,"(access denied)"); break; case ENOTSOCK: fprintf(sockerrlog,"(descriptor is not a socket)"); break; case EOPNOTSUPP: fprintf(sockerrlog,"(operation not supported)"); break; case EFAULT: fprintf(sockerrlog,"(fault)"); break; case EWOULDBLOCK: fprintf(sockerrlog,"(operation would block)"); break; case EISCONN: fprintf(sockerrlog,"(is already connected)"); break; case ECONNREFUSED: fprintf(sockerrlog,"(connection refused)"); break; case ETIMEDOUT: fprintf(sockerrlog,"(operation timed out)"); break; case ENETUNREACH: fprintf(sockerrlog,"(network is unreachable)"); break; case EADDRINUSE: fprintf(sockerrlog,"(address already in use)"); break; case EINPROGRESS: fprintf(sockerrlog,"(in progress)"); break; case EALREADY: fprintf(sockerrlog,"(previous connect request not completed yet)"); break; default: fprintf(sockerrlog,"(?)"); } fprintf(sockerrlog,"\n"); fflush(sockerrlog); socklasterr=sockerrno; } if (rtb) exit(1); } return res; } #else extern int suppmode=0; extern int socklasterr; extern FILE *sockerrlog=0; int sockerrorchecks(char *buf, int blen, int res); int sockerrorcheck(char *sn, int rtb, int res); #endif #define FCF(X,F) sockerrorcheck(X,1,F) #define CF(X,F) sockerrorcheck(X,0,F) #endif #ifdef MAIN struct sockaddr *build_sin(struct sockaddr_in *sa,char *ip,int port) { memset(sa,0,sizeof(struct sockaddr_in)); sa->sin_family=AF_INET; sa->sin_port=htons(port); sa->sin_addr.s_addr=(ip)?inet_addr(ip):htonl(INADDR_ANY); return (struct sockaddr*)sa; } #else struct sockaddr *build_sin(struct sockaddr_in *sa,char *ip,int port); #endif #endif /* __SISOCKS_H__ */ plink-1.07-src/genepi.cpp0000644000265600020320000010074311264127624014454 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "sets.h" #include "helper.h" #include "stats.h" #include "crandom.h" #include "linear.h" #include "logistic.h" typedef vector vector_tld; // Function that implements Pillai's (1964) approximation to upper // distribution of the largest canonical correlatio // Function (bico) to estimate combinations choose(n,k) double factln(int n) { double gammln(double xx); static double a[101]; if (n < 0) error("Negative factorial in routine factln"); if (n <= 1) return 0.0; if (n <= 100 ) return a[n] ? a[n] : (a[n]=gammln(n+1.0)); else return gammln(n+1.0); } double bico(int n, int k) { double factln(int n); return floor(0.5+exp(factln(n)-factln(k)-factln(n-k))); } // Beta function double betaln(double z, double w) { double gammln(double xx); return gammln(z)+gammln(w)-gammln(z+w); } // Pillai's helper double C(int s, double m, double n) { // // Components 1 and 2 // double c1,c2,temp; // c1=c2=temp=0; // for (int i=1; i q ? p : q; // int s = p2; // double m = 0.5 * (q2-p2-1); // double n = 0.5 * (N-p2-q2-2) ; // double Csmn = C(s,m,n); // double Cs_1mn = C(s-1,m,n); // // Calculating log( V ) // // We need 1 V for each of the 's-1' k's needed in the formula for P // vector_tld V(s-1); // for (int i=1; i1 // else // { // double a=1; // for (int j=1; j 1) // last = 1; // cdf = last + sum_v; // } // return 1-cdf; } // Bartlet's test for all canonical correlations long double bartlett(int N, int p, int q, vector_t eigen) { int p2 = p <= q ? p : q; // Number of canonical correlations double prod_eigen=1; for (int i=0; i &, int, int, bool, Plink *, vector &, vector > &, vector&, vector &, vector &); void CCA_logit(bool perm, vector > & blperm, Set & S, Plink & P); void CCA_caseonly(bool perm, vector > & blperm, Set & S, Plink & P); void Plink::driverSCREEPI() { /////////////////////////////// // Gene-based epistasis ////////////////////////////////////////// // Case-control samples only affCoding(*this); ////////////////////////////////////////// // SNP-major mode analysis if (!par::SNP_major) Ind2SNP(); ////////////////////////////////////////// // Requires that sets have been speciefied if (par::set_test) readSet(); else error("Need to specify genes with --set {filename} when using --genepi\n"); ////////////////// // SET statistics Set S(snpset); ////////////////////////////////////////////// // Prune SET (0-sized sets, MAF==0 SNPs, etc) S.pruneSets(*this); int ns = snpset.size(); if (ns < 2) error("Need to specify at least two fully valid sets\n"); int n = 0; int ncase = 0; ///////////////////////////////////////////////////////// // Prune based on VIF string original_outfile = par::output_file_name; // Case-control? Prune cases and controls together... if (!par::epi_caseonly) { printLOG("\nConsidering cases and controls: "); setFlags(false); vector::iterator person = sample.begin(); while ( person != sample.end() ) { if ( ! (*person)->missing ) { (*person)->flag = true; n++; } person++; } par::output_file_name += ".all"; S.pruneMC(*this,false,par::vif_threshold); //S.pruneMC(*this,false,1000); } // Case-only? Prune cases only... else { printLOG("\nConsidering cases: "); setFlags(false); vector::iterator person = sample.begin(); while ( person != sample.end() ) { if ( (*person)->aff && ! (*person)->missing ) { (*person)->flag = true; ncase++; } person++; n++; } par::output_file_name += ".case"; S.pruneMC(*this,false,par::vif_threshold); //S.pruneMC(*this,false,1000); } par::output_file_name = original_outfile; // Write finalized set ofstream SET1, SET2; string f = par::output_file_name + ".all.set.in"; printLOG("Writing combined pruned-in set file to [ " + f + " ]\n"); SET1.open(f.c_str(),ios::out); f = par::output_file_name + ".all.set.out"; printLOG("Writing combined pruned-out set file to [ " + f + " ]\n"); SET2.open(f.c_str(),ios::out); for (int s=0; sname << "\n"; else SET2 << locus[snpset[s][j]]->name << "\n"; } SET1 << "END\n\n"; SET2 << "END\n\n"; } SET1.close(); SET2.close(); // Prune empty sets once more: S.pruneSets(*this); ns = snpset.size(); if (ns < 2) error("Need to specify at least two fully valid sets\n"); //////////////////////////////// // Set up permutation structure // Specialized (i.e. cannot use Perm class) as this // requires a block-locus permutation // First block is fixed vector > blperm(ns); vector > blperm_case(ns); vector > blperm_control(ns); for (int i=0; imissing ) blperm[i].push_back(j); // A slot for each individual per locus for (int j=0; jmissing && sample[j]->aff ) blperm_case[i].push_back(j); // A slot for each individual per locus for (int j=0; jmissing && !sample[j]->aff ) blperm_control[i].push_back(j); } //////////////////////////////////////////// // Open file and print header for results ofstream EPI(f.c_str(), ios::out); EPI.open(f.c_str(), ios::out); EPI.precision(4); //////////////////////////////////////// // Analysis (calls genepi functions) if (!par::epi_caseonly) CCA_logit(false,blperm,S,*this); else CCA_caseonly(false,blperm_case,S,*this); if (!par::permute) return; if (!par::silent) cout << "\n"; } // End of screepi /////////////////////////// // CCA functions /////////////////////////////////////////////////////////// // First CCA function: use for case-control logit analysis void CCA_logit(bool perm, vector > & blperm, Set & S, Plink & P) { /////////////// // Output results ofstream EPI; if (!perm) { string f = par::output_file_name+".genepi"; P.printLOG("\nWriting gene-based epistasis tests to [ " + f + " ]\n"); EPI.open(f.c_str(), ios::out); EPI.precision(4); EPI << setw(12) << "NIND" << " " << setw(12) << "GENE1" << " " << setw(12) << "GENE2" << " " << setw(12) << "NSNP1" << " " << setw(12) << "NSNP2" << " " << setw(12) << "P" << " " << "\n"; } ////////////////////////////////// // Canonical correlation analysis int ns = P.snpset.size(); // Consider each pair of genes for (int s1=0; s1 < ns-1; s1++) { for (int s2 = s1+1; s2 < ns; s2++) { //////////////////////////////////////////////////////// // Step 1. Construct covariance matrix (cases and controls together) // And partition covariance matrix: // S_11 S_21 // S_12 S_22 int n1=0, n2=0; vector > sigma(0); vector mean(0); vector pSNP(0); ///////////////////////////// // List of SNPs for both loci for (int l=0; l::iterator person = P.sample.begin(); while ( person != P.sample.end() ) { (*person)->flag = true; person++; } int nind = calcGENEPIMeanVariance(pSNP, n1,n2, false, &P, mean, sigma, P.sample , blperm[s1], blperm[s2] ); /////////////////////////// // Partition covariance matrix vector > I11; vector > I11b; vector > I12; vector > I21; vector > I22; vector > I22b; sizeMatrix( I11, n1, n1); sizeMatrix( I11b, n1, n1); sizeMatrix( I12, n1, n2); sizeMatrix( I21, n2, n1); sizeMatrix( I22, n2, n2); sizeMatrix( I22b, n2, n2); // For step 4b (eigenvectors for gene2) for (int i=0; i sorted_eigenvalues_gene1 = gene1_eigen.d; sort(sorted_eigenvalues_gene1.begin(),sorted_eigenvalues_gene1.end(),greater()); // Position of the largest canonical correlation that is < // max_cancor in the sorted vector of eigenvalues. This will be // needed to use the right gene1 and gene2 coefficients to build // the appropriate canonical variates. double cancor1=0; int cancor1_pos; for (int i=0; i cancor1 && sqrt(sorted_eigenvalues_gene1[i]) < max_cancor ) { cancor1 = sqrt(sorted_eigenvalues_gene1[i]); cancor1_pos = i; break; } } // Display largest canonical correlation and its position // cout << "Largest canonical correlation [position]\n" // << cancor1 << " [" << cancor1_pos << "]" << "\n\n" ; // Sort evectors. Rows must be ordered according to cancor value (highest first) matrix_t sorted_eigenvectors_gene1 = gene1_eigen.z; vector order_eigenvalues_gene1(n1); for (int i=0; i sorted_eigenvalues_gene2 = gene2_eigen.d; sort(sorted_eigenvalues_gene2.begin(),sorted_eigenvalues_gene2.end(),greater()); // Sort eigenvectors for gene2 matrix_t sorted_eigenvectors_gene2 = gene2_eigen.z; vector order_eigenvalues_gene2(n2); for (int i=0; i gene1(nind); for (int j=0; jone[i]; bool a2 = ps->two[i]; if ( a1 ) { if ( a2 ) // 11 homozygote { gene1[i] += (1 - mean[j]) * coeff_gene1[order_eigenvalues_gene1[cancor1_pos]][j]; } else // 12 { gene1[i] += (0 - mean[j]) * coeff_gene1[order_eigenvalues_gene1[cancor1_pos]][j]; } } else { if ( a2 ) // 21 { gene1[i] += (0 - mean[j]) * coeff_gene1[order_eigenvalues_gene1[cancor1_pos]][j]; } else // 22 homozygote { gene1[i] += (-1 - mean[j]) * coeff_gene1[order_eigenvalues_gene1[cancor1_pos]][j]; } } } // Next individual } // Next SNP in gene1 ///////////////////////////////// // Consider each SNP in gene2 vector gene2(P.n); int cur_snp = -1; for (int j=n1; jone[i]; bool a2 = ps->two[i]; if ( a1 ) { if ( a2 ) // 11 homozygote { gene2[i] += (1 - mean[j]) * coeff_gene2[order_eigenvalues_gene2[cancor1_pos]][cur_snp]; } else // 12 { gene2[i] += (0 - mean[j]) * coeff_gene2[order_eigenvalues_gene2[cancor1_pos]][cur_snp]; } } else { if ( a2 ) // 21 { gene2[i] += (0 - mean[j]) * coeff_gene2[order_eigenvalues_gene2[cancor1_pos]][cur_snp]; } else // 22 homozygote { gene2[i] += (-1 - mean[j]) * coeff_gene2[order_eigenvalues_gene2[cancor1_pos]][cur_snp]; } } } // Next individual } // Next SNP in gene2 // Store gene1.variate and gene2.variate in the multiple_covariates field of P.sample // TO DO: NEED TO CHECK IF FIELDS ARE EMPTY FIRST! for (int i=0; iclist.resize(2); P.sample[i]->clist[0] = gene1[i]; P.sample[i]->clist[1] = gene2[i]; } /////////////////////////////////////////////// // STEP 7 - Logistic or linear regression epistasis test // Model * lm; if (par::bt) { LogisticModel * m = new LogisticModel(& P); lm = m; } else { LinearModel * m = new LinearModel(& P); lm = m; } // No SNPs used lm->hasSNPs(false); // Set missing data lm->setMissing(); // Main effect of GENE1 1. Assumes that the variable is in position 0 of the clist vector lm->addCovariate(0); lm->label.push_back("GENE1"); // Main effect of GENE 2. Assumes that the variable is in position 1 of the clist vector lm->addCovariate(1); lm->label.push_back("GENE2"); // Epistasis lm->addInteraction(1,2); lm->label.push_back("EPI"); // Build design matrix lm->buildDesignMatrix(); // Prune out any remaining missing individuals // No longer needed (check) // lm->pruneY(); // Fit linear model lm->fitLM(); // Did model fit okay? lm->validParameters(); // Obtain estimates and statistic lm->testParameter = 3; // interaction vector_t b = lm->getCoefs(); double chisq = lm->getStatistic(); double logit_pvalue = chiprobP(chisq,1); // Clean up delete lm; ///////////////////////////// // OUTPUT EPI << setw(12) << nind << " " << setw(12) << P.setname[s1] << " " << setw(12) << P.setname[s2] << " " << setw(12) << n1 << " " << setw(12) << n2 << " " << setw(12) << logit_pvalue << " " << "\n"; } // End of loop over genes2 } // End of loop over genes1 EPI.close(); } // End of CCA_logit() /////////////////////////////////////////////////////////// // Second CCA function: use for case-control only void CCA_caseonly(bool perm, vector > & blperm_case, Set & S, Plink & P) { /////////////// // Output file ofstream EPI; if (!perm) { string f = par::output_file_name+".genepi"; P.printLOG("\nWriting gene-based epistasis tests to [ " + f + " ]\n"); EPI.open(f.c_str(), ios::out); EPI.precision(4); EPI << setw(12) << "NIND" << " " << setw(12) << "GENE1" << " " << setw(12) << "GENE2" << " " << setw(12) << "NSNP1" << " " << setw(12) << "NSNP2" << " " << setw(12) << "CC1" << " " // << setw(12) << "PILLAI" << " " << setw(12) << "BART" << " " << "\n"; } ////////////////////////////////// // Canonical correlation analysis // Number of genes int ns = P.snpset.size(); // Consider each pair of genes for (int s1=0; s1 < ns-1; s1++) { for (int s2 = s1+1; s2 < ns; s2++) { //////////////////////////////////////////////////////// // Step 1. Construct covariance matrix (cases only) // And partition covariance matrix: // S_11 S_21 // S_12 S_22 int n1=0, n2=0; vector > sigma(0); vector mean(0); vector pSNP(0); ///////////////////////////// // List of SNPs for both loci for (int l=0; l::iterator person = P.sample.begin(); int ncase=0; while ( person != P.sample.end() ) { if ( (*person)->aff && !(*person)->missing ) { (*person)->flag = true; ncase++; } person++; } int nind = calcGENEPIMeanVariance(pSNP, n1,n2, false, &P, mean, sigma, P.sample , blperm_case[s1], blperm_case[s2] ); /////////////////////////// // Partition covariance matrix vector > I11; vector > I11b; vector > I12; vector > I21; vector > I22; vector > I22b; sizeMatrix( I11, n1, n1); sizeMatrix( I11b, n1, n1); sizeMatrix( I12, n1, n2); sizeMatrix( I21, n2, n1); sizeMatrix( I22, n2, n2); sizeMatrix( I22b, n2, n2); // For step 4b (eigenvectors for gene2) for (int i=0; i sorted_eigen = eigen; sort(sorted_eigen.begin(),sorted_eigen.end(),greater()); // P-value // long double pillai_pvalue = pillai(ncase,n1,n2,sorted_eigen[0]); long double bartlett_pvalue = bartlett(ncase,n1,n2,sorted_eigen); ///////////////////////////////////////////////////////////////////// // OUTPUT EPI << setw(12) << ncase << " " << setw(12) << P.setname[s1] << " " << setw(12) << P.setname[s2] << " " << setw(12) << n1 << " " << setw(12) << n2 << " " << setw(12) << sqrt(sorted_eigen[0]) << " " // << setw(12) << pillai_pvalue << " " << setw(12) << bartlett_pvalue << " " << "\n"; } // End of loop over genes2 } // End of loop over genes1 EPI.close(); } // End of CCA_caseonly ////////////////////////////////// // Helper functions int calcGENEPIMeanVariance(vector & pSNP, int n1, int n2, bool perm, Plink * P, vector & mean, vector > & variance, vector & sample, vector & gp1, vector & gp2 ) { // Return number of individuals that the mean and variance matrix // are based on bool casewise_deletion = false; // Calculate mean and variance for n1+n2 x n1+n2 matrix // Individual order in n1 , n2 deteremined by g1, g2 // (i.e. block-based permutation) // Under permutations, mean and variances won't change // Store means only for now int nss = pSNP.size(); // Original calculation? if (!perm) mean.resize(nss,0); vector cnt(nss,0); variance.resize(nss); for (int j=0; jn; i++) { // Only need to look at one perm set bool a1 = ps->one[gp1[i]]; bool a2 = ps->two[gp2[i]]; if ( a1 ) { if ( a2 ) // 11 homozygote { mean[j]++; cnt[j]++; } } else { cnt[j]++; if ( ! a2 ) // 00 homozygote mean[j]--; } } // Next individual } // Next SNP in set for (int j=0; jn; i++) { bool a1, a2; if (j1one[gp1[i]]; a2 = ps1->two[gp1[i]]; } else { a1 = ps1->one[gp2[i]]; a2 = ps1->two[gp2[i]]; } bool b1, b2; if (j1one[gp1[i]]; b2 = ps2->two[gp1[i]]; } else { b1 = ps2->one[gp2[i]]; b2 = ps2->two[gp2[i]]; } // Mean substitution double v1=mean[j1], v2=mean[j2]; // First SNP if ( a1 ) { if ( a2 ) // 11 homozygote { v1 = 1; } } else { if ( ! a2 ) // 00 homozygote { v1 = -1; } else v1 = 0; // 01 heterozygote } // Second SNP if ( b1 ) { if ( b2 ) // 11 homozygote { v2 = 1; } } else { if ( ! b2 ) // 00 homozygote { v2 = -1; } else v2 = 0; // 01 heterozygote } // Contribution to covariance term variance[j1][j2] += ( v1 - mean[j1] ) * ( v2 - mean[j2] ); } // Next individual } // Second SNP } // First SNP // Make symmetric covariance matrix for (int i=0; in); variance[j][i] = variance[i][j]; } // Mean-imputation uses everybody return P->n; } plink-1.07-src/whap.h0000644000265600020320000000303111264127626013603 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2007 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __WHAP_H__ #define __WHAP_H__ #include #include #include #include #include "plink.h" class Plink; class HaploPhase; using namespace std; class ChapModel { public: string model; map haploGroup; vector masked_conditioning_snps; vector_t coef; vector_t se; vector_t p; vector label; vector< set > group; double lnLk; int df; double rsq; void buildFromSNPs(string); void buildFromGroups(string); bool haplotypesInSameGroup(int,int); }; class Chap { public: ChapModel * alternate; ChapModel * null; ChapModel * current; Plink * P; HaploPhase * H; Chap(Plink * p_, HaploPhase * h_) { P = p_; H = h_; } void determineTestType(); void setModels(ChapModel &, ChapModel &); void build(ChapModel &); void setSNPList(vector &, ChapModel &); bool isNested(); }; #endif plink-1.07-src/gvar.cpp0000644000265600020320000006265711264127625014160 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include "gvar.h" #include "helper.h" #include "options.h" #include "plink.h" #include "model.h" #include "logistic.h" #include "linear.h" #include "stats.h" #include #include #include #include #include #include // A couple of helper functions Model * analyseModel(Plink *, Variant *, int, bool, bool); double compareModels(Model *, Model *) ; class fullGenotype{ public: fullGenotype(int2 a, int2 b) { a1 = a; a2 = b; if ( a1 < a2 ) { int2 t = a1; a1 = a2; a2 = t; } } int2 a1; int2 a2; bool operator< (const fullGenotype & b) const { return (a1 < b.a1 || (a1 == b.a1 && a2 < b.a2) ); } }; void Plink::readGenericVariantData() { /////////////////////////////////////////////// // Load data (if par::gvar is true) -- otherwise, // we are just string filename = par::gvarfile; if ( par::gvar ) checkFileExists( filename ); // If we are not reading in any generic variants, // but we have elected to write a file, then // we must set this flag so that the data are // carried over if ( par::gvar_write && ! par::gvar ) par::gvar_include_all_variants = true; /////////////////////////////////////////////// // Input mode // // 1) Either we start from scratch, read in gMAP file // and then data (par::load_gvar is true) // 2) Or, we have first read in the standard file, and we // just place these on top, copying basic SNPs over first // (par::load_gvar is false) bool preexisting = locus.size() > 0; // We need either basic SNPs or generic variants here if ( ! ( preexisting || par::gvar ) ) return; if ( preexisting ) { nl_all = locus.size(); n = sample.size(); // prettyPrintLengths(); } /////////////////////////////////////////////// // .map file vector include; vector include_pos(0); if ( par::gvar && !preexisting ) { checkFileExists( par::gmapfile ); ngvar = 0; readMapFile(par::gmapfile, include, include_pos, ngvar); gvar.clear(); for (int l=0; lacode.clear(); } locus.clear(); } /////////////////////////////////////////////// // Otherwise figure out which markers to keep // i.e. assume a small subset of all set observedGVARs; map copyover; if ( preexisting ) { // Either take all SNPs in memory, or // select out just those already specified in // the GVAR file if ( par::gvar_include_all_variants ) { // Copy all SNPs over for (int l=0; lname ); ngvar = observedGVARs.size(); } else if ( par::gvar ) { FILE * GV; GV = fopen64( filename.c_str(),"r"); if ( GV == NULL ) error("Problem opening GVAR file, errno = "+int2str(errno)); while ( ! feof(GV) ) { string dummy; string gvarname; int f=0; if ( readString( GV , dummy ) ) ++f; if ( dummy == "" ) continue; if ( readString( GV , dummy ) ) ++f; if ( readString( GV , gvarname ) ) observedGVARs.insert( gvarname ); while (fgetc(GV) != '\n' && !feof(GV)) {} } fclose(GV); ngvar = observedGVARs.size(); printLOG("Found "+int2str( ngvar )+" variant in [ " + filename + " ]\n"); } ////////////////////////////////// // Copy MAP into appropriate space gvar.clear(); map im; for (int l=0; lname, l)); set::iterator is = observedGVARs.begin(); while ( is != observedGVARs.end() ) { map::iterator imi = im.find( *is ); if ( imi != im.end() ) { Variant * gloc = new Variant; Locus * loc = locus[imi->second]; gloc->name = loc->name; gloc->chr = loc->chr; gloc->bp = loc->bp; gloc->pos = loc->pos; gloc->acode.clear(); gloc->alleles.clear(); gvar.push_back( gloc ); int l = gvar.size() - 1; copyover.insert(make_pair(l,imi->second)); } else error("Generic variant " + *is + " found that was not in original SNP file\n"); ++is; } } /////////////////////////////////////////////// // .fam if ( par::gvar && ! preexisting ) { checkFileExists( par::gfamfile ); readFamFile(par::gfamfile); n = sample.size(); // Read an alternate phenotype file? if (par::pheno_file) readPhenoFile(); if ( par::bt ) affCoding(*this); } /////////////////////////////////////////////// // Allocate space for (int i=0; igvar.resize( ngvar ); for (int g=0; ggvar[g] = new GVariant; } /////////////////////////////////////////////// // Copy over any existing SNP data if ( preexisting ) { for (int g=0; g::iterator li = copyover.find(g); int l = li->second; Locus * loc = locus[ li->second ]; int acode1 = -1; int acode2 = -1; if ( loc->allele1 != "" && loc->allele1 != par::missing_genotype ) { if ( gloc->acode.find( loc->allele1 ) == gloc->acode.end() ) { gloc->acode.insert( make_pair( loc->allele1, gloc->nallele )); gloc->alleles.push_back( loc->allele1 ); acode1 = gloc->nallele; ++(gloc->nallele); } } if ( loc->allele2 != "" && loc->allele2 != par::missing_genotype ) { if ( gloc->acode.find( loc->allele2 ) == gloc->acode.end() ) { gloc->acode.insert( make_pair( loc->allele2, gloc->nallele )); gloc->alleles.push_back( loc->allele2 ); acode2 = gloc->nallele; ++(gloc->nallele); } } // Now copy over actual genotypes for this particular SNP // allele for (int i=0; igvar[g]; gv->allele1 = gv->allele2 = -1; bool s1 = par::SNP_major ? SNP[l]->one[i] : sample[i]->one[l]; bool s2 = par::SNP_major ? SNP[l]->two[i] : sample[i]->two[l]; // Missing genotype if ( s1 && ! s2 ) { gv->missing = true; continue; } gv->missing = false; if ( s1 ) { gv->allele1 = gv->allele2 = acode2; } else { if ( s2 ) { // if ( sample[i]->sex ) // cout << "het male??? " << sample[i]->fid << "\n"; gv->allele1 = acode1; gv->allele2 = acode2; } else { gv->allele1 = acode1; gv->allele2 = acode1; } } ///////////////////////////////////////////// // Autosomal or haploid? Should have already // blanked out hemizygous haploid calls if ( par::chr_haploid[gloc->chr] || ( par::chr_sex[gloc->chr] && sample[i]->sex ) ) { gv->dosage1 = 1; gv->dosage2 = 0; } else { gv->dosage1 = 1; gv->dosage2 = 1; } } } } /////////////////////////////////////////////// // .gvar FILE * GV; if ( par::gvar ) { GV = fopen64( filename.c_str(),"r"); if ( GV == NULL ) error("Problem opening GVAR file, errno = "+int2str(errno)); // We can now read any number of individual/genotype lines, in any // order; we also do not assume that all genotypes are given -- // these will be missing by default map imap; if ( ! preexisting ) { for (int i=0; iname , k ) ); } } } else { include.clear(); include.resize( ngvar, true ); include_pos.clear(); for (int i=0; iname , i ) ); } } map iperson; for (int i=0; ifid + "_" + sample[i]->iid , i ) ); } // Whether or not we want to look at a locus is in the include[] vector // The genomic position of locus i is k=include_pos[i] -> locus[k] bool fatal = false; string fmsg = ""; while( ! feof(GV) ) { string fid = ""; string iid = ""; string gvarname = ""; string one = ""; string dose1 = ""; string two = ""; string dose2 = ""; int f = 0; if ( readString( GV , fid ) ) f++; if ( fid == "" ) continue; if ( readString( GV , iid ) ) f++; if ( readString( GV , gvarname ) ) f++; if ( readString( GV , one ) ) f++; if ( readString( GV , dose1 ) ) f++; if ( readString( GV , two ) ) f++; if ( readString( GV , dose2 ) ) f++; map::iterator peri = iperson.find( fid+"_"+iid ); Individual * person = peri != iperson.end() ? sample[peri->second] : NULL ; map::iterator im = imap.find( gvarname ); int k = im != imap.end() ? im->second : -1; // Ignore this genotype? if ( ( ! person ) || k < 0 ) continue; double d1, d2; if ( ( ! from_string( d1, dose1, std::dec) ) || ( ! from_string( d2, dose2, std::dec) ) ) error("Problem in format of file [ " + filename + " ]\n\n" +fid+" "+iid+" "+gvarname+" " +one+" "+dose1+" " +two+" "+dose2+"\n"); int ip = peri->second; Variant * gloc = gvar[k]; GVariant * g = person->gvar[k]; ///////////////////////////////////////// // Add allele names to list, if needed if ( ( one == par::missing_genotype && d1 > 0 ) || ( two == par::missing_genotype && d2 > 0 ) ) { g->missing = true; continue; } g->missing = false; map::iterator ia = gloc->acode.find( one ); if ( d1 > 0 ) { if ( ia == gloc->acode.end() ) { gloc->acode.insert( make_pair( one, gloc->nallele )); gloc->alleles.push_back( one ); g->allele1 = gloc->nallele; ++(gloc->nallele); } else g->allele1 = ia->second; } else g->allele1 = -1; // Store second allele ia = gloc->acode.find( two ); if ( d2 > 0 ) { if ( ia == gloc->acode.end() ) { gloc->acode.insert( make_pair( two, gloc->nallele )); gloc->alleles.push_back( two ); g->allele2 = gloc->nallele; ++(gloc->nallele); } else g->allele2 = ia->second; } else g->allele2 = -1; // Store dosage information g->dosage1 = d1; g->dosage2 = d2; // Have we seen any non-integer dosages? if ( d1 - int(d1) > 1e-6 || d2 - int(d2 ) > 1e-6 ) gloc->integerDosage = false; } fclose(GV); } ///////////////////////////// // Clear up any old SNP data if ( preexisting ) { for (int l=0; lone.clear(); sample[i]->two.clear(); } SNP.clear(); locus.clear(); } } void Plink::processGVAR() { // Use the Model class to test generic variants, // but not entering any main SNP effects par::assoc_glm_without_main_snp = true; printLOG("Processing data for " + int2str( ngvar ) + " generic variants\n"); ofstream GOUT; ofstream GVERB; if ( true ) { printLOG("Writing frequency & genotyping informtion to [ " + par::output_file_name + ".gvar.summary ]\n"); GOUT.open( ( par::output_file_name + ".gvar.summary").c_str(), ios::out ); GOUT.precision(4); GOUT << setw(16) << "NAME" << " " << setw(12) << "FIELD" << " " << setw(12) << "VALUE" << "\n"; } if ( par::gvar_verbose_association ) { GVERB.open((par::output_file_name+".assoc.gvar").c_str(),ios::out); printLOG("Writing verbose GVAR association results to [ " + par::output_file_name + ".assoc.gvar ]\n"); } ///////////////////////////////////// // Is there any phenotypic variation? bool phenotypicVariation = false; for (int i=1; imissing ) continue; if ( sample[i]->missing ) continue; if ( sample[i-1]->phenotype != sample[i]->phenotype ) { phenotypicVariation = true; break; } } for (int g=0; gname << " " << setw(12) << "CHR" << " " << setw(12) << v->chr << "\n"; GOUT << setw(16) << v->name << " " << setw(12) << "BP" << " " << setw(12) << v->bp << "\n"; vector_t f( v->nallele ); map cnvCount; int sampleTotalDose = 0; int sampleTotalInd = 0; map typeCount; map::iterator tit; for (int i=0; imissing ) continue; GVariant * gv = person->gvar[g]; if ( gv->missing ) { int2 p(-1,1); fullGenotype fg(p,p); tit = typeCount.find(fg); int2 q(0,0); if ( person->aff ) q.p1=1; else q.p2=1; if ( tit == typeCount.end() ) typeCount.insert(make_pair(fg,q)); else { if ( person->aff ) ++(tit->second.p1); else ++(tit->second.p2); } continue; } ///////////////////////// // Calculate frequencies if ( gv->allele1 >= 0 ) f[ gv->allele1 ] += gv->dosage1; if ( gv->allele2 >= 0 ) f[ gv->allele2 ] += gv->dosage2; int totalDose = (int)(gv->dosage1 + gv->dosage2); cnvCount[totalDose]++; sampleTotalDose += totalDose; // Keep track of specific allele/CNV counts // by case/control status if ( person->aff || par::qt ) { int2 p(gv->allele1,(int)gv->dosage1); int2 q(gv->allele2,(int)gv->dosage2); fullGenotype fg(p,q); tit = typeCount.find(fg); if ( tit == typeCount.end() ) typeCount.insert(make_pair(fg,int2(1,0))); else ++(tit->second.p1); } else { int2 p(gv->allele1,(int)gv->dosage1); int2 q(gv->allele2,(int)gv->dosage2); fullGenotype fg(p,q); tit = typeCount.find(fg); if ( tit == typeCount.end() ) typeCount.insert(make_pair(fg,int2(0,1))); else ++(tit->second.p2); } ++sampleTotalInd; } // Look at next individual v->allelicVariation = false; int commonAlleles = 0; for ( int x=0; x< v->nallele; x++) { f[x] /= sampleTotalDose; if ( f[x] >= par::min_af ) ++commonAlleles; } if ( commonAlleles>1 ) v->allelicVariation = true; v->copyNumberVariation = false; int commonCNV = 0; map::iterator ia = cnvCount.begin(); while ( ia != cnvCount.end() ) { if ( (double)ia->second / (double)sampleTotalInd >= par::min_af ) ++commonCNV; ++ia; } if ( commonCNV>1 ) v->copyNumberVariation = true; ///////////////////////////////// // Report to summary file GOUT << setw(16) << v->name << " " << setw(12) << "CNV" << " "; if ( v->copyNumberVariation ) GOUT << setw(12) << "yes" << "\n"; else GOUT << setw(12) << "no" << "\n"; GOUT << setw(16) << v->name << " " << setw(12) << "ALLELIC" << " "; if ( v->allelicVariation ) GOUT << setw(12) << "yes" << "\n"; else GOUT << setw(12) << "no" << "\n"; GOUT << setw(16) << v->name << " " << setw(12) << "GCOUNT" << " " << setw(12) << sampleTotalInd << "\n"; if ( v->integerDosage ) GOUT << setw(16) << v->name << " " << setw(12) << "INTEGER" << " " << setw(12) << "Y" << "\n"; else GOUT << setw(16) << v->name << " " << setw(12) << "INTEGER" << " " << setw(12) << "N" << "\n"; for (int a = 0; a < v->nallele; a++) { GOUT << setw(16) << v->name << " " << setw(12) << v->alleles[a] << " " << setw(12) << f[a] << "\n"; } if ( v->integerDosage ) { ia = cnvCount.begin(); while ( ia != cnvCount.end() ) { GOUT << setw(16) << v->name << " " << setw(12) << "["+int2str(ia->first)+"]" << " " << setw(12) << (double)ia->second / (double)sampleTotalInd << "\n"; ++ia; } //////////////////////////////////////////// // Display full Genotype counts (cases/all) tit = typeCount.begin(); while ( tit != typeCount.end() ) { string aname = ""; if ( (tit->first.a1.p2 == 0 ) ) aname += "null"; for (int z=0; zfirst.a1.p2; z++) { if ( tit->first.a1.p1 == -1 ) aname += par::out_missing_genotype; else aname += v->alleles[ tit->first.a1.p1 ]; } aname += "/"; if ( (tit->first.a2.p2 == 0 ) ) aname += "null"; for (int z=0; zfirst.a2.p2; z++) { if ( tit->first.a2.p1 == -1 ) aname += par::out_missing_genotype; else aname += v->alleles[ tit->first.a2.p1 ]; } GOUT << setw(16) << v->name << " " << setw(12) << aname << " "; if ( par::bt ) GOUT << setw(12) << int2str(tit->second.p1)+":"+int2str(tit->second.p2) << "\n"; else GOUT << setw(12) << tit->second.p1 << "\n"; ++tit; } } /////////////////////////////// // Test for assocaition if ( phenotypicVariation && ( v->allelicVariation || v->copyNumberVariation ) ) { // Place allelic variant and/or dosage information in // covariate fields par::clist = true; Model * mJoint = NULL; Model * mCNV = NULL; Model * mAllelic = NULL; if ( v->allelicVariation && v->copyNumberVariation ) mJoint = analyseModel(this,v,g,true,true); if ( v->allelicVariation ) mAllelic = analyseModel(this,v,g,true,false); if ( v->copyNumberVariation ) mCNV = analyseModel(this,v,g,false,true); /////////////////////////////////////////////////////////////// // Extract results; for now we assume everything is biallelic if ( mAllelic && mAllelic->isValid() ) { vector_t b = mAllelic->getCoefs(); vector_t var = mAllelic->getVar(); vector_t pval = mAllelic->getPVals(); int term1 = par::clist_number + 1; // double chisq = mAllelic->getStatistic(); // double pvalue = chiprobP(chisq,1); GOUT << setw(16) << v->name << " " << setw(12) << "B(SNP)" << " " << setw(12) << b[ term1 ] << "\n"; GOUT << setw(16) << v->name << " " << setw(12) << "P(SNP)" << " " << setw(12) << pval[ term1 - 1 ] << "\n"; } if ( mCNV && mCNV->isValid() ) { vector_t b = mCNV->getCoefs(); vector_t var = mCNV->getVar(); vector_t pval = mCNV->getPVals(); int term1 = par::clist_number + 1; // double chisq = mCNV->getStatistic(); // double pvalue = chiprobP(chisq,1); GOUT << setw(16) << v->name << " " << setw(12) << "B(CNP)" << " " << setw(12) << b[ term1 ] << "\n"; GOUT << setw(16) << v->name << " " << setw(12) << "P(CNP)" << " " << setw(12) << pval[ term1 - 1 ] << "\n"; } if ( mJoint && mJoint->isValid() ) { int term1 = par::clist_number + 1; int term2 = par::clist_number + 2; vector_t b = mJoint->getCoefs(); vector_t var = mJoint->getVar(); vector_t p = mJoint->getPVals(); // A 2df test (joint test of two parameters) vector_t h(2,0); matrix_t H; // row = number of fixes; cols = np sizeMatrix(H,2,mJoint->getNP()); H[0][term1] = H[1][term2] = 1; double chisq = mJoint->isValid() ? mJoint->linearHypothesis(H,h) : 0; double pvalue = chiprobP(chisq,2); GOUT << setw(16) << v->name << " " << setw(12) << "B(CNP|SNP)" << " " << setw(12) << b[term1] << "\n"; GOUT << setw(16) << v->name << " " << setw(12) << "P(CNP|SNP)" << " " << setw(12) << p[ term1 - 1 ] << "\n"; GOUT << setw(16) << v->name << " " << setw(12) << "B(SNP|CNP)" << " " << setw(12) << b[ term2 ] << "\n"; GOUT << setw(16) << v->name << " " << setw(12) << "P(SNP|CNP)" << " " << setw(12) << p[ term2 - 1 ] << "\n"; GOUT << setw(16) << v->name << " " << setw(12) << "P(SNP&CNP)" << " " << setw(12) << pvalue << "\n"; } /////////////////////////////////////////// // No valid model if ( ! ( mJoint || mAllelic || mCNV ) ) { GOUT << setw(16) << v->name << " " << setw(12) << "SNP/CNP" << " " << setw(12) << "NA" << "\n"; } ////////////////////////////////// // Clean-up if ( mJoint ) delete mJoint; if ( mAllelic ) delete mAllelic; if ( mCNV ) delete mCNV; } // End of association testing } // Next generic variant if ( true ) GOUT.close(); if ( par::gvar_verbose_association ) GVERB.close(); } Model * analyseModel(Plink * P, Variant * v, int g, bool allelic, bool cnv) { // Keep track of original numbr of covariates int addedTerms = 0; if ( allelic && cnv ) { addedTerms = 2; P->clistname.push_back("SNP"); P->clistname.push_back("CNP"); } else if ( allelic ) { addedTerms = 1; P->clistname.push_back("SNP"); } else if ( cnv ) { addedTerms = 1; P->clistname.push_back("CNP"); } par::clist_number += addedTerms; for (int i=0; in; i++) { Individual * person = P->sample[i]; GVariant * gv = person->gvar[g]; // Assume biallelic for now // Y ~ m + b1.(A+B) + b2.(A-B) double d0=0, d1=0; if ( gv->missing ) { // This flag means that the Model // class will ignore this person person->missing2 = true; } else { person->missing2 = false; if ( gv->allele1 == 0 ) d0 += gv->dosage1; if ( gv->allele1 == 1 ) d1 += gv->dosage1; if ( gv->allele2 == 0 ) d0 += gv->dosage2; if ( gv->allele2 == 1 ) d1 += gv->dosage2; } if ( allelic && cnv ) { person->clist.push_back( d0+d1 ); person->clist.push_back( d0-d1 ); } else if ( allelic ) { person->clist.push_back( d0-d1 ); } else if ( cnv ) { person->clist.push_back( d0+d1 ); } } // Fit linear model, and return a pointer to it P->glmAssoc(false, *(P->pperm) ); // Return covariate list to normal status par::clist_number -= addedTerms; P->clistname.resize( par::clist_number ); for (int i=0; in; i++) P->sample[i]->clist.resize( par::clist_number ); return P->model; } double compareModels(Model * alternate, Model * null) { if ( par::bt ) { return chiprobP( ((LogisticModel*)null)->getLnLk() - ((LogisticModel*)alternate)->getLnLk() , alternate->getNP() - null->getNP() ); } else { double F = ((LinearModel*)alternate)->calculateFTest((LinearModel*)null); if ( F < 0 ) F = 0; return pF( F, alternate->getNP() - null->getNP(), alternate->Ysize() - alternate->getNP() - 1 ); } return -1; } void Plink::convertGenericVariantData() { error("Not yet implemented"); } void Plink::outputGenericVariantFile() { //////////////////////////////////////////////////////////// // Assume a fully populated generic variant dataset exists //////////////////////////////// // Write in variant-major order string f = par::output_file_name + ".gvar"; printLOG("Writing generic variant file to [ " + f + " ]\n"); ofstream GOUT(f.c_str(), ios::out); for (int g=0; ggvar[g]; GOUT << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(par::pp_maxsnp) << v->name << " "; ///////////////////// // First allele if ( gv->allele1 >= 0 ) GOUT << setw(4) << v->alleles[ gv->allele1 ] << " "; else GOUT << setw(4) << par::out_missing_genotype << " "; GOUT << setw(4) << gv->dosage1 << " "; ///////////////////// // Second allele if ( gv->allele2 >= 0 ) GOUT << setw(4) << v->alleles[ gv->allele2 ] << " "; else GOUT << setw(4) << par::out_missing_genotype << " "; GOUT << setw(4) << gv->dosage2 << " "; GOUT << "\n"; } // Next individual } // Next variant GOUT.close(); } plink-1.07-src/gxe.cpp0000644000265600020320000001632111264127625013767 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include "plink.h" #include "helper.h" #include "stats.h" #include "perm.h" void Plink::perm_testGXE2(Perm & perm) { // Assumes SNP-major mode if (!par::SNP_major) Ind2SNP(); // This procedure is only for continuous traits if (par::bt) error("Can only use --gxe option with continuous phenotypes"); // GxE test statistics vector original; // Empirical p-valuess perm.setTests(nl_all); // Construct a binary covariate for GxE // Individuals who are missing for the // covariate will already have been set // to missing for the phenotype -- also // allow for 0 to equal missing here // (i.e. use affection status coding) for (int i=0; icovar == 0) sample[i]->missing = true; else if (sample[i]->covar == 2) sample[i]->bcovar = false; else sample[i]->bcovar = true; } //////////////////////////////// // Set up permutation structure perm.setPermClusters(*this); perm.originalOrder(); //////////////////////////////////// // If we do perform permutation, // check the permutation procedure here: // i.e. pperson->bcovar or gperson->bcovar //////////////////////////////////// // Quantitative trait regression original = testQAssocGXE2(true,perm); //////////////////////////////////// // No permutations for now shutdown(); } ///////////////////////////////////////////// // Simple quantitative trait association test // Assumes SNP-major mode vector Plink::testQAssocGXE2(bool print_results , Perm & perm ) { vector results(nl_all); ofstream ASC; if (print_results) { string f = par::output_file_name + ".qassoc.gxe"; printLOG("Writing QT GxE association results to [ " + f + " ] \n"); ASC.open(f.c_str(),ios::out); ASC << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(8) << "NMISS1" << " " << setw(10) << "BETA1" << " " << setw(10) << "SE1" << " " << setw(8) << "NMISS2" << " " << setw(10) << "BETA2" << " " << setw(10) << "SE2" << " " << setw(8) << "Z_GXE" << " " << setw(12) << "P_GXE" << " " << "\n"; ASC.precision(4); } // Iterate over each locus vector::iterator s = SNP.begin(); int l = 0; while ( s != SNP.end() ) { // Skip possibly if (par::adaptive_perm && !perm.snp_test[l]) { // advance to next SNP s++; l++; continue; } double g_mean1=0; double g_var1=0; double qt_mean1=0; double qt_var1=0; double qt_g_covar1=0; int nanal1 = 0; double g_mean2=0; double g_var2=0; double qt_mean2=0; double qt_var2=0; double qt_g_covar2=0; int nanal2=0; /////////////////////////////// // Iterate over each individual vector::iterator person = sample.begin(); vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); while ( person != sample.end() ) { // Permuted self Individual * pperson = (*person)->pperson; // Genotype bool s1 = *i1; bool s2 = *i2; if (!pperson->missing) { if ( ! ( s1 && !s2) ) // 10 = missing { if (pperson->bcovar) qt_mean1 += pperson->phenotype; else qt_mean2 += pperson->phenotype; if ( (!s1) && (!s2) ) // 00 = hom(11) { if (pperson->bcovar) g_mean1+=2; else g_mean2+=2; } else if ( (!s1) && s2) // 01 = het(12) { if (pperson->bcovar) g_mean1++; else g_mean2++; } if (pperson->bcovar) nanal1++; else nanal2++; } } // Next person i1++; i2++; person++; } // Calculate mean qt_mean1 /= (double)nanal1; g_mean1 /= (double)nanal1; qt_mean2 /= (double)nanal2; g_mean2 /= (double)nanal2; // Iterate over individuals again person = sample.begin(); i1 = (*s)->one.begin(); i2 = (*s)->two.begin(); while ( person != sample.end() ) { // Permuted self Individual * pperson = (*person)->pperson; // Genotype bool s1 = *i1; bool s2 = *i2; if (!pperson->missing) { if ( ! ( s1 && !s2) ) // 10 = missing { if (pperson->bcovar) qt_var1 += (pperson->phenotype-qt_mean1) * ( pperson->phenotype-qt_mean1 ) ; else qt_var2 += (pperson->phenotype-qt_mean2) * ( pperson->phenotype-qt_mean2 ) ; double g = 0; if ( (!s1) && (!s2) ) // 00 = hom(11) g=2; else if ( (!s1) && s2 ) // 01 = het(12) g=1; if (pperson->bcovar) { g_var1 += (g-g_mean1) * ( g-g_mean1 ) ; qt_g_covar1 += ( pperson->phenotype - qt_mean1 ) * ( g - g_mean1 ) ; } else { g_var2 += (g-g_mean2) * ( g-g_mean2 ) ; qt_g_covar2 += ( pperson->phenotype - qt_mean2 ) * ( g - g_mean2 ) ; } } } // Next individual i1++; i2++; person++; } qt_var1 /= (double)nanal1 - 1; g_var1 /= (double)nanal1 - 1; qt_g_covar1 /= (double)nanal1 - 1; qt_var2 /= (double)nanal2 - 1; g_var2 /= (double)nanal2 - 1; qt_g_covar2 /= (double)nanal2 - 1; double beta1 = qt_g_covar1 / g_var1; double vbeta1 = (qt_var1/g_var1 - (qt_g_covar1*qt_g_covar1)/(g_var1*g_var1) ) / (nanal1-2); double beta2 = qt_g_covar2 / g_var2; double vbeta2 = (qt_var2/g_var2 - (qt_g_covar2*qt_g_covar2)/(g_var2*g_var2) ) / (nanal2-2); double Z = (beta1-beta2) / sqrt( vbeta1 + vbeta2 ) ; if (print_results) { ASC << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " "; if (realnum(Z)) { ASC << setw(8) << nanal1 << " " << setw(10) << beta1 << " " << setw(10) << sqrt(vbeta1) << " " << setw(8) << nanal2 << " " << setw(10) << beta2 << " " << setw(10) << sqrt(vbeta2) << " " << setw(8) << Z << " " << setw(12) << chiprobP(Z*Z,1) << "\n"; } else { ASC << setw(8) << "NA" << " " << setw(10) << "NA" << " " << setw(10) << "NA" << " " << setw(8) << "NA" << " " << setw(10) << "NA" << " " << setw(10) << "NA" << " " << setw(8) << "NA" << " " << setw(12) << "NA" << "\n"; } } results[l] = Z; // Advance to next SNP s++; l++; } if (print_results) ASC.close(); return results; } plink-1.07-src/step.cpp0000644000265600020320000000145011264127625014154 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "perm.h" #include "model.h" #include "linear.h" #include "logistic.h" #include "stats.h" plink-1.07-src/Rconnection.h0000644000265600020320000002775411264127626015147 0ustar tilleaadmin/* * C++ Interface to Rserve * Copyright (C) 2004-8 Simon Urbanek, All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; version 2.1 of the License * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Leser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Although this code is licensed under LGPL v2.1, we strongly encourage * everyone modifying this software to contribute back any improvements and * bugfixes to the project for the benefit all other users. Thank you. * * $Id: Rconnection.h 245 2008-11-25 15:16:27Z urbanek $ */ /* external defines: SWAPEND - needs to be defined for platforms with inverse endianess related to Intel MAIN - should be defined in just one file that will contain the fn definitions and variables (this is inherited from Rsrv.h and sisocks.h) */ #ifndef __RCONNECTION_H__ #define __RCONNECTION_H__ #if defined __GNUC__ && !defined unix && !defined Win32 && !defined WIN32 #define unix #endif #include #include #include "sisocks.h" #include "Rsrv.h" typedef unsigned int Rsize_t; //=== Rconnection error codes #define CERR_connect_failed -1 #define CERR_handshake_failed -2 #define CERR_invalid_id -3 #define CERR_protocol_not_supp -4 #define CERR_not_connected -5 #define CERR_peer_closed -7 #define CERR_malformed_packet -8 #define CERR_send_error -9 #define CERR_out_of_mem -10 #define CERR_not_supported -11 #define CERR_io_error -12 // this one is custom - authentication method required by // the server is not supported in this client #define CERR_auth_unsupported -20 #define A_required 0x001 #define A_crypt 0x002 #define A_plain 0x004 //===================================== Rmessage ---- QAP1 storage class Rmessage { public: struct phdr head; char *data; Rsize_t len; int complete; // the following is avaliable only for parsed messages (max 16 pars) int pars; unsigned int *par[16]; Rmessage(); Rmessage(int cmd); // 0 data Rmessage(int cmd, const char *txt); // DT_STRING data Rmessage(int cmd, int i); // DT_INT data (1 entry) Rmessage(int cmd, const void *buf, int len, int raw_data=0); // raw data or DT_BYTESTREAM virtual ~Rmessage(); int command() { return complete?head.cmd:-1; } Rsize_t length() { return complete?head.len:-1; } int is_complete() { return complete; } int read(int s); void parse(); int send(int s); }; //===================================== Rexp --- basis for all SEXPs class Rexp { public: Rmessage *msg; unsigned int *pos; Rsize_t len; Rexp *attr; int type; /* memory manegement for data/len: - content is in a message and this Rexp is the master of that message: master=0; msg=; - content is in a message, but this Rexp is not the master master=; msg=0 - content is all self-allocated with no message associated master=this; msg=0 */ char *data, *next; protected: // the next two are only cached if requested, no direct access allowed int attribs; char **attrnames; Rexp *master; // if this is set then this Rexp allocated the memory for us, so we are not supposed to free anything; if this is set to "this" then the content is self-allocated, including any data int rcount; // reference count - only for a master - it counts how many children still exist public: Rexp(Rmessage *msg); Rexp(unsigned int *pos, Rmessage *msg=0); Rexp(int type, const char *data=0, int len=0, Rexp *attr=0); virtual ~Rexp(); void set_master(Rexp *m); char *parse(unsigned int *pos); virtual Rsize_t storageSize() { return len+((len>0x7fffff)?8:4); } virtual void store(char *buf); Rexp *attribute(const char *name); char **attributeNames(); virtual Rsize_t length() { return len; } friend std::ostream& operator<< (std::ostream& os, const Rexp& exp) { return ((Rexp&)exp).os_print(os); } friend std::ostream& operator<< (std::ostream& os, const Rexp* exp) { return ((Rexp*)exp)->os_print(os); } virtual std::ostream& os_print(std::ostream& os) { return os << "Rexp[type=" << type << ",len=" << len <<"]"; } }; //===================================== Rint --- XT_INT/XT_ARRAY_INT class Rinteger : public Rexp { public: Rinteger(Rmessage *msg) : Rexp(msg) { fix_content(); } Rinteger(unsigned int *ipos, Rmessage *imsg) : Rexp(ipos, imsg) { fix_content(); } Rinteger(int *array, int count) : Rexp(XT_ARRAY_INT, (char*)array, count*sizeof(int)) { fix_content(); } int *intArray() { return (int*) data; } int intAt(int pos) { return (pos>=0 && (unsigned)pos=0 && (unsigned)pos=nel)?0:cont[i]; } char *string() { return stringAt(0); } unsigned int count() { return nel; } int indexOfString(const char *str); virtual std::ostream& os_print (std::ostream& os) { return os << "char*[" << nel <<"]\"" << string() <<"\".."; } private: void decode() { char *c = (char*) data; int i = 0; nel = 0; while (i < len) { if (!c[i]) nel++; i++; } if (nel) { i = 0; cont = (char**) malloc(sizeof(char*)*nel); while (i < nel) { cont[i] = strdup(c); while (*c) c++; c++; i++; } } else cont = 0; } }; //===================================== Rstring --- XT_STR class Rstring : public Rexp { public: Rstring(Rmessage *msg) : Rexp(msg) {} Rstring(unsigned int *ipos, Rmessage *imsg) : Rexp(ipos, imsg) {} Rstring(const char *str) : Rexp(XT_STR, str, strlen(str)+1) {} char *string() { return (char*) data; } virtual std::ostream& os_print (std::ostream& os) { return os << "\"" << string() <<"\""; } }; //===================================== Rlist --- XT_LIST (CONS lists) class Rlist : public Rexp { public: Rexp *head, *tag; Rlist *tail; Rlist(Rmessage *msg) : Rexp(msg) { head=tag=0; tail=0; fix_content(); } Rlist(unsigned int *ipos, Rmessage *imsg) : Rexp(ipos, imsg) { head=tag=0; tail=0; fix_content(); } /* this is a sort of special constructor that allows to create a Rlist based solely on its content. This is necessary since 0.5 because each LISTSXP is no longer represented by its own encoded SEXP but they are packed in one content list instead */ Rlist(int type, Rexp *head, Rexp *tag, char *next, Rmessage *imsg) : Rexp(type, 0, 0, 0) { this->head = head; this->tag = tag; tail = 0; this->next = next; this->msg = imsg; master = 0; } virtual ~Rlist(); Rexp *entryByTagName(const char *tagName) { if (tag && (tag->type==XT_SYM || tag->type==XT_SYMNAME) && !strcmp(((Rsymbol*)tag)->symbolName(),tagName)) return head; if (tail) return tail->entryByTagName(tagName); return 0; } virtual std::ostream& os_print (std::ostream& os) { os << "Rlist[tag="; if (tag) os << *tag; else os << ""; os << ",head="; if (head) os << *head; else os << ""; if (tail) os << ",tail=" << *tail; return os << "]"; } private: void fix_content(); }; //===================================== Rvecotr --- XT_VECTOR (general lists) class Rvector : public Rexp { protected: Rexp **cont; int count; // cached char **strs; public: Rvector(Rmessage *msg) : Rexp(msg) { cont=0; count=0; strs=0; fix_content(); } Rvector(unsigned int *ipos, Rmessage *imsg) : Rexp(ipos, imsg) { cont=0; count=0; strs=0; fix_content(); } virtual ~Rvector(); char **strings(); int indexOf(Rexp *exp); int indexOfString(const char *str); char *stringAt(int i) { if (i<0 || i>count || !cont[i] || cont[i]->type!=XT_STR) return 0; return ((Rstring*)cont[i])->string(); } Rexp* byName(const char *name); virtual std::ostream& os_print (std::ostream& os) { os << "Rvector[count=" << count << ":"; int i=0; while (i #include #include "plink.h" #include "sets.h" #include "options.h" #include "helper.h" #include "model.h" #include "stats.h" #include "phase.h" extern Plink * PP; Set::Set(vector > & ss) : snpset(ss) { sizeSets(); } void Set::sizeSets() { cur.resize(snpset.size()); for(int s=0;s snpset[i].size() ) s_min[i] = snpset[i].size(); else s_min[i] = par::set_min-1; if (par::set_max==-1 || par::set_max > snpset[i].size() ) s_max[i] = snpset[i].size(); else s_max[i] = par::set_max; if (s_min>s_max) s_min[i]=s_max[i]; int s = (s_max[i] - s_min[i]); stat_set[i].resize(s); pv_set[i].resize(s); pv_maxG_set[i].resize(s); pv_maxE_set[i].resize(s); if ( ! par::set_score ) { for (int j=0; j mean; // Sample mean vector > var; // Covariance matrix vector nSNP(0); for (int j=0; j p = vif_prune(var,VIF_threshold,nSNP); for (int i=0; iname << "\n"; else SET2 << P.locus[snpset[s][j]]->name << "\n"; } SET1 << "END\n\n"; SET2 << "END\n\n"; } SET1.close(); SET2.close(); } } ////////////////////////////////////////////////////// // // // Remove SNPs not in any set // // // ////////////////////////////////////////////////////// void Set::dropNotSet(Plink & P) { ///////////////////////////////////////////// // Drop any SNPs that do not belong in a set vector drop(P.nl_all,true); for (int i=0;i nmap; int cnt = 0; for (int l=0; lsecond; } } ////////////////////////////////////////////////////// // // // Create LD map within each set // // // ////////////////////////////////////////////////////// void Set::makeLDSets() { ldSet.clear(); ldSet.resize( snpset.size() ); ////////////////////////////////////////////////////// // If pre-calculated, we can read a .ldset file if ( par::set_r2_read ) { // checkFileExists(par::set_r2_read_file); // PP->printLOG("Read LD set information from [ " + par::set_r2_read_file + " ]\n"); // map mlocus; // makeLocusMap(*PP,mlocus); // ifstream SIN; // SIN.open( par::set_r2_read_file.c_str() , ios::in ); // while ( ! SIN.eof() ) // { // vector l = tokenizeLine( SIN ); // if ( SIN.eof() ) // break; // if ( l.size() < 2 ) // continue; // // SET ISNP PROXIES... // int nprox = l.size() - 2; // // Lookup SNP names // int isnp = // for ( int j = 0; j < nprox; j++) // { // int l1 = snpset[i][j]; // int l2 = snpset[i][k]; // double rsq = -1; // if ( par::set_r2_phase ) // rsq = PP->haplo->rsq(l1,l2); // else // rsq = PP->correlation2SNP(l1,l2,true,false); // if ( rsq >= par::set_r2_val ) // { // ldSet[i][j].insert(k); // ldSet[i][k].insert(j); // } // } // return; } ////////////////////////////////////////////////////// // Otherwise, calculate LD based on raw genotype data for (int i=0;ilocus[l1]->chr == PP->locus[l2]->chr ) { if ( par::set_r2_phase ) rsq = PP->haplo->rsq(l1,l2); else rsq = PP->correlation2SNP(l1,l2,true,false); } if ( rsq >= par::set_r2_val ) { ldSet[i][j].insert(k); ldSet[i][k].insert(j); } } } } // Output LD sets? if ( par::set_r2_write ) { PP->printLOG("Writing LD sets to [ " + par::output_file_name + ".ldset ]\n"); ofstream SOUT; SOUT.open( ( par::output_file_name + ".ldset").c_str() , ios::out); for (int i=0;ilocus[ snpset[i][j] ]; set & lset = ldSet[i][j]; if ( lset.size() > 0 ) { SOUT << PP->setname[i] << " "; SOUT << loc->name << " "; //SOUT << lset.size() << " "; set::iterator k = lset.begin(); while ( k != lset.end() ) { int l = snpset[i][*k]; SOUT << PP->locus[l]->name << " "; ++k; } SOUT << "\n"; } } } SOUT.close(); } } ////////////////////////////////////////////////////// // // // Create map of SNP number of set codes // // // ////////////////////////////////////////////////////// void Set::initialiseSetMapping() { setMapping.clear(); for (int i=0;i >::iterator si = setMapping.find(l); // Either we haven't yet seen the SNP... if ( si == setMapping.end() ) { set t; t.insert(i); setMapping.insert(make_pair(l,t)); } else { // ... or we have si->second.insert(i); } // Next SNP } } ////////////////////////////////////////////////////// // // // Sum-statistic scoring (original) // // // ////////////////////////////////////////////////////// void Set::cumulativeSetSum_WITHLABELS(Plink & P, vector & original) { // // Consider each set // for (int i=0;i t; // // Gather set of all chi-sqs (map sorts them automatically) // for (int j=0; j < snpset[i].size(); j++) // { // SetSortedSNP s; // s.chisq = original[snpset[i][j]]; // s.name = P.locus[snpset[i][j]]->name; // s.locus = snpset[i][j]; // t.push_back(s); // } // // Sort t // sort(t.begin(),t.end()); // // Store results for s_min through s_max // double s=0; // int j=0; // vector t2; // for( vector::reverse_iterator p = t.rbegin(); p!=t.rend(); p++) // { // // //////////////////////////////// // // // Using an r-sq threshold also? // // double max_r2 = 0; // // if ( par::set_r2 ) // // { // // int l0 = p->locus; // // for (int l=0; l< inSet.size(); l++) // // { // // double r = PP->haplo->rsq( l0, inSet[l] ); // // if ( r > max_r2 ) // // max_r2 = r; // // } // // } // //////////////////////////////// // // Add this SNP to the set? // // if ( (!par::set_r2) || // // max_r2 <= par::set_r2_val ) // // { // s += p->chisq; // if (j>=s_min[i] && jname); // // inSet.push_back(p->locus); // } // j++; // // } // } // // And save // setsort.push_back(t2); // } } ////////////////////////////////////////////////////// // // // Sum-statistic scoring (permutation) // // // ////////////////////////////////////////////////////// void Set::cumulativeSetSum_WITHOUTLABELS(vector & perm, int p) { // vector t; // // Consider each set // for (int i=0;i inSet; // double s=0; // for (int j=0;jlocus; // // for (int l=0; l< inSet.size(); l++) // // { // // double r = PP->haplo->rsq( l0, inSet[l] ); // // if ( r > max_r2 ) // // max_r2 = r; // // } // // } // //////////////////////////////// // // Add this SNP to the set? // // if ( (!par::set_r2) || // // max_r2 <= par::set_r2_val ) // // { // s += t[t.size()-1-j]; // if (j>=s_min[i] && j= stat_set[i][j][p0] ) pv_set[i][j][p0]++; // Find best p-values per rep (overall, per set) for (int p=0;p<=R;p++) { double maxE_set = 1; vector maxG_set(pv_set.size(),1); // Consider each score for (int i=0;i >::iterator si = setMapping.find(l); if ( si == setMapping.end() ) { return; } set::iterator li = si->second.begin(); while ( li != si->second.end() ) { profileSNPs[ *li ].push_back( l ); profileScore[ *li ].push_back( odds ); ++li; } } vector_t Set::profileTestScore() { /////////////////////////////////////////////////// // For each set, calculate per-individual scores, then // regress this on the phenotype, then save a Wald // test statistic vector_t results; for (int i=0; i count; vector acount; map scores; map allele1; for (int j=0; jcalculateProfile(scores, allele1, profile, dummy , count, acount); /////////////////////////////////////////////// // Save as the covariate, the mean score (i.e. // average by number of seen SNPs) for (int k=0; k < PP->n; k++) { Individual * person = PP->sample[k]; if ( count[k] == 0 || person->flag ) person->missing = true; else { person->clist[0] = profile[k] / (double)count[k]; person->missing = false; } } //////////////////////////////// // Regress phenotype on profil PP->glmAssoc(false,*PP->pperm); ////////////////////////////////////////////// // Reset original missing status vector::iterator i = PP->sample.begin(); while ( i != PP->sample.end() ) { (*i)->missing = (*i)->flag; ++i; } //////////////////////////////////////////////// // Save test statistic for permutation purposes double statistic = PP->model->getStatistic(); PP->model->validParameters(); if ( ! PP->model->isValid() ) statistic = -1; results.push_back( statistic ); //////////////////////////////////////////////// // Clear up GLM model delete PP->model; } // Finally, important to clear the profile scores now, // so that the next permutation starts from scratch profileSNPs.clear(); profileScore.clear(); profileSNPs.resize( snpset.size() ); profileScore.resize( snpset.size() ); return results; } void Set::profileTestInitialise() { PP->printLOG("Initalising profile-based set test\n"); // Set up the mapping to determine which set(s) // a given SNP is in initialiseSetMapping(); // Clear the scores profileSNPs.clear(); profileScore.clear(); profileSNPs.resize( snpset.size() ); profileScore.resize( snpset.size() ); /////////////////////////////////////////////////// // Set-up for use of the Linear or Logistic Models par::assoc_glm_without_main_snp = true; if ( PP->clistname.size() > 0 ) error("Cannot specify covariates with --set-score"); ////////////////////////////////////////////// // Use flag to store original missing status vector::iterator i = PP->sample.begin(); while ( i != PP->sample.end() ) { (*i)->flag = (*i)->missing; ++i; } ///////////////////////////////// // Pretend we have covariates par::clist = true; par::clist_number = 1; PP->clistname.resize(1); PP->clistname[0] = "PROFILE"; for (int i=0; i< PP->n; i++) { Individual * person = PP->sample[i]; person->clist.resize(1); } } vector_t Set::fitLDSetTest( vector_t & singleSNP, bool save ) { int ns = snpset.size(); vector_t results(ns,0); if ( save ) { numSig.resize(ns,0); selectedSNPs.resize(ns); } /////////////////////////////////////////// // Down-weight under true model? if ( save && par::fix_lambda ) { PP->printLOG("Downweighting observed statistics in set-test by a factor of " + dbl2str( par::lambda ) + "\n"); vector_t::iterator i = singleSNP.begin(); while ( i != singleSNP.end() ) { *i = (*i) / par::lambda; ++i; } } /////////////////////////////////////////// // Consider each set for (int i=0; i t(nss); for (int j=0;j selected(0); // Step through SNPs sequentially, adding to score for( vector::reverse_iterator p = t.rbegin(); p!=t.rend(); p++) { // Is this score already too large? if ( inSet == par::set_max ) { break; } // Get SET-centric SNP code int j = p->l; // Are there any SNPs worth adding? if ( p->chisq < par::set_chisq_threshold ) { break; } // Record this SNP as significant if ( save ) ++isSig; // Is this SNP correlated to a SNP already in the list? set & ls = ldSet[i][j]; bool hasProxy = false; for (int k=0; k::iterator d = ls.find(selected[k]); if ( d != ls.end() ) { hasProxy = true; break; } } // Advance to next potential SNP if ( hasProxy ) continue; // Otherwise, add this SNP to the score score += p->chisq; ++inSet; selected.push_back( j ); } /////////////////////////////////////////// // Do we want to save anything here? if ( save ) { numSig[i] = isSig; selectedSNPs[i] = selected; } /////////////////////////////////////////// // Statistic is the mean test statistic per // selected SNP results[i] = inSet>0 ? score/(double)inSet : 0 ; } return results; } vector_t Set::fitStepwiseModel() { if ( par::SNP_major ) PP->SNP2Ind(); par::assoc_glm_without_main_snp = true; // We are using the conditioning SNPs list to swap // in and out the effects par::conditioning_snps = true; // Put a set of SNPs into the model // Allow // Fixed covariates, as usual // Handle all SNPs as conditioning SNPs // but have a boolean vector that allows some to be fixed // vector_t results; for (int i=0; isetname[i] << "\n"; int ns = snpset[i].size(); vector fixed(ns,false); vector inModel(ns,false); // Scan all SNPs not in the model, and add the best if above // threshold bool done = false; Model * bestModel = NULL; while ( ! done ) { int bestSNP = -1; double lowestP = 1; for (int j=0; jconditioner.clear(); // And now add this SNP PP->conditioner.push_back( snpset[i][j] ); for (int j2=0; j2conditioner.push_back( snpset[i][j2] ); } // cout << "Testing model: "; // for (int j2=0; j2conditioner.size(); j2++) // cout << PP->locus[ PP->conditioner[j2] ]->name << " "; PP->glmAssoc(false,*PP->pperm); // Conditioning test SNP will always be the first // This function skips the intercept vector_t pv = PP->model->getPVals(); double pval = pv[0]; if ( pval < lowestP && realnum(pval) ) { // cout << "Selecting this marker..." << pval << "\n"; // But do we really want to accept this based // on the absolute threshold? if ( pval < par::set_step_in ) { if ( bestModel != NULL ) delete bestModel; bestModel = PP->model; } else delete PP->model; lowestP = pval; bestSNP = j; } else { delete PP->model; } } if ( lowestP < par::set_step_in ) { inModel[bestSNP] = true; } else { done = true; } // Do we need this still? if ( bestSNP == -1 ) done = true; } // Conintue the stepwise procedure? // The final model is stored in bestModel // Or perhaps we did not find a model? if ( bestModel == NULL ) continue; // Note: skips intercept vector_t pval = bestModel->getPVals(); // But, annoyingly..., this includes intercept... // hmm, should sort this out vector_t coef = bestModel->getCoefs(); // Skip intercept for (int t = 1; t < bestModel->getNP(); t++) { cout << "fcoef " << bestModel->label[t] << " " << coef[t] << "\t" << pval[t-1] << "\n"; } cout << "--------\n"; // for (int j2=0; j2locus[ snpset[i][j2] ]->name << " (selected) \n"; // if ( fixed[j2] ) // cout << PP->locus[ snpset[i][j2] ]->name << " (fixed) \n"; // } // cout << "\n"; // cout << "-----------\n"; // Obtain full model p-valie // results.push_back( bestModel->getStatistic() ); // PLACE ALL THIS IN CONTEXT OF PERMTATION ALSO... if ( bestModel != NULL ) delete bestModel; // Next set } return results; } plink-1.07-src/metaanal.cpp0000644000265600020320000003676711264127626015007 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "stats.h" #include "options.h" #include "plink.h" #include "helper.h" #include "zed.h" #include "nlist.h" using namespace std; extern Plink * PP; class SInfo { public: SInfo(double d, double se) : d(d), se(se) { } double d; double se; }; class Alleles { public: string snp; string a1; string a2; int chr; int bp; Alleles(string name) : snp(name) { a1=a2=""; chr = 0; bp = 0; } Alleles(string name, int chr, int bp) : snp(name), chr(chr), bp(bp) { a1=a2=""; } Alleles(string name, int chr, int bp, string a1, string a2) : snp(name), chr(chr), bp(bp), a1(a1), a2(a2) { } bool matches(string b1, string b2) const { return ( a1 == b1 && a2 == b2 ) || ( a1 == b2 && a2 == b1 ) ; } bool swap(string b1) const { return a1 != b1; } bool operator< (const Alleles & b) const { // Base first on position, then name if ( chr < b.chr ) return true; if ( chr > b.chr ) return false; if ( bp < b.bp ) return true; if ( bp > b.bp ) return false; return snp < b.snp; } bool operator== (const Alleles & b) const { return (chr == b.chr && bp == b.bp && snp == b.snp ); } }; typedef map > mymap_t; void Plink::metaAnalysis() { // Read in 2+ files; match SNPs on IDs // Find OR,SE and perform meta-analysis // Information indexed by SNP: each SNP // has 1+ files containing that SNP // and the needed info map > store; printLOG("Performing meta-analysis of " + int2str( par::meta_files.size() ) + " files\n"); OptionSet * opt = par::opt.getOptions("META"); bool outputStudyEffects = opt->isSet("study"); bool usePositions = ! opt->isSet("no-map"); bool quantTrait = opt->isSet("qt"); bool allelicInfo = ! opt->isSet("no-allele"); //bool sensitivity = opt->isSet("drop1"); bool reportAll = opt->isSet("report-all"); bool logOR = opt->isSet("logscale"); // Are sample sizes given? bool sampleN = opt->isSet("n"); vector n1; vector n2; // Use sample N as weights? bool nWeights = opt->isSet("n-weight"); bool uWeights = opt->isSet("weight"); if ( nWeights && ! sampleN ) error("Must give sample N's with n-weight option"); if ( nWeights && uWeights ) error("Cannot specify both n-weight and weight"); // By default, look for SE field string weightField = "SE"; if ( uWeights ) { weightField = opt->getValue("weight"); printLOG("Setting user-defined weight field to " + weightField + "\n"); } if ( sampleN ) { // Get sample-N information vector ns = opt->getValues("n"); // n=200,300,100,300 // OR n=200/200,100/150,403,405 etc if ( ns.size() != par::meta_files.size() ) error( int2str( ns.size()) + " sample Ns given, but " + int2str( par::meta_files.size()) + " files listed\n"); n1.resize(ns.size()); n2.resize(ns.size()); for (int f=0; f( n1[f] , ns[f] , std::dec ) ) error("Problem with sample N = " + ns[f] ); } else { NList l2(0); l2.setRangeChar(" "); l2.setDelimiter("/"); vector tok2 = l2.deparseStringList( ns[f] ); if ( tok2.size() != 2 ) error("Problem with sample N = " + display(tok2) + " Expecting A/U format" ); if ( ! from_string( n2[f] , tok2[0] , std::dec ) ) error("Problem with sample N = " + tok2[0] ); if ( ! from_string( n1[f] , tok2[1] , std::dec ) ) error("Problem with sample N = " + tok2[1] ); } } double totN1 = 0, totN2 = 0; for (int f=0; f 0 ) printLOG("Processing only chromosome " + int2str(par::run_chr) + "\n"); if ( par::run_chr > 0 && ! usePositions ) par::run_chr == -1; set mset; if ( par::extract_set ) { printLOG("Processing only SNPs in [ " + par::extract_file + " ]\n"); ZInput z2( par::extract_file , compressed(par::extract_file) ); while( ! z2.endOfFile() ) { vector tok = z2.tokenizeLine(); for (int i=0; i tokens = tokenizeLine( header ); // Find appropriate columns to filter int snp_column = -1; int pval_column = -1; int d_column = -1; int se_column = -1; int chr_column = -1; int bp_column = -1; int a1_column = -1; int a2_column = -1; for (int i=0; i tokens = zin.tokenizeLine( ); if ( tokens.size() != fsize ) continue; string snp = tokens[ snp_column ]; // Ignore this SNP? if ( par::extract_set && mset.find(snp) == mset.end() ) continue; double d, se; bool okay = true; // A potential SNP to add Alleles a(snp); if ( usePositions ) { a.chr = getChromosomeCode( tokens[ chr_column ] ); if ( a.chr == 0 ) { rdet += par::meta_files[f] + "\t" + snp + "\tBAD_CHR\n"; okay = false; } if ( par::run_chr > 0 && par::run_chr != a.chr ) continue; if ( ! from_string( a.bp , tokens[ bp_column ] , std::dec ) ) { rdet += par::meta_files[f] + "\t" + snp + "\tBAD_BP\n"; okay = false; } } if ( allelicInfo ) { a.a1 = tokens[ a1_column ]; if ( a2_column != -1 ) a.a2 = tokens[ a2_column ]; // We can only include polymorphic alleles if ( a.a1 == par::missing_genotype ) { rdet += par::meta_files[f] + "\t" + snp + "\tMISSING_A1\n"; okay = false; } if ( a2_column != -1 && a.a2 == par::missing_genotype ) { rdet += par::meta_files[f] + "\t" + snp + "\tMISSING_A2\n"; okay = false; } } if ( ! from_string( d , tokens[ d_column ] , std::dec ) ) { rdet += par::meta_files[f] + "\t" + snp + "\tBAD_ES\n"; okay = false; } if ( ! from_string( se , tokens[ se_column ] , std::dec ) ) { rdet += par::meta_files[f] + "\t" + snp + "\tBAD_SE\n"; okay = false; } // Check alleles? if ( allelicInfo ) { mymap_t::iterator k = store.find( a ); if ( k != store.end() ) { if ( k->first.matches(a.a1,a.a2) ) { if ( k->first.swap(a.a1) ) { // Need to swap effect direction if ( quantTrait || logOR ) d = -d; else d = 1/d; } } else { rdet += par::meta_files[f] + "\t" + snp + "\tALLELE_MISMATCH\n"; okay = false; } } } if ( ! okay ) { ++rejected; continue; } // If OR, take log unless told it is already as log if ( ! ( quantTrait || logOR ) ) d = log(d); SInfo s(d,se); mymap_t::iterator i = store.find( a ); if ( i == store.end() ) { map t; t.insert(make_pair(f,s)); store.insert(make_pair(a,t)); } else { if ( i->second.size() == 1 ) ++twoOrMore; i->second.insert(make_pair(f,s)); } ++snps; } zin.close(); printLOG( " with " + int2str( snps ) + " read\n"); } printLOG(int2str(store.size()) + " unique SNPs, " + int2str(twoOrMore) + " in two or more files\n"); if ( rejected > 0 ) { printLOG("Rejected " + int2str( rejected ) + " SNPs, writing details to [ " + par::output_file_name + ".prob ]\n"); ZOutput z2( par::output_file_name + ".prob" , false ); z2.write(rdet); z2.close(); } // NOTES // No weights // Assume OR,SE, so LOG taken -- add BETA, etc // Do not check for allelic discrepancy // Perform meta-analysis printLOG("Writing meta-analysis results to [ " + par::output_file_name + ".meta ]\n"); ZOutput zout( par::output_file_name + ".meta" , false ); if ( usePositions ) zout << sw("CHR",4) << sw("BP",12) ; zout << sw( "SNP" , 15 ); if ( allelicInfo ) { zout << sw( "A1" , 4 ) << sw( "A2" , 4 ); } zout << sw( "N" , 4 ) << sw( "P" , 12 ) << sw( "P(R)" , 12 ) << sw( "OR" , 8 ) << sw( "OR(R)" , 8 ) << sw( "Q" , 8 ) << sw( "I" , 8 ); if( outputStudyEffects ) for (int f=0; fsecond.size(); if ( n > 1 ) { // CHR/BP positions? if ( usePositions ) zout << sw( i->first.chr, 4) << sw( i->first.bp, 12) ; // SNP name zout << sw( i->first.snp , 15 ); // Allele codes? if ( allelicInfo ) { zout << sw( i->first.a1 , 4 ); if ( i->first.a2 == "" ) zout << sw( "?" , 4 ); else zout << sw( i->first.a2 , 4 ); } // Number of studies it appears in zout << sw( n , 4 ); // vector of OR, SE vector_t d; vector_t se; map::iterator j = i->second.begin(); while ( j != i->second.end() ) { d.push_back( j->second.d ); se.push_back( j->second.se ); ++j; } // Caclculate weights double denom = 0, denom2 =0, numer = 0; vector_t w(n); vector_t w_random(n); vector_t vars(n); for (int k=0; k 100 ) I= 100; ///////////////////////////////////////// // Output if ( ! quantTrait ) { summ = exp(summ); summ_random = exp(summ_random); } // Convert -9 to NaN, so sw() handles printing double z = 0; if ( p1 < 0 ) p1 = 1/z; if ( pR < 0 ) pR = 1/z; if ( pQ < 0 ) pQ = 1/z; zout << sw( p1 , -4, 12 ) << sw( pR , -4, 12 ) << sw( summ , 4, 8 ) << sw( summ_random , 4, 8 ) << sw( pQ , 4, 8 ) << sw( I ,2 , 8 ); if( outputStudyEffects ) for (int f=0; f::iterator k = i->second.find(f); if ( k != i->second.end() ) zout << sw( exp(k->second.d) , 4, 8 ); else zout << sw( "NA" , 8 ); } zout << "\n"; } else if ( reportAll ) { // Display an entry for even single study/zero study // SNPs if ( usePositions ) zout << sw( i->first.chr, 4) << sw( i->first.bp, 12) ; // SNP name zout << sw( i->first.snp , 15 ); if ( allelicInfo ) { zout << sw( i->first.a1 , 4 ); if ( i->first.a2 == "" ) zout << sw( "?" , 4 ); else zout << sw( i->first.a2 , 4 ); } // Number of studies it appears in zout << sw( n , 4 ); zout << sw( "NA" , 12 ) << sw( "NA" , 12 ) << sw( "NA" , 8 ) << sw( "NA" , 8 ) << sw( "NA" , 8 ) << sw( "NA" , 8 ); if( outputStudyEffects ) for (int f=0; f::iterator k = i->second.find(f); if ( k != i->second.end() ) zout << sw( exp(k->second.d) , 4, 8 ); else zout << sw( "NA" , 8 ); } zout << "\n"; } // Next SNP ++i; } zout.close(); shutdown(); } plink-1.07-src/flip.cpp0000644000265600020320000002701611264127625014141 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "stats.h" #include "helper.h" #include "options.h" using namespace std; void Plink::flipStrand() { //////////////////////////// // Look-up table by SNP name map mlocus; map::iterator ilocus; map mnlocus; map::iterator inlocus; vector::iterator loc = locus.begin(); while ( loc != locus.end() ) { mlocus.insert(make_pair( (*loc)->name, *loc) ); loc++; } //////////////////////////// // Look-up table by SNP name map mpeople; ////////////////////////////////////////////////////////// // Performing this for all individuals, or just a subset? set pflip; if ( par::flip_subset ) { if ( ! par::SNP_major ) Ind2SNP(); for (int i=0; ifid + "_" + person->iid; mpeople.insert( make_pair( id , person ) ); } for (int l=0; lname , l ) ); } checkFileExists(par::flip_subset_file); printLOG("Reading individuals to flip strand for [ " + par::flip_subset_file + " ] \n"); ifstream IN2(par::flip_subset_file.c_str(),ios::in); int pcount1 = 0; int pcount2 = 0; while ( ! IN2.eof() ) { string fid, iid; IN2 >> fid >> iid; string id = fid + "_" + iid; if ( fid == "" ) continue; ++pcount1; map::iterator ipeople = mpeople.find( id ); if ( ipeople == mpeople.end() ) continue; pflip.insert( ipeople->second ); ++pcount2; } printLOG("Read " + int2str(pcount1) + " individuals, of whom " + int2str(pcount2) + " were found, to flip\n"); IN2.close(); } /////////////////////// // Read in SNPs to flip checkFileExists(par::flip_file); printLOG("Reading SNPs to flip strand from [ " + par::flip_file + " ] \n"); ifstream INFILE(par::flip_file.c_str(),ios::in); INFILE.clear(); int counter = 0; while (!INFILE.eof()) { string m; INFILE >> m; if (m=="") continue; if ( par::flip_subset ) { inlocus = mnlocus.find(m); if (inlocus != mnlocus.end() ) { ++counter; for (int i=0; isecond; if ( SNP[ l ]->one[ i ] == SNP[ l ]->two[ i ] ) { SNP[ l ]->one[ i ] = ! SNP[ l ]->one[ i ]; SNP[ l ]->two[ i ] = ! SNP[ l ]->two[ i ]; } } } } } else { ilocus = mlocus.find(m); if (ilocus != mlocus.end()) { counter++; // Flip strand Locus * loc = ilocus->second; if ( loc->allele1 == "A" ) loc->allele1 = "T"; else if ( loc->allele1 == "C" ) loc->allele1 = "G"; else if ( loc->allele1 == "G" ) loc->allele1 = "C"; else if ( loc->allele1 == "T" ) loc->allele1 = "A"; else if ( loc->allele1 == "1" ) loc->allele1 = "4"; else if ( loc->allele1 == "2" ) loc->allele1 = "3"; else if ( loc->allele1 == "3" ) loc->allele1 = "2"; else if ( loc->allele1 == "4" ) loc->allele1 = "1"; if ( loc->allele2 == "A" ) loc->allele2 = "T"; else if ( loc->allele2 == "C" ) loc->allele2 = "G"; else if ( loc->allele2 == "G" ) loc->allele2 = "C"; else if ( loc->allele2 == "T" ) loc->allele2 = "A"; else if ( loc->allele2 == "1" ) loc->allele2 = "4"; else if ( loc->allele2 == "2" ) loc->allele2 = "3"; else if ( loc->allele2 == "3" ) loc->allele2 = "2"; else if ( loc->allele2 == "4" ) loc->allele2 = "1"; } } // Next SNP } INFILE.close(); printLOG("Flipped strand of " + int2str(counter) + " SNPs\n"); } void Plink::calcFlipScan() { /////////////////////////////////////////// // Screen for SNPs that appear to have been // flipped in one dataset versus another, // based on patterns of LD. Just use a simple // sliding window, keeping track of the number // and magnitude of concordant versus discordant // LD pairs ofstream OUT1; string f = par::output_file_name + ".flipscan"; OUT1.open(f.c_str(),ios::out); OUT1.precision(3); printLOG("Writing FS statistics to [ " + f + " ] \n"); OUT1 << setw(6) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "BP" << " " << setw(4) << "A1" << " " << setw(4) << "A2" << " " << setw(8) << "F" << " " << setw(6) << "POS" << " " << setw(8) << "R_POS" << " " << setw(6) << "NEG" << " " << setw(8) << "R_NEG" << " " << "NEGSNPS\n"; ofstream OUT1V; if ( par::flip_scan_verbose ) { string f = par::output_file_name + ".flipscan.verbose"; OUT1V.open(f.c_str(),ios::out); OUT1V.precision(3); printLOG("Writing FS verbose output to [ " + f + " ] \n"); OUT1V << setw(6) << "CHR_INDX" << " " << setw(par::pp_maxsnp) << "SNP_INDX" << " " << setw(12) << "BP_INDX" << " " << setw(4) << "A1_INDX" << " " << setw(par::pp_maxsnp) << "SNP_PAIR" << " " << setw(12) << "BP_PAIR" << " " << setw(4) << "A1_PAIR" << " " << setw(8) << "R_A" << " " << setw(8) << "R_U" << "\n"; } /////////////////////////// // Index locus for (int l1=0; l1= par::disp_r_window_snp ) continue; if ( l1 - l2 >= par::disp_r_window_snp ) continue; if ( locus[l2]->chr != locus[l1]->chr ) continue; if ( locus[l2]->bp - locus[l1]->bp > par::disp_r_window_kb ) continue; if ( locus[l1]->bp - locus[l2]->bp > par::disp_r_window_kb ) continue; ////////////////////////////////////// // Calculate correlation (un-squared) // in cases and controls separately setFlagToCase(); double rCase = correlation2SNP(l1,l2,false,false,true); setFlagToControl(); double rControl = correlation2SNP(l1,l2,false,false,true); // Keep track of score bool sameDirection = true; if ( ( rCase > 0 && rControl < 0 ) || ( rControl > 0 && rCase < 0 ) ) sameDirection = false; bool aboveThreshold = true; if ( ( rCase > - par::flip_scan_threshold && rCase < par::flip_scan_threshold ) && ( rControl > - par::flip_scan_threshold && rControl < par::flip_scan_threshold ) ) aboveThreshold = false; if ( ! realnum( rCase ) ) aboveThreshold = false; if ( ! realnum( rControl ) ) aboveThreshold = false; if ( aboveThreshold ) { if ( sameDirection ) { ++cntPlus; if ( rCase > 0 ) scorePlus += ( rCase + rControl ) / 2.0; else scorePlus += ( -rCase - rControl ) / 2.0; } else { ++cntNeg; if ( rCase > 0 ) scoreNeg += ( rCase - rControl ) / 2.0; else scoreNeg += ( -rCase + rControl ) / 2.0; if ( snplist == "" ) snplist = locus[l2]->name; else snplist += "|" + locus[l2]->name; } if ( par::flip_scan_verbose ) { verbose_buffer << setw(6) << chromosomeName( locus[l1]->chr ) << " " << setw(par::pp_maxsnp) << locus[l1]->name << " " << setw(12) << locus[l1]->bp << " " << setw(4) << locus[l1]->allele1 << " " << setw(par::pp_maxsnp) << locus[l2]->name << " " << setw(12) << locus[l2]->bp << " " << setw(4) << locus[l2]->allele1 << " " << setw(8) << rCase << " " << setw(8) << rControl << "\n"; } } // Consider the paired SNP } // Calculate FS score for this index SNP // Display OUT1 << setw(6) << locus[l1]->chr << " " << setw(par::pp_maxsnp) << locus[l1]->name << " " << setw(12) << locus[l1]->bp << " " << setw(4) << locus[l1]->allele1 << " " << setw(4) << locus[l1]->allele2 << " " << setw(8) << locus[l1]->freq << " " << setw(6) << cntPlus << " "; if ( cntPlus == 0 ) OUT1 << setw(8) << "NA" << " "; else OUT1 << setw(8) << scorePlus/(double)cntPlus << " "; OUT1 << setw(6) << cntNeg << " "; if ( cntNeg == 0 ) OUT1 << setw(8) << "NA" << " "; else OUT1 << setw(8) << scoreNeg/(double)cntNeg << " "; OUT1 << snplist << "\n"; if ( par::flip_scan_verbose ) { if ( cntNeg > 0 ) OUT1V << verbose_buffer.str(); } } // Consider the next index SNP OUT1.close(); if ( par::flip_scan_verbose ) OUT1V.close(); } void Plink::setReferenceAllele() { //////////////////////////// // Look-up table by SNP name map mnlocus; map::iterator inlocus; for (int l=0; lname, l) ); //////////////// // Read in SNPs checkFileExists( par::set_reference_allele_file ); printLOG("Reading SNPs to set reference allele [ " + par::set_reference_allele_file + " ] \n"); ifstream INFILE(par::set_reference_allele_file.c_str(),ios::in); INFILE.clear(); int counter_changed = 0; int counter_problem = 0; int counter_unchanged = 0; while (!INFILE.eof()) { string m, ref; INFILE >> m >> ref; if (m=="") continue; inlocus = mnlocus.find(m); if (inlocus != mnlocus.end() ) { int l = inlocus->second; // Check alleles if ( ref == locus[l]->allele1 ) { ++counter_unchanged; continue; } if ( ref != locus[l]->allele2 ) { ++counter_problem; continue; } ++counter_changed; // Swap alleles, recode genotypes internally string tmp = locus[l]->allele2; locus[l]->allele2 = locus[l]->allele1; locus[l]->allele1 = tmp; // cout << "changed " << m << " ref= " << ref << " now = " // << locus[l]->allele1 << " " << locus[l]->allele2 << "\n"; for (int i=0; ione[ i ] == SNP[ l ]->two[ i ] ) { SNP[ l ]->one[ i ] = ! SNP[ l ]->one[ i ]; SNP[ l ]->two[ i ] = ! SNP[ l ]->two[ i ]; } } } // Next SNP } INFILE.close(); printLOG("Set reference alleles for " + int2str(counter_changed+counter_unchanged) + " SNPs, "); printLOG(int2str(counter_changed) + " different from minor allele\n"); if (counter_problem>0) printLOG("Also, " + int2str(counter_problem) + " couldn't be changed due to bad allele codes\n"); } plink-1.07-src/mishap.cpp0000644000265600020320000002573211264127625014473 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "options.h" #include "helper.h" #include "plink.h" #include "phase.h" #include "stats.h" extern Plink * PP; void Plink::performMisHapTests() { printLOG("\nPerforming haplotype-based tests for non-random missingness\n"); if (!par::SNP_major) Ind2SNP(); // Consider 1 SNP at a time // Form haplotypes based on the surrounding SNPs // Form phenotype based on the patterns of missingness for test SNP // Is there any association? bool old_silent = par::silent; string f = par::output_file_name + ".missing.hap"; haplo->HTEST.open(f.c_str(),ios::out); haplo->HTEST.precision(3); printLOG("Writing haplotype-based missingness results to [ " + f + " ] \n"); haplo->HTEST << setw(par::pp_maxsnp) << "SNP" << " " << setw(10) << "HAPLOTYPE" << " " << setw(8) << "F_0" << " " << setw(8) << "F_1" << " " << setw(20) << "M_H1" << " " << setw(20) << "M_H2" << " " << setw(8) << "CHISQ" << " " << setw(8) << "P" << " " << "FLANKING" << "\n"; ////////////////////////////////// // Set up a single haplotype entry haplo->new_pred_locus.resize(1); // Make new entry in MAP file Locus * loc = new Locus; loc->name = ""; loc->chr = 0; loc->pos = 1; loc->bp = 1; loc->allele1 = "1"; loc->allele2 = "2"; // Add this single dummy locus to the list haplo->new_map.clear(); haplo->new_map.push_back(loc); vector::iterator s = SNP.begin(); int l=0; while ( s != SNP.end() ) { if (!par::silent) cout << l+1 << " of " << nl_all << " SNPs tested \r"; ///////////////////////////////// // Currently, skipped haplod SNPs if (par::chr_sex[locus[l]->chr] || par::chr_haploid[locus[l]->chr] ) { s++; l++; continue; } //////////////////////////////////////////// // Form phenotype (missingness for this SNP) vector::iterator person = sample.begin(); vector::iterator i1 = (*s)->one.begin(); vector::iterator i2 = (*s)->two.begin(); int nmiss=0; while ( person != sample.end() ) { // Missing at test SNP? if ( *i1 && ! *i2 ) { nmiss++; (*person)->aff = true; } else (*person)->aff = false; // Include all individuals in this analysis (*person)->missing = false; i1++; i2++; person++; } //////////////////////////////////////// // Enough missingness to warrant a test? if (nmiss<5) { s++; l++; continue; } /////////////////////// // Form test haplotypes haplo->reset(); // Add which allele to look for (corresponding to allele1) vector tmp; for ( int i = l - par::mishap_window ; i <= l + par::mishap_window ; i++ ) { if ( i >= 0 && i < nl_all && i != l ) if ( locus[i]->chr == locus[l]->chr ) tmp.push_back(i); } haplo->new_pred_locus[0] = tmp; haplo->new_map[0] = locus[l]; /////////////////// // Phase haplotypes par::silent = true; haplo->new_pred_allele = listPossibleHaplotypes(*this, haplo->new_pred_locus[0]); haplo->phaseAllHaplotypes(false,*PP->pperm); haplo->hname = locus[l]->name; par::silent = old_silent; /////////////////////////////////////// // Test association with each haplotype map tests; for (int h=0; h < haplo->nh; h++) { if (haplo->f[h] >= par::min_af) { tests.clear(); for (int h2=0; h2 < haplo->nh; h2++) { if (haplo->f[h2] >= par::min_af) { if (h==h2) { tests.insert(make_pair(h2,0)); } else tests.insert(make_pair(h2,1)); } } ////////////////////// // Test each haplotype int nt = 2; vector caseN(nt,0); vector controlN(nt,0); // Consider each individual for (int i=0; iaff ) { for (int z = 0 ; z < haplo->hap1[i].size(); z++) { map::iterator i1 = tests.find(haplo->hap1[i][z]); map::iterator i2 = tests.find(haplo->hap2[i][z]); if ( i1 != tests.end() ) { if (!haplo->ambig[i]) caseN[i1->second]++; else caseN[i1->second] += haplo->pp[i][z]; } if ( i2 != tests.end() ) { if (!haplo->ambig[i]) caseN[i2->second]++; else caseN[i2->second] += haplo->pp[i][z]; } } } // Or control? else { for (int z = 0 ; z < haplo->hap1[i].size(); z++) { map::iterator i1 = tests.find(haplo->hap1[i][z]); map::iterator i2 = tests.find(haplo->hap2[i][z]); if ( i1 != tests.end() ) { if (!haplo->ambig[i]) controlN[i1->second]++; else controlN[i1->second] += haplo->pp[i][z]; } if ( i2 != tests.end() ) { if (!haplo->ambig[i]) controlN[i2->second]++; else controlN[i2->second] += haplo->pp[i][z]; } } } } // next individual haplo->HTEST << setw(par::pp_maxsnp) << haplo->hname << " "; // Find test haplotype int hh=0; map::iterator i1 = tests.begin(); while ( i1 != tests.end() ) { if ( i1->second == 0 ) hh = i1->first; i1++; } haplo->HTEST << setw(10) << haplo->haplotypeName(hh) << " "; if ( caseN[0] + caseN[1] == 0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << caseN[0] / ( caseN[0] + caseN[1] ) << " "; if ( controlN[0] + controlN[1] == 0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << controlN[0] / ( controlN[0] + controlN[1] ) << " "; haplo->HTEST << setw(20) << (dbl2str(caseN[0])+"/"+dbl2str(controlN[0])) << " " << setw(20) << (dbl2str(caseN[1])+"/"+dbl2str(controlN[1])) << " "; vector rowT(nt); double caseT = 0; double controlT = 0; for (int h=0; hHTEST << setw(8) << chi2 << " " << setw(8) << chiprobP(chi2,nt-1) << " "; else haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << " "; for (int snps=0; snpsns-1; snps++) haplo->HTEST << locus[haplo->S[snps]]->name << "|"; haplo->HTEST << locus[haplo->S[haplo->ns-1]]->name << "\n"; } // next haplotype } ////////////////////// // Heterozygsote test double caseHET=0; double controlHET=0; double caseHOM=0; double controlHOM=0; // Consider each individual for (int i=0; iaff ) { for (int z = 0 ; z < haplo->hap1[i].size(); z++) { if ( haplo->hap1[i][z] != haplo->hap2[i][z] ) { if (!haplo->ambig[i]) caseHET++; else caseHET += haplo->pp[i][z]; } else { if (!haplo->ambig[i]) caseHOM++; else caseHOM += haplo->pp[i][z]; } } } // Or control? else { for (int z = 0 ; z < haplo->hap1[i].size(); z++) { if ( haplo->hap1[i][z] != haplo->hap2[i][z] ) { if (!haplo->ambig[i]) controlHET++; else controlHET += haplo->pp[i][z]; } else { if (!haplo->ambig[i]) controlHOM++; else controlHOM += haplo->pp[i][z]; } } } } // next individual double chi2 = 0; double total = caseHET + caseHOM + controlHET + controlHOM; double total_case = caseHET + caseHOM; double total_control = controlHET + controlHOM; double total_het = caseHET + controlHET; double total_hom = caseHOM + controlHOM; double exp_caseHET = ( total_case * total_het ) / total; double exp_controlHET = ( total_control * total_het ) / total; double exp_caseHOM = ( total_case * total_hom ) / total; double exp_controlHOM = ( total_control * total_hom ) / total; chi2 += ( ( caseHET - exp_caseHET ) * ( caseHET - exp_caseHET ) ) / exp_caseHET + ( ( caseHOM - exp_caseHOM ) * ( caseHOM - exp_caseHOM ) ) / exp_caseHOM + ( ( controlHET - exp_controlHET ) * ( controlHET - exp_controlHET ) ) / exp_controlHET + ( ( controlHOM - exp_controlHOM ) * ( controlHOM - exp_controlHOM ) ) / exp_controlHOM; haplo->HTEST << setw(par::pp_maxsnp) << haplo->hname << " "; haplo->HTEST << setw(10) << "HETERO" << " "; if ( caseHET + caseHOM == 0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << caseHET / ( caseHET + caseHOM ) << " "; if ( controlHET + controlHOM == 0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << controlHET / ( controlHET + controlHOM ) << " "; haplo->HTEST << setw(20) << (dbl2str(caseHET)+"/"+dbl2str(controlHET)) << " " << setw(20) << (dbl2str(caseHOM)+"/"+dbl2str(controlHOM)) << " "; if ( realnum(chi2) ) haplo->HTEST << setw(8) << chi2 << " " << setw(8) << chiprobP(chi2,1) << " "; else haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << " "; for (int snps=0; snpsns-1; snps++) haplo->HTEST << locus[haplo->S[snps]]->name << "|"; haplo->HTEST << locus[haplo->S[haplo->ns-1]]->name << "\n"; // Next test SNP s++; l++; } if (!par::silent) cout << "\n"; // Shut down results file haplo->HTEST.close(); } plink-1.07-src/cdflib.h0000644000265600020320000000746411264127626014105 0ustar tilleaadmin #ifndef __CDFLIB_H__ #define __CDFLIB_H__ double algdiv(double*,double*); double alngam(double*); double alnrel(double*); double apser(double*,double*,double*,double*); double basym(double*,double*,double*,double*); double bcorr(double*,double*); double betaln(double*,double*); double bfrac(double*,double*,double*,double*,double*,double*); void bgrat(double*,double*,double*,double*,double*,double*,int*i); double bpser(double*,double*,double*,double*); void bratio(double*,double*,double*,double*,double*,double*,int*); double brcmp1(int*,double*,double*,double*,double*); double brcomp(double*,double*,double*,double*); double bup(double*,double*,double*,double*,int*,double*); void cdfbet(int*,double*,double*,double*,double*,double*,double*, int*,double*); void cdfbin(int*,double*,double*,double*,double*,double*,double*, int*,double*); void cdfchi(int*,double*,double*,double*,double*,int*,double*); void cdfchn(int*,double*,double*,double*,double*,double*,int*,double*); void cdff(int*,double*,double*,double*,double*,double*,int*,double*); void cdffnc(int*,double*,double*,double*,double*,double*,double*, int*s,double*); void cdfgam(int*,double*,double*,double*,double*,double*,int*,double*); void cdfnbn(int*,double*,double*,double*,double*,double*,double*, int*,double*); void cdfnor(int*,double*,double*,double*,double*,double*,int*,double*); void cdfpoi(int*,double*,double*,double*,double*,int*,double*); void cdft(int*,double*,double*,double*,double*,int*,double*); void cumbet(double*,double*,double*,double*,double*,double*); void cumbin(double*,double*,double*,double*,double*,double*); void cumchi(double*,double*,double*,double*); void cumchn(double*,double*,double*,double*,double*); void cumf(double*,double*,double*,double*,double*); void cumfnc(double*,double*,double*,double*,double*,double*); void cumgam(double*,double*,double*,double*); void cumnbn(double*,double*,double*,double*,double*,double*); void cumnor(double*,double*,double*); void cumpoi(double*,double*,double*,double*); void cumt(double*,double*,double*,double*); double dbetrm(double*,double*); double devlpl(double [],int*,double*); double dexpm1(double*); double dinvnr(double *p,double *q); static void E0000(int,int*,double*,double*,unsigned long*, unsigned long*,double*,double*,double*, double*,double*,double*,double*); void dinvr(int*,double*,double*,unsigned long*,unsigned long*); void dstinv(double*,double*,double*,double*,double*,double*, double*); double dlanor(double*); double dln1mx(double*); double dln1px(double*); double dlnbet(double*,double*); double dlngam(double*); double dstrem(double*); double dt1(double*,double*,double*); static void E0001(int,int*,double*,double*,double*,double*, unsigned long*,unsigned long*,double*,double*, double*,double*); void dzror(int*,double*,double*,double*,double *, unsigned long*,unsigned long*); void dstzr(double *zxlo,double *zxhi,double *zabstl,double *zreltl); double erf1(double*); double erfc1(int*,double*); double esum(int*,double*); double exparg(int*); double fpser(double*,double*,double*,double*); double gam1(double*); void gaminv(double*,double*,double*,double*,double*,int*); double gamln(double*); double gamln1(double*); double Xgamm(double*); void grat1(double*,double*,double*,double*,double*,double*); void gratio(double*,double*,double*,double*,int*); double gsumln(double*,double*); double psi(double*); double rcomp(double*,double*); double rexp(double*); double rlog(double*); double rlog1(double*); double spmpar(int*); double stvaln(double*); double fifdint(double); double fifdmax1(double,double); double fifdmin1(double,double); double fifdsign(double,double); long fifidint(double); long fifmod(long,long); void ftnstop(char*); extern int ipmpar(int*); #endif plink-1.07-src/stats.cpp0000644000265600020320000007426511264127624014354 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include "stats.h" // #ifdef WITH_LAPACK // #include "lapackf.h" // #endif #include "helper.h" #include "crandom.h" #include "options.h" #include "plink.h" #include "perm.h" #include "dcdflib.h" #include "ipmpar.h" #define FPMIN 1.0e-30 extern ofstream LOG; extern Plink * PP; bool realnum(double d) { double zero = 0; if (d != d || d == 1/zero || d == -1/zero) return false; else return true; } long double factorial(int x) { int i; long double result = 1; for (i = 2; i <= x; i++) result *= i; return result; } double normdist(double z) { double sqrt2pi = 2.50662827463; double t0, z1, p0 ; t0 = 1 / (1 + 0.2316419 * fabs(z)); z1 = exp(-0.5 * z*z ) / sqrt2pi; p0 = z1 * t0 * (0.31938153 + t0 * (-0.356563782 + t0 * (1.781477937 + t0 * (-1.821255978 + 1.330274429 * t0)))); return z >= 0 ? 1 - p0 : p0 ; } double chiprobP(double x, double df) { if ( ! realnum(x) ) return -9; double p, q; int st = 0; // error variable int w = 1; // function variable double bnd = 1; // boundary function // NCP is set to 0 cdfchi(&w,&p,&q,&x,&df,&st,&bnd); // Check status if (st != 0 ) return -9; // Return p-value return q; } double inverse_chiprob(double q, double df) { if ( ! realnum(q) ) return -9; else if (q>=1) return 0; double x; double p = 1 - q; int st = 0; // error variable int w = 2; // function variable double bnd = 1; // boundary function // NCP is set to 0 cdfchi(&w,&p,&q,&x,&df,&st,&bnd); // Check status if (st != 0 ) return -9; // Return p-value return x; } double gammln(double xx) { double x, y, tmp, ser; static double cof[6]={76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5}; int j; y=x=xx; tmp=x+5.5; tmp -= (x+0.5)*log(tmp); ser=1.000000000190015; for (j=0; j<=5; j++) ser += cof[j]/++y; return -tmp+log(2.5066282746310005*ser/x); } // Inverse normal distribution /* * Lower tail quantile for standard normal distribution function. * * This function returns an approximation of the inverse cumulative * standard normal distribution function. I.e., given P, it returns * an approximation to the X satisfying P = Pr{Z <= X} where Z is a * random variable from the standard normal distribution. * * The algorithm uses a minimax approximation by rational functions * and the result has a relative error whose absolute value is less * than 1.15e-9. * * Author: Peter J. Acklam * Time-stamp: 2002-06-09 18:45:44 +0200 * E-mail: jacklam@math.uio.no * WWW URL: http://www.math.uio.no/~jacklam * * C implementation adapted from Peter's Perl version */ /* Coefficients in rational approximations. */ static const double a[] = { -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, 1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00 }; static const double b[] = { -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, 6.680131188771972e+01, -1.328068155288572e+01 }; static const double c[] = { -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00 }; static const double d[] = { 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00 }; #define LOW 0.02425 #define HIGH 0.97575 double ltqnorm(double p) { double q, r; errno = 0; if (p < 0 || p > 1) { return 0.0; } else if (p == 0) { return -HUGE_VAL /* minus "infinity" */; } else if (p == 1) { return HUGE_VAL /* "infinity" */; } else if (p < LOW) { /* Rational approximation for lower region */ q = sqrt(-2*log(p)); return (((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1); } else if (p > HIGH) { /* Rational approximation for upper region */ q = sqrt(-2*log(1-p)); return -(((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1); } else { /* Rational approximation for central region */ q = p - 0.5; r = q*q; return (((((a[0]*r+a[1])*r+a[2])*r+a[3])*r+a[4])*r+a[5])*q / (((((b[0]*r+b[1])*r+b[2])*r+b[3])*r+b[4])*r+1); } } double pT(double T, double df) { if ( ! realnum(T) ) return -9; T = abs(T); double p, q; int st = 0; // error variable int w = 1; // function variable double bnd = 1; // boundary function // NCP is set to 0 cdft(&w,&p,&q,&T,&df,&st,&bnd); // Check status if (st != 0 ) return -9; // Return two-sided p-value return 2*q; } double pF(const double F, const int df1, const int df2) { return betai(0.5*df2,0.5*df1,(double)df2/(double)(df2+df1*F)); } double betai(const double a, const double b, const double x) { double bt; if (x < 0.0 || x > 1.0) error("Internal error: bad x in routine betai"); if (x == 0.0 || x == 1.0) bt=0.0; else bt=exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(1.0-x)); if (x < (a+1.0)/(a+b+2.0)) return bt*betacf(a,b,x)/a; else return 1.0-bt*betacf(b,a,1.0-x)/b; } double betacf(const double a, const double b, const double x) { const int MAXIT = 100; const double EPS = 3e-7; int m,m2; double aa,c,d,del,h,qab,qam,qap; qab=a+b; qap=a+1.0; qam=a-1.0; c=1.0; d=1.0-qab*x/qap; if (fabs(d) < FPMIN) d=FPMIN; d=1.0/d; h=d; for (m=1;m<=MAXIT;m++) { m2=2*m; aa=m*(b-m)*x/((qam+m2)*(a+m2)); d=1.0+aa*d; if (fabs(d) < FPMIN) d=FPMIN; c=1.0+aa/c; if (fabs(c) < FPMIN) c=FPMIN; d=1.0/d; h *= d*c; aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2)); d=1.0+aa*d; if (fabs(d) < FPMIN) d=FPMIN; c=1.0+aa/c; if (fabs(c) < FPMIN) c=FPMIN; d=1.0/d; del=d*c; h *= del; if (fabs(del-1.0) <= EPS) break; } if (m > MAXIT) error("Internal error in betacf() function (please report)"); return h; } vector< vector > inverse(vector< vector > & m ) { double d; int i, j; if (m.size() == 0) error("Internal error: matrix with no rows (inverse function)"); if (m.size() != m[0].size() ) error("Internal error: cannot invert non-square matrix"); int n = m.size(); // indx is an integer array vector indx(n); vector col(n); vector > y(n); for (int i=0; i > tm; tm = m; ludcmp(tm,indx,d); for (j=0; j eigenvalues(vector > & a) { // 'a' should be a square, symmetric matrix int n=a.size(); vector e(n); vector d(n); tred2(a,d,e); vector > z; // dummy tqli(d,e,z); return d; } // Householder method to reduce real, symmetric matrix // to tridiagonal form // Modified to return only eigenvalues. void tred2(vector > & a, vector & d, vector &e) { int l,k,j,i; double scale,hh,h,g,f; int n=d.size(); for (i=n-1;i>0;i--) { l=i-1; h=scale=0.0; if (l > 0) { for (k=0;k= 0.0 ? -sqrt(h) : sqrt(h)); e[i]=scale*g; h -= f*g; a[i][l]=f-g; f=0.0; for (j=0;j &d, vector&e, vector > &z) { int m,l,iter,i,k; double s,r,p,g,f,dd,c,b; double volatile temp; int n=d.size(); for (i=1;i=l;i--) { f=s*e[i]; b=c*e[i]; e[i+1]=(r=pythag(f,g)); if (r == 0.0) { d[i+1] -= p; e[m]=0.0; break; } s=f/r; c=g/r; g=d[i+1]-p; r=(d[i]-g)*s+2.0*c*b; d[i+1]=g+(p=s*r); g=c*r-b; // Next loop can be omitted if eigenvectors not wanted /* for (k=0;k= l) continue; d[l] -= p; e[l]=g; e[m]=0.0; } } while (m != l); } } ////////////////////////////////////////////////// // As above, but with eigenvectors returned also Eigen eigenvectors(vector > & a) { // 'a' should be a square, symmetric matrix int n=a.size(); Eigen E; E.set(n); vector e(n,0); EV_tred2(a,E.d,e); EV_tqli(E.d,e,a); E.z = a; return E; } // Householder method to reduce real, symmetric matrix // to tridiagonal form // Modified to return both eigenvalues and eigenvectors void EV_tred2(vector > & a, vector & d, vector &e) { int l,k,j,i; double scale,hh,h,g,f; int n=d.size(); for (i=n-1;i>0;i--) { l=i-1; h=scale=0.0; if (l > 0) { for (k=0;k= 0.0 ? -sqrt(h) : sqrt(h)); e[i]=scale*g; h -= f*g; a[i][l]=f-g; f=0.0; for (j=0;j &d, vector&e, vector > &z) { int m,l,iter,i,k; double s,r,p,g,f,dd,c,b; int n=d.size(); for (i=1;i=l;i--) { f=s*e[i]; b=c*e[i]; e[i+1]=(r=pythag(f,g)); if (r == 0.0) { d[i+1] -= p; e[m]=0.0; break; } s=f/r; c=g/r; g=d[i+1]-p; r=(d[i]-g)*s+2.0*c*b; d[i+1]=g+(p=s*r); g=c*r-b; for (k=0;k= l) continue; d[l] -= p; e[l]=g; e[m]=0.0; } } while (m != l); } } ///////////////////////// // Romberg integration double qromb(double func(const double), double a, double b) { const int JMAX=20, JMAXP=JMAX+1, K=5; const double EPS=1.0e-10; double ss,dss; vector_t s(JMAX),h(JMAXP),s_t(K),h_t(K); int i,j; h[0]=1.0; for (j=1;j<=JMAX;j++) { s[j-1]=trapzd(func,a,b,j); if (j >= K) { for (i=0;i > & v, vector & w, vector > & cvm) { int i,j,k; double sum; int ma=w.size(); vector wti(ma); for (i=0;i > &u, vector &w, vector > &v, vector &b, vector &x) { int jj,j,i; double s; // int us = u.size()>0 ? u[0].size() : 0; // int vs = v.size()>0 ? v[0].size() : 0; // cout << "U = " << u.size() << " " << us<< "\n"; // cout << "V = " << v.size() << " " << vs << "\n"; // cout << "w = " << w.size() << "\n"; // cout << "b = " << b.size() << "\n"; // cout << "x = " << x.size() << "\n"; int m=u.size(); int n=u[0].size(); vector tmp(n); for (j=0;j wmax ? w[i] : wmax; // double wmin = wmax * eps; // for (int i=0; i > svd_inverse(vector< vector > & u , bool & flag ) { const double eps = 1e-24; if (u.size() == 0) error("Internal problem: matrix with no rows (inverse function)"); if (u.size() != u[0].size() ) error("Internal problem: Cannot invert non-square matrix"); int n = u.size(); vector w(n,0); vector > v(n); for (int i=0; i wmax ? w[i] : wmax; double wmin = wmax * eps; for (int i=0; i > r(n); for (int i=0; i > msqrt(vector > & u) { // Using SVD, square root is U . sqrt(D) . V_T // msqrt <- function(m) { // m <- svd(m) // m$u %*% sqrt(diag(m$d)) %*% t(m$v) } const double eps = 1e-12; int n = u.size(); vector d(n,0); vector > v(n); for (int i=0; i > r(n); for (int i=0; i > r2 = r; for (int i=0; i > &a, vector &indx, double &d) { int i, imax = 0, j, k; double big, dum, sum, temp; int n = a.size(); vector vv(n); d=1; for (i=0; i big) big=temp; if (big==0) error("singular matrix in ludcmp"); vv[i]=1/big; } for (j=0; j= big) { big = dum; imax = i; } } if (j != imax) { for (k=0; k > &a, vector &indx, vector &b) { int i, ii=0, ip, j; double sum; int n = a.size(); for (i=0; i=0; i--) { sum=b[i]; for (j=i+1; j > & a, vector & w, vector > &v) { bool flag; int i,its,j,jj,k,l,nm; double anorm,c,f,g,h,s,scale,x,y,z; double volatile temp; int m=a.size(); if (m==0) error("Internal problem in SVD function (no observations left?)"); int n=a[0].size(); vector rv1(n); g=scale=anorm=0.0; for (i=0;i=0;i--) { if (i < n-1) { if (g != 0.0) { for (j=l;j=0;i--) { l=i+1; g=w[i]; for (j=l;j=0;k--) { for (its=0;its<30;its++) { flag=true; for (l=k;l>=0;l--) { nm=l-1; temp=fabs(rv1[l])+anorm; if (temp == anorm) { flag=false; break; } temp=fabs(w[nm])+anorm; if (temp == anorm) break; } if (flag) { c=0.0; s=1.0; for (i=l;i absb) return absa*sqrt(1.0+SQR(absb/absa)); else return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb))); } double SQR(double a) { return a*a; } void multMatrix(vector > & a, vector > & b, vector > & c) { int ar = a.size(); int br = b.size(); if (ar == 0 || br == 0) error("Internal error: multiplying 0-sized matrices"); int ac = a[0].size(); int bc = b[0].size(); if ( ac != br ) error("Internal error: non-conformable matrices in multMatrix()"); int cr = ar; int cc = bc; c.clear(); sizeMatrix(c,cr,cc); for (int i=0; i U.D.Vt // Return UD, and # of PCs we should look at (0 means error) // Handle missing data by mean imputation (i.e. set to 0 after centering) // Edit g to equal U.W.V' (after any editing) int nrow = x.size(); if ( nrow == 0 ) return 0; int ncol = x[0].size(); vector_t means(ncol); vector cnt(nrow,0); for ( int r = 0 ; r < nrow ; r++) { for ( int c = 0 ; c < ncol ; c++) { if ( ! mask[r][c] ) { means[c] += x[r][c]; ++cnt[c]; } } } for ( int c = 0 ; c < ncol; c++) means[c] /= (double)cnt[c]; // Center on column means if ( mean_centre ) { for ( int r = 0 ; r < nrow ; r++) for ( int c = 0 ; c < ncol ; c++) { if ( mask[r][c] ) x[r][c] = 0; else x[r][c] -= means[c]; } } else { // ALTERNATE: no mean centering for ( int r = 0 ; r < nrow ; r++) for ( int c = 0 ; c < ncol ; c++) { if ( mask[r][c] ) x[r][c] = means[c]; } } // Perform SVD on X vector_t p2; svd(x,p2,v); // Figure out how many component to return // Use Dunn & Everitt (2001) rule of s^2 above 0.7/n double thresh = 0.7 / (double)ncol; double totvar = 0; vector keep; for ( int i=0; i= thresh ) { p2[i] = 1; keep.push_back(i); } else p2[i] = 0; } // What to return? If in 2sided mode, just return all // PC scores that meet criterion // Return PC scores in s, PCs in v matrix_t w = vec2diag(p2); matrix_t z, z2; // S = X %*% P multMatrix( x , w , z ); if ( par::elf_pcmode_2sided ) { // Calculate scores, then return sizeMatrix(s, nrow, keep.size() ); for ( int r = 0 ; r < nrow ; r++) for (int c = 0 ; c < keep.size(); c++) { s[r][c] = z[r][keep[c]]; } p.resize(keep.size()); for (int c = 0 ; c < keep.size(); c++) p[c] = ( p2[keep[c]] * p2[keep[c]] ) / totvar; // cout << "sizes = " << keep.size() << " " << s[0].size() << " " << ncol << "\n"; return keep.size(); } // Otherwise, reconstruct X as U.W.V' if ( ! par::elf_pcmode_2sided ) { multMatrix( z , v , x ); // For now return everything --- add back in the generic PCA // later -- but for now, we will use this special version just // for ELF calculations return ncol; } // Otherwise, returned pruned x multMatrix( z , v , z2 ); sizeMatrix(x, nrow, keep.size() ); for ( int r = 0 ; r < nrow ; r++) for (int c = 0 ; c < keep.size(); c++) { x[r][c] = z2[r][keep[c]]; } p.resize(keep.size()); for (int c = 0 ; c < keep.size(); c++) p[c] = ( p2[keep[c]] * p2[keep[c]] ) / totvar; return keep.size(); } matrix_t vec2diag(vector_t & v) { matrix_t d; sizeMatrix(d,v.size(),v.size()); for (int i = 0; i < v.size(); i++) d[i][i] = v[i]; return d; } double rnorm() { double u1 = CRandom::rand(); double u2 = CRandom::rand(); return sqrt(-2*log(u1)) * cos(2*M_PI*u2); // z2 = sqrt(-2*log(u1)) * sin(2*M_PI*u2); } plink-1.07-src/phase.cpp0000644000265600020320000012475611264127624014317 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "genogroup.h" #include "phase.h" #include "haplowindow.h" extern ofstream LOG; using namespace std; vector_t HaploPhase::phaseAllHaplotypes(bool display, Perm & perm ) { vector_t results; ///////////////////////////////////////////// // Set individual to follow in verbose mode? if ( par::haplo_plem_follow ) { par::haplo_plem_follow = false; for (int i=0; ifid == par::haplo_plem_follow_fid && P.sample[i]->iid == par::haplo_plem_follow_iid ) { P.printLOG("Following individual [ " + par::haplo_plem_follow_fid + " " + par::haplo_plem_follow_iid + " ] in EM\n"); par::haplo_plem_follow_ind = i; par::haplo_plem_follow = true; } } } ////////////////////////////// // Begin phasing (and testing) int nms = new_map.size(); for (int l=0; lchr]; haploid = par::chr_haploid[new_map[l]->chr]; // Same window as previous, unless in weighted // multimarker test mode, in which case do all bool same = true; if (l==0 || par::weighted_mm) same = false; else { if (new_pred_locus[l].size() != new_pred_locus[l-1].size() ) { same = false; } else for (int j=0; jname; if (nm.substr(nm.size()-1) == "_") name(new_map[l]->name.substr(0, new_map[l]->name.find("_"))); else name(new_map[l]->name); S = new_pred_locus[l]; ns = S.size(); for (int i=0; ifounder) { if (haploid || (X && person->sex)) validN++; else validN+=2; } includeIndividuals(i); } //////////////////////////////////////// // Verbose recording? (one window only) if ( par::haplo_plem_verbose || par::haplo_plem_follow ) VPHASE.open("phased.verbose",ios::out); /////////////////////////////////////////////////// // Define 'windows' within the 'region' int pos = 0; // Number of possible stub haplotypes nsh = (int) pow(2.0, (double) par::haplo_plem_overlap ); // Check we have a valid overlap size par::haplo_plem_overlap = par::haplo_plem_original_overlap; if ( par::haplo_plem_window >= ns || par::haplo_plem_overlap >= par::haplo_plem_window ) par::haplo_plem_overlap = 0; while (1) { HaploWindow * window = new HaploWindow( this , & P ); ////////////////////////////////// // Define SNP span for this window window->start = pos; window->stop = pos + par::haplo_plem_window - 1; if (window->stop >= ns ) window->stop = ns-1; ///////////////////////////////////////////////////////////////// // Enumerate haplotypes/genoGroups/phases and tally // unambiguous window->enumerateHaplotypes(new_pred_locus[l]); window->enumerateGenogroups(); set::iterator im = window->genotypes.begin(); while (im != window->genotypes.end() ) { if (P.sample[ (*im)->reference ]->founder) window->enumeratePhase( (*im)->reference ); ++im; } window->tallyUnambiguousCounts(); /////////////////////////////// // Add the window to the list windows.push_back( window ); /////////////////////////// // Reached end of region? if (window->stop == ns-1) break; /////////////////////////////////// // Otherwise advance by one window pos += par::haplo_plem_window - par::haplo_plem_overlap; if (par::haplo_plem_verbose) { cout << windows.size() << " windows added \n"; cout << window->stop << " ... " << ns-1 << "\n"; } } if ( par::haplo_plem_verbose ) cout << "\n"; // Set number of windows; nw = windows.size(); if (par::haplo_plem_verbose) P.printLOG("Constructed "+int2str(nw)+" windows\n"); ///////////////////////////////////// // Use offspring to reduce ambiguity // To be added in; not yet implemented // // if (nonfounders) // for (int i=0; ifounder && ambig[i]) // resolveWithChildren(i) //////////////////////////////// // E-M phasing based on founders // Currently three versions of EM // performAlternEM() drive chain-of-windows EM // performEM() individual-window EM // performEM_original() used for meta-EM performAlternEM(); ////////////////////////////////// // Free window storage for (int w=0; wfounder) prunePhase(i); //////////////////////////////////////// // Fill-in phasing for any non-founders // These functions work at per-individual level, i.e. we do // not use genogroups here if ( nonfounders ) { phasemap.clear(); phasemap.resize(P.n); if (par::test_hap_TDT || par::proxy_TDT ) { trans.clear(); untrans.clear(); trans.resize(nh,0); untrans.resize(nh,0); transmissionX.clear(); transmissionX2.clear(); transmissionX.resize(nh,0); transmissionX2.resize(nh,0); } if (haploid ) error("Family-based haplotyping only for autosomes and X chromosome"); // List all possible non-rare phases // enumerateAllPhases(); for (int i=0; ifounder) { // Only phase affecteds, if performing TDT if (par::test_hap_TDT || par::proxy_TDT ) if ( !P.sample[i]->aff) continue; phaseAndScoreNonfounder(i); prunePhase(i); } } if ( par::haplo_plem_verbose || par::haplo_plem_follow ) VPHASE.close(); /////////////////////////////////////////////////////////////// // // // Post-phasing actions // // // /////////////////////////////////////////////////////////////// //////////////////////////////// // Haplotype frequencies if (par::display_hap_freqs) reportHaplotypeFrequencies(); //////////////////////////////// // Haplotype association tests if (par::test_hap_CC || par::test_hap_GLM || par::test_hap_TDT || par::test_hap_QTL ) { // Are we testing all haplotypes, or a specified // pre-determined one? if ( !par::phase_hap_all ) setTestHaplotype(new_pred_allele[l]); // Perform the actual tests: ignore the results for now // (these will be used when we incorporate permutation) vector_t t = performHaplotypeTests(display, perm ); for (int i=0; i -1 ) imputeThisHaplotype(l); else { for ( test_hap = 0; test_hap < nh; test_hap++ ) if ( f[test_hap] >= par::min_hf && f[test_hap] <= par::max_hf ) { imputeThisHaplotype(l); } } } //////////////////////////////////// // Next set of SNPs to be phased } if ( par::haplo_plem_verbose ) cout << "\n"; return results; } void HaploPhase::enumerateHaplotypes(vector & s) { // Make list of haplotypes, and code as +/- S = s; ns = s.size(); nh = (int)pow((double)2, ns); f.resize(nh); ph_hap1.clear(); ph_hap2.clear(); ph_freq.clear(); // Optionally, for haplotype-based TDT if (par::test_hap_TDT || par::proxy_TDT ) { trans.clear(); untrans.clear(); trans.resize(nh, 0); untrans.resize(nh, 0); } unsigned int h=0; while (h m1; unsigned int p=1; for (int s=0; sfreq; else f[h] *= ( 1 - P.locus[S[s]]->freq); } } } void HaploPhase::includeIndividuals(int i) { // Do not look at non-reference individuals in some circumstances if ( reference_only && ! P.sample[i]->missing ) { include[i] = false; if (P.sample[i]->founder) { if (haploid || (X && P.sample[i]->sex)) validN--; else validN-=2; } return; } vector s1(ns); vector s2(ns); // Flipping allele-coding for homozygotes for (int s=0; sone[i]; s2[s] = P.SNP[S[s]]->two[i]; } else { s1[s] = P.sample[i]->one[S[s]]; s2[s] = P.sample[i]->two[S[s]]; } if (s1[s] == s2[s]) { s1[s] = !s1[s]; s2[s] = !s2[s]; } } ////////////////////////////////////////////////////////// // Count amount of missing genotype data at this position int nm = 0; for (int s=0; s= par::hap_missing_geno ) { include[i] = false; if (P.sample[i]->founder) { if (haploid || (X && P.sample[i]->sex)) validN--; else validN-=2; } return; } } void HaploPhase::performAlternEM() { // Working variables for convergence for first // pass meta-EM (chain of EMs) int iter = 0; bool converged; int num_converged = 0 ; // Start multiple EM runs, one per window, allowing each to update // the adjoining window's EM state startWindow = 0; finishWindow = nw-1; bool verboseOutputInFirstRound = true; if ( par::haplo_plem_follow && verboseOutputInFirstRound ) { VPHASE << "\n\nFOLLOWED INDIVIDUAL " << par::haplo_plem_follow_ind << "\n"; VPHASE << " PRIOR TO START OF ANY EM\n"; verboseDisplayWindows(par::haplo_plem_follow_ind); } do { if ( par::haplo_plem_verbose && verboseOutputInFirstRound ) VPHASE << "OUTER LOOP ITERATION " << iter << "\n\n"; // Iterate forward through windows int w = 0; while (w < nw ) { // if ( ! par::silent ) // { // cout << num_converged << " converged windows at iteration " // << iter << " forwards chain, window " << w << " \r"; // cout.flush(); // } if ( par::haplo_plem_verbose && verboseOutputInFirstRound ) VPHASE << "\n\nFORWARD WINDOW " << w << "\n"; HaploWindow * currentWindow = windows[w]; if ( par::haplo_plem_follow && verboseOutputInFirstRound ) { VPHASE << "\n\nFOLLOWED INDIVIDUAL " << par::haplo_plem_follow_ind << "\n"; VPHASE << " FORWARD LOOP, WINDOW " << w << " PRIOR TO ADJUSTMENT \n"; verboseDisplayWindows(par::haplo_plem_follow_ind); } /////////////////////////////////////////////////// // Only adjust freq if this window isn't converged if ( !currentWindow->converged ) { if ( par::haplo_plem_verbose && verboseOutputInFirstRound ) VPHASE << "WINDOW NOT YET CONVERGED\n"; /////////////////////////////////////////////////// // Only pass freq information once after convergence if ( false && w > 0 && !windows[w-1]->right_passed ) { if ( par::haplo_plem_verbose && verboseOutputInFirstRound ) VPHASE << "ADJUSTING FREQUENCIES BASED ON PREVIOUS WINDOW " << w-1 << "\n"; HaploWindow * previousWindow = windows[w-1]; // Populate stub frequencies for both windows vector_t currentStub = currentWindow->leftStubFrequency(); vector_t previousStub = previousWindow->rightStubFrequency(); if ( par::haplo_plem_verbose ) { for (int z=0; zf; // Adjust haplotype frequencies for (int h = 0; h < currentWindow->nh; h++) { int stub = currentWindow->leftStub[h]; if ( abs( previousStub[ stub ] - currentStub[ stub ] ) > 0.02 ) { double nf; if (currentStub[ stub ] == 0) nf = 0; else nf = currentWindow->f[h] * ( previousStub[ stub ] / currentStub[ stub ] ); currentWindow->f[h] = ( currentWindow->f[h] + nf ) * 0.5; } } if ( par::haplo_plem_follow && verboseOutputInFirstRound ) { VPHASE << "FOLLOWED INDIVIDUAL " << par::haplo_plem_follow_ind << "\n"; VPHASE << " FORWARD LOOP, WINDOW " << w << " POST ADJUSTMENT \n"; verboseDisplayWindows(par::haplo_plem_follow_ind); } if ( par::haplo_plem_verbose ) { VPHASE << "OLD, ADJUSTED FREQS\n"; for (int h = 0; h < currentWindow->nh; h++) { if ( of[h] > 0.001 || currentWindow->f[h] > 0.001 ) VPHASE << "ORIGINAL HAP " << currentWindow->haplotypeName(h) << "\t" << currentWindow->leftStub[h] << " " << of[h] << "\t" << currentWindow->f[h] << "\n"; } VPHASE << "\n"; } // Once window has converged pass freq and keep track if (previousWindow->converged) { previousWindow->right_passed = true; if ( par::haplo_plem_verbose ) VPHASE << "\nPREVIOUS WINDOW CONVERGED AND PASSED ON: NOW FIXED\n"; } } if ( par::haplo_plem_verbose ) VPHASE << "\nENTERING INNER EM FOR THIS WINDOW\n"; // Do some more EM iterations currentWindow->performEM(); if ( par::haplo_plem_follow && verboseOutputInFirstRound ) { VPHASE << "FOLLOWED INDIVIDUAL " << par::haplo_plem_follow_ind << "\n"; VPHASE << " FORWARD LOOP, WINDOW " << w << " POST EM PHASING \n"; verboseDisplayWindows(par::haplo_plem_follow_ind); } if ( par::haplo_plem_verbose ) VPHASE << "\nENTERING PRUNING FOR THIS WINDOW\n"; // Get rid of unlikely phases currentWindow->pruneGenogroups(); if ( par::haplo_plem_follow && verboseOutputInFirstRound ) { VPHASE << "FOLLOWED INDIVIDUAL " << par::haplo_plem_follow_ind << "\n"; VPHASE << " FORWARD LOOP, WINDOW " << w << " POST EM PRUNING \n"; verboseDisplayWindows(par::haplo_plem_follow_ind); } if ( par::haplo_plem_verbose ) VPHASE << "\nDONE WITH WINDOW " << w << " IN FORWARDS LOOP\n"; } // Next window ++w; } ////////////////////// // Now move backwards w = windows.size() - 1; while (w >= 0) { if ( par::haplo_plem_verbose ) cout << num_converged << " converged windows at iteration " << iter << " backwards chain, window " << w << " \r"; if ( par::haplo_plem_verbose && verboseOutputInFirstRound ) VPHASE << "\n\nBACKWARD WINDOW " << w << "\n"; HaploWindow * currentWindow = windows[w]; if ( par::haplo_plem_follow && verboseOutputInFirstRound ) { VPHASE << "\n\nFOLLOWED INDIVIDUAL " << par::haplo_plem_follow_ind << "\n"; VPHASE << " BACKWARDS LOOP, WINDOW " << w << " PRIOR ADJUSTMENT \n"; verboseDisplayWindows(par::haplo_plem_follow_ind); } // Only adjust freq if window isn't converged if ( !currentWindow->converged) { if ( false && w+1< windows.size() && !windows[w+1]->left_passed) { if ( par::haplo_plem_verbose && verboseOutputInFirstRound ) VPHASE << "ADJUSTING FREQUENCIES BASED ON PREVIOUS WINDOW " << w+1 << "\n"; HaploWindow * previousWindow = windows[w+1]; // Populate stub frequencies for both windows vector_t currentStub = currentWindow->rightStubFrequency(); vector_t previousStub = previousWindow->leftStubFrequency(); vector_t of = currentWindow->f; // Adjust haplotype frequencies for (int h = 0; h < currentWindow->nh; h++) { int stub = currentWindow->rightStub[h]; if ( abs( previousStub[ stub ] - currentStub[ stub ] ) > 0.02 ) { double nf; if (currentStub[ stub ] == 0) nf = 0; else nf = currentWindow->f[h] * ( previousStub[ stub ] / currentStub[ stub ] ); currentWindow->f[h] = ( currentWindow->f[h] + nf ) * 0.5; } } if ( par::haplo_plem_verbose ) { VPHASE << "OLD, ADJUSTED FREQS\n"; for (int h = 0; h < currentWindow->nh; h++) { if ( of[h] > 0.001 || currentWindow->f[h] > 0.001 ) VPHASE << "ORIGINAL HAP " << currentWindow->haplotypeName(h) << "\t" << currentWindow->leftStub[h] << " " << of[h] << "\t" << currentWindow->f[h] << "\n"; } VPHASE << "\n"; } // Once window has converged pass freq and keep // track if (previousWindow->converged) previousWindow->left_passed = true; } if ( par::haplo_plem_follow && verboseOutputInFirstRound ) { VPHASE << "FOLLOWED INDIVIDUAL " << par::haplo_plem_follow_ind << "\n"; VPHASE << " BACKWARDS LOOP, WINDOW " << w << " POST ADJUSTMENT \n"; verboseDisplayWindows(par::haplo_plem_follow_ind); } // Do some more EM iterations currentWindow->performEM(); if ( par::haplo_plem_follow && verboseOutputInFirstRound ) { VPHASE << "FOLLOWED INDIVIDUAL " << par::haplo_plem_follow_ind << "\n"; VPHASE << " BACKWARDS LOOP, WINDOW " << w << " POST EM PHASING \n"; verboseDisplayWindows(par::haplo_plem_follow_ind); } // Get rid of unlikely phases currentWindow->pruneGenogroups(); if ( par::haplo_plem_follow && verboseOutputInFirstRound ) { VPHASE << "FOLLOWED INDIVIDUAL " << par::haplo_plem_follow_ind << "\n"; VPHASE << " BACKWARDS LOOP, WINDOW " << w << " POST EM PRUNING \n"; verboseDisplayWindows(par::haplo_plem_follow_ind); } } // Next window --w; } converged = true; num_converged = 0; for (int w = 0; w < nw; w++) if ( !windows[w]->converged) converged = false; else ++num_converged; // Next regional iteration iter++; } while ( !converged ); if ( par::debug ) cout << "Chain of EMs has convered, now putting together... \n"; if (par::haplo_plem_verbose) P.printLOG("Chain of E-Ms converged\n"); //////////////////////////////////////////////// // Finished chain of window-EMs; now perform // the meta-EM (to stitch windows together), // if needed //////////////////////////////////////////////// ////////////////////////////////// // Expand genoGroups // if ( ! par::silent ) // cout << "Expanding genogroups...\n"; for (int w = 0; w < nw; w++) { // A more intense pruning now at the end windows[w]->pruneGenogroups( par::haplo_plem_meta_prune_phase ); // before we expand out... windows[w]->expandGenogroups(); } //////////////////////////////////////////////////////////// // If only a single window, just swap in the relevant variables from // the window if ( nw == 1 ) { if ( par::meta_large_phase ) { mainImputation(); return; } HaploWindow * thisWindow = windows[0]; f = thisWindow->f; hap = thisWindow->hap; hapmapb = thisWindow->hapmap; nh = thisWindow->nh; S = thisWindow->S; ns = thisWindow->ns; if ( par::test_hap_TDT ) { trans.clear(); untrans.clear(); trans.resize(nh,0); untrans.resize(nh,0); } pp.resize(P.n); hap1.resize(P.n); hap2.resize(P.n); for (int i = 0; i < P.n; i++) { if ( include[i] ) { pp[i] = thisWindow->pp[i]; hap1[i] = thisWindow->hap1[i]; hap2[i] = thisWindow->hap2[i]; if ( hap1[i].size() == 1 ) ambig[i] = false; else ambig[i] = true; if ( hap1[i].size() == 0 ) { include[i] = false; } else include[i] = true; } } // Now we are done return; } //////////////////////////////////////////////////////////// // Combine haplotypes and define initial variables (f, pp) int waplotypeSize = par::haplo_plem_meta_window > nw ? nw : par::haplo_plem_meta_window; if (par::haplo_plem_verbose) P.printLOG("Window size is " + int2str( waplotypeSize ) + " in meta-EM\n"); startWindow = 0; finishWindow = startWindow + waplotypeSize - 1; if ( finishWindow >= nw ) finishWindow = nw-1; actual_nw = finishWindow - startWindow + 1; if ( par::haplo_plem_follow ) { VPHASE << "\n\n--------------\nENTERING NOW META-EM STAGE\n\n"; if ( par::meta_large_phase ) VPHASE << " ** IN IMPUTE MODE ** \n"; else VPHASE << " ** IN HAPLOTYING MODE ** \n"; } //////////////////////////////////////////////////////// // Double up phase possibilities for all windows beyond // the first for (int w=1; wfounder ) continue; HaploWindow * currentWindow = windows[w]; if ( ! currentWindow->ambig[i] ) { if ( currentWindow->hap1[i][0] != currentWindow->hap2[i][0] ) { currentWindow->hap1[i].push_back( currentWindow->hap2[i][0] ); currentWindow->hap2[i].push_back( currentWindow->hap1[i][0] ); } currentWindow->ambig[i] = true; currentWindow->pp[i].resize(2,0.5); } else { int o = currentWindow->pp[i].size(); for ( int z=0; zhap1[i][z] != currentWindow->hap2[i][z] ) { currentWindow->hap1[i].push_back( currentWindow->hap2[i][z] ); currentWindow->hap2[i].push_back( currentWindow->hap1[i][z] ); currentWindow->pp[i][z] /= 2; currentWindow->pp[i].push_back( currentWindow->pp[i][z] ); } } } } } ////////////////////////////////// // Some temporary descriptions long int cnt_phase = 0; long int cnt_hap = 0; long int cnt_ambig = 0; for (int w = 0; w < nw; w++) { for (int i = 0; i < P.n; i++) if ( include[i] && P.sample[i]->founder ) { cnt_phase += windows[w]->hap1[i].size(); if ( windows[w]->pp[i].size() > 0 ) cnt_ambig++; } cnt_hap += windows[w]->f.size(); } // Average number of phases per person per window = double avg_phase_depth = (double)cnt_phase / ( (double)P.n * (double)nw ) ; if (par::haplo_plem_verbose) { stringstream s2; s2 << "In " << nw << " windows, " << cnt_phase << " total phases and " << cnt_hap << " haplotypes\n"; s2 << "Of these, " << cnt_ambig << " individual/windows still need resolving\n"; s2 << "Average phase depth = " << avg_phase_depth << "\n"; P.printLOG(s2.str()); } //////////////////////////////// // Begin main loop for meta-EM bool completed = false; // For imputation mode, we might want to scan the windows multiple // times bool forwards_mode = true; int meta_iter = 0; int num_meta_iter = 1; while ( 1 ) { ///////////////////////////////// // Enumerate possible waplotypes for (int i = 0; i < P.n; i++) { // Always try to phase as much as possible when // in imputation mode if ( par::meta_large_phase ) include[i] = true; if ( include[i] ) { enumeratePhasedWindows(i); if ( hap1[i].size() > 1 ) pp[i].resize(hap1[i].size()); else pp[i].resize(0); if ( par::haplo_plem_follow && par::haplo_plem_follow_ind == i ) { VPHASE << "AFTER ENUMERATED_PHASED DUMP, WINDOWS " << startWindow << " to " << finishWindow << "\n"; verboseDisplayWindows(i,false); VPHASE << "\n\n"; } } } // Generate starting values for haplotypes -- for now use a // uniform distribution if ( nh != hapmap.size() ) error("weirdness...\n"); if (par::haplo_plem_verbose) P.printLOG("Considering "+int2str(nh) + " waplotypes in second-stage EM, " + int2str(startWindow) +" to "+int2str(finishWindow)+"\n"); if ( par::haplo_plem_verbose ) VPHASE << "ENTERING PLEM WITH " << nh << " WAPLOTYPES\n"; // Set up haplotype space for HaploPhase f.resize( hapmap.size() ); for (int h=0; hns; s++) S.push_back( currentWindow->S[s] ); } ns = S.size(); if ( par::test_hap_TDT ) { trans.clear(); untrans.clear(); trans.resize(nh,0); untrans.resize(nh,0); } if ( par::haplo_plem_follow ) { VPHASE << "inclusion status = " << include[ par::haplo_plem_follow_ind ] << "\n"; VPHASE << "HAPLOPHASE PRE_MERGING STATUS \n"; for (int z=0; z= 0.01 ) VPHASE << h << "\t" << f[h] << "\t" << haplotypeName(h) << "\n"; VPHASE << "Post meta-EM phase frequencies\n"; for (int i=0; i 0.01 ) { VPHASE << haplotypeName(hap1[par::haplo_plem_follow_ind][z]) << " / " << haplotypeName(hap2[par::haplo_plem_follow_ind][z]) << "\t"; VPHASE << "(freqs = " << f[hap1[par::haplo_plem_follow_ind][z]] << " / " << f[hap2[par::haplo_plem_follow_ind][z]] << " )\t"; if ( pp[par::haplo_plem_follow_ind].size() == 0 ) VPHASE << "F" << ambig[par::haplo_plem_follow_ind] << "\n"; else VPHASE << pp[par::haplo_plem_follow_ind][z] << " " << ambig[par::haplo_plem_follow_ind] << "\n"; } } VPHASE << "\n\n"; } /////////////////////////////////////////////////////////////////// // Given new results in phase, copy these back into the original // windows, applying appropriate pruning of phases if ( par::meta_large_phase ) { updateForImputation(); } // In imputation mode, we mnight want to re-prune the windows // several times if ( par::meta_large_phase ) { if ( finishWindow == nw-1 && forwards_mode ) { if ( ++meta_iter == num_meta_iter ) completed = true; else forwards_mode = false; } if ( startWindow == 0 && ! forwards_mode ) { if ( ++meta_iter == num_meta_iter ) completed = true; else forwards_mode = true; } } // In haplotyping mode, we only do a single run through // the windows if ( ! par::meta_large_phase ) { if ( finishWindow == nw-1 ) completed = true; } ///////////////////////////////////////////////////////////////// // Have we finished? In this case, generate imputed solution, or // copy final solution into HaploPhase (if not already there?) if ( completed ) { // if ( !par::silent ) // cout << "Finished meta-EM: creating the final solution... \n"; //////////////////////////////////////////////////////////// // Copy back SNPs for HaploPhase -- we shouldn't need to do // this -- we only ever swapped things for debug purposes, // therefore we can remove above and this change... //////////////////////////////////// // Imputation, or haplotyping mode? if ( par::meta_large_phase ) { S.clear(); for( int w= 0; w < nw; w++) { HaploWindow * currentWindow = windows[w]; int start = par::haplo_plem_overlap; if (w==0) start = 0; for (int s = start; s < currentWindow->ns; s++) S.push_back( currentWindow->S[s] ); } ns = S.size(); mainImputation(); } ///////////////////////////////////////////////////////////// // Finish up haplotyping (copy frequencies, phases, etc back // into HaploPhase) if ( ! par::meta_large_phase ) { // The things we need / should check are all okay are: // f = thisWindow->f; // hap = thisWindow->hap; // nh = thisWindow->nh; // S = thisWindow->S; // ns = thisWindow->ns; // pp[i] // hap1[i] // hap2[i] // ambig[i] // include[i] //////////////////////// // Some final things... if ( par::test_hap_TDT ) { trans.clear(); untrans.clear(); trans.resize(nh,0); untrans.resize(nh,0); } } ////////////////////////////////// // Some temporary descriptions long int cnt_phase = 0; long int cnt_hap = 0; long int cnt_ambig = 0; for (int w = 0; w < nw; w++) { for (int i = 0; i < P.n; i++) if ( include[i] && P.sample[i]->founder ) { cnt_phase += windows[w]->hap1[i].size(); if ( windows[w]->pp[i].size() > 0 ) cnt_ambig++; } cnt_hap += windows[w]->f.size(); } if (par::haplo_plem_verbose) { stringstream s2; s2 << "After meta-EM, in " << nw << " windows, " << cnt_phase << " total phases and " << cnt_hap << " haplotypes\n"; s2 << "Of these, " << cnt_ambig << " individual/windows still need resolving\n"; P.printLOG(s2.str()); } //////////////////// // And now finish break; } ////////////////////////////////////////////////////////////////// // Alternatively, in haplotyping mode, we want to keep track of // all non-rare haplotypes and accumulate ////////////////////////////////////////////////////////////// // If haplotyping (estimating haplotype frequencies) as opposed // to imputation, then we use an alternate strategy: we copy // HaploPhase results back into last window in this set, and // shift window forwards (copying only common haplotypes // (default=?) and phases (default=10%) ) if ( ! par::meta_large_phase ) { HaploWindow * thisWindow = windows[ finishWindow ]; thisWindow->f.clear(); thisWindow->hap.clear(); thisWindow->nh = 0; ///////////////////////////////// // Save only non-rare haplotypes map keptHaplotypes; keptHaplotypes.clear(); int new_h = 0; for (int h=0; h par::haplo_plem_meta_prune_haplotype ) { thisWindow->f.push_back(f[h]); thisWindow->hap.push_back(hap[h]); thisWindow->nh++; keptHaplotypes.insert( make_pair(h,new_h++) ); } } ///////////////////////////////////////////////////////////////// // Update list of SNPs this HaploWindow now points to (handling // any possible overlap) vector tmp; for(int w = startWindow; w <= finishWindow; w++ ) { HaploWindow * currentWindow = windows[w]; int start = par::haplo_plem_overlap; if (w==startWindow) start = 0; for (int s = start; s < currentWindow->ns; s++) tmp.push_back( currentWindow->S[s] ); } thisWindow->S = tmp; if ( hap[0].size() > 0 ) thisWindow->ns = hap[0].size(); else thisWindow->ns = 0; //////////////////////////////////////////////////// // For this new window, set the stub codes for each // new haplotype thisWindow->setStubCodes(); ///////////////////////////////////////////////////////// // Copy per-person information (phases and probabilities) for (int i = 0; i < P.n; i++) { thisWindow->pp[i].clear(); thisWindow->hap1[i].clear(); thisWindow->hap2[i].clear(); if ( pp[i].size() == 0 ) { thisWindow->pp[i] = pp[i]; if ( hap1[i].size() == 1 ) { map::iterator k1 = keptHaplotypes.find( hap1[i][0] ); map::iterator k2 = keptHaplotypes.find( hap2[i][0] ); if ( k1 != keptHaplotypes.end() && k2 != keptHaplotypes.end() ) { thisWindow->hap1[i].push_back( k1->second ); thisWindow->hap2[i].push_back( k2->second ); } } } else { for ( int z = 0 ; z < pp[i].size() ; z++ ) { if (pp[i][z] > par::haplo_plem_meta_prune_phase ) { map::iterator k1 = keptHaplotypes.find( hap1[i][z] ); map::iterator k2 = keptHaplotypes.find( hap2[i][z] ); if ( k1 != keptHaplotypes.end() && k2 != keptHaplotypes.end() ) { thisWindow->pp[i].push_back(pp[i][z]); thisWindow->hap1[i].push_back(k1->second); thisWindow->hap2[i].push_back(k2->second); } } } // No need to rescale these, as we do not use posterior // probs (or haplotype freqs) at this stage in any case // yet -- but otherwise, we will want to adjust to sum // to 1 if we edit out rare phases, or set ambig codes, // etc } } // Next individual } if ( par::haplo_plem_follow ) { VPHASE << "POST WINDOW STITCHING... " << startWindow << " to " << finishWindow << "\n"; verboseDisplayWindows(par::haplo_plem_follow_ind,false); VPHASE << "\n--------------------------------------\n"; } ///////////////////////////////////////// // Reset key variables in HaploPhase nh = 0; hap.clear(); hapi.clear(); f.clear(); hapmap.clear(); ///////////////////////////////////////// // Advance to next set of windows // Imputation? if ( par::meta_large_phase ) { if ( forwards_mode ) { ++startWindow; ++finishWindow; } else { --startWindow; --finishWindow; } } // or haplotyping? else { startWindow = finishWindow; finishWindow = startWindow + waplotypeSize - 1; if ( finishWindow >= nw ) finishWindow = nw-1; actual_nw = finishWindow - startWindow + 1; } } // loop while not finished ////////////////////////////////// // End of alternate EM algorithm return; } vector_t HaploPhase::performHaplotypeTests(bool display, Perm & perm) { vector_t statistic; if ( par::test_hap_GLM ) { return P.glmHaplotypeTest(display,perm); } // Weighted multimarker test (test a single Haplotype) if ( par::weighted_mm ) { if (par::test_hap_CC) haplotypicWeightedCC(); else if (par::test_hap_TDT) haplotypicWeightedTDT(); return statistic; } // Standard haplotype tests if ( !par::phase_hap_all ) { map tests; // Of the specific, prespecified haplotype? // Test only the specific haplotype if (f[test_hap] >= par::min_hf) { for (int h2=0; h2 < nh; h2++) { if (f[h2] >= par::min_hf) { if (test_hap==h2) tests.insert(make_pair(h2, 0)); else tests.insert(make_pair(h2, 1)); } } if (par::test_hap_CC) haplotypicCC(tests, 2, true); else if (par::test_hap_QTL) haplotypicQTL(tests, 2, true); else if (par::test_hap_TDT) haplotypicTDT(tests, 2, true); } } else { // Perform an omnibus test and all haplotype-specific // tests, if the --hap-all flag has been set // Omnibus test // tests[0] = 0 // tests[1] = 1 // tests[2] = 2 // ... // tests[h] = h map tests; int nch=0; for (int h=0; h < nh; h++) if (f[h] >= par::min_hf) { tests.insert(make_pair(h, nch++)); } if (nch>2) { if (par::test_hap_CC) haplotypicCC(tests, nch, true); else if (par::test_hap_QTL) haplotypicQTL(tests, nch, true); else if (par::test_hap_TDT) haplotypicTDT(tests, nch, true); } // Haplotype-specific test // tests[0] = 0 // tests[1] = 1,2,...,h // tests[0] = 1 // tests[1] = 0,2,3,...,h // etc for (int h=0; h < nh; h++) { if (f[h] >= par::min_hf) { tests.clear(); for (int h2=0; h2 < nh; h2++) { if (f[h2] >= par::min_hf) { if (h==h2) { tests.insert(make_pair(h2, 0)); } else tests.insert(make_pair(h2, 1)); } } if (par::test_hap_CC) haplotypicCC(tests, 2, true); else if (par::test_hap_QTL) haplotypicQTL(tests, 2, true); else if (par::test_hap_TDT) haplotypicTDT(tests, 2, true); } } } // end of --hap-all routine // Dummy return vector (for now) return statistic; } void HaploPhase::prunePhase(int i) { // Prune regional phases (HaploPhase) if ( (!include[i]) ||(!ambig[i])) return; double psum = 0; vector new_pp(0); vector new_h1(0); vector new_h2(0); for (int z=0; z < hap1[i].size(); z++) { if (pp[i][z] >= par::hap_min_phase_prob) { new_pp.push_back(pp[i][z]); psum += pp[i][z]; new_h1.push_back(hap1[i][z]); new_h2.push_back(hap2[i][z]); } } // Normalise? if (pp[i].size() > new_pp.size() ) { for (int z=0; z < new_pp.size(); z++) new_pp[z] /= psum; } // Update pp[i] = new_pp; hap1[i] = new_h1; hap2[i] = new_h2; } set HaploPhase::makeSetFromMap(map & h) { // Wrapper function for dosage that takes a set // Extract indicated haplotypes (coded 0) set tests; map::iterator ih = h.begin(); while ( ih != h.end() ) { if ( ih->second == 0 ) tests.insert( ih->first ); ++ih; } return tests; } double HaploPhase::dosage(int i, set & h) { // Assume i and h are valid double d = 0; if ( ambig[i] ) { vector_t & posterior = pp[i]; vector & h1 = hap1[i]; vector & h2 = hap2[i]; for (int z = 0; z < h1.size(); z++) { if ( h.find( h1[z] ) != h.end() ) d += posterior[z]; if ( ! (haploid || (X && P.sample[i]->sex))) if ( h.find( h2[z] ) != h.end() ) d += posterior[z]; } } else { if ( h.find( hap1[i][0] ) != h.end() ) ++d; if ( ! (haploid || (X && P.sample[i]->sex))) if ( h.find( hap2[i][0] ) != h.end() ) ++d; } return d; } plink-1.07-src/webcheck.cpp0000644000265600020320000001174011264163125014752 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "sockets.h" using namespace std; extern string PVERSION; extern string PREL; #define PORT_NUM 80 #define IP_ADDR "155.52.206.11" #define GET_STRING "GET /~purcell/plink/version2.txt HTTP/1.1\nHost: pngu.mgh.harvard.edu\nConnection: close\n\n" void Plink::webcheck(CArgs & a) { #ifdef SKIP printLOG("Web-check not implemented on this system...\n"); return; #else ////////////////////////////////////////// // First look for a local .pversion file in // the local directory // Get today's date time_t curr=time(0); string tdstamp = (string)ctime(&curr); string buf; stringstream ss(tdstamp); vector date_tokens; while (ss >> buf) date_tokens.push_back(buf); string thisDate = date_tokens[0] + date_tokens[1] + date_tokens[2]; bool hasRecord = doesFileExist(".pversion"); //////////////////////////////////////////////////////// // Web-based message (but may be cached in local file) vector tokens; bool connect2web = true; printLOG("Web-based version check ( --noweb to skip )\n"); //////////////////////////////////////////// // If we have a record, are we up-to-date? if ( hasRecord ) { ifstream VER; VER.open(".pversion",ios::in); string oldDay, oldMonth, oldDate, webVersion; VER >> oldDay >> oldMonth >> oldDate; if ( thisDate == oldDay+oldMonth+oldDate ) { printLOG("Recent cached web-check found..."); connect2web = false; // Read rest of cached web message while ( ! VER.eof() ) { string t; VER >> t; if (t=="") break; tokens.push_back(t); } } VER.close(); } if ( connect2web ) { //printLOG("Connecting to web to get version...\n"); tokens = socketConnection( this , IP_ADDR, PORT_NUM, GET_STRING); } bool print = false; bool print2 = false; bool version_okay = true; for (int i=0; i b.pt ) return false; if ( pu < b.pu ) return true; if ( pu > b.pu ) return false; if ( mt < b.mt ) return true; if ( mt > b.mt ) return false; if ( mu < b.mu ) return true; if ( mu > b.mu ) return false; return false; } }; class HaploPhase { public: Plink & P; int ns; // Number of SNPs in haplotype (region) int nw; // Numbner of windows in region int actual_nw; // If a subset if analysed int nh; // Number of possible haplotypes int nt; // Number of downcoded haplotypes int nsh; // Number of possible stub haplotypes int np; // Number of phases, diploid int haploid_np; // As above, haploid (do we need this?) string hname; // Name of haplotype locus int test_hap; // To be imputed haplotype bool X; // Sex chromosome code bool haploid; // Haploid chromosome code int cnt_f; // Number of founders to be phased int current; // Number of current haplotype being tested bool reference_only; // Only consider reference panel ////////////////////////////////////// // Lists of SNP sets (regions) // List of SNPs in haplotypes vector > new_pred_locus; // List of 'tag' haplotypes vector new_pred_allele; // List of weighted multimarker predictors vector > new_pred_weighted_allele; ////////////////////////////////////// // Region-wide haplotype information // Coding for each haplotype (and HaploWindow coding) vector > hap; vector > hapi; // List of predictor SNP numbers intvec_t S; // Estimated haplotype frequencies vector_t f; // Individual posterior probabilities matrix_t pp; vector > hap1; vector > hap2; // Lookup table for haplotype number given SNPs map,int> hapmapb; map,int > hapmap; // Phase markers and frequencies vector ph_hap1; vector ph_hap2; vector ph_freq; vector haploid_ph_hap1; vector haploid_ph_freq; // Whether individual has ambiguous phase for region // i.e. hap1[i].size() == 1 vector ambig; // Should we skip this person? vector include; // Downcoding? bool subhaplotypes; map downcoding; //////////////////////////////////////// // For EM, region is split into windows vector windows; int startWindow; int finishWindow; //////////////////////////////////////// // For EM, region is split into windows void enumeratePhasedWindows(int); bool makeWaplotype(vector &, vector &); //////////////////////////////////////// // (Regional) haplotype association // Transmission/untransmission counts vector trans; vector untrans; vector > phasemap; //////////////////////////////////////// // (Regional) haplotype imputation (ML) vector new_map; vector actual_map; vector > new_one; vector > new_two; //////////////////////////////////////// // In segment-tracking-mode, individuals int p1; int p2; bool homozyg; // Output files: haplotype frequencies ofstream HFRQ; ofstream HTEST; ofstream HIMPUTE; ofstream HPHASE; ofstream VPHASE; ofstream WGT; // Temporary storage for chi-sqs from haplotype tests // and odds ratio (haplotype specific tests) double result; double pvalue; double odds; double case_freq; // also T:U double control_freq; HaploPhase(Plink & P_) : P(P_) { ambig.resize(P.n, false); include.resize(P.n, true); pp.resize(P.n); hap1.resize(P.n); hap2.resize(P.n); X=haploid = false; subhaplotypes = false; useEmpiricalVariance = true; reference_only = false; nonfounders = false; } // Read list of tests/tags void readTagFile(); // Make sliding window list of tests void makeSlidingWindow(string); // Set a specific set of SNPs based on a command line void setSpecificSNPs(string); // Display haplotype frequencies void calculateHaplotypeFrequencies(); // Track shared haplotypes void trackSharedHaplotypes(); void trackThisSegment(); vector_t trackedIBS; vector trackedN; // Make test set for haplotype tests map makeTestSet(boolvec_t &, boolvec_t &); // Make subhaplotype identity set map makeSubHaplotypeSet(boolvec_t &); // Return subhaplotype name, formatted string getSubHaplotypeName(boolvec_t &, boolvec_t &, int); // Perform haplotype tests vector_t performHaplotypeTests(bool,Perm&); // Impute all haplotypes void imputeAllHaplotypes(); // Display haplotype phases void calculatetHaplotypePhases(); // Verbose displays void verboseDisplayWindows(int i, bool use_ref = true ); // Return of dosage for a single or set of haplotypes double dosage(int i, set & h); set makeSetFromMap(map & h); void reset() { ns = nh = np = 0; test_hap = -1; for (int i=0; i=0) return f[test_hap]; else return -1; } // Main routine to driver phasing (also set up for assoc/perm testing) vector_t phaseAllHaplotypes(bool, Perm&); // Given list of SNP numbers, set up all possible haplotypes (hap) void enumerateHaplotypes(vector&); // Give list of haplotypes in each phase void enumerateAllPhases(); // Set test haplotype (query hap with allele string) void setTestHaplotype(string); // Return possible haplotype list vector returnHaplotypes(vector&); // Don't include individuals missing too much information void includeIndividuals( int i ); // Construct and determine inclusion for nonfounder void validateNonfounder(int, vector &, vector &); // Determine possible haplotype phases for an individual void enumerateNonfounderPhase(int, vector&,vector&, int,int, int,int, vector&,vector&); // Possible offspring haplotypes given parents? (NOT USED) bool consistentNonfounderPhaseGivenParents(int,int,int,int,int,int,int); // Possible offspring haplotypes given offspring genotypes? bool consistentNonfounderPhaseGivenGenotypes(vector&,vector&,int,int); // Possible offspring haplotypes given offspring genotypes, exception for X? bool consistentNonfounderMalePhaseGivenXGenotypes(vector &,vector & s2,int); // Score transmissions void transmissionCount(int, map & ); void scoreTransmissions(int,int,int,int,int,int,vector&,vector&); // Get rid of unlikely phases void prunePhase(int); // Check for unlikely genotypes void queryGenotype(int); void queryThisGenotype(int,int,int,vector_t&); // Phase non-founders, given we have haplotype frequencies // and score/rescore for TDT void phaseAndScoreNonfounder(int); // Use offspring information to help resolve parental phase void resolveWithKids(int); // E-M algorithm void performEM_original(); void performAlternEM(); // Report haplotype phase for an individual void reportPhase(); // Report haplotype phase for an individual, alternate format void reportPhaseWideFormat(); // Report haplotype frequencues void reportHaplotypeFrequencies(); // Impute most likely genotype given a set of windows void mainImputation(); // Helper function in stitching together windows in imputation mode void updateForImputation(); void imputeThisHaplotype(int); double imputeHaplotypes(int, bool&, bool&); vector_t imputeGenotype(int, int); ///////////////////////////////////////////// // Convenience functions to report LD, freqs double rsq(int, int); double dprime(int, int); double rsq_internal(int, int); double rsq_internal(boolvec_t &, boolvec_t &, boolvec_t &, boolvec_t &); double freq(boolvec_t &, boolvec_t &); bool calculateDp; ////////////////////////////////////// // Haplotype-based association tests void haplotypicCC(map &, int, bool); void haplotypicWeightedCC(); void haplotypicTDT(map &, int, bool); void haplotypicWeightedTDT(); void haplotypicQTL(map &, int, bool); map testSet; set sets; // as above, slightly diff. specification int validN; // Number of non-missing founders // Perform non-founder fill-in phasing? bool nonfounders; // Empirical variance of dosage bool useEmpiricalVariance; void calculateEmpiricalVariance(int); void calculateEmpiricalVariance(set&); set returnHaplotypeSet(boolvec_t &, boolvec_t &); double empiricalVariance; double ratio; // TDT empirical variance stores vector_t transmissionX; vector_t transmissionX2; double transmissionTotal; }; #endif plink-1.07-src/homozyg.cpp0000644000265600020320000020256511264127625014707 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "perm.h" #include "stats.h" #include "cnv.h" extern Plink * PP; namespace std { template<> class less { public: bool operator()(Segment const* s1, Segment const* s2) { if ( s1->start > s2->start ) return true; else if ( s1->start < s2->start ) return false; else if ( s1->finish > s2->finish ) return true; else if ( s1->p1 < s2->p1 ) return true; else if ( s1->p1 > s2->p1 ) return false; else if ( s1->p2 < s2->p2 ) return true; else if ( s1->p2 > s2->p2 ) return false; else return false; } }; }; class Pool { public: set > segs; vector > match; vector matchcount; vector group; vector index; int chr; int ng; int min; int max; int union_min; int union_max; }; namespace std { template<> class less { public: bool operator()(Pool const* p1, Pool const* p2) { if ( p1->segs.size() > p2->segs.size() ) return true; else if ( p1->segs.size() < p2->segs.size() ) return false; else { set::iterator s1 = p1->segs.begin(); set::iterator s2 = p2->segs.begin(); while ( s1 != p1->segs.end() ) { if ( (*s1)->start > (*s2)->start ) return true; else if ( (*s1)->start < (*s2)->start ) return false; else if ( (*s1)->finish > (*s2)->finish ) return true; else if ( (*s1)->finish < (*s2)->finish ) return false; s1++; s2++; } } return false; } }; }; vector_t compareCNVs(CNVIndivReport & a, CNVIndivReport & b) { vector_t res(8); // t1 = # events in cases // t2 = proportion of sample with 1+ event // t6 = proportion of sample with at least gene // t7 = if ( par::segment_test_1sided ) { res[0] = a.t1 - b.t1; // RATE res[1] = a.t2 - b.t2; // PROP res[2] = a.t3 - b.t3; // KBTOT res[3] = a.t4 - b.t4; // KBAVG res[4] = a.t5 - b.t5; // GRATE res[5] = a.t6 - b.t6; // GPROP res[6] = a.t7 - b.t7; // GRICH res[7] = a.t8 - b.t8; // GRICH2 } else { res[0] = fabs( a.t1 - b.t1 ); res[1] = fabs( a.t2 - b.t2 ); res[2] = fabs( a.t3 - b.t3 ); res[3] = fabs( a.t4 - b.t4 ); res[4] = fabs( a.t5 - b.t5 ); res[5] = fabs( a.t6 - b.t6 ); res[6] = fabs( a.t7 - b.t7 ); res[7] = fabs( a.t8 - b.t8 ); } // cout << "DETS: " // << a.count << " " << b.count << " -- G= " // << a.t5 << " " << b.t5 << " -- B= " // << a.t8 << " " << b.t8 << " -- EG= " // << a.t9 << " " << b.t9 << " EB= " // << a.t10 << " " << a.t10 << "\n"; return res; } // Helper function void summaryIndivSummaries(Plink * P, int kmask, map & segmentCount, map & segmentLength, CNVIndivReport & a, CNVIndivReport & u, vector_t & res) { // Optionally only select individuals with kmask value, // unless kmask < 0 // Return 2 (case/control) x object with // for ( int i = 0; i < P->n; i++) { Individual * person = P->sample[i]; if ( kmask >= 0 ) { if ( person->sol != kmask ) continue; } indivPair t; t.p1 = t.p2 = P->sample[i]; map::iterator ic = P->segmentCount.find(t); map::iterator il = P->segmentLength.find(t); map::iterator ic2 = P->segmentCount2.find(t); map::iterator ic2b; if ( par::cnv_count_baseline ) ic2b = P->segmentCount2Baseline.find(t); //indivPair p = ic->first; if ( person->pperson->aff ) ++a.n; else ++u.n; if ( ic == P->segmentCount.end() ) continue; CNVIndivReport * pstat = person->pperson->aff ? &a : &u; // Basic CNV properties pstat->t1 += ic->second; pstat->t2++; pstat->t3 += il->second; pstat->t4 += il->second / (double)ic->second; // Geneset count statistics pstat->t5 += ic2->second; pstat->t6 += ic2->second > 0 ? 1 : 0; pstat->t7 += (double)ic2->second / (double)il->second; pstat->t9 += PP->expectedOverlap[i]; // cout << " overlap = " << ic2->second << " of " << PP->expectedOverlap[i] << "\n"; // Baseline/comparator geneset counts if ( par::cnv_count_baseline ) { if ( ic2b->second>0) pstat->count_baseline++; pstat->t8 += ic2b->second; pstat->t10 += PP->expectedOverlapBaseline[i]; // cout << "bline = " << ic2b->second << " " << PP->expectedOverlapBaseline[i] << "\n"; } pstat->count++; } // Next individual // edit to make denom of t8 test total # of individuals // a.count_baseline = a.count; // u.count_baseline = u.count; // Save actual segment counts before we make the means a.segCount = (int)a.t1; u.segCount = (int)u.t1; //////////////////////////////////////// // Get means a.calculateResults(); u.calculateResults(); res = compareCNVs(a,u); } ///////////////////////////////////////////// // Entrypoint for all homozygosity run tests void Plink::findAllHomozygousRuns(Perm & perm) { if (par::SNP_major) SNP2Ind(); string f = par::output_file_name + ".hom"; // Calculate or read homozygous segments from a file? if ( ! par::read_segment_file ) { ofstream HOM; HOM.open(f.c_str(),ios::out); HOM.precision(3); HOM.setf(ios::fixed); printLOG("\nWriting homozygosity-run information to [ "+f+" ] \n"); printLOG("Run defined as: "); if (par::homo_run_kb) { printLOG(int2str(par::homo_run_length_kb)+" kb "); if (par::homo_run_snps) printLOG(", "); } if (par::homo_run_snps) printLOG(int2str(par::homo_run_length_snps)+" SNPs"); printLOG("\n"); // printLOG("Allowing "+int2str(par::homo_run_het)+" hets per run\n"); HOM << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(8) << "PHE" << " " << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP1" << " " << setw(par::pp_maxsnp) << "SNP2" << " " << setw(12) << "POS1" << " " << setw(12) << "POS2" << " " << setw(10) << "KB" << " " << setw(8) << "NSNP" << " " << setw(8) << "DENSITY" << " " << setw(8) << "PHOM" << " " << setw(8) << "PHET" << "\n"; printLOG("Homozygous segment criteria:\n"); printLOG(" length (kb) = " + int2str(par::homo_run_length_kb) + "\n"); printLOG(" # SNPs (N) = " + int2str(par::homo_run_length_snps) + "\n"); printLOG(" density (kb/SNP) = " + dbl2str(par::homo_run_density) + "\n"); printLOG(" largest gap (kb) = " + int2str(par::homo_run_gap) + "\n"); // Rescale GAP to base-pairs par::homo_run_gap *= 1000; // Find segments for each individual for (int i1=0; i1start > s2->start ? s1->start : s2->start; int finish = s1->finish < s2->finish ? s1->finish : s2->finish; // Assume individual major mode for (int l=start; l<= finish; l++) { // Only test homozygotes if ( s1->p1->one[l] == s1->p1->two[l] && s2->p1->one[l] == s2->p1->two[l] ) if ( s1->p1->one[l] != s2->p1->one[l] ) mismatch++; } if ((double)mismatch/(double)(finish-start+1) > 1-par::fuzzy_homo ) return false; else return true; } // Homozygosity Match, supplying pool consensus start/finish bool segsIBDMatchCON(Segment * s1, Segment * s2, int start, int finish) { int match = 0; int valid = 0; // 1. Determine which allele is shared for each pair // 2. See whether this is the same allele between pairs // Assume individual major mode for (int l=start; l<= finish; l++) { // Ignore is 1+ missing or 2 hets bool a1 = s1->p1->one[l]; bool a2 = s1->p1->two[l]; bool b1 = s1->p2->one[l]; bool b2 = s1->p2->two[l]; bool c1 = s2->p1->one[l]; bool c2 = s2->p1->two[l]; bool d1 = s2->p2->one[l]; bool d2 = s2->p2->two[l]; bool allele1 = false; bool allele2 = false; // Any missing alleles? if ( (a1 && (!a2)) || (b1 && (!b2)) || (c1 && (!c2)) || (d1 && (!d2)) ) continue; // Any double hets within each pair? if ( ((!a1) && a2) && ((!b1) && b2) ) continue; if ( (!c1) && c2 && (!d1) && d2 ) continue; // Any opposing homozygotes within pair? if ( (a1 == a2) && (b1 == b2) && (a1 != b1) ) continue; if ( (c1 == c2) && (d1 == d2) && (c1 != d1) ) continue; // Get the alleles for the pairs // (i.e. from the homozygote) if ( a1 == a2 ) allele1 = a1; else allele1 = b1; if ( c1 == c2 ) allele2 = c1; else allele2 = d1; // Do these match? if ( allele1 == allele2 ) match++; valid++; } if ( validstart > s2->start ? s1->start : s2->start; int finish = s1->finish < s2->finish ? s1->finish : s2->finish; // 1. Determine which allele is shared for each pair // 2. See whether this is the same allele between pairs // Assume individual major mode for (int l=start; l<= finish; l++) { // Ignore is 1+ missing or 2 hets bool a1 = s1->p1->one[l]; bool a2 = s1->p1->two[l]; bool b1 = s1->p2->one[l]; bool b2 = s1->p2->two[l]; bool c1 = s2->p1->one[l]; bool c2 = s2->p1->two[l]; bool d1 = s2->p2->one[l]; bool d2 = s2->p2->two[l]; bool allele1 = false; bool allele2 = false; // Any missing alleles? if ( (a1 && (!a2)) || (b1 && (!b2)) || (c1 && (!c2)) || (d1 && (!d2)) ) continue; // Any double hets within each pair? if ( ((!a1) && a2) && ((!b1) && b2) ) continue; if ( (!c1) && c2 && (!d1) && d2 ) continue; // Any opposing homozygotes within pair? if ( (a1 == a2) && (b1 == b2) && (a1 != b1) ) continue; if ( (c1 == c2) && (d1 == d2) && (c1 != d1) ) continue; // Get the alleles for the pairs // (i.e. from the homozygote) if ( a1 == a2 ) allele1 = a1; else allele1 = b1; if ( c1 == c2 ) allele2 = c1; else allele2 = d1; // Do these match? if ( allele1 == allele2 ) match++; valid++; } if ( validp1->one[l] == s1->p1->two[l] && s2->p1->one[l] == s2->p1->two[l] ) if ( s1->p1->one[l] != s2->p1->one[l] ) mismatch++; } if ((double)mismatch/(double)(finish-start+1) > 1-par::fuzzy_homo ) return false; else return true; } void displayPoolVerbose( Plink & P, Pool * pool , ofstream & OUT ) { // Figure out list of individuals, for the maximal region // i.e. union rather than intersection of the pool of segs set pset; vector > pgrpset( pool->ng ); vector plist; vector pstart; vector pend; vector pgrp; // Loop over each group in the pool for ( int g = 0; g < pool->ng; g++) { // Consider all segments in this pool set::iterator s = pool->segs.begin(); int c2=0; while ( s != pool->segs.end() ) { // Not in group 'g' ? if ( pool->group[c2] == g ) { if ( pset.find( (*s)->p1 ) == pset.end() ) { pset.insert( (*s)->p1 ); plist.push_back( (*s)->p1 ); pstart.push_back( (*s)->start ); pend.push_back( (*s)->finish ); pgrp.push_back( g ); } if ( pset.find( (*s)->p2 ) == pset.end() ) { pset.insert( (*s)->p2 ); plist.push_back( (*s)->p2 ); pstart.push_back( (*s)->start ); pend.push_back( (*s)->finish ); pgrp.push_back( g ); } // Add to group-specfic unique individual list if ( pgrpset[g].find( (*s)->p1 ) == pgrpset[g].end() ) { pgrpset[g].insert( (*s)->p1 ); } if ( pgrpset[g].find( (*s)->p2 ) == pgrpset[g].end() ) { pgrpset[g].insert( (*s)->p2 ); } } // Next segment s++; c2++; continue; } } // We now a have a unique list of individuals, in the same order in plist as // the grouping variable; now display, each row is a SNP // Header row (including IDs) OUT << setw(6) << " " << " " << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(4) << "GRP" << " " << "\n"; for (int i=0; i < plist.size(); i++) { OUT << setw(6) << int2str(i+1)+") " << " " << setw(par::pp_maxfid) << plist[i]->fid << " " << setw(par::pp_maxiid) << plist[i]->iid << " "; // Display the group(s) this individual belongs to bool any = false; for ( int g = 1 ; g < pool->ng; g++ ) { if ( pgrpset[g].find(plist[i]) != pgrpset[g].end() ) { if (!any) any = true; else OUT << ", "; OUT << g; } } OUT << "\n"; } OUT << "\n" << setw(par::pp_maxsnp) << "SNP" << " "; for (int i=0; i < plist.size(); i++) OUT << setw(5) << int2str(i+1)+" " << " "; OUT << "\n\n"; /////////////////////////////////////////////////////// // Forcing a single pool? In this case, only display // the selected region, rather than the whole thing // (note: we have already displayed the union in the // *.overlap file, so okay to change this now if ( par::force_span ) { pool->union_min = par::segment_snp1; pool->union_max = par::segment_snp2; } /////////////////////////////////////////////////////// // Display all SNPs for (int l = pool->union_min ; l <= pool->union_max ; l++) { if (! par::force_span ) if ( l == pool->min ) OUT << "\n"; OUT << setw(par::pp_maxsnp) << P.locus[l]->name << " "; // Consider all individuals (in plist) for (int i=0; i < plist.size(); i++) { if ( l >= pstart[i] && l <= pend[i] ) OUT << setw(5) << "["+genotype(P,plist[i],l)+"]" << " "; else OUT << setw(5) << " "+genotype(P,plist[i],l)+" " << " "; } OUT << "\n"; if (! par::force_span ) if ( l == pool->max ) OUT << "\n"; } OUT << "\n\n"; /////////////////////////////////////////////////////// // Finally, find the consensus haplotype for each group // For example, pair within a group, for their overlap only // count number of greatest allele of the 4 // Exactly the same principle applies whether looking at homozygous // or heterozygous runs; this will implicitly ignore double hets; // and provide a sensible way of dealing with missing data, hets in // homozygous runs, etc vector ghap(pool->ng,""); for ( int g = 1; g < pool->ng; g++) { OUT << "Group " << g << "\n\n"; for (int i=0; i < plist.size(); i++) { // Is this individual in this group? if ( pgrpset[g].find(plist[i]) != pgrpset[g].end() ) { Individual * person = plist[i]; OUT << setw(6) << int2str(i+1)+") " << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->fid << " " << setw(8) << person->phenotype << "\n"; } } OUT << "\n"; OUT << "\n" << setw(par::pp_maxsnp) << "SNP" << " " << setw(6) << " " << " "; for (int i=0; i < plist.size(); i++) if ( pgrpset[g].find(plist[i]) != pgrpset[g].end() ) OUT << setw(5) << int2str(i+1)+" " << " "; OUT << "\n\n"; // Consider each position for (int l = pool->union_min ; l <= pool->union_max ; l++) { if (! par::force_span ) if ( l == pool->min ) OUT << "\n"; OUT << setw(par::pp_maxsnp) << P.locus[l]->name << " "; // Loop over each group in the pool (start from 1..ng) // (0 is unassigned) // Keep track of most likely allele for this position for this group int a1count = 0 ; int a2count = 0 ; // Consider all segments in this pool set::iterator s = pool->segs.begin(); int c2=0; while ( s != pool->segs.end() ) { // Only count of this segment is spanning this particular // position if ( l >= (*s)->start && l <= (*s)->finish ) { // Not in group 'g' ? if ( pool->group[c2] == g ) { // Consider genotypes of p1, p2: get count of 4 alleles bool a1 = (*s)->p1->one[l]; bool a2 = (*s)->p1->two[l]; bool b1 = (*s)->p2->one[l]; bool b2 = (*s)->p2->two[l]; // Homozygote? if ( a1 == a2 ) { if ( a1 ) a2count++; else a1count++; } if ( b1 == b2 ) { if ( b1 ) a2count++; else a1count++; } } } // Next segment s++; c2++; continue; } // Next segment in group /////////////////////////////////////////// // Display this group's consensus haplotype string str = "?"; if ( a1count > a2count ) str = P.locus[l]->allele1; else if ( a2count > a1count ) str = P.locus[l]->allele2; OUT << setw(2) << str << " " << setw(4) << " " << " "; ghap[g] += str; /////////////////////////////////////////// // Consider all individuals (in plist) for (int i=0; i < plist.size(); i++) { if ( pgrpset[g].find(plist[i]) != pgrpset[g].end() ) //if ( pgrp[i] == g ) { if ( l >= pstart[i] && l <= pend[i] ) OUT << setw(5) << "["+genotype(P,plist[i],l)+"]" << " "; else OUT << setw(5) << " "+genotype(P,plist[i],l)+" " << " "; } } OUT << "\n"; if (! par::force_span ) if ( l == pool->max ) OUT << "\n"; } // Next SNP OUT << "\n"; } // Next group OUT << "\n\n"; // Consider each position int lp=0; for (int l = pool->union_min ; l <= pool->union_max ; l++) { if (! par::force_span ) if ( l == pool->min ) OUT << "\n"; OUT << setw(par::pp_maxsnp) << P.locus[l]->name << " "; for ( int g = 1; g < pool->ng; g++) { OUT << ghap[g][lp] << " "; } lp++; OUT << "\n"; if (! par::force_span ) if ( l == pool->max ) OUT << "\n"; } } void Plink::groupSegmentsSpanning(int l) { // Find list of segments spanning SNP 'l' // and return as a vector of ints -- this // is just a convenience wrapper around // summariseHomoRuns() par::force_span = true; par::segment_current_focal_snp = l; par::segment_silently_return_groups = true; bool old_silent = par::silent; par::silent = true; // A value of -1 means no segment indivSegmentGroup.resize(n); for (int i=0; i 1e-6) { printLOG("Requiring allelic match, threshold " + dbl2str(par::fuzzy_homo)+" identity\n"); printLOG("Requiring at least " + int2str(par::genome_test_min_snp) + " informative SNPs to match segments\n"); } else { printLOG("Not requiring allelic match (overlapping only)\n"); par::fuzzy_homo = 0; } } if (par::segment_overlap) HOM << setw(5) << "POOL" << " " << setw(par::pp_maxfid) << "FID1" << " " << setw(par::pp_maxiid) << "IID1" << " " << setw(par::pp_maxfid) << "FID2" << " " << setw(par::pp_maxiid) << "IID2" << " " << setw(8) << "PHE" << " " << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP1" << " " << setw(par::pp_maxsnp) << "SNP2" << " " << setw(14) << "BP1" << " " << setw(14) << "BP2" << " " << setw(8) << "KB" << " " << setw(8) << "NSNP" << " " << setw(4) << "NSIM" << " " << setw(6) << "GRP" << "\n"; if ( par::cnv_list ) HOM << setw(5) << "POOL" << " " << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(8) << "PHE" << " " << setw(4) << "CHR" << " " << setw(14) << "BP1" << " " << setw(14) << "BP2" << " " << setw(8) << "KB" << " " << setw(6) << "TYPE" << " " << setw(8) << "SCORE" << "\n"; else HOM << setw(5) << "POOL" << " " << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(8) << "PHE" << " " << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP1" << " " << setw(par::pp_maxsnp) << "SNP2" << " " << setw(14) << "BP1" << " " << setw(14) << "BP2" << " " << setw(8) << "KB" << " " << setw(8) << "NSNP" << " " << setw(4) << "NSIM" << " " << setw(6) << "GRP" << "\n"; } // Make groups of segments // Group by maximally over-lapping region // Both irrespective of alleles // And considering alleles // i.e. all segments in a group must have at least one position // where they overlap Then group alleles within pool based on // allelic identity // ---- ----- // ---- ---------- // ----------------------- // | | // A unique list of overlapping segments set > pools; ///////////////// // 1. Make sets // Consider each position int start = 0; int finish = nl_all-1; if ( par::segment_silently_return_groups ) start = finish = par::segment_current_focal_snp; else { if ( par::segment_m1 != "" ) { par::segment_snp1 = getMarkerNumber((*this),par::segment_m1); if (par::segment_snp1==-1) error("--segment-from {marker} not found"); start = par::segment_snp1; } if ( par::segment_m2 != "" ) { par::segment_snp2 = getMarkerNumber((*this),par::segment_m2); if (par::segment_snp2==-1) error("--segment-to {marker} not found"); finish = par::segment_snp2; } start = start > finish ? finish : start; } // A sequential position scan (i.e. to scan whole region, // potentially find non-overlapping pools, or force everything into // a single pool? if ( par::force_span ) { Pool * thispool = new Pool; // Which segments contain this SNP? vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( s->start <= finish && s->finish >= start ) { thispool->segs.insert( &(*s) ); } s++; } // Add this pool to the overall list, if it is unique if ( thispool->segs.size() >= par::pool_size_min ) { pools.insert( thispool ); } else { delete thispool; if ( par::segment_silently_return_groups) return; printLOG("No segments found in the spanned region, exiting now.\n"); return; } } else { for (int l=start; l<=finish; l++) { if (!par::silent) cout << "Considering position " << l+1 << " of " << nl_all << " \r"; Pool * thispool = new Pool; // Which segments contain this SNP? vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( s->start <= l && s->finish >= l ) { thispool->segs.insert( &(*s) ); } s++; } // Add this pool to the overall list, if it is unique if ( thispool->segs.size() >= par::pool_size_min ) { pools.insert( thispool ); } else delete thispool; } if (!par::silent) cout << "\n"; printLOG("Found " +int2str(pools.size()) +" unique overlapping sets of segments\n"); } ////////////////////////////////////////////////////// // 2. Prune, i.e. drop AB if ABC and/or ABD exists? // Start at end (smallest set) and go upwards vector redundant(pools.size(),false); set::reverse_iterator p = pools.rbegin(); int c=pools.size()-1; while ( p != pools.rend()) { if (!par::silent) cout << c << " pools left to prune \r"; // Do we find all segments in this pool in a larger pool? set::iterator p2 = pools.begin(); int c2=0; while ( p2 != pools.end() ) { // Already redundant if ( c2 >= c || redundant[c2] ) { c2++; p2++; continue; } // Same pool if ( *p == *p2 ) { c2++; p2++; continue; } set::iterator s = (*p)->segs.begin(); bool embedded = true; while ( s != (*p)->segs.end() ) { // Do we find this segment? set::iterator s2 = (*p2)->segs.begin(); bool found = false; while ( s2 != (*p2)->segs.end() ) { if ( *s == *s2 ) { found = true; break; } s2++; } if ( ! found ) embedded = false; s++; } if (embedded) { redundant[c] = true; break; } c2++; p2++; } c--; p++; } if (!par::silent) cout << "\n"; ////////////////////////////////////////////// // 3. Determine allelic matching // Determine consensus region for each pool set::iterator pC = pools.begin(); while ( pC != pools.end()) { (*pC)->min=0; (*pC)->max=nl_all; (*pC)->union_min=nl_all; (*pC)->union_max=0; set::iterator s = (*pC)->segs.begin(); while ( s != (*pC)->segs.end() ) { (*pC)->min = (*s)->start > (*pC)->min ? (*s)->start : (*pC)->min; (*pC)->max = (*s)->finish < (*pC)->max ? (*s)->finish : (*pC)->max; (*pC)->union_min = (*s)->start < (*pC)->union_min ? (*s)->start : (*pC)->union_min; (*pC)->union_max = (*s)->finish > (*pC)->union_max ? (*s)->finish : (*pC)->union_max; s++; } pC++; } // Find matches set::iterator pA = pools.begin(); int cA=0; while ( pA != pools.end()) { // Skip this pool? if ( redundant[cA] ) { cA++; pA++; continue; } // For this pool, consider all segments in the pool in a pairwise // manner, and populate the match matrix set::iterator s1 = (*pA)->segs.begin(); int c1=0; // Resize matching variables (*pA)->match.resize( (*pA)->segs.size() ); for (int i=0; i< (*pA)->segs.size(); i++) (*pA)->match[i].clear(); (*pA)->group.resize( (*pA)->segs.size() , 0 ); (*pA)->matchcount.resize((*pA)->segs.size() , 0 ); (*pA)->index.resize((*pA)->segs.size() , false ); // Conisder each pair of segments while ( s1 != (*pA)->segs.end() ) { set::iterator s2 = (*pA)->segs.begin(); int c2=0; while ( s2 != (*pA)->segs.end() ) { if ( c2 >= c1 ) { s2++; c2++; continue; } // Determine match function: // Based on homozygosity or pairwise sharing // Based on whole segments, or just pool consensus region if (par::segment_overlap) { // PAIRWISE SEGMENTAL MATCH if ( par::homo_run_consensus_match) { // Consensus match if ( segsIBDMatchCON( *s1, *s2, (*pA)->min, (*pA)->max ) ) { (*pA)->match[c1].push_back( c2 ); (*pA)->match[c2].push_back( c1 ); (*pA)->matchcount[c1]++; (*pA)->matchcount[c2]++; } } else // else match whole segments (default) { if ( segsIBDMatch( *s1, *s2 ) ) { (*pA)->match[c1].push_back( c2 ); (*pA)->match[c2].push_back( c1 ); (*pA)->matchcount[c1]++; (*pA)->matchcount[c2]++; } } } else { // HOMOZYGOSITY MATCH // If a match, add pairs to lists if ( par::homo_run_consensus_match) { // Consensus match if ( segsMatchCON( *s1, *s2, (*pA)->min, (*pA)->max ) ) { (*pA)->match[c1].push_back( c2 ); (*pA)->match[c2].push_back( c1 ); (*pA)->matchcount[c1]++; (*pA)->matchcount[c2]++; } } else // else match whole segments (default) { if ( segsMatch( *s1, *s2 ) ) { (*pA)->match[c1].push_back( c2 ); (*pA)->match[c2].push_back( c1 ); (*pA)->matchcount[c1]++; (*pA)->matchcount[c2]++; } } } // Next segment (B) s2++; c2++; } // Next segment (A) s1++; c1++; } // Parse the list bool done = false; (*pA)->ng = 1; while ( ! done ) { // Find largest, ungrouped list int maxlist = 0; int maxlisti = -1; for (int i=0; i < (*pA)->group.size() ; i++) { if ( (*pA)->group[i] == 0 ) { if ( (*pA)->match[i].size() >= maxlist ) { maxlist = (*pA)->match[i].size(); maxlisti = i; } } } // Set group for this, and matches (*pA)->index[maxlisti] = true; (*pA)->group[maxlisti] = (*pA)->ng; for ( int j=0; j < maxlist; j++) (*pA)->group[ (*pA)->match[maxlisti][j] ] = (*pA)->ng; // Advance to next group (*pA)->ng++; // Are all segments grouped? bool ungroup = false; for (int i=0; i < (*pA)->group.size() ; i++) { if ( (*pA)->group[i] == 0 ) { ungroup = true; //break; } } if ( ! ungroup ) done = true; } // Remove temporary storage (*pA)->match.clear(); // Next pool pA++; cA++; } ////////////////////////////////////////// // Populate indivSegmentGroup and return? if ( par::segment_silently_return_groups ) { // Should only be a single pool -- but for now, let's ignore and // just use this code... check later... set::iterator p2 = pools.begin(); c=0; while ( p2 != pools.end()) { if (redundant[c]) { c++; p2++; continue; } // Loop over each group in the pool for ( int g = 0; g < (*p2)->ng; g++) { // Consider all segments in this pool set::iterator s = (*p2)->segs.begin(); int c2=0; while ( s != (*p2)->segs.end() ) { // Not in group 'g' ? if ( (*p2)->group[c2] != g ) { s++; c2++; continue; } // Find person, given pointer.... yes, yes, I know // this is a terrible way to do things... for now // just assume homozygous segments (i.e. p1==p2) for (int i=0; ip1 ) indivSegmentGroup[i] = (*p2)->group[c2]; } s++; c2++; } // Next segment } // Next group c++; p2++; } // Next pool return; } ////////////// // 4. Display set::iterator p2 = pools.begin(); c=0; while ( p2 != pools.end()) { if (redundant[c]) { c++; p2++; continue; } int ncase=0; int ncontrol=0; // Loop over each group in the pool for ( int g = 0; g < (*p2)->ng; g++) { // Consider all segments in this pool set::iterator s = (*p2)->segs.begin(); int c2=0; while ( s != (*p2)->segs.end() ) { // Not in group 'g' ? if ( (*p2)->group[c2] != g ) { s++; c2++; continue; } HOM << setw(5) << "S"+int2str(c+1) << " "; if (par::segment_overlap) { HOM << setw(par::pp_maxfid) << (*s)->p1->fid << " " << setw(par::pp_maxiid) << (*s)->p1->iid << " " << setw(par::pp_maxfid) << (*s)->p2->fid << " " << setw(par::pp_maxiid) << (*s)->p2->iid << " "; if (par::bt) { if ( (*s)->p1->missing || (*s)->p2->missing ) HOM << setw(8) << "NA" << " "; else if ( (!(*s)->p1->aff) && (!(*s)->p2->aff) ) { HOM << setw(8) << "-1" << " "; ncontrol++; } else if ( (*s)->p1->aff && (*s)->p2->aff ) { HOM << setw(8) << "1" << " "; ncase++; } else { HOM << setw(8) << "0" << " "; ncontrol++; } } else HOM << setw(8) << "NA" << " "; } else { HOM << setw(par::pp_maxfid) << (*s)->p1->fid << " " << setw(par::pp_maxiid) << (*s)->p1->iid << " "; if (par::bt) HOM << setw(8) << (*s)->p1->phenotype << " "; else { HOM.precision(4); HOM << setw(8) << (*s)->p1->phenotype << " "; HOM.precision(8); } if ( (*s)->p1->aff ) ncase++; else ncontrol++; } HOM << setw(4) << locus[(*s)->start]->chr << " "; if ( ! par::cnv_list ) HOM << setw(par::pp_maxsnp) << locus[(*s)->start]->name << " " << setw(par::pp_maxsnp) << locus[(*s)->finish]->name << " "; HOM << setw(14) << locus[(*s)->start]->bp << " " << setw(14) << locus[(*s)->finish]->bp << " " << setw(8) << (double)(locus[(*s)->finish]->bp - locus[(*s)->start]->bp ) / 1000.0 << " "; // CNV-specific info: type and score if ( par::cnv_list ) { if ( (*s)->type == 1 ) HOM << setw(6) << "DEL" << " " << setw(8) << (*s)->score << " "; else HOM << setw(6) << "DUP" << " " << setw(8) << (*s)->score << " "; } // Non-CNV specific info if ( !par::cnv_list ) { HOM<< setw(8) << (*s)->finish - (*s)->start + 1 << " " << setw(4) << (*p2)->matchcount[c2] << " "; if ( (*p2)->index[c2] ) HOM << setw(6) << int2str((*p2)->group[c2])+"*" << " "; else HOM << setw(6) << int2str((*p2)->group[c2])+" " << " "; } HOM << "\n"; s++; c2++; } // Next segment } // Next group // Consensus region if (! par::force_span ) { HOM << setw(5) << "S"+int2str(c+1) << " "; HOM << setw(par::pp_maxfid) << "CON" << " " << setw(par::pp_maxiid) << (*p2)->segs.size() << " "; if (par::segment_overlap) HOM << setw(par::pp_maxfid) << "NA" << " " << setw(par::pp_maxiid) << "NA" << " "; HOM << setw(8) << int2str(ncase)+":"+int2str(ncontrol) << " " << setw(4) << locus[(*(*p2)->segs.begin())->start]->chr << " "; if ( ! par::cnv_list ) HOM << setw(par::pp_maxsnp) << locus[(*p2)->min]->name << " " << setw(par::pp_maxsnp) << locus[(*p2)->max]->name << " "; HOM << setw(14) << locus[(*p2)->min]->bp << " " << setw(14) << locus[(*p2)->max]->bp << " " << setw(8) << (double)(locus[(*p2)->max]->bp - locus[(*p2)->min]->bp ) / 1000.0 << " "; if ( par::cnv_list ) { HOM << setw(6) << "NA" << " " << setw(8) << "NA" << " "; } if ( ! par::cnv_list ) { HOM << setw(8) << (*p2)->max - (*p2)->min + 1; HOM << setw(6) << "NA" << " " << setw(6) << "NA" << " "; } HOM << "\n"; } else { HOM << setw(5) << "S"+int2str(c+1) << " "; HOM << setw(par::pp_maxfid) << "FORCE" << " " << setw(par::pp_maxiid) << (*p2)->segs.size() << " "; if (par::segment_overlap) HOM << setw(par::pp_maxfid) << "NA" << " " << setw(par::pp_maxiid) << "NA" << " "; HOM << setw(8) << int2str(ncase)+":"+int2str(ncontrol) << " " << setw(4) << locus[(*(*p2)->segs.begin())->start]->chr << " "; if ( ! par::cnv_list ) HOM << setw(par::pp_maxsnp) << locus[par::segment_snp1]->name << " " << setw(par::pp_maxsnp) << locus[par::segment_snp2]->name << " "; HOM << setw(14) << locus[par::segment_snp1]->bp << " " << setw(14) << locus[par::segment_snp2]->bp << " " << setw(8) << (double)(locus[par::segment_snp2]->bp - locus[par::segment_snp1]->bp ) / 1000.0 << " "; if ( ! par::cnv_list ) { HOM << setw(8) << par::segment_snp2 - par::segment_snp1 + 1; HOM << setw(6) << "NA" << " " << setw(6) << "NA" << " "; } HOM<< "\n"; } // Union region HOM << setw(5) << "S"+int2str(c+1) << " "; HOM << setw(par::pp_maxfid) << "UNION" << " " << setw(par::pp_maxiid) << (*p2)->segs.size() << " "; if (par::segment_overlap) HOM << setw(par::pp_maxfid) << "NA" << " " << setw(par::pp_maxiid) << "NA" << " "; HOM << setw(8) << int2str(ncase)+":"+int2str(ncontrol) << " " << setw(4) << locus[(*(*p2)->segs.begin())->start]->chr << " "; if ( ! par::cnv_list ) HOM << setw(par::pp_maxsnp) << locus[(*p2)->union_min]->name << " " << setw(par::pp_maxsnp) << locus[(*p2)->union_max]->name << " "; HOM << setw(14) << locus[(*p2)->union_min]->bp << " " << setw(14) << locus[(*p2)->union_max]->bp << " " << setw(8) << (double)(locus[(*p2)->union_max]->bp - locus[(*p2)->union_min]->bp ) / 1000.0 << " "; if ( par::cnv_list ) { HOM << setw(6) << "NA" << " " << setw(8) << "NA" << " "; } if ( !par::cnv_list ) HOM << setw(8) << (*p2)->union_max - (*p2)->union_min + 1 << setw(6) << "NA" << " " << setw(6) << "NA" << " "; HOM << "\n\n"; // Verbose mode? (not for CNV lists) if ( ( par::homozyg_verbose || par::segment_verbose ) && !par::cnv_list ) { string f; if (par::segment_overlap) f = par::output_file_name + ".segment.overlap.S"+int2str(c+1)+".verbose"; else f = par::output_file_name + ".hom.overlap.S"+int2str(c+1)+".verbose"; ofstream VHOM(f.c_str(),ios::out); VHOM.precision(2); displayPoolVerbose( *this , *p2 , VHOM ); VHOM.close(); } c++; p2++; } // Next pool HOM.close(); } void Plink::findHomoRuns(Individual * person, ofstream & HOM) { int l=0; int lasthom=0; int nmiss = 0; int nhet = 0; bool run = false; int start = 0; int end = 0; while ( l < nl_all ) { // Skip haploid chromosomes / end any existing run if ( ( par::chr_sex[locus[l]->chr] && person->sex ) || par::chr_haploid[locus[l]->chr] ) { if (run) { end = l-1; run = false; } else { l++; continue; } } // Outside of a run? if (!run) { // A new run? if (person->one[l] == person->two[l]) { start = lasthom = l; nmiss=0; nhet=0; run=true; } } else // if already in a run, either end or increase length? { if ( locus[l]->chr != locus[start]->chr ) // different chromosome? { end = l-1; run = false; } else if ( l == (nl_all -1) ) // or end of all SNPs? { if ( person->one[l] == person->two[l] ) lasthom=l; end = lasthom; run = false; } // found a het? else if ( (!person->one[l]) && person->two[l]) { if (nhet==par::homo_run_het) { end = lasthom; run = false; } else nhet++; } else // ...continue run { lasthom=l; if ( person->one[l] && (!person->two[l]) ) nmiss++; } } // Check run length? if (!run) { bool accept = true; if (par::homo_run_kb) if ( locus[end]->bp - locus[start]->bp < par::homo_run_length_kb * 1000 ) accept = false; if (par::homo_run_snps) if ( end - start +1 < par::homo_run_length_snps ) accept = false; if (accept) { HOM << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(4) << locus[start]->chr << " " << setw(par::pp_maxsnp) << locus[start]->name << " " << setw(par::pp_maxsnp) << locus[end]->name << " " << setw(12) <bp << " " << setw(12) <bp << " " << setw(10) << (double)(locus[end]->bp - locus[start]->bp)/(double)1000 << " " << setw(10) << end - start + 1 << " " << setw(4) << nhet << " " << setw(4) << nmiss << "\n"; Segment s; s.p1 = s.p2 = person; s.start = start; s.finish = end; segment.push_back(s); } ////////////////// // Clear counters start = end = nmiss = 0; } /////////////// // Next locus l++; } } class HWindow { public: int start, stop; bool leftHomozyg, leftMissing; bool rightHomozyg, rightMissing; bool finished; bool valid; Individual * person; Plink * P; int homCount, hetCount, misCount; // Constructor: must specify a person HWindow(Plink*,Individual*); // Set Window bounaries void set(int,int); // Full count void recount(); // Shift update void shift(); }; HWindow::HWindow(Plink * plink, Individual * p) { P = plink; person = p; finished = false; valid = true; start = stop = 0; } void HWindow::recount() { // Assume individual-major mode // Reset counts homCount = hetCount = misCount = 0; for (int l = start; l <= stop; l++) { if ( person->one[l] ) { if ( person->two[l] ) homCount++; else misCount++; } else { if ( person->two[l] ) hetCount++; else homCount++; } } } void HWindow::shift() { // Find a new, valid (i.e. all on same autosomal chromosome) // window valid = false; bool moreThanOne = false; while ( ! valid ) { set( ++start, ++stop ); if ( ! valid ) moreThanOne = true; if ( finished ) return; } // Typically, we will just shift a single SNP, so we do not // need to recount eveything if ( ! moreThanOne ) { // Update counts: remove left edge int trailing = start - 1; if ( person->one[trailing] == person->two[trailing] ) --homCount; else if ( person->one[trailing] ) --misCount; else --hetCount; // Add right edge; leading edge is now 'stop' if ( person->one[stop] == person->two[stop] ) ++homCount; else if ( person->one[stop] ) ++misCount; else ++hetCount; } else { // A full recount recount(); } } void HWindow::set(int s1, int s2) { start = s1; stop = s2; // Can we set this window? if ( s1 < 0 || s2 >= P->nl_all || s1 > s2 || P->locus[s1]->chr != P->locus[s2]->chr || ( par::chr_sex[P->locus[s1]->chr] && person->sex ) || par::chr_haploid[P->locus[s1]->chr] ) { // Finished all SNPS? if ( s2 == P->nl_all ) finished = true; valid = false; return; } // Set poisitions valid = true; return; } void Plink::findHomoWindow(Individual * person, ofstream & HOM) { // Window properties // Only count a window if not too many missing genotypes // and if distance spanned if not too great. Then score // as 0 or 1 depending on whether we see too many hets. // Record homozygosity state vector totalWindows(nl_all,0); vector homozygWindows(nl_all,0); // Create an initial window and place just before // first SNP HWindow window(this,person); window.set(0,par::homo_windowSize-1); if ( ! window.valid ) window.shift(); window.recount(); while ( 1 ) { // End of genome if ( window.finished ) break; // A valid window? Then record if ( window.valid ) { // Is this also homozygous enough ? bool homozyg = false; if ( window.misCount <= par::homo_windowAllowedMissing && window.hetCount <= par::homo_windowAllowedHet ) homozyg = true; // Score for all SNPs for ( int l = window.start ; l <= window.stop ; l++) { totalWindows[l]++; if (homozyg) homozygWindows[l]++; } } // Move window window.shift(); } // Extract segments -- above threshold values for ( int l = 0 ; l < nl_all ; l++ ) { if ( totalWindows[l] == 0 ) homozygWindows[l] == 0; else homozygWindows[l] /= totalWindows[l]; } if (par::verbose) HOM << "Segments for " << person->fid << " " << person->iid << "\n"; int bp1 = 0; int l1 = 0; // Find segments int allSegs = 0; int incSegs = 0; bool inseg = false; for ( int l = 0 ; l < nl_all ; l++ ) { if ( ( !inseg ) && homozygWindows[l] >= par::homo_threshold ) { inseg = true; bp1 = locus[l]->bp; l1 = l; } else if ( inseg ) { bool ending = homozygWindows[l] < par::homo_threshold; bool bigGap = ( locus[l]->bp - locus[l-1]->bp ) > par::homo_run_gap; bool newChr = locus[l]->chr != locus[l-1]->chr; bool lastSNP = l == ( nl_all - 1) ; if ( ending || bigGap || newChr || lastSNP ) { inseg = false; // Does this potential segment shape up? // Length, number of SNPs, density of SNPs, // largest gap int l2 = l - 1; // Check this is not the final, homozygous SNP if ( lastSNP && ! ending ) l2 = l; // We might want to start a new segment on this SNP also if ( ( newChr || bigGap ) && ! ending ) --l; double length = ( locus[l2]->bp - bp1 ) / 1000.0; int snps = l2 - l1 + 1; double density = length / (double)snps ; if ( length >= par::homo_run_length_kb && snps >= par::homo_run_length_snps && length/(double)snps <= par::homo_run_density ) { incSegs++; // Some sanity checks double proHet = 0; double proHom = 0; for (int j= l1; j <= l2; j++) { bool s1 = person->one[j]; bool s2 = person->two[j]; if ( s1 == s2 ) ++proHom; else if ( s2 ) ++proHet; } proHom /= (double)snps; proHet /= (double)snps; HOM << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(8) << person->phenotype << " " << setw(4) << locus[l1]->chr << " " << setw(par::pp_maxsnp) << locus[l1]->name << " " << setw(par::pp_maxsnp) << locus[l2]->name << " " << setw(12) << locus[l1]->bp << " " << setw(12) << locus[l2]->bp << " " << setw(10) << length << " " << setw(8) << snps << " " << setw(8) << density << " " << setw(8) << proHom << " " << setw(8) << proHet << "\n"; Segment s; s.p1 = s.p2 = person; s.start = l1; s.finish = l2; segment.push_back(s); } else if ( par::verbose ) { // Some sanity checks double proHet = 0; double proHom = 0; for (int j= l1; j <=l2; j++) { bool s1 = person->one[j]; bool s2 = person->two[j]; if ( s1 == s2 ) ++proHom; else if ( s2 ) ++proHet; } proHom /= (double)snps; proHet /= (double)snps; HOM <<"* " << setw(par::pp_maxfid) << person->fid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(4) << person->phenotype << " " << setw(4) << locus[l1]->chr << " " << setw(par::pp_maxsnp) << locus[l1]->name << " " << setw(par::pp_maxsnp) << locus[l2]->name << " " << setw(12) << locus[l1]->bp << " " << setw(12) << locus[l2]->bp << " " << setw(10) << length << " " << setw(8) << snps << " " << setw(8) << density << " " << setw(8) << proHom << " " << setw(8) << proHet << "\n"; } allSegs++; } } } if ( par::verbose ) { for ( int l = 0 ; l < nl_all ; l++ ) { HOM << "SX " << locus[l]->chr << "\t" << locus[l]->name << "\t" << (double)locus[l]->bp / (1000.0*1000.0) << "\t" << homozygWindows[l] << "\t"; if ( homozygWindows[l] >= par::homo_threshold ) HOM << "1\t"; else HOM << "0\t"; if ( person->one[l] == person->two[l] ) HOM << "1\n"; else if ( person->two[l] ) HOM << "0\n"; //het else HOM << "-1\n"; } HOM << "\n"; } } bool segsOverlap(Segment * s1, Segment * s2) { if ( s1->finish < s2->start ) return false; else if ( s2->finish < s1->start ) return false; else return true; } void Plink::homozygousSegmentPermutationTest(Perm & perm, string f, vector & coverage_aff, vector & coverage_unaff ) { // Permutation test for excess of case homozygous segments in a // particular region (one-sided test) // Also applies for CNV data // Optionally allowed for this to operate on smoothed data (i.e. // average of event count over a KB window, forwards and backwards // from the given position) // Also, adds in summary statistics permutation double tot_aff=0; double tot_not=0; for (int i1=0; i1aff ) tot_aff++; else tot_not++; printLOG(int2str((int)tot_aff)+" affected individuals out of " +int2str(int(tot_not+tot_aff))+" in total\n"); ////////////////////////////////////////// // Test positons = MAP positions (nl_all) // Test positions = summed segment counts ( get from original counts ) // Test position = aggregate statistics ( 7 tests) int nt = nl_all; if ( par::seg_test_region ) nt = coverage_aff.size(); else if ( par::cnv_indiv_perm ) { nt = 7; if ( par::cnv_count_baseline ) ++nt; } // Option per-individual summary tests? (4 tests) // Cases - controls: total # segs // # people w/ 1+ seg // total kb length // mean segment length // gene-count // atleast-1-gene-count // gene-enrichment perm.setTests(nt); perm.setPermClusters(*this); perm.originalOrder(); vector original(nt); ////////////////////////////////////////////////// // Genic/regional, or standard positional tests? if ( ! par::cnv_indiv_perm ) { // // Get genome-wide means // double Ag = 0; // double Ug = 0; // for (int l=0; l1 ) { vector ak(nk); vector uk(nk); for (int k=0; k pr(nt); if ( par::cnv_pos_perm ) { // Offset randomisation method, i.e. shift // gene list by a random constant positionPermuteSegments(); // Recount for each individual indivSegmentSummaryCalc(segmentCount, segmentLength, true, true); // for (int j=0; j<20; j++) // cout << j << " is " << segment[j].count << "\n"; // cout << "----\n"; } else { // Label swapping for phenotype perm.permuteInCluster(); } ////////////////////////////// // Retest permuted dataset if ( ! par::cnv_indiv_perm ) { vector coverage_aff(nt,0); vector coverage_unaff(nt,0); /////////////////////////// // Re-calculate counts if ( par::seg_test_region ) { countCNVPerRegion(coverage_aff, coverage_unaff); } else { ///////////////////////// // Actual segments vector::iterator s = segment.begin(); while ( s != segment.end() ) { if ( s->p1->pperson->aff) for (int l = s->start ; l <= s->finish; l++) coverage_aff[l]++; else for (int l = s->start ; l <= s->finish; l++) coverage_unaff[l]++; s++; } // Optionally, if we allow 'wings' to increase span // of events (and so, each data point represents the // number of events with X kb of that position) if ( par::seg_test_window ) { vector::iterator s = segment.begin(); while ( s != segment.end() ) { // Shift left from start int l = s->start; Locus * loc1 = locus[s->start]; while ( 1 ) { --l; if ( l < 0 ) break; Locus * loc2 = locus[l]; if ( loc2->chr != loc1->chr ) break; if ( loc1->bp - loc2->bp > par::seg_test_window_bp ) break; if ( s->p1->pperson->aff ) ++coverage_aff[l]; else ++coverage_unaff[l]; } // Shift right from start l = s->finish; loc1 = locus[s->finish]; while ( 1 ) { ++l; if ( l == nl_all ) break; Locus * loc2 = locus[l]; if ( loc2->chr != loc1->chr ) break; if ( loc2->bp - loc1->bp > par::seg_test_window_bp ) break; if ( s->p1->pperson->aff ) ++coverage_aff[l]; else ++coverage_unaff[l]; } // Next segment s++; } } } ////////////////////////// // Get genome-wide average // double Ag = 0; // double Ug = 0; // for (int l=0; l testname(8); testname[0] = "RATE"; testname[1] = "PROP"; testname[2] = "KBTOT"; testname[3] = "KBAVG"; testname[4] = "GRATE"; testname[5] = "GPROP"; testname[6] = "GRICH"; testname[7] = "GRICH2"; if ( par::seg_test_region ) { set::iterator i1 = geneList.begin(); int gCount = 0; while ( i1 != geneList.end() ) { PHOM << setw(4) << i1->chr << " " << setw(16) << i1->name << " "; PHOM << setw(12) << perm.pvalue( gCount ) << " "; PHOM << setw(12) << perm.max_pvalue( gCount ) << "\n"; ++i1; ++gCount; } } else for (int l=0; lchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " "; PHOM << setw(12) << perm.pvalue(l) << " "; PHOM << setw(12) << perm.max_pvalue(l) << "\n"; } } PHOM.close(); } plink-1.07-src/Rconnection.cpp0000644000265600020320000005300711264127626015470 0ustar tilleaadmin/* * C++ Interface to Rserve * Copyright (C) 2004-8 Simon Urbanek, All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; version 2.1 of the License * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Leser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Although this code is licensed under LGPL v2.1, we strongly encourage * everyone modifying this software to contribute back any improvements and * bugfixes to the project for the benefit all other users. Thank you. * * $Id: Rconnection.cc 248 2008-12-03 08:39:47Z urbanek $ */ /* external defines: SWAPEND - needs to be defined for platforms with inverse endianess related to Intel see also SOCK_ERROR, MAIN and other defines in sisocks.h */ #include "Rconnection.h" #include #include "sisocks.h" #ifdef unix #include #include #else #define AF_LOCAL -1 #endif #if defined HAVE_NETINET_TCP_H && defined HAVE_NETINET_IN_H #define CAN_TCP_NODELAY #include #include #endif #ifdef Win32 #define CAN_TCP_NODELAY #endif #include "Rsrv.h" #ifndef AF_LOCAL #define AF_LOCAL AF_UNIX #endif // NOTE: 0103 compatibility has not been established! use at your own risk! static char *myID= "Rsrv0103QAP1"; /* this client supports up to protocol version 0103 */ static Rexp *new_parsed_Rexp(unsigned int *d, Rmessage *msg) { int type=ptoi(*d)&0x3f; #ifdef DEBUG_CXX printf("new_parsed_Rexp(%p, %p) type=%d\n", d, msg, type); #endif if (type==XT_ARRAY_INT || type==XT_INT) return new Rinteger(d,msg); if (type==XT_ARRAY_DOUBLE || type==XT_DOUBLE) return new Rdouble(d,msg); if (type==XT_LIST || type == XT_LIST_NOTAG || type == XT_LIST_TAG) return new Rlist(d,msg); if (type==XT_VECTOR) return new Rvector(d,msg); if (type==XT_STR) return new Rstring(d,msg); if (type==XT_SYM || type==XT_SYMNAME) return new Rsymbol(d,msg); if (type==XT_ARRAY_STR) return new Rstrings(d,msg); return new Rexp(d,msg); } static Rexp *new_parsed_Rexp_from_Msg(Rmessage *msg) { int hl=1; unsigned int *hp=msg->par[0]; Rsize_t plen=hp[0]>>8; if ((hp[0]&DT_LARGE)>0) { hl++; plen|=((Rsize_t)hp[1])<<24; } return new_parsed_Rexp(hp+hl,msg); } Rmessage::Rmessage() { complete=0; data=0; len=0; } Rmessage::Rmessage(int cmd) { memset(&head,0,sizeof(head)); head.cmd=cmd&0x3f; data=0; len=0; complete=1; } Rmessage::Rmessage(int cmd, const char *txt) { memset(&head,0,sizeof(head)); int tl=strlen(txt)+1; if ((tl&3)>0) tl=(tl+4)&0xffffc; // allign the text len=tl+4; // message length is tl + 4 (short format only) head.cmd=cmd&0x3f; head.len=len; data=(char*)malloc(tl+16); memset(data,0,tl+16); *((int*)data)=itop(SET_PAR(DT_STRING,tl)); strcpy(data+4,txt); complete=1; } Rmessage::Rmessage(int cmd, const void *buf, int dlen, int raw_data) { memset(&head,0,sizeof(head)); len=(raw_data)?dlen:(dlen+4); head.cmd=cmd&0x3f; head.len=len; data=(char*)malloc(len); memcpy(data, (raw_data)?buf:((char*)buf+4), dlen); if (!raw_data) *((int*)data)=itop(SET_PAR(DT_BYTESTREAM,dlen)); complete=1; } Rmessage::Rmessage(int cmd, int i) { memset(&head,0,sizeof(head)); len=8; // DT_INT+len (4) + payload-1xINT (4) head.cmd=cmd&0x3f; head.len=len; data=(char*)malloc(8); *((int*)data)=itop(SET_PAR(DT_INT,4)); ((int*)data)[1]=itop(i); complete=1; } Rmessage::~Rmessage() { if(data) free(data); complete=0; } int Rmessage::read(int s) { complete=0; int n=recv(s,(char*)&head,sizeof(head),0); if (n!=sizeof(head)) { closesocket(s); s=-1; return (n==0)?-7:-8; } Rsize_t i=len=head.len=ptoi(head.len); head.cmd=ptoi(head.cmd); head.dof=ptoi(head.dof); head.res=ptoi(head.res); if (head.dof>0) { // skip past DOF if present char sb[256]; int k=head.dof; while (k>0) { n=recv(s,(char*)sb,(k>256)?256:k,0); if (n<1) { closesocket(s); s=-1; return -8; // malformed packet } k-=n; } } if (i>0) { data=(char*) malloc(i); if (!data) { closesocket(s); s=-1; return -10; // out of memory } char *dp=data; while(i>0 && (n=recv(s,(char*)dp,i,0))>0) { dp+=n; i-=n; } if (i>0) { closesocket(s); s=-1; return -8; } } parse(); complete=1; return 0; } void Rmessage::parse() { pars=0; if (len<4) return; char *c=data, *eop=c+len; while (c>8; if ((p1&DT_LARGE)>0) { hs+=4; unsigned int p2=ptoi(pp[1]); len|=((Rsize_t)p2)<<24; } #ifdef DEBUG_CXX printf(" par %d: %d length %d\n", pars, p1&0x3f, len); #endif par[pars++]=(unsigned int*)c; c+=hs; c+=len; if (pars>15) break; // max 16 pars } } int Rmessage::send(int s) { int failed=0; head.cmd=itop(head.cmd); head.len=itop(head.len); head.dof=itop(head.dof); head.res=itop(head.res); if (::send(s,(char*)&head,sizeof(head),0)!=sizeof(head)) failed=-1; if (!failed && len>0 && (Rsize_t)::send(s,data,len,0)!=len) failed=-1; head.cmd=ptoi(head.cmd); head.len=ptoi(head.len); head.dof=ptoi(head.dof); head.res=ptoi(head.res); return failed; } Rexp::Rexp(Rmessage *msg) { #ifdef DEBUG_CXX printf("new Rexp@%x\n", this); #endif master=0; rcount=0; attr=0; attribs=0; this->msg=msg; int hl=1; unsigned int *hp=msg->par[0]; Rsize_t plen=hp[0]>>8; if ((hp[0]&DT_LARGE)>0) { hl++; plen|=((Rsize_t)hp[1])<<24; } next=parse(hp+hl); } Rexp::Rexp(unsigned int *pos, Rmessage *msg) { #ifdef DEBUG_CXX printf("new Rexp@%x\n", this); #endif attr=0; master=0; this->msg=msg; rcount=0; attribs=0; next=parse(pos); } Rexp::Rexp(int type, const char *data, int len, Rexp *attr) { this->attr=attr; master=this; rcount=0; attribs=0; this->type=type; this->msg=0; if (len>0) { this->data=(char*) malloc(len); memcpy(this->data, data, len); this->len=len; } else this->len=0; next=(char*)data+this->len; } Rexp::~Rexp() { #ifdef DEBUG_CXX printf("releasing Rexp@%p\n", this); #endif if (attr) delete(attr); attr=0; if (master) { if (master==this) { free(data); len=0; } else master->rcount--; master=0; } if (msg) { if (rcount>0) fprintf(stderr, "WARNING! Rexp master %lx delete requested, but %d object(s) are using our memory - refusing to free, leaking...\n", (long)this, rcount); else delete(msg); } msg=0; } void Rexp::set_master(Rexp *m) { if (master) master->rcount--; master=m; if (m) m->rcount++; } char *Rexp::parse(unsigned int *pos) { // plen is not used this->pos=pos; int hl=1; unsigned int p1=ptoi(pos[0]); len=p1>>8; if ((p1&XT_LARGE)>0) { hl++; len|=((Rsize_t)(ptoi(pos[1])))<<24; } data=(char*)(pos+hl); if (p1&XT_HAS_ATTR) { attr=new_parsed_Rexp((unsigned int*)data, 0); len-=attr->next-data; data=attr->next; if (master || msg) attr->set_master(master?master:this); } type=p1&0x3f; #ifdef DEBUG_CXX printf("Rexp(type=%d, len=%d, attr=%p)\n", type, len, attr); #endif return data+len; } void Rexp::store(char *buf) { int hl=4; unsigned int *i = (unsigned int*)buf; i[0]=SET_PAR(type, len); i[0]=itop(i[0]); if (len>0x7fffff) { buf[0]|=XT_LARGE; i[1]=itop(len>>24); hl+=4; } memcpy(buf+hl, data, len); } Rexp *Rexp::attribute(const char *name) { return (attr && attr->type==XT_LIST)?((Rlist*)attr)->entryByTagName(name):0; } char **Rexp::attributeNames() { if (!attr || attr->type!=XT_LIST) return 0; if (attribs==0) { // let us cache attribute names Rlist *l = (Rlist*) attr; while (l && l->type==XT_LIST) { if (l->tag && l->tag->type==XT_SYM) attribs++; l=l->tail; } attrnames=(char**) malloc(sizeof(char*)*(attribs+1)); l = (Rlist*) attr; while (l && l->type==XT_LIST) { if (l->tag && l->tag->type==XT_SYM) attrnames[attribs++]=((Rsymbol*)l->tag)->symbolName(); l=l->tail; } attrnames[attribs]=0; } return attrnames; } void Rinteger::fix_content() { if (len<0 || !data) return; #ifdef SWAPEND int *i = (int*) data; int *j = (int*) (data+len); while (inext; if (ptrnext; if (ptrtype!=XT_LIST) { // if tail is not a list, then something is wrong - just delete it delete(tail); tail=0; } } } } } else if (type == XT_LIST_NOTAG) { /* new style list w/o tags */ Rlist *lt = this; int n = 0; while (ptr < eod) { Rexp *h = new_parsed_Rexp((unsigned int*) ptr, 0); if (!h) break; if (n) lt = lt->tail = new Rlist(type, h, 0, h->next, msg); else lt->head = h; n++; ptr = h->next; } } else if (type == XT_LIST_TAG) { /* new style list with tags */ Rlist *lt = this; int n = 0; while (ptr < eod) { Rexp *h = new_parsed_Rexp((unsigned int*) ptr, 0); #ifdef DEBUG_CXX printf(" LIST_TAG: n=%d, ptr=%p, h=%p\n", n, ptr, h); #endif if (!h) break; ptr = h->next; Rexp *t = new_parsed_Rexp((unsigned int*) ptr, 0); #ifdef DEBUG_CXX printf(" tag=%p (ptr=%p)\n", t, ptr); #endif if (!t) break; if (n) lt = lt->tail = new Rlist(type, h, t, t->next, msg); else { lt->head = h; lt->tag = t; } ptr = t->next; n++; } next = ptr; } #ifdef DEBUG_CXX printf(" end of list %p, ptr=%p\n", this, ptr); #endif } Rvector::~Rvector() { int i=0; while(itype==XT_STR) sc++; i++; } if (sc==0) return 0; strs=(char**)malloc(sizeof(char*)*(sc+1)); i=0; sc=0; while (itype==XT_STR) strs[sc++]=((Rstring*)cont[i])->string(); i++; } strs[sc]=0; return strs; } int Rvector::indexOf(Rexp *exp) { int i=0; while (itype==XT_STR && !strcmp(((Rstring*)cont[i])->string(),str)) return i; i++; } return -1; } int Rstrings::indexOfString(const char *str) { unsigned int i = 0; while (i < nel) { if (cont[i] && !strcmp(cont[i], str)) return i; i++; } return -1; } Rexp* Rvector::byName(const char *name) { if (count<1 || !attr || (attr->type!=XT_LIST && attr->type != XT_LIST_TAG)) return 0; Rexp *e = ((Rlist*) attr)->head; if (((Rlist*) attr)->tag) e=((Rlist*) attr)->entryByTagName("names"); if (!e || (e->type!=XT_VECTOR && e->type!=XT_ARRAY_STR && e->type!=XT_STR)) return 0; if (e->type==XT_VECTOR) { int pos = ((Rvector*)e)->indexOfString(name); if (pos>-1 && postype == XT_ARRAY_STR) { int pos = ((Rstrings*)e)->indexOfString(name); if (pos>-1 && posstring(),name)) return cont[0]; } return 0; } void Rvector::fix_content() { char *ptr = data; char *eod = data+len; capacity=16; cont=(Rexp**) malloc(sizeof(Rexp*)*capacity); while (ptrnext; else break; count++; } } Rconnection::Rconnection(const char *host, int port) { if (!host) host="127.0.0.1"; this->host=(char*)malloc(strlen(host)+1); strcpy(this->host, host); this->port=port; family=(port==-1)?AF_LOCAL:AF_INET; s=-1; auth=0; salt[0]='.'; salt[1]='.'; } Rconnection::~Rconnection() { if (host) free(host); host=0; if (s!=-1) closesocket(s); s=-1; } int Rconnection::connect() { #ifdef unix struct sockaddr_un sau; #endif SAIN sai; char IDstring[33]; if (family==AF_INET) { memset(&sai,0,sizeof(sai)); build_sin(&sai,host,port); } else { #ifdef unix memset(&sau,0,sizeof(sau)); sau.sun_family=AF_LOCAL; strcpy(sau.sun_path,host); // FIXME: possible overflow! #else return -11; // unsupported #endif } IDstring[32]=0; int i; s=socket(family,SOCK_STREAM,0); if (family==AF_INET) { #ifdef CAN_TCP_NODELAY int opt=1; setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (const char*) &opt, sizeof(opt)); #endif i=::connect(s,(SA*)&sai,sizeof(sai)); } #ifdef unix else i=::connect(s,(SA*)&sau,sizeof(sau)); #endif if (i==-1) { closesocket(s); s=-1; return -1; // connect failed } int n=recv(s,IDstring,32,0); if (n!=32) { closesocket(s); s=-1; return -2; // handshake failed (no IDstring) } if (strncmp(IDstring,myID,4)) { closesocket(s); s=-1; return -3; // invalid IDstring } if (strncmp(IDstring+8,myID+8,4) || strncmp(IDstring+4,myID+4,4)>0) { closesocket(s); s=-1; return -4; // protocol not supported } { int i=12; while (i<32) { if (!strncmp(IDstring+i, "ARuc", 4)) auth|=A_required|A_crypt; if (!strncmp(IDstring+i, "ARpt", 4)) auth|=A_required|A_plain; if (IDstring[i]=='K') { salt[0]=IDstring[i+1]; salt[1]=IDstring[i+2]; } i+=4; } } return 0; } int Rconnection::disconnect() { if (s>-1) { closesocket(s); s=-1; } return 0; } /**--- low-level functions --*/ int Rconnection::request(Rmessage *msg, int cmd, int len, void *par) { struct phdr ph; if (s==-1) return -5; // not connected memset(&ph,0,sizeof(ph)); ph.len=itop(len); ph.cmd=itop(cmd); if (send(s,(char*)&ph,sizeof(ph),0)!=sizeof(ph)) { closesocket(s); s=-1; return -9; } if (len>0 && send(s,(char*)par,len,0)!=len) { closesocket(s); s=-1; return -9; } return msg->read(s); } int Rconnection::request(Rmessage *targetMsg, Rmessage *contents) { if (s==-1) return -5; // not connected if (contents->send(s)) { closesocket(s); s=-1; return -9; // send error } return targetMsg->read(s); } /** --- high-level functions -- */ int Rconnection::shutdown(const char *key) { Rmessage *msg = new Rmessage(); Rmessage *cm = key?new Rmessage(CMD_shutdown, key):new Rmessage(CMD_shutdown); int res = request(msg, cm); delete cm; delete msg; return res; } int Rconnection::assign(const char *symbol, Rexp *exp) { Rmessage *msg=new Rmessage(); Rmessage *cm=new Rmessage(CMD_setSEXP); int tl=strlen(symbol)+1; if (tl&3) tl=(tl+4)&0xfffc; Rsize_t xl=exp->storageSize(); Rsize_t hl=4+tl+4; if (xl>0x7fffff) hl+=4; cm->data=(char*) malloc(hl+xl); cm->head.len=cm->len=hl+xl; ((unsigned int*)cm->data)[0]=SET_PAR(DT_STRING, tl); ((unsigned int*)cm->data)[0]=itop(((unsigned int*)cm->data)[0]); strcpy(cm->data+4, symbol); ((unsigned int*)(cm->data+4+tl))[0]=SET_PAR((Rsize_t) ((xl>0x7fffff)?(DT_SEXP|DT_LARGE):DT_SEXP), (Rsize_t) xl); ((unsigned int*)(cm->data+4+tl))[0]=itop(((unsigned int*)(cm->data+4+tl))[0]); if (xl>0x7fffff) ((unsigned int*)(cm->data+4+tl))[1]=itop(xl>>24); exp->store(cm->data+hl); int res=request(msg,cm); delete (cm); if (res) { delete(msg); return res; } // we should check response code here ... return 0; } int Rconnection::voidEval(const char *cmd) { int status=0; eval(cmd, &status, 1); return status; } Rexp *Rconnection::eval(const char *cmd, int *status, int opt) { Rmessage *msg=new Rmessage(); Rmessage *cmdMessage=new Rmessage((opt&1)?CMD_voidEval:CMD_eval, cmd); int res=request(msg,cmdMessage); delete (cmdMessage); if (opt&1 && !res) { if (status) *status=0; // we should put response code here delete(msg); return 0; } if (!res && (msg->pars!=1 || (ptoi(msg->par[0][0])&0x3f)!=DT_SEXP)) { delete(msg); if (status) *status=-10; // returned object is not SEXP return 0; } if (res) { delete(msg); if (status) *status=res; return 0; } if (status) *status=0; return new_parsed_Rexp_from_Msg(msg); } int Rconnection::openFile(const char *fn) { Rmessage *msg=new Rmessage(); Rmessage *cmdMessage=new Rmessage(CMD_openFile, fn); int res=request(msg,cmdMessage); delete (cmdMessage); if (!res) res=CMD_STAT(msg->command()); delete (msg); return res; } int Rconnection::createFile(const char *fn) { Rmessage *msg=new Rmessage(); Rmessage *cmdMessage=new Rmessage(CMD_createFile, fn); int res=request(msg,cmdMessage); delete (cmdMessage); if (!res) res=CMD_STAT(msg->command()); delete (msg); return res; } int Rconnection::readFile(char *buf, unsigned int len) { Rmessage *msg=new Rmessage(); Rmessage *cmdMessage=new Rmessage(CMD_readFile, len); int res=request(msg,cmdMessage); delete(cmdMessage); if (!res) { // FIXME: Rserve up to 0.4-0 actually sends buggy response - it ommits DT_BYTESTREAM header! if (msg->len > len) { // we're in trouble here - techincally we should not get this delete(msg); return CERR_malformed_packet; } if (msg->len > 0) memcpy(buf, msg->data, msg->len); int rl = msg->len; delete(msg); return rl; } delete(msg); return CERR_io_error; } int Rconnection::writeFile(const char *buf, unsigned int len) { Rmessage *msg=new Rmessage(); Rmessage *cmdMessage=new Rmessage(CMD_writeFile, buf, len); int res=request(msg,cmdMessage); delete(cmdMessage); if (!res && msg->command()==RESP_OK) { delete(msg); return 0; } delete(msg); // FIXME: this is not really true ... return (res==0)?CERR_io_error:res; } int Rconnection::closeFile() { Rmessage *msg=new Rmessage(); Rmessage *cmdMessage=new Rmessage(CMD_closeFile); int res=request(msg,cmdMessage); delete(cmdMessage); if (!res && msg->command()==RESP_OK) { delete(msg); return 0; } delete(msg); // FIXME: this is not really true ... return (res==0)?CERR_io_error:res; } int Rconnection::removeFile(const char *fn) { Rmessage *msg=new Rmessage(); Rmessage *cmdMessage=new Rmessage(CMD_removeFile, fn); int res=request(msg,cmdMessage); delete (cmdMessage); if (!res) res=CMD_STAT(msg->command()); delete (msg); return res; } int Rconnection::login(const char *user, const char *pwd) { char *authbuf, *c; if (!(auth&A_required)) return 0; authbuf=(char*) malloc(strlen(user)+strlen(pwd)+22); strcpy(authbuf, user); c=authbuf+strlen(user); *c='\n'; c++; strcpy(c,pwd); #ifdef unix if (auth&A_crypt) strcpy(c,crypt(pwd,salt)); #else if (!(auth&A_plain)) { free(authbuf); return CERR_auth_unsupported; } #endif Rmessage *msg=new Rmessage(); Rmessage *cmdMessage=new Rmessage(CMD_login, authbuf); int res=request(msg,cmdMessage); delete (cmdMessage); if (!res) res=CMD_STAT(msg->command()); delete (msg); free(authbuf); return res; } plink-1.07-src/haplowindow.cpp0000644000265600020320000004717111264127625015546 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2007 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "stats.h" #include "phase.h" #include "genogroup.h" #include "haplowindow.h" class HaploPhase; void verboseDisplayWindows2(HaploPhase * haplo, int i, bool use_ref = true ) { for (int w = haplo->startWindow; w <= haplo->finishWindow ; w++) { int r = haplo->windows[w]->genoGroup[i]->reference; if ( ! use_ref ) r = i; haplo->VPHASE << "WINDOW " << w << "\n"; for (int z = 0; z < haplo->windows[w]->hap1[r].size(); z++) { HaploWindow * thisWindow = haplo->windows[w]; haplo->VPHASE << setw(w) << " " << thisWindow->haplotypeName(thisWindow->hap1[r][z]) << "/" << thisWindow->haplotypeName(thisWindow->hap2[r][z]) << " "; haplo->VPHASE << "( " << thisWindow->f[ thisWindow->hap1[r][z] ] << " / " << thisWindow->f[ thisWindow->hap2[r][z] ] << " ) "; if ( thisWindow->hap1[r].size() == 1) haplo->VPHASE << "[1]\n"; else haplo->VPHASE << thisWindow->pp[r][z]<< "\n"; } } } string HaploWindow::haplotypeName(int h) { string str; for (int s=0; sP.locus[S[s]]->allele1; string a2 = haplo->P.locus[S[s]]->allele2; if ( a1 == "" ) a1 = "X"; if ( a2 == "" ) a2 = "X"; if (h == -1) str += "-"; // haploid gap else if (hap[h][s]) str += a1; else str += a2; } return str; } HaploWindow::HaploWindow(HaploPhase * hp, Plink * plinkp) { haplo = hp; P = plinkp; converged = false; left_passed = false; right_passed = false; genoGroup.resize(P->n, (MultiLocusGenotype*)0); ambig.resize(P->n, false); pp.resize(P->n); hap1.resize(P->n); hap2.resize(P->n); } HaploWindow::~HaploWindow() { set::iterator im = genotypes.begin(); while (im != genotypes.end() ) { delete *im; ++im; } } void HaploWindow::setStubCodes() { ///////////////////////////////////////////////////////////// // Set stub codes (if no overlap, all codes will be 0, okay) leftStub.clear(); rightStub.clear(); for (int h=0; h m1; unsigned int p=1; for (int s=0; slocus[S[s]]->freq; } else { f[h] *= ( 1 - P->locus[S[s]]->freq); } } fsum += f[h]; // Consider next haplotype h++; } ////////////////// // Set stub codes setStubCodes(); } void HaploWindow::enumeratePhase(int i) { vector s1(ns); vector s2(ns); hap1[i].clear(); hap2[i].clear(); // Flipping allele-coding for homozygotes for (int s = 0; s < ns; s++) { if (par::SNP_major) { s1[s] = P->SNP[S[s]]->one[i]; s2[s] = P->SNP[S[s]]->two[i]; } else { s1[s] = P->sample[i]->one[S[s]]; s2[s] = P->sample[i]->two[S[s]]; } if (s1[s] == s2[s]) { s1[s] = !s1[s]; s2[s] = !s2[s]; } } ////////////////////////////////////////////////////////// // Count amount of missing genotype data at this position int nm = 0; for (int s = 0; s < ns; s++) if (s1[s] && !s2[s]) nm++; // If any missing genotypes, this person counts // as ambiguous if (nm>0) ambig[i] = true; // We only worry about too much missing data at the HaploPhase // stage, not here /////////////////////////////////////////////// // 2 or more hets at any loci -> ambiguous // Haploid genotypes should never be heterozygous, // so we are okay here w.r.t X chromosome int het=0; for (int s=0; s1) ambig[i] = true; ////////////////////////////////////// // Construct list of consistent phases if (!ambig[i]) { // Unambiguous means all no missing genotypes // and less than 2 hets // Match haplotype alleles: haploid individuals // will just be coded as homozygous here (but // when considering phases, frequencies, etc // we will take care of this downstream) hap1[i].push_back(hapmap.find(s1)->second); hap2[i].push_back(hapmap.find(s2)->second); } else { // For individuals with ambiguity // Which are the ambiguous sites // (missing or heterozygous) // We will not observe any hets for haploid // individuals: but we do need to make sure // that missing haploid genotypes are not // allowed to be heterozygous vector het_site(ns, false); vector mis_site(ns, false); int firstHeterozygote = ns; int ambig_cnt=0; for (int s=0; shaploid || (haplo->X && P->sample[i]->sex)) ambig_cnt++; else ambig_cnt+=2; } } int ambig_nh = (int)pow((double)2, ambig_cnt); vector h1(ns); vector h2(ns); int original_firstHeterozygote = firstHeterozygote; int h=0; while (h m1; unsigned int p=1; for (int s=0; shaploid || (haplo->X && P->sample[i]->sex)) { // select a hemizygote/homozygote if (m1[ac]) h1[s] = h2[s] = false; else h1[s] = h2[s] = true; ac++; } else { // Make het if (m1[ac]) { if (m1[ac+1]) { h1[s] = false; h2[s] = true; } else { if (s < firstHeterozygote ) { skip = true; } else { h1[s] = true; h2[s] = false; } } firstHeterozygote = s; } else { // otherwise, select a homozygote if (m1[ac+1]) h1[s] = h2[s] = false; else h1[s] = h2[s] = true; } ac+=2; } } else { // Maintain unambigous site // (which might be 1st het) h1[s] = s1[s]; h2[s] = s2[s]; } } if ( !skip ) { // Add to (non-redundant) list? int n1 = hapmap.find( h1 )->second; int n2 = hapmap.find( h2 )->second; hap1[i].push_back(n1 ); hap2[i].push_back(n2 ); } // Consider next haplotype pair h++; } } // Make space for posterior probabilities, and skip codes if (ambig[i]) pp[i].resize(hap1[i].size()); genoGroup[i]->skip.resize(hap1[i].size() , false); } void HaploWindow::pruneGenogroups(double t ) { set::iterator im = genotypes.begin(); while (im != genotypes.end() ) { int i = (*im)->reference; if (P->sample[i]->founder) prunePhase(i,t); ++im; } } void HaploWindow::prunePhase(int i, double t) { // pp[i][z] // hap1[i][z] // hap2[i][z] // skip[z] if ( (!haplo->include[i]) ||(!ambig[i])) return; MultiLocusGenotype * mlg = genoGroup[i]; double psum = 0; vector new_pp(0); vector new_h1(0); vector new_h2(0); vector new_skip(0); for (int z=0; z < hap1[i].size(); z++) { if ( pp[i][z] >= t ) { new_pp.push_back(pp[i][z]); psum += pp[i][z]; new_h1.push_back(hap1[i][z]); new_h2.push_back(hap2[i][z]); new_skip.push_back(mlg->skip[z]); } } // Normalise? if (pp[i].size() > new_pp.size() ) { for (int z=0; z < new_pp.size(); z++) new_pp[z] /= psum; } // Update pp[i] = new_pp; hap1[i] = new_h1; hap2[i] = new_h2; mlg->skip = new_skip; } vector_t HaploWindow::leftStubFrequency() { vector_t freq(haplo->nsh, 0); for (int h=0; h< nh; h++) freq[ leftStub[h] ] += f[h]; return freq; } vector_t HaploWindow::rightStubFrequency() { vector_t freq(haplo->nsh, 0); for (int h=0; h< nh; h++) freq[ rightStub[h] ] += f[h]; return freq; } void HaploWindow::tallyUnambiguousCounts() { uc.resize(nh, 0); set::iterator im = genotypes.begin(); while (im != genotypes.end() ) { int i = (*im)->reference; if (!ambig[i]) { uc[hap1[i][0]] += (*im)->count; if ( ! (haplo->haploid || (haplo->X && P->sample[i]->sex))) uc[hap2[i][0]] += (*im)->count; } ++im; } } void HaploWindow::expandGenogroups() { for (int i=0; in; i++) { if ( ! ( P->sample[i]->founder && haplo->include[i] ) ) continue; int r = genoGroup[i]->reference; if (r != i ) { pp[i] = pp[r]; hap1[i] = hap1[r]; hap2[i] = hap2[r]; } } } void HaploWindow::performEM() { ////////////////// // Begin E-M if ( par::haplo_plem_verbose ) haplo->VPHASE << "\nWINDOW spanning " << start << " to " << stop << "\n" << "\nINNER EM LOOP FOR " << par::haplo_plem_iter << " ITERATIONS "; for (int j=0; j<=par::haplo_plem_iter; j++) { ////////////////////////// // E-step for genoGroups set::iterator im = genotypes.begin(); while (im != genotypes.end() ) { int i = (*im)->reference; if (ambig[i]) { double s=0; // Haploid phases... if (haplo->haploid || (haplo->X && P->sample[i]->sex)) { for (int z=0; zskip[z]) continue; int h1 = hap1[i][z]; int h2 = hap2[i][z]; if (zero[h1] || zero[h2]) { (*im)->skip[z] = true; continue; } pp[i][z] = f[h1] * f[h2]; if (h1 != h2) pp[i][z] *= 2; s += pp[i][z]; } } ///////////////////////////////////////// // Check for single phase with 0 probability if ( s == 0 ) { if ( pp[i].size()==1 ) { pp[i][0] = s = 1; if ( par::haplo_plem_verbose ) haplo->VPHASE << "\n*** WARNING *** FIXED INDIVIDUAL " << P->sample[i]->fid << " " << P->sample[i]->iid << " TO PP=1 FOR SINGLE IMPOSS PHASE\n"; } else { if ( par::haplo_plem_verbose ) { haplo->VPHASE << "\n*** ERROR *** INDIVIDUAL " << P->sample[i]->fid << " " << P->sample[i]->iid << " HAS >1 PHASE BUT PP SUMS TO 0\n"; verboseDisplayWindows2(haplo,i,true); haplo->VPHASE.close(); error("See phased.verbose (--em-verbose) file"); } } } ///////////////////////////////////////// // Rescale haplotype phase probabilities for (int z=0; zskip[z] ) { pp[i][z] /= s; if ( par::haplo_plem_verbose ) { if ( (!realnum(pp[i][z])) || pp[i][z] < 0 || pp[i][z] > 1 ) haplo->VPHASE << "\n*** WARNING *** PROBLEM PP FOR INDIVIDUAL " << P->sample[i]->fid << " " << P->sample[i]->iid << "\n"; } } } } im++; } ///////////////////////////////////// // M-step for pre-counted haplotypes // unambiguous counts for (int h=0; hreference; if (ambig[i]) { if (haplo->haploid || (haplo->X && P->sample[i]->sex)) { for (int z=0; zcount; } } else { for (int z=0; zskip[z]) { continue; } // haplo->VPHASE << "considering " << haplotypeName( hap1[i][z] ) // << " and " << haplotypeName( hap2[i][z] ) << " for " // << P->sample[i]->fid << " " << P->sample[i]->iid << "\t" // << " times " << (*im)->count << " " << pp[i][z] // << " and hap codes are " << hap1[i][z] << " " << hap2[i][z] << "\n" ; f[hap1[i][z]] += pp[i][z] * (*im)->count; f[hap2[i][z]] += pp[i][z] * (*im)->count; } } } ++im; } // validN is the total number of *chromosomes* for (int h=0; hvalidN; ////////////////////////////////////////// // Update likelihood (not every iteration) if ( j == par::haplo_plem_iter - 1 || j % par::haplo_plem_likelihood_iter == 0) { // Zero out unlikely haplotypes? for (int h=0; hreference; double lk = 0; if (haplo->haploid || (haplo->X && P->sample[i]->sex)) { for (int z=0; zskip[z] || zero[hap1[i][z]] || zero[hap2[i][z]]) continue; lk += f[hap1[i][z]] * f[hap2[i][z]]; if (hap1[i][z] != hap2[i][z]) lk += f[hap1[i][z]] * f[hap2[i][z]]; } if (lk > 0) lnl -= log(lk) * (*im)->count; ++im; } if ( par::haplo_plem_verbose ) { haplo->VPHASE << "INNER_LNL " << lnl << "\n"; } if (j > 0 && sampleLogLikelihood - lnl < par::haplo_plem_window_tol ) { if ( par::haplo_plem_verbose ) haplo->VPHASE << "INNER_CONVERGED AT " << j << " ITERATIONS\n"; iter = j; converged = true; break; } sampleLogLikelihood = lnl; } // End of likelihood calculation } // Next EM iteration if ( par::haplo_plem_verbose ) haplo->VPHASE << "INNER_EM HAS FINISHED/CONVERGED\n\n"; if ( par::haplo_plem_verbose ) { haplo->VPHASE << "INNER_FREQS "; for (int h=0; h 0.001 ) haplo->VPHASE << h << " " << haplotypeName(h) << "\t" << f[h] << "\n"; haplo->VPHASE << "\n--------------------\n"; } // EM has converged/finished } void HaploWindow::reportPhase() { string fn = par::output_file_name+".phase"; ofstream PHASE(fn.c_str(), ios::out); //P.printLOG("Writing phased haplotypes for " + hname + " to [ "+ fn + " ]\n"); cout << setw(12) << "FID"<< " "<< setw(12) << "IID"<< " "<< setw(4)<< "PH" << " "<< setw(10) << "HAP1"<< " "<< setw(10) << "HAP2"<< " " << setw(12) << "POSTPROB"<< " "<< setw(6) << "BEST"<< " "<< "\n"; PHASE.precision(4); for (int i = 0; i < haplo->P.n; i++) { if (haplo->include[i]) { for (int z = 0; z < hap1[i].size(); z++) { cout << setw(12) << haplo->P.sample[i]->fid<< " "<< setw(12) << haplo->P.sample[i]->iid<< " "<< setw(4) << z << " " << setw(10) << haplotypeName(hap1[i][z]) << " "; if (haplo->haploid || (haplo->X && haplo->P.sample[i]->sex)) cout << setw(10) << haplotypeName( -1) << " "; else cout << setw(10) << haplotypeName(hap2[i][z]) << " "; if (ambig[i]) { cout << setw(12) << pp[i][z]<< " "; int max_z = 0; for (int z2=0; z2 pp[i][max_z] ? z2 : max_z ; // int fac=1; // if ( hap1[i][max_z] != hap2[i][max_z] ) fac=2; // double w = ( pp[i][max_z] - fac*f[hap1[i][max_z]]*f[hap1[i][max_z]] ) // / ( 1 - fac*f[hap1[i][max_z]]*f[hap1[i][max_z]] ); // Do not output weight for now // if (max_z==z) PHASE << setw(12) << w << " " << setw(6) << 1 << " " << "\n"; // else PHASE << setw(12) << "." << " " << setw(6) << 0 << " " << "\n"; if (max_z == z) cout << setw(6) << 1<< " "<< " "; else cout << setw(6) << 0<< " "<< " "; } else cout << setw(12) << 1<< " "<< setw(6) << 1<< " "<< " "; // Genotypes //for (int s=0; sP.sample[i]->fid<< " "<< setw(12) << haplo->P.sample[i]->iid<< " "<< setw(4) << "NA"<< " " << setw(10) << "NA"<< " "<< setw(10) << "NA"<< " " << setw(12) << "NA"<< " "<< setw(6) << "NA"<< " "; // genotypes //for (int s=0; s #include "plink.h" using namespace std; vector socketConnection( Plink * P, string ip_addr , int port , string message ) ; #endif plink-1.07-src/simul.cpp0000644000265600020320000007550011264127625014341 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include "plink.h" #include "helper.h" #include "options.h" #include "crandom.h" #include "stats.h" #include using namespace std; ////////////////////////////////////////////////////////////////////// // A simple routine to simulate a dataset of unlinked case/control SNPs // or QTs class SimParameters { public: int nsnp; double lfreq; double ufreq; double hetOdds; double homOdds; double missing; string name; double lmarker; double umarker; double dprime; SimParameters() { name = ""; nsnp = 0; missing = 0.00; lfreq = ufreq = hetOdds = homOdds = 0; lmarker = umarker = 0; dprime = 1; } }; class SimParametersQT { public: int nsnp; double lfreq; double ufreq; double variance; double dom; double gAA, gAB, gBB; double missing; string name; double lmarker; double umarker; double dprime; SimParametersQT() { name = ""; nsnp = 0; missing = 0.00; variance = dom = 0; lfreq = ufreq = 0; lmarker = umarker = 0; dprime = 1; } }; vector_t instanceSNP(SimParameters & s) { // Return: // 0 Population allele frequency (disease variant) // 1 Population allele frequency (marker) // 2 Case AA (marker) // 3 Case AB (marker) // 4 Control AA (marker) // 5 Control AB (marker) // 6 P( disease A | marker A ) // 7 P( disease A | marker B ) vector_t freqs(38,0); // Calculate actual population allele frequency for this SNP double freq = s.lfreq + CRandom::rand() * ( s.ufreq - s.lfreq ) ; // And a marker frequency double mfreq = par::simul_tags ? s.lmarker + CRandom::rand() * ( s.umarker - s.lmarker ) : freq; // Handle LD double dmax; double ld = 0; double h11, h12, h21, h22; dmax = freq * (1-mfreq); if ( (1-freq) * mfreq < dmax) dmax = (1-freq) * mfreq; ld = s.dprime * dmax; // Haplotype frequencies in general population h11 = freq * mfreq + ld; h12 = freq * ( 1 - mfreq ) - ld; h21 = ( 1 - freq ) * mfreq - ld; h22 = ( 1 - freq ) * ( 1 - mfreq ) + ld; // Get case and control allele frequencies given GRR, // population frequency and disease frequency // Need to model // AM / AM // 2 * AM / Am // Am / Am // 2* AM / aM // 4* AM / am // 2* Am / am // aM / aM // 2 * aM / am // am / am // P( disease | causal variant ) double f0, f1, f2; double g0 = freq * freq; double g1 = 2 * freq * ( 1-freq ); double g2 = 1 - g0 - g1; f2 = par::simul_prevalence / ( g0 * s.homOdds + g1 * s.hetOdds + g2 ); f0 = f2 * s.homOdds; f1 = f2 * s.hetOdds; // P ( disease | diplotype ) double gh_11_11 = h11*h11; double gh_11_12 = h11*h12; double gh_12_11 = h12*h11; double gh_12_12 = h12*h12; double gh_11_21 = h11*h21; double gh_11_22 = h11*h22; double gh_12_21 = h12*h21; double gh_12_22 = h12*h22; double gh_21_11 = h21*h11; double gh_21_12 = h21*h12; double gh_22_11 = h22*h11; double gh_22_12 = h22*h12; double gh_21_21 = h21*h21; double gh_21_22 = h21*h22; double gh_22_21 = h22*h21; double gh_22_22 = h22*h22; // P( disease | diplotype ) double fh_11_11 = f0; double fh_11_12 = f0; double fh_12_11 = f0; double fh_12_12 = f0; double fh_11_21 = f1; double fh_11_22 = f1; double fh_12_21 = f1; double fh_12_22 = f1; double fh_21_11 = f1; double fh_21_12 = f1; double fh_22_11 = f1; double fh_22_12 = f1; double fh_21_21 = f2; double fh_21_22 = f2; double fh_22_21 = f2; double fh_22_22 = f2; double mg0 = mfreq * mfreq; double mg1 = 2 * mfreq * ( 1 - mfreq); double mg2 = 1 - mg0 - mg1; double mf0 = ( f0 * h11*h11 + f1 * 2 * h11 * h21 + f2 * h21*h21 ) / mg0; double mf1 = ( f0 * 2*h11*h12 + f1 * ( 2 * h11 * h22 + 2 * h12 * h21 ) + f2 * 2*h21*h22 ) / mg1; double mf2 = ( f0 * h12*h12 + f1 * 2 * h12 * h22 + f2 * h22*h22 ) / mg2; // P(G|X) double d0 = mg0 * mf0; double d1 = mg1 * mf1; double d2 = mg2 * mf2; double dSb = d0 + d1 + d2; d0 /= dSb; d1 /= dSb; d2 /= dSb; double u0 = mg0 * (1-mf0); double u1 = mg1 * (1-mf1); double u2 = mg2 * (1-mf2); double uSb = u0 + u1 + u2; u0 /= uSb; u1 /= uSb; u2 /= uSb; // P( diplotype | affected ) double ah_11_11 = fh_11_11 * gh_11_11; double ah_11_12 = fh_11_12 * gh_12_11; double ah_12_11 = fh_12_11 * gh_12_11; double ah_12_12 = fh_12_12 * gh_12_12; double ah_11_21 = fh_11_21 * gh_11_21; double ah_11_22 = fh_11_22 * gh_11_22; double ah_12_21 = fh_12_21 * gh_12_21; double ah_12_22 = fh_12_22 * gh_12_22; double ah_21_11 = fh_21_11 * gh_21_11; double ah_21_12 = fh_21_12 * gh_21_12; double ah_22_11 = fh_22_11 * gh_22_11; double ah_22_12 = fh_22_12 * gh_22_12; double ah_21_21 = fh_21_21 * gh_21_21; double ah_21_22 = fh_21_22 * gh_21_22; double ah_22_21 = fh_22_21 * gh_22_21; double ah_22_22 = fh_22_22 * gh_22_22; double aS = ah_11_11+ah_11_12+ah_12_11+ah_12_12+ ah_11_21+ah_11_22+ah_12_21+ah_12_22+ ah_21_11+ah_21_12+ah_22_11+ah_22_12+ ah_21_21+ah_21_22+ah_22_21+ah_22_22; ah_11_11 /= aS; ah_11_12 /= aS; ah_12_11 /= aS; ah_12_12 /= aS; ah_11_21 /= aS; ah_11_22 /= aS; ah_12_21 /= aS; ah_12_22 /= aS; ah_21_11 /= aS; ah_21_12 /= aS; ah_22_11 /= aS; ah_22_12 /= aS; ah_21_21 /= aS; ah_21_22 /= aS; ah_22_21 /= aS; ah_22_22 /= aS; // P( diplotype | unaffected ) double uh_11_11 = (1-fh_11_11) * gh_11_11; double uh_11_12 = (1-fh_11_12) * gh_12_11; double uh_12_11 = (1-fh_12_11) * gh_12_11; double uh_12_12 = (1-fh_12_12) * gh_12_12; double uh_11_21 = (1-fh_11_21) * gh_11_21; double uh_11_22 = (1-fh_11_22) * gh_11_22; double uh_12_21 = (1-fh_12_21) * gh_12_21; double uh_12_22 = (1-fh_12_22) * gh_12_22; double uh_21_11 = (1-fh_21_11) * gh_21_11; double uh_21_12 = (1-fh_21_12) * gh_21_12; double uh_22_11 = (1-fh_22_11) * gh_22_11; double uh_22_12 = (1-fh_22_12) * gh_22_12; double uh_21_21 = (1-fh_21_21) * gh_21_21; double uh_21_22 = (1-fh_21_22) * gh_21_22; double uh_22_21 = (1-fh_22_21) * gh_22_21; double uh_22_22 = (1-fh_22_22) * gh_22_22; double uS = uh_11_11+uh_11_12+uh_12_11+uh_12_12+ uh_11_21+uh_11_22+uh_12_21+uh_12_22+ uh_21_11+uh_21_12+uh_22_11+uh_22_12+ uh_21_21+uh_21_22+uh_22_21+uh_22_22; uh_11_11 /= uS; uh_11_12 /= uS; uh_12_11 /= uS; uh_12_12 /= uS; uh_11_21 /= uS; uh_11_22 /= uS; uh_12_21 /= uS; uh_12_22 /= uS; uh_21_11 /= uS; uh_21_12 /= uS; uh_22_11 /= uS; uh_22_12 /= uS; uh_21_21 /= uS; uh_21_22 /= uS; uh_22_21 /= uS; uh_22_22 /= uS; // Return vector freqs[0] = freq; // P(variant) freqs[1] = mfreq; // P(marker) freqs[2] = d0; // P(marker-het|affected) freqs[3] = d1; // P(marker-hom|affected) freqs[4] = u0; // P(marker-het|unaffected) freqs[5] = u1; // P(marker-hom|unaffected) freqs[6] = ah_11_11 ; freqs[7] = ah_11_12 ; freqs[8] = ah_12_11 ; freqs[9] = ah_12_12 ; freqs[10] = ah_11_21 ; freqs[11] = ah_11_22 ; freqs[12] = ah_12_21 ; freqs[13] = ah_12_22 ; freqs[14] = ah_21_11 ; freqs[15] = ah_21_12 ; freqs[16] = ah_22_11 ; freqs[17] = ah_22_12 ; freqs[18] = ah_21_21 ; freqs[19] = ah_21_22 ; freqs[20] = ah_22_21 ; freqs[21] = ah_22_22 ; // Control diplotype freqs freqs[22] = uh_11_11 ; freqs[23] = uh_11_12 ; freqs[24] = uh_12_11 ; freqs[25] = uh_12_12 ; freqs[26] = uh_11_21 ; freqs[27] = uh_11_22 ; freqs[28] = uh_12_21 ; freqs[29] = uh_12_22 ; freqs[30] = uh_21_11 ; freqs[31] = uh_21_12 ; freqs[32] = uh_22_11 ; freqs[33] = uh_22_12 ; freqs[34] = uh_21_21 ; freqs[35] = uh_21_22 ; freqs[36] = uh_22_21 ; freqs[37] = uh_22_22 ; return freqs; } vector_t instanceSNP_QT(SimParametersQT & s) { // Return: // 0 Population allele frequency (disease variant) // 1 Population allele frequency (marker) // 16 haplotype frequencies vector_t freqs(18,0); // Calculate actual population allele frequency for this SNP double freq = s.lfreq + CRandom::rand() * ( s.ufreq - s.lfreq ) ; // And a marker frequency double mfreq = par::simul_tags ? s.lmarker + CRandom::rand() * ( s.umarker - s.lmarker ) : freq; // Given the specified allele frequency, now calcuate the // additive genetic value and domiance deviation double p = freq; double q = 1-p; double a = sqrt( ( s.variance ) / ( (2*p*q)* (1+s.dom*(q-p))*(1+s.dom*(q-p)) + (2*p*q*s.dom)*(2*p*q*s.dom) ) ); double d = s.dom * a; // Mean center s.gBB = a -(a*(p-(1-p))+ (2*p*(1-p)*d)); s.gAB = d -(a*(p-(1-p))+ (2*p*(1-p)*d)); s.gAA = -a -(a*(p-(1-p))+ (2*p*(1-p)*d)); // cout << "p = " << p << "\n"; // cout <<"G = " << s.gBB << " " // << s.gAB << " " // << s.gAA << "\n"; // Handle LD double dmax; double ld = 0; double h11, h12, h21, h22; dmax = freq * (1-mfreq); if ( (1-freq) * mfreq < dmax) dmax = (1-freq) * mfreq; ld = s.dprime * dmax; // Haplotype frequencies in general population h11 = freq * mfreq + ld; h12 = freq * ( 1 - mfreq ) - ld; h21 = ( 1 - freq ) * mfreq - ld; h22 = ( 1 - freq ) * ( 1 - mfreq ) + ld; double h_11_11 = h11*h11; double h_11_12 = h11*h12; double h_12_11 = h12*h11; double h_12_12 = h12*h12; double h_11_21 = h11*h21; double h_11_22 = h11*h22; double h_12_21 = h12*h21; double h_12_22 = h12*h22; double h_21_11 = h21*h11; double h_21_12 = h21*h12; double h_22_11 = h22*h11; double h_22_12 = h22*h12; double h_21_21 = h21*h21; double h_21_22 = h21*h22; double h_22_21 = h22*h21; double h_22_22 = h22*h22; // Return vector freqs[0] = freq; // P(variant) freqs[1] = mfreq; // P(marker) freqs[2] = h_11_11 ; freqs[3] = h_11_12 ; freqs[4] = h_12_11 ; freqs[5] = h_12_12 ; freqs[6] = h_11_21 ; freqs[7] = h_11_22 ; freqs[8] = h_12_21 ; freqs[9] = h_12_22 ; freqs[10] = h_21_11 ; freqs[11] = h_21_12 ; freqs[12] = h_22_11 ; freqs[13] = h_22_12 ; freqs[14] = h_21_21 ; freqs[15] = h_21_22 ; freqs[16] = h_22_21 ; freqs[17] = h_22_22 ; return freqs; } void Plink::simulateSNPs() { // Read in SNP parameters // Number of SNPs // Lower allele frequency for '1' (versus '2') allele // Upper allele frequency (population) // Odds ratio ('1' allele) checkFileExists(par::simul_file); printLOG("Reading simulation parameters from [ " + par::simul_file + " ]\n"); printLOG("Writing SNP population frequencies to [ " + par::output_file_name + ".simfreq ]\n"); ofstream SOUT( ( par::output_file_name+".simfreq").c_str(), ios::out); ifstream SIM; SIM.open( par::simul_file.c_str(), ios::in ); if ( par::simul_label != "" ) par::simul_label += "-"; vector sp; while ( ! SIM.eof() ) { SimParameters s; vector tokens = tokenizeLine( SIM ); if ( tokens.size() == 0 ) continue; if ( par::simul_tags ) { if ( tokens.size() != 9 ) error("Problem with format of simulation parameter file: expecting 9 fields\n"); if( ! from_string(s.nsnp , tokens[0] , std::dec) ) error("Expecting numeric value for 1st field, # SNPs\n"); s.name = tokens[1]; if( ! from_string(s.lfreq , tokens[2] , std::dec) ) error("Expecting numeric value for 3rd field, lower variant freq.\n"); if( ! from_string(s.ufreq , tokens[3] , std::dec) ) error("Expecting numeric value for 4th field, upper variant freq.\n"); if( ! from_string(s.lmarker , tokens[4] , std::dec) ) error("Expecting numeric value for 5th field, lower marker freq.\n"); if( ! from_string(s.umarker , tokens[5] , std::dec) ) error("Expecting numeric value for 6th field, upper marker freq.\n"); if( ! from_string(s.dprime , tokens[6] , std::dec) ) error("Expecting numeric value for 7th field, d-prime\n"); if( ! from_string(s.hetOdds , tokens[7] , std::dec) ) error("Expecting numeric value for 8th field, het odds\n"); if ( ! from_string( s.homOdds , tokens[8] , std::dec ) ) s.homOdds = s.hetOdds * s.hetOdds; } else { if ( tokens.size() != 6 ) error("Problem with format of simulation parameter file: expecting 6 fields\n"); if( ! from_string(s.nsnp , tokens[0] , std::dec) ) error("Expecting numeric value for first field, # SNPs\n"); s.name = tokens[1]; if( ! from_string(s.lfreq , tokens[2] , std::dec) ) error("Expecting numeric value for 3rd field, lower variant freq.\n"); if( ! from_string(s.ufreq , tokens[3] , std::dec) ) error("Expecting numeric value for 4th field, upper variant freq.\n"); s.lmarker = s.lfreq; s.umarker = s.ufreq; s.dprime = 1; if( ! from_string(s.hetOdds , tokens[4] , std::dec) ) error("Expecting numeric value for 5th field, het odds\n"); if ( ! from_string( s.homOdds , tokens[5] , std::dec ) ) s.homOdds = s.hetOdds * s.hetOdds; } // Read odds ratio; unless a specific number is given, assume // multiplicative sp.push_back(s); if ( SIM.eof() ) break; } SIM.close(); //////////////////////////////////////////// // Make room for total number of SNPs, etc int tsnp = 0; for (int s=0; s > haps; for (int cv1=0; cv1<2; cv1++) for (int cv2=0; cv2<2; cv2++) for (int mk1=0; mk1<2; mk1++) for (int mk2=0; mk2<2; mk2++) { vector t(4); t[0] = cv1; t[1] = mk1; t[2] = cv2; t[3] = mk2; haps.push_back(t); } int pos = 0; for (int s=0; s 1 ) loc->name = sp[s].name+"_"+int2str(l); else loc->name = sp[s].name; loc->chr = 1; loc->allele1 = "D"; loc->allele2 = "d"; loc->bp = ++pos; loc->pos = 0; locus.push_back(loc); CSNP * newset = new CSNP; newset->one.resize(nind); newset->two.resize(nind); if ( par::simul_haps ) { Locus * loc2 = new Locus; loc2->name = loc->name + "_M"; loc2->chr = 1; loc2->allele1 = "A"; loc2->allele2 = "B"; loc2->bp = ++pos; loc2->pos = 0; locus.push_back(loc2); } // Sample case and control population genotype frequencies vector_t f = instanceSNP(sp[s]); // f Information // 0 Population allele frequency (disease variant/marker) // 1 Population allele frequency (marker) // 2 Case AA (marker) // 3 Case AB (marker) // 4 Control AA (marker) // 5 Control AB (marker) // 6+ full diplotype frequencies, for // cases and controls if ( par::simul_tags ) { SOUT << 1 << " " << loc->name << "\t" << f[0] << " " << f[0] << "\t" << f[1] << " " << f[1] << "\t" << sp[s].dprime << "\t" << sp[s].hetOdds << "\t" << sp[s].homOdds << "\n"; } else { SOUT << 1 << " " << loc->name << "\t" << f[0] << " " << f[0] << "\t" << sp[s].hetOdds << "\t" << sp[s].homOdds << "\n"; } if ( ! par::simul_haps ) { // Simulate only a single SNP (either the marker, // if --simulate-tags, otherwise the CV itself // Genotype frequencies in cases and controls const double caseAA = f[2]; const double caseAB = f[2] + f[3]; const double contAA = f[4]; const double contAB = f[4] + f[5]; ////////////////////////////////////////////////// // Generate each individual, simulating genotypes // rather than alleles for ( int i = 0 ; i < nind ; i++ ) { // Simple missingness if ( CRandom::rand() < sp[s].missing ) { newset->one[i] = true; newset->two[i] = false; } else { bool isCase = i < par::simul_ncases ? true : false; double r = CRandom::rand(); int g = 0; if ( isCase ) { if ( r > caseAB ) g = 2; else if ( r > caseAA ) g = 1; } else { if ( r > contAB ) g = 2; else if ( r > contAA ) g = 1; } if ( g == 2 ) { newset->one[i] = false; newset->two[i] = false; } else if ( g == 1 ) { newset->one[i] = false; newset->two[i] = true; } else { newset->one[i] = true; newset->two[i] = true; } } } SNP.push_back(newset); } else { /////////////////////////////////// // Simulate diplotype pair CSNP * newset2 = new CSNP; newset2->one.resize(nind); newset2->two.resize(nind); // Cases: 6 to 21 // Controls: 22 to 37 vector_t freqA; vector_t freqU; double cumA = 0, cumU = 0; for (int j=6; j<=21; j++) { cumA += f[j]; freqA.push_back(cumA); } for (int j=22; j<=37; j++) { cumU += f[j]; freqU.push_back(cumU); } ////////////////////////////// // Generate each individual, for ( int i = 0 ; i < nind ; i++ ) { // Simple missingness bool miss_marker = false; bool miss_causal = false; if ( CRandom::rand() < sp[s].missing ) miss_marker = true; if ( CRandom::rand() < sp[s].missing ) miss_causal = true; bool isCase = i < par::simul_ncases ? true : false; // Simulate diplotype double r = CRandom::rand(); int h = 0; for ( int j=14;j>=0;j--) { if ( isCase ) { if ( r > freqA[j] ) { h = j+1; break; } } else { if ( r > freqU[j] ) { h = j+1; break; } } } // We now have selected 'h', a number between 0 and 15 vector & hp = haps[h]; //cout << "h = " << h << "\n"; // cout << "size haps=" << haps.size() << "\n"; // display(freqA); ////////////////////////// // Set both genotypes if ( miss_marker ) { newset->one[i] = true; newset->two[i] = false; } else { if ( hp[0] && hp[2] ) { newset->one[i] = true; newset->two[i] = true; } else if ( (!hp[0]) && (!hp[2]) ) { newset->one[i] = false; newset->two[i] = false; } else { newset->one[i] = false; newset->two[i] = true; } } if ( miss_causal ) { newset2->one[i] = true; newset2->two[i] = false; } else { if ( hp[1] && hp[3] ) { newset2->one[i] = true; newset2->two[i] = true; } else if ( (!hp[1]) && (!hp[3]) ) { newset2->one[i] = false; newset2->two[i] = false; } else { newset2->one[i] = false; newset2->two[i] = true; } } } // Add markers and then CV SNP.push_back(newset); SNP.push_back(newset2); } // Next SNP/SNP-pair to simulate } // Phenotypes for (int i=0;ifid = person->iid = par::simul_label + "per"+int2str(i); person->missing = false; person->pat = "0"; person->mat = "0"; if ( i < par::simul_ncases ) person->phenotype = 2; else person->phenotype = 1; person->sex = false; person->sexcode = "2"; sample.push_back(person); } SOUT.close(); } void Plink::simulateSNPs_QT() { par::qt = true; par::bt = false; // Read in SNP parameters // for QUANTITATIVE TRAIT simulations // Number of SNPs // Lower allele frequency for '1' (versus '2') allele // Upper allele frequency (population) // Additive genetic effect // Dominance deviation checkFileExists(par::simul_file); printLOG("Reading QT simulation parameters from [ " + par::simul_file + " ]\n"); printLOG("Writing SNP population frequencies to [ " + par::output_file_name + ".simfreq ]\n"); ofstream SOUT( ( par::output_file_name+".simfreq").c_str(), ios::out); ifstream SIM; SIM.open( par::simul_file.c_str(), ios::in ); if ( par::simul_label != "" ) par::simul_label += "-"; vector sp; double totvar = 0; while ( ! SIM.eof() ) { SimParametersQT s; vector tokens = tokenizeLine( SIM ); if ( tokens.size() == 0 ) continue; if ( par::simul_tags ) { if ( tokens.size() != 9 ) error("Problem with format of simulation parameter file: expecting 9 fields\n"); if( ! from_string(s.nsnp , tokens[0] , std::dec) ) error("Expecting numeric value for 1st field, # SNPs\n"); s.name = tokens[1]; if( ! from_string(s.lfreq , tokens[2] , std::dec) ) error("Expecting numeric value for 3rd field, lower variant freq.\n"); if( ! from_string(s.ufreq , tokens[3] , std::dec) ) error("Expecting numeric value for 4th field, upper variant freq.\n"); if( ! from_string(s.lmarker , tokens[4] , std::dec) ) error("Expecting numeric value for 5th field, lower marker freq.\n"); if( ! from_string(s.umarker , tokens[5] , std::dec) ) error("Expecting numeric value for 6th field, upper marker freq.\n"); if( ! from_string(s.dprime , tokens[6] , std::dec) ) error("Expecting numeric value for 7th field, d-prime\n"); if( ! from_string(s.variance , tokens[7] , std::dec) ) error("Expecting numeric value for 8th field, variance\n"); if ( ! from_string( s.dom , tokens[8] , std::dec ) ) error("Expecting numeric value for 9th field, dom \n"); } else { if ( tokens.size() != 6 ) error("Problem with format of simulation parameter file: expecting 6 fields\n"); if( ! from_string(s.nsnp , tokens[0] , std::dec) ) error("Expecting numeric value for first field, # SNPs\n"); s.name = tokens[1]; if( ! from_string(s.lfreq , tokens[2] , std::dec) ) error("Expecting numeric value for 3rd field, lower variant freq.\n"); if( ! from_string(s.ufreq , tokens[3] , std::dec) ) error("Expecting numeric value for 4th field, upper variant freq.\n"); if( ! from_string(s.variance , tokens[4] , std::dec) ) error("Expecting numeric value for 5th field, variance\n"); if ( ! from_string( s.dom , tokens[5] , std::dec ) ) error("Expecting numeric value for 6th field, dom \n"); s.lmarker = s.lfreq; s.umarker = s.ufreq; s.dprime = 1; } // Keep track of total QTL variance totvar += s.nsnp * s.variance; sp.push_back(s); if ( SIM.eof() ) break; } SIM.close(); //////////////////////////////////////////// // Make room for total number of SNPs, etc int tsnp = 0; for (int s=0; s 1 ) error("Specific QTL variance is greater than 100%"); // Phenotypes for (int i=0;ifid = person->iid = par::simul_label + "per"+int2str(i); person->missing = false; person->pat = "0"; person->mat = "0"; person->sex = false; person->sexcode = "2"; // Residual variance component person->phenotype = rnorm() * sqrt( 1 - totvar ); sample.push_back(person); } // Genotypes vector > haps; for (int cv1=0; cv1<2; cv1++) for (int cv2=0; cv2<2; cv2++) for (int mk1=0; mk1<2; mk1++) for (int mk2=0; mk2<2; mk2++) { vector t(4); t[0] = cv1; t[1] = mk1; t[2] = cv2; t[3] = mk2; haps.push_back(t); } int pos = 0; for (int s=0; s 1 ) loc->name = sp[s].name+"_"+int2str(l); else loc->name = sp[s].name; loc->chr = 1; loc->allele1 = "H"; loc->allele2 = "L"; loc->bp = ++pos; loc->pos = 0; locus.push_back(loc); CSNP * newset = new CSNP; newset->one.resize(nind); newset->two.resize(nind); if ( par::simul_haps ) { Locus * loc2 = new Locus; loc2->name = loc->name + "_M"; loc2->chr = 1; loc2->allele1 = "A"; loc2->allele2 = "B"; loc2->bp = ++pos; loc2->pos = 0; locus.push_back(loc2); } // Get haplotype frequencies vector_t f = instanceSNP_QT(sp[s]); // f Information // 0 Population allele frequency (disease variant/marker) // 1 Population allele frequency (marker) // 2+ 16 haplotype frequencies if ( par::simul_tags ) { SOUT << 1 << " " << loc->name << "\t" << f[0] << " " << f[0] << "\t" << f[1] << " " << f[1] << "\t" << sp[s].dprime << "\t" << sp[s].variance << "\t" << sp[s].dom << "\n"; } else { SOUT << 1 << " " << loc->name << "\t" << f[0] << " " << f[0] << "\t" << sp[s].variance << "\t" << sp[s].dom << "\n"; } if ( ! par::simul_haps ) { // Simulate only a single SNP (either the marker, // if --simulate-tags, otherwise the CV itself // Genotype frequencies in cases and controls const double freq = f[0]; ////////////////////////////////////////////////// // Generate each individual, simulating genotypes // rather than alleles for ( int i = 0 ; i < nind ; i++ ) { // Simple missingness if ( CRandom::rand() < sp[s].missing ) { newset->one[i] = true; newset->two[i] = false; } else { int g = 0; if ( CRandom::rand() > freq ) ++g; if ( CRandom::rand() > freq ) ++g; if ( g == 2 ) { newset->one[i] = false; newset->two[i] = false; sample[i]->phenotype += sp[s].gAA; } else if ( g == 1 ) { newset->one[i] = false; newset->two[i] = true; sample[i]->phenotype += sp[s].gAB; } else { newset->one[i] = true; newset->two[i] = true; sample[i]->phenotype += sp[s].gBB; } } } SNP.push_back(newset); } else { /////////////////////////////////// // Simulate diplotype pair CSNP * newset2 = new CSNP; newset2->one.resize(nind); newset2->two.resize(nind); // Cases: 2 to 17 vector_t freqA; vector_t freqU; double cumA = 0, cumU = 0; for (int j=2; j<=17; j++) { cumA += f[j]; freqA.push_back(cumA); } ////////////////////////////// // Generate each individual, for ( int i = 0 ; i < nind ; i++ ) { // Simple missingness bool miss_marker = false; bool miss_causal = false; if ( CRandom::rand() < sp[s].missing ) miss_marker = true; if ( CRandom::rand() < sp[s].missing ) miss_causal = true; // Simulate diplotype double r = CRandom::rand(); int h = 0; for ( int j=14;j>=0;j--) { if ( r > freqA[j] ) { h = j+1; break; } } // We now have selected 'h', a number between 0 and 15 vector & hp = haps[h]; cout << "h = " << h << "\n"; cout << "size haps=" << haps.size() << "\n"; display(freqA); ////////////////////////// // Set both genotypes if ( miss_marker ) { newset->one[i] = true; newset->two[i] = false; } else { if ( hp[0] && hp[2] ) { newset->one[i] = true; newset->two[i] = true; } else if ( (!hp[0]) && (!hp[2]) ) { newset->one[i] = false; newset->two[i] = false; } else { newset->one[i] = false; newset->two[i] = true; } } if ( miss_causal ) { newset2->one[i] = true; newset2->two[i] = false; } else { if ( hp[1] && hp[3] ) { newset2->one[i] = true; newset2->two[i] = true; sample[i]->phenotype += sp[s].gAA; } else if ( (!hp[1]) && (!hp[3]) ) { newset2->one[i] = false; newset2->two[i] = false; sample[i]->phenotype += sp[s].gAB; } else { newset2->one[i] = false; newset2->two[i] = true; sample[i]->phenotype += sp[s].gBB; } } } // Add markers and then CV SNP.push_back(newset); SNP.push_back(newset2); } // Next SNP/SNP-pair to simulate } SOUT.close(); } plink-1.07-src/linput.cpp0000644000265600020320000002321211264127625014514 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" extern ofstream LOG; void Plink::readDataLongFormat() { ////////////////////// // Check files exist checkFileExists(par::lpedfile); checkFileExists(par::mapfile); checkFileExists(par::famfile); /////////////////////////////////////////////// // .map file vector include; vector include_pos(0); int nl_actual=0; // Read in MAP file: this function also allocates // slots for SNP-major mode readMapFile(par::mapfile, include, include_pos, nl_actual); ////////////////////////////////////////////// // First read a reference file? map refallele; map refallele2; if ( par::ref_file ) { set mset; for (int l=0; l< locus.size(); l++) mset.insert( locus[l]->name ); int notfound = 0; checkFileExists( par::ref_file_name ); ifstream REF( par::ref_file_name.c_str(), ios::in ); while ( ! REF.eof() ) { vector tok = tokenizeLine( REF ); if ( tok.size() == 0 ) continue; if ( tok.size() != 2 && tok.size() != 3 ) error("Problem with line in [ " + par::ref_file_name + " ] :\n " + displayLine( tok ) ); if ( mset.find( tok[0] ) == mset.end() ) { ++notfound; continue; } refallele.insert( make_pair( tok[0], tok[1] )); // A second allele also specified? if ( tok.size() == 3 ) refallele2.insert( make_pair( tok[0], tok[2] )); } printLOG("Read reference alleles for " + int2str( refallele.size() ) + " sites\n"); if ( notfound>0 ) printLOG(int2str( notfound ) + " SNPs in reference but not in map file\n"); if ( refallele.size() < locus.size() ) printLOG(int2str( locus.size() - refallele.size() ) + " SNPs in map file but not in reference\n"); REF.close(); } /////////////////////////////////////////////// // .fam readFamFile(par::famfile); // Allocate space for individual-major mode, set to // missing by default... // Either missing (TF) by default; or reference allele (FF) bool code = ! par::ref_file; if ( ! par::SNP_major) { for (int i=0; ione.resize(nl_actual,code); sample[i]->two.resize(nl_actual,false); } } else { for (int l=0; lone.resize(sample.size(),code); SNP[l]->two.resize(sample.size(),false); } } if ( par::ref_file ) { for (int l=0; l< locus.size(); l++) { map::iterator i = refallele.find( locus[l]->name ); map::iterator i2 = refallele2.find( locus[l]->name ); // If we cannot find, we need to set genotypes to missing instead if ( i != refallele.end() ) { locus[l]->allele1 = i->second; if ( i2 != refallele2.end() ) locus[l]->allele2 = i2->second; else if ( par::lfile_allele_count ) locus[l]->allele2 = i->second + "v"; } else { for (int i=0; ione[i] = true; SNP[l]->two[i] = false; } else { sample[i]->one[l] = true; sample[i]->two[l] = false; } } } } } /////////////////////////////////////////////// // .lgen FILE * PED; PED = fopen64(par::lpedfile.c_str(),"r"); if ( PED == NULL ) error("Problem opening LGEN file, errno = "+int2str(errno)); // We can now read any number of individual/genotype lines, in any // order; we also do not assume that all genotypes are given -- // these will be missing by default map imap; map iperson; for (int i=0; iname , k ) ); } } for (int i=0; ifid + "_" + sample[i]->iid , i ) ); } // Whether or not we want to look at a locus is in the include[] vector // The genomic position of locus i is k=include_pos[i] -> locus[k] bool fatal = false; string fmsg = ""; while( ! feof(PED) ) { string fid = ""; string iid = ""; string snp = ""; string one = ""; string two = ""; int f = 0; if ( readString( PED , fid ) ) f++; if ( fid == "" ) continue; if ( readString( PED , iid ) ) f++; if ( readString( PED , snp ) ) f++; if ( readString( PED , one ) ) f++; map::iterator im = imap.find(snp); int k = im != imap.end() ? im->second : -1; // Need to read second allele? if ( ! ( par::compound_genotype_code || par::lfile_allele_count ) ) { if ( readString( PED , two ) ) f++; } else { if ( par::compound_genotype_code ) { if ( one.size() != 2 ) error("Problem with compound genotype not of length 2: [ " + one + " ]"); two = one[1]; one = one[0]; } else if ( par::lfile_allele_count && k != -1 ) { // expect either a 0,1 or 2, or missing code (anything other than 0,1 or 2) int a; if ( ! from_string( a, one, std::dec ) ) a = -1; if ( a < 0 || a > 2 ) { one = two = par::missing_genotype; } else if ( a == 1 ) { one = locus[k]->allele1; two = locus[k]->allele2; } else if ( a == 2 ) one = two = locus[k]->allele2; else if ( a == 0 ) one = two = locus[k]->allele1; } } // cout << f << " " << "[" << fid << "] " // << "[" << iid << "] " // << "[" << snp << "] " // << "[" << one << "] " // << "[" << two << "] \n"; map::iterator peri = iperson.find( fid+"_"+iid ); Individual * person = peri != iperson.end() ? sample[peri->second] : NULL ; // Ignore this genotype? if ( ( ! person ) || k < 0 ) continue; int ip = peri->second; Locus * loc = locus[k]; ///////////////////////////////////////// // Add allele names to list, if needed // If allele is not missing... if (one!=par::missing_genotype && two!=par::missing_genotype) { // ...and not already listed if (one!=loc->allele1 && one!=loc->allele2) { // ...then add to first empty slot if(loc->allele1=="") loc->allele1=one; else if(loc->allele2=="") loc->allele2=one; else { // .. or show an error if no empty slots if (!fatal) fmsg = "Locus " + loc->name + " has >2 alleles:\n individual " + person->fid + " " + person->iid + " has genotype [ " + one +" "+two+" ]\n" + " but we've already seen [ " + loc->allele1 + " ] and [ " + loc->allele2 + " ]\n"; fatal=true; } } } // Repeat for second allele, if different if (two!=one) { // If allele is not missing... if (one!=par::missing_genotype) // ...and not already listed if (two!=loc->allele1 && two!=loc->allele2) { // ...then add to first empty slot if(loc->allele1=="") loc->allele1=two; else if(loc->allele2=="") loc->allele2=two; else { if (!fatal) fmsg = "Locus " + loc->name + " has >2 alleles:\n individual " + person->fid + " " + person->iid + " has genotype [ " + one +" "+two+" ]\n" + " but we've already seen [ " + loc->allele1 + " ] and [ " + loc->allele2 + " ]\n"; fatal=true; } } } // Give an error message if ( fatal ) error(fmsg); ///////////////////////////// // Add specific genotypes if (par::SNP_major) { // 00 hom if (one==loc->allele1 && two==loc->allele1) { SNP[k]->one[ip] = false; SNP[k]->two[ip] = false; } // 01 het else if (one!=par::missing_genotype && two!=par::missing_genotype && one!=two) { SNP[k]->one[ip] = false; SNP[k]->two[ip] = true; } // 11 hom else if (one==loc->allele2 && two==loc->allele2) { SNP[k]->one[ip] = true; SNP[k]->two[ip] = true; } // 10 missing else if (one==par::missing_genotype || two==par::missing_genotype) { SNP[k]->one[ip] = true; SNP[k]->two[ip] = false; } } else { // 00 hom if (one==loc->allele1 && two==loc->allele1) { person->one[k]=false; person->two[k]=false; } // 01 het else if (one!=par::missing_genotype && two!=par::missing_genotype && one!=two) { person->one[k]=false; person->two[k]=true; } // 11 hom else if (one==loc->allele2 && two==loc->allele2) { person->one[k]=true; person->two[k]=true; } // 10 missing else if (one==par::missing_genotype || two==par::missing_genotype) { person->one[k]=true; person->two[k]=false; } } } fclose(PED); } plink-1.07-src/plink.h0000644000265600020320000005737411264127626014004 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __PLINK_H__ #define __PLINK_H__ #include #include #include #include #include #include #include #include #include "zed.h" class CArgs; class Perm; class Set; class Family; class Cluster; class HaploPhase; class Locus; class WMLocus; class Individual; class CSNP; class Model; class Chap; class Variant; class GVariant; using namespace std; typedef vector > table_t; typedef vector > matrix_t; typedef vector vector_t; typedef vector boolvec_t; typedef vector > boolmatrix_t; typedef vector intvec_t; typedef vector > fmatrix_t; typedef vector floatvec_t; typedef vector::iterator iIndividual; typedef vector::iterator iLocus; typedef vector::iterator iSNP; typedef vector::iterator iAllele; class int2 { public: int p1; int p2; int2() { p1=p2=0; } int2(int a, int b) { p1=a; p2=b; } bool operator< (const int2 & b) const { return (p1 < b.p1 || (p1 == b.p1 && p2 < b.p2) ); } bool operator== (const int2 & b) const { return (p1 == b.p1 && p2 == b.p2 ); } }; class double2 { public: double p1; double p2; double2() { p1=p2=0; } double2(double a, double b) { p1=a; p2=b; } bool operator< (const double2 & b) const { return (p1 < b.p1 || (p1 == b.p1 && p2 < b.p2) ); } bool operator== (const double2 & b) const { return (p1 == b.p1 && p2 == b.p2 ); } }; class Pair2 { public: double p; int l; bool operator< (const Pair2 & p2) const { return ( p < p2.p ); } }; class indivPair { public: Individual * p1; Individual * p2; bool operator< (const indivPair & b) const { return (p1 < b.p1 || (p1 == b.p1 && p2 < b.p2) ); } }; class Range { public: int chr; int start; int stop; string name; int group; static map groupNames; Range() { } Range(int p1, int p2, int p3, string p4) { chr = p1; start = p2; stop = p3; name = p4; } bool operator< (const Range & b) const { if ( chr < b.chr ) return true; else if ( chr > b.chr ) return false; if ( start < b.start ) return true; else if ( start > b.start ) return false; return stop < b.stop; } bool operator== (const Range & b) const { return chr == b.chr && start == b.start && stop == b.stop; } }; class WMLocus { public: WMLocus() { chr=0; name=""; } void reset() { allele.clear(); weight.clear(); } int chr; string name; vector allele; vector weight; }; class Individual { public: Individual() { fid=iid=pat=mat=""; ip=im=-1; sex=false; phenotype=-9; sexcode=""; aff=false; covar=-9; bcovar=false; clist.resize(0); clistMissing.resize(0); plist.resize(0); plistMissing.resize(0); missing=false; missing2=false; flag=true; one.resize(0); two.resize(0); sol=0; founder=true; pp=pm=NULL; family=NULL; kids.resize(0); pperson=this; T=W=B=0; gvar.resize(0); } string fid; string iid; // Parental codes string pat; string mat; // Pointers to parents Individual * pp; Individual * pm; // Parent slot number int ip; int im; // Relatedness functions int countMeioses(Individual*); // Permuted self Individual * pperson; // Children (pointers, slot numbers) vector kids; vector ikids; bool sex; string sexcode; double phenotype; bool aff; double covar; bool bcovar; vector_t clist; // multiple covariates vector clistMissing; vector_t plist; // multiple phenotypes vector plistMissing; bool missing; bool missing2; bool flag; int sol; bool founder; Family * family; // SNP data vector one; // Person-major mode genotypes vector two; vector::iterator i1; vector::iterator i2; // Generic variant data vector gvar; // Weighted, multi-allelic single marker WMLocus wmlocus; // For QFAM, within and total scores (temporary variables) double T; double B; double W; }; // Main genotype storage, ordered by SNP class CSNP { public: vector one; // SNP-major mode genotypes vector two; }; class Cluster { public: vector person; }; class Family { public: Family() { include = false; parents = false; discordant_parents = false; singleton = false; sibship = false; TDT = false; pat = mat = NULL; kid.clear(); } void copy(const Family & rhs) { include = rhs.include; pat = rhs.pat; mat = rhs.mat; kid.clear(); for (unsigned int c=0; c kid; // Between-family genotypic score double B; }; class Locus { public: Locus() { chr=0; name=""; allele1=""; allele2=""; freq=0; pos=0; bp=0; nm=0; } int chr; string name; string allele1; string allele2; double freq; // of allele1 double pos; // cM map positions int bp; // base-pair position int nm; // number of non-missing alleles // Copy constructor Locus(const Locus& h1) { copy(h1); } Locus & operator= (const Locus & h1) { copy(h1); return *this; } void copy(const Locus &h1) { chr = h1.chr; name = h1.name; allele1 = h1.allele1; allele2 = h1.allele2; freq = h1.freq; pos = h1.pos; bp = h1.bp; nm = h1.nm; } bool operator< (const Locus & p2) const { return (chr < p2.chr || (chr == p2.chr && bp < p2.bp) ); } bool operator== (const Locus & p2) const { return ( name == p2.name ); } }; namespace std { template<> class less { public: bool operator()(Locus const* p1, Locus const* p2) { // Locus comparison based first on distance, // but then pointers in case we have a degenerate map // file (i.e. so we can sort on position, but so that // set still works if(!p1) return true; if(!p2) return false; if (p1->chr < p2->chr) return true; if (p1->chr > p2->chr) return false; if (p1->bp < p2->bp) return true; return false; } }; }; class MainExitException { public: void exitMessage() { cout << "Exception: ending...\n"; } }; class Z { public: Z() { z0=z1=z2=0;} double z0; double z1; double z2; }; class CInfo { public: int lstart; int lstop; int bpstart; int bpstop; }; class Segment { public: Segment() { start = finish = 0; p1 = p2 = NULL; count = baseline = freq = type = sites = 0; score = 0.0; } int start; // based on map position [0..nl_all] int finish; Individual * p1; Individual * p2; // Generics int count; int baseline; double weightedCount; double weightedBaseline; int freq; int type; int sites; double score; // Just base for CNVs for now (i.e. only consider p1) bool operator< (const Segment & b) const { if ( start < b.start ) return true; if ( start > b.start ) return false; if ( finish < b.finish ) return true; if ( finish > b.finish ) return false; if ( p1 < b.p1 ) return true; return false; } bool operator== (const Segment & b) const { return ( start == b.start && finish == b.finish && p1 == b.p1 ); } }; class ZZ { public: ZZ() { z00=z01=z02=0; z10=z11=z12=0; z20=z21=z22=0; } double z00; double z10; double z20; double z01; double z11; double z21; double z02; double z12; double z22; }; class Plink { public: Plink() { sample.resize(0); locus.resize(0); phenotype.resize(0); clistname.resize(0); plistname.resize(0); m1.resize(0); m2.resize(0); pos.resize(0); pihat_G.resize(0); warnings=false; n=0; nl_all=0; ngvar=0; cnt_f=npheno=nl=0; nk=1; kname.resize(1,"0"); phenotype_name = ""; scaffold.clear(); } // Genotype/phenotype per individual file vector sample; // SNP information (ordered by SNP/individual) vector SNP; // Locus information vector locus; // Marker scaffold map scaffold; // Genetic variant information vector gvar; // Family data vector family; // Phenotype names string phenoLabel; vector clistname; vector plistname; // number of individuals, pairs int n; // total number of individuals int cnt_f; // number of founders int npheno; // number of individuals with informative phenotypes int np; int nl_all; // all loci int ngvar; // generic variants (non-SNP) int nl; // test loci int nk; // number of clusters string phenotype_name; // Generic output file ofstream OUTFILE; ZOutput ZOUTFILE; // Were any warnings set? bool warnings; // Cluster names map kmap; vector kname; vector klist; // List of oblig-missing SNP/clusters set oblig_missing; // Conditioning SNPs, and mask vector conditioner; vector conditioner_mask; // Skip the pair if not informative vector skip_pair; // String for current multipoint pair IDs string pairid; // Singlepoint locus-specific IBD, singlepoint // for each pair (one at a time) vector Zlocus; // Store genome-wide IBD only for informative pairs vector saved_IBDg; // Multipoint map variables vector m1; // left flanking marker vector m2; // right flanking marker vector pos; // relative position between // Final matrices: pihats and squared differences, cross-products vector< vector > pihat; // row=marker, col=pair // Segments; CNVs, ROHs, IBD segments vector segment; // condensed IBD segment record set geneList; // Genic/regional intersection range list map > gene2segment; // Map of segments per gene vector indivSegmentGroup; // allelic group for each segment(per-ind) vector pihat_G; // global pi-hat set related; // set of T pairs above threshold vector phenotype; // SD or CP vector pair1; // First member of pair vector pair2; // Second member of pair vector in_anal; // Unique'd list of inds in regression // Variances, means double m_phenotype; double v_phenotype; double prev_bt; vector m_pihat; vector v_pihat; // Expected frequencies for IBS|IBD double E00, E10, E20; double E01, E11, E21; double E02, E12, E22; // T matrix elements double T00, T01, T02; double T10, T11, T12; double T20, T21, T22; // Storage for genome-wide max(r^2) values vector maxr2; // Association SETs vector setname; vector > snpset; // Storage for original results vector > original; // IBS matrix and cluster variables vector > mdist; // IBS metric double pv; // temporary holder of p-value double dst; // temporary holder of IBS double pvIBS0; // holder for IBS0 pvalue count double pvIBS2het; // holder for IBS2 het/het count // Epistasis tests vector epi1; vector epi2; // Lists of individuals set gset1; set gset2; // Working variables for merge_mode >=6 long int diff_overlap; long int diff_nonmissing_overlap; long int diff_concordant_overlap; // Association test p-value storage: # inds vector_t tcnt; // Cache for LD values in proxy-windows map proxyLD; // Segmental test help variables map segmentCount; map segmentLength; map segmentCount2; map segmentCount2Baseline; // Expected overlap vector_t expectedOverlap; vector_t expectedOverlapBaseline; // Pointer to permutation class Perm * pperm; /////////////////////// // Functions // Input/output functions void readData(); void readDataLongFormat(); void readFamFile(string); void readMapFile(string,vector&,vector&,int&); void readTransposedData(); void readGenericVariantData(); void outputGenericVariantFile(); void convertGenericVariantData(); void updateMapFile(); void updateFamFile(); void updateAlleles(); void readStdIn(); void mergeData(); bool reconcileMerge(int,int,string,string,bool,bool,ofstream&,map&); void mergeBinaryData(); void mergeList(); void dummyLoader(); void simulateSNPs(); void simulateSNPs_QT(); bool readPhenoFile(); bool readMultiplePhenoFile(); bool readCovariateFile(); bool readCovListFile(); bool readClusterFile(bool verbose=true); void readConditioningList(); void readBinData(); void readSet(); void prettyPrintLengths(); void printLOG(string); void outputSetFile(); void setAssocSummary(); void Ind2SNP(); void SNP2Ind(); // Summary statistic / data cleaning functions void filterSNPs(); void processGVAR(); void calcStratifiedAlleleFreqs(); void hardyWeinbergCheck(); double calcInbreeding(Individual *,int,int,ofstream&); void sexCheck(); void calcFst(); void findAllHomozygousRuns(Perm &); void findHomoRuns(Individual *,ofstream&); void findHomoWindow(Individual *,ofstream&); void summariseHomoRuns(); void findIBSRuns(Individual *,Individual *,ofstream&); void findMissRuns(Individual *,ofstream&); void groupSegmentsSpanning(int); void displaySegmentsLong(); void displaySegmentsBED(); // CNV segment functions void setUpForCNVList(); void readCNVList(); void processCNVList(); vector_t glmCNVBurdenModel(Perm &, bool); // Helper functions bool missingGenotype(int,int); bool obligMissing(int,int); void outputPermedPhenotypes(Perm &); void countCNVPerRegion(vector&,vector&); void initialiseGeneCountAssociation(Perm &); // Family-based functions void parseTrios(); void makeFounders(); void makeMissingParents(); void linkRelateds(map &, map &); void checkMendel(); void pseudoCaseControl(); vector testTDT(bool,bool, Perm &, vector &, vector & ); void perm_testTDT(Perm &); vector testSibTDT(bool,bool, Perm &, vector &, vector & ); void perm_testQTDT(Perm &); vector calcQTDT(vector &, ofstream&, bool, Perm &, vector &, vector &); vector testTDT_POO(bool,bool, Perm &, vector &, vector & ); void perm_testTDT_POO(Perm &); // IBS sharing test statistics vector sharingIBSTest(Perm &); void perm_sharingIBSTest(Perm &); //////////////////////////////////// // Main pointers to other classes // Haplotype phasing/testing HaploPhase * haplo; // GLM models Model * model; // Conditional haplotype tests (WHAP) Chap * whap; // Set-based functions Set * pS; // PLINK Functions int readInformative(); int calcInformative(); void writeInformative(); void displayGenomeWideInfo(); void testGenomeIBDByCovariate(Perm &); void permutationIBSTest(Perm &); void displayGMULTI(Individual *, Individual *, int, ofstream &); void preCalcGenomeIBD(); void preCalcMultiPoint(); void preCalcSinglePoint(); void preCalcPhenotypes(); Z calcGenomeIBS(Individual *, Individual *); void calcGenomeIBM(Individual *, Individual *); Z calcGenomeIBD(Individual *, Individual *, Z); vector calcLocusIBD(Individual *, Individual *, Z); vector calcMultiPoint(vector &, Z, ofstream &); vector calcSinglePoint(vector &, Z); short calcPhenotypes(vector &, Individual *p1, Individual *p2); void calcRegression(int); vector doRegression(int,vector&); void preCalcRegression_PHENO(vector&); void preCalcRegression_PIHAT(); // Association tests void calcAssociationWithPermutation(Perm&); void calcAssociationWithBootstrap(); void perm_testGXE2(Perm &); vector testQAssocGXE2(bool,Perm &); void calcGXE(Perm&); void perm_testHotel(Perm &); vector calcHotel(bool, Perm &, Set &,int,int); void calcMH(); void calcHomog(); vector calcMantelHaenszel_2x2xK(Perm &, bool); vector calcMantelHaenszel_ORD(vector&,vector&,vector&); vector calcMantelHaenszel_IxJxK(vector&,vector&,vector&); void calcLDStatistics(); void calcPairwiseLD(); double correlation2SNP(int,int,bool,bool,bool useFlag=false); void pruneLD(); void calcFlipScan(); void setReferenceAllele(); map > mkBlks(int, int ); void setFlagToCase(); void setFlagToControl(); void calcEpistasis(); void driverSCREEPI(); vector testMiss(Perm &,bool); void performMisHapTests(); void proxyWrapper(); void performProxyTests(int); void scoreIndividuals(); void calculateProfile(map &, map &, vector_t &, matrix_t &, vector &,vector &); vector testAssoc(int &, int &, vector &, vector &, vector &, vector &, vector &,vector &, vector &,vector &, Perm &, bool); vector testQAssoc(bool, Perm &); vector fullModelAssoc(bool, Perm &); void displayQTMeans(ofstream &, int l); vector_t glmAssoc(bool, Perm &); vector_t conditionalHaplotypeTest(bool, Perm &); vector_t glmHaplotypeTest(bool, Perm &); void multcomp(vector&,string); void buildT(double,bool,double,double); void setMarkerRange(); void buildCluster(); void generateMDS(); void groupGenome(); void summaryIBD(); void findSegments(int,int,vector_t &,ofstream &); void summaryIBDsegments(Perm & perm); void summaryIBSsegments(Perm & perm); void indivSegmentSummary(); void indivSegmentSummaryCalc(map&, map&,bool,bool); void readSegmentFile(ifstream &); void readSegmentFileMinimal(ifstream &); void readHomozygSegmentFile(ifstream &); void segmentPermutationTest(Perm &,bool,string,vector&,vector&,vector&); void segmentIndividualTest(Perm &); vector_t perm_segmentIndividualTest(Perm&,bool,int,int,map&); void homozygousSegmentPermutationTest(Perm &,string,vector&,vector&); void validateSegments(); void positionPermuteSegments(); void runTestCNVwithQT(Perm &); vector_t testCNVwithQT(double,int,int,vector_t&,vector_t&,vector_t&); void runTestCNVwithGLM(Perm &); vector_t testCNVwithGLM(bool, Perm &, vector & ); void displayGenomePV(); void extractExcludeSet(bool); void removeIndividuals(bool); void keep2SetsForGenome(); void filterQualSNPs(); void filterQualGenotypes(); void makePhenotype(); void filterOnCovariate(); void filterOnCase(); void filterOnControl(); void filterOnMale(); void filterOnFemale(); void filterOnFounder(); void filterOnNonFounder(); void attribFilterSNP(); void attribFilterInd(); void zeroOnCluster(); void setObligMissing(); int deleteSNPs(vector&); int deleteSNPs(set&); int deleteSNPs(set&); int deleteIndividuals(vector&); int deleteIndividuals(set&); void thinSNPs(); int keepSNPs(set&); int keepSNPs(set&); int keepIndividuals(set&); void flipStrand(); void alleleRecoding(); void display_recoded_PEDFILE(); void display_recoded_PEDFILE_transpose(); void display_recoded_PEDFILE_AD(); void display_recoded_LONG(); void display_recoded_MUTLIST(); void output_fastphase_format(); void output_bimbam_format(); void output_structure_format(); void display_listByAllele(); void display_twolocus(); void display_pairList(); void display_indivReport(); void write_BITFILE(); void write_covariates(); void write_clusters(); void write_snplist(); bool openBinaryFile(string,ifstream&); void setTable(); void writeSetFile(); void tagMode(); void processDosageFile(); void displayGeneReport(); void annotateFile(); void metaAnalysis(); void webcheck(CArgs &); void lookup(); void lookup2(); void Rfunc(); void cleanUp(); // Misc help functions void setFlags(bool f) { vector::iterator person = sample.begin(); while ( person != sample.end() ) { (*person)->flag = f; person++; } } // Additional functions void permTestRareDistribution(Perm &); void elfBaseline(); void displayRareRange(); vector_t testRareDistribution(Perm &,bool,map & ranges); }; #endif plink-1.07-src/hotel.cpp0000644000265600020320000002744511264127625014330 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "plink.h" #include "sets.h" #include "options.h" #include "helper.h" #include "stats.h" #include "perm.h" using namespace std; // Helper function void calcHotelSetMeanVariance(vector &, vector &, vector &, vector > &, vector&, int,int); void Plink::perm_testHotel(Perm & perm) { if (!par::SNP_major) Ind2SNP(); // Do not allow monomorphic alleles if (par::min_af==0) error("Cannot specify --maf 0 when using --T2; set --maf > 0"); // Do not allow completely missing SNPs if (par::MAX_GENO_MISSING==1) error("Cannot specify --geno 1 when using --T2; set --geno < 1"); // Are we using sets? If so, construct these now if (!par::set_test) error("You need to specify sets (--set option) with --T2"); // Do not allow quantitative traits if (!par::bt) error("Cannot specify --T2 with quantitative traits"); // Prune SET (0-sized sets, MAF==0 SNPs, etc) pS->pruneSets(*this); int ns = snpset.size(); /////////////////////////////////////////// // Count how many cases, how many controls int caseN = 0; int controlN = 0; for (int i=0; i < n; i++) if (!sample[i]->missing) if (sample[i]->aff) caseN++; else controlN++; if ( caseN == 0 || controlN == 0 ) error("No cases / no controls for T(2) test"); // Multi-collinearity SNP pruning setFlags(true); pS->pruneMC(*this,true,par::vif_threshold); // Empirical p-values (1 per set) perm.setTests(ns); //////////////////////////////// // Set up permutation structure // (we need to perform this step // whether or not we also // subsequently permute) perm.setPermClusters(*this); perm.originalOrder(); vector original = calcHotel(true, perm, *pS, caseN, controlN); //////////////////////////// // If no permutation, then // leave now if (!par::permute) return; ////////////////////// // Begin permutations bool finished = false; while(!finished) { // Store permuted results vector pr(ns); if (par::perm_genedrop) perm.geneDrop(); else perm.permuteInCluster(); pr = calcHotel(false, perm, *pS, caseN, controlN); //////////////////////////////// // Standard permutation counting finished = perm.update(pr,original); } // next permutation if (!par::silent) cout << "\n\n"; //////////////////// // Display results ofstream ASC; string f; if (par::adaptive_perm) f = par::output_file_name + ".T2.perm"; else f = par::output_file_name + ".T2.mperm"; ASC.open(f.c_str(),ios::out); ASC.precision(4); printLOG("Writing permutation T2 test results to [ " + f + " ] \n"); ASC << setw(12) << "SET" << " " << setw(4)<< "SIZE" << " " << setw(12) << "EMP1" << " "; if (par::adaptive_perm) ASC << setw(12)<< "NP" << " "; else ASC << setw(12)<< "EMP2" << " "; ASC << "\n"; for (int s=0; s Plink::calcHotel(bool disp, Perm & perm, Set & S, int ncase, int ncontrol) { ofstream ASC; if (disp) { string f = par::output_file_name + ".T2"; ASC.open(f.c_str(),ios::out); ASC.precision(4); printLOG("Writing T2 test results to [ " + f + " ] \n"); ASC << setw(12) << "SET" << " " << setw(4)<< "SIZE" << " " << setw(12)<< "T2" << " " << setw(12) << "DF1" << " " << setw(12)<< "DF2" << " " << setw(12)<< "P_HOTEL" << "\n"; } // Number of SETs int ns = pS->snpset.size(); vector T2(ns,0); // Consider each SET for (int s=0; s pSNP(0); for (int j=0; jcur[s][j] ) { // Add to set list pSNP.push_back( SNP[snpset[s][j]] ); // Increase the actual number of snps in set nss++; } } vector mean2(nss,0); // Case mean vector mean1(nss,0); // Control mean vector > pooled; // Covariance matrix /////////////////////////////////////// // Calculate mean and variance (pooled) // after imputing missing SNPs calcHotelSetMeanVariance(pSNP,mean1,mean2,pooled,sample,ncase,ncontrol); /////////////////////////////// // 2. Calculate test statistic for (int j1=0; j1 tmp(nss,0); for (int j1=0; j1 & pSNP, vector & mean1, vector & mean2, vector > & pooled, vector & sample, int ncase, int ncontrol) { int nss = mean1.size(); vector mean(nss,0); vector cnt1(nss,0); vector cnt2(nss,0); //////////////////////////// // Iterate over SNPs in SET vector::iterator ps = pSNP.begin(); int j=0; while ( ps != pSNP.end() ) { /////////////////////////// // Iterate over individuals vector::iterator gperson = sample.begin(); vector::iterator i1 = (*ps)->one.begin(); vector::iterator i2 = (*ps)->two.begin(); int i=0; while ( gperson != sample.end() ) { // Permuted self Individual * pperson = (*gperson)->pperson; // Affected individuals if ( ! pperson->missing ) { if (pperson->aff) { if ( *i1 ) { if ( *i2 ) // 11 homozygote { mean[j]++; cnt2[j]++; mean2[j]++; } } else { cnt2[j]++; if ( ! *i2 ) // 00 homozygote { mean[j]--; mean2[j]--; } } } else { if ( *i1 ) { if ( *i2 ) // 11 homozygote { mean[j]++; cnt1[j]++; mean1[j]++; } } else { cnt1[j]++; if ( ! *i2 ) // 00 homozygote { mean[j]--; mean1[j]--; } } } } // Next individual gperson++; i1++; i2++; i++; } // Next SNP in set ps++; j++; } // Having iterated over all individuals, we can now calculate the mean // values, perform mean-substitution of missing data, and calculate the // second order terms cout.precision(8); for (int j=0; j::iterator ps1 = pSNP.begin(); int j1=0; while ( ps1 != pSNP.end() ) { // Second SNP vector::iterator ps2 = ps1; int j2=j1; while ( ps2 != pSNP.end() ) { /////////////////////////// // Iterate over individuals vector::iterator gperson = sample.begin(); vector::iterator i1_1 = (*ps1)->one.begin(); vector::iterator i2_1 = (*ps1)->two.begin(); vector::iterator i1_2 = (*ps2)->one.begin(); vector::iterator i2_2 = (*ps2)->two.begin(); while ( gperson != sample.end() ) { // Permuted self Individual * pperson = (*gperson)->pperson; // Set both values to sample mean double v1 = mean[j1]; double v2 = mean[j2]; // First SNP if ( *i1_1 ) { if ( *i2_1 ) // 11 homozygote { v1 = 1; } } else { if ( ! *i2_1 ) // 00 homozygote { v1 = -1; } else v1 = 0; // 01 heterozygote } // Second SNP if ( *i1_2 ) { if ( *i2_2 ) // 11 homozygote { v2 = 1; } } else { if ( ! *i2_2 ) // 00 homozygote { v2 = -1; } else v2 = 0; // 01 heterozygote } // Contribution to covariance term if (! pperson->missing) { if (pperson->aff) // affecteds pooled[j1][j2] += ( v1 - mean2[j1] ) * ( v2 - mean2[j2] ); else // unaffecteds pooled[j1][j2] += ( v1 - mean1[j1] ) * ( v2 - mean1[j2] ); } // Next individual gperson++; i1_1++; i2_1++; i1_2++; i2_2++; } // Next second SNP ps2++; j2++; } // Next first SNP ps1++; j1++; } // Make matrix symmetric for (int i=0; i #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" #include "crandom.h" #include "sets.h" #include "perm.h" #include "stats.h" //////////////////////////// // Parent-of-origin analysis void Plink::perm_testTDT_POO(Perm & perm) { ////////////////////////////////// // Individual-major mode analysis if (par::SNP_major) SNP2Ind(); /////////////////////////////////////////// // Calculate original results for true data vector dummy(family.size(),false); perm.setTests(nl_all); perm.setPermClusters(*this); vector original = testTDT_POO(true, false, perm, dummy, dummy); //////////////////////////// // Display corrected p-values? if (par::multtest) { vector obp(0); for (int l=0; lcumulativeSetSum_WITHLABELS(*this,original); ////////////////////// // Begin permutations bool finished = false; while(!finished) { /////////////////////////////////// // Set up permutation list for TDT // Permutations are constant across family and markers // flipA/B[permutation][family] vector fA(family.size(),false); vector fB(family.size(),false); for (int f=0; f pr = testTDT_POO(false, true, perm, fA, fB); ////////////////////// // Make sets? if (par::set_test) pS->cumulativeSetSum_WITHOUTLABELS(pr,perm.current_reps()+1); //////////////////////////////// // Standard permutation counting finished = perm.update(pr,original); } // next permutation cout << "\n\n"; /////////////////////////////////////////// // Calculate SET-based empirical p-values if (par::set_test) { printLOG("Calculating empirical SET-based p-values\n"); pS->empiricalSetPValues(); } //////////////////// // Display results ofstream TDT; string f; if (par::adaptive_perm) f = par::output_file_name + ".tdt.poo.perm"; else f = par::output_file_name + ".tdt.poo.mperm"; TDT.open(f.c_str(),ios::out); printLOG("Writing TDT parent-of-origin permutation results to [ " + f + " ] \n"); TDT.precision(4); TDT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " "; TDT << setw(12) << "CHISQ_TDT" << " "; TDT << setw(12) << "EMP1" << " "; if (par::adaptive_perm) TDT << setw(12) << "NP" << " " << "\n"; else TDT << setw(12) << "EMP2" << " " << "\n"; for (int l=0; lchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " "; if (original[l] < -0.5) TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " " << setw(12) << "NA"; else { TDT << setw(12) << original[l] << " " << setw(12) << perm.pvalue(l) << " "; if (par::adaptive_perm) TDT << setw(12) << perm.reps_done(l); else TDT << setw(12) << perm.max_pvalue(l); } TDT << "\n"; } TDT.close(); //////////////////////////// // Display SET-based results if (par::set_test) { f = par::output_file_name + ".tdt.poo.set"; TDT.open(f.c_str(),ios::out); printLOG("Writing set-based TDT parent-of-origin results to [ " +f+ " ] \n"); TDT.clear(); // Header row TDT << setw(12) << "SET" << " " << setw(6) << "S" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "T" << " " << setw(12) << "P_0" << " " << setw(12) << "P_1" << " " << setw(12) << "P_2" << " " << "\n"; for (int i=0;ipv_set.size();i++) { TDT << "\n"; for (int j=0;jpv_set[i].size();j++) { TDT << setw(12) << setname[i] << " " << setw(6) << string("S<"+int2str(j+1)) << " " << setw(par::pp_maxsnp) << pS->setsort[i][j] << " " << setw(12) << pS->stat_set[i][j][0] << " " << setw(12) << pS->pv_set[i][j][0] << " " << setw(12) << pS->pv_maxG_set[i][j]/(par::replicates+1) << " " << setw(12) << pS->pv_maxE_set[i][j]/(par::replicates+1) << " " << "\n"; } } TDT.close(); } } vector Plink::testTDT_POO(bool print_results, bool permute, Perm & perm, vector & flipA, vector & flipB) { /////////////////////////// // Vector to store results vector res(nl_all); double zt; ofstream TDT; if (print_results) { string f = par::output_file_name + ".tdt.poo"; TDT.open(f.c_str(),ios::out); printLOG("Writing TDT parent-of-origin results (asymptotic) to [ " + f + " ] \n"); TDT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(6) << "A1:A2" << " " << setw(12) << "T:U_PAT" << " " << setw(12) << "CHISQ_PAT" << " " << setw(12) << "P_PAT" << " " << setw(12) << "T:U_MAT" << " " << setw(12) << "CHISQ_MAT" << " " << setw(12) << "P_MAT" << " " << setw(12) << "Z_POO" << " " << setw(12) << "P_POO" << " "; // if (par::display_ci) // TDT << setw(12) << string("L"+int2str(int(par::ci_level*100))) << " " // << setw(12) << string("U"+int2str(int(par::ci_level*100))) << " "; // if (par::display_ci) // zt = ltqnorm( 1 - (1 - par::ci_level) / 2 ) ; TDT << "\n"; } /////////////////////////////////// // Perform analysis for each locus for (int l=0; lTDT ) continue; int trP = 0; // transmitted allele from het father int unP = 0; // untransmitted allele from het father int trM = 0; // transmitted allele from het mother int unM = 0; // untransmitted allele from het mother Individual * pat = family[f]->pat; Individual * mat = family[f]->mat; vector kid = family[f]->kid; bool pat1 = pat->one[l]; bool pat2 = pat->two[l]; bool mat1 = mat->one[l]; bool mat2 = mat->two[l]; // We need two genotyped parents, with // at least one het if ( pat1 == pat2 && mat1 == mat2 ) continue; if ( ( pat1 && !pat2 ) || ( mat1 && !mat2 ) ) continue; // Consider all offspring in nuclear family for (int c=0; caff ) continue; bool kid1 = kid[c]->one[l]; bool kid2 = kid[c]->two[l]; // Skip if offspring has missing genotype if ( kid1 && !kid2 ) continue; // We've now established: no missing genotypes // and at least one heterozygous parent bool hhh = false; // flag for het X het => het // Kid is 00 if ( (!kid1) && (!kid2) ) { // Paternal transmission? if ( (!pat1) && pat2 ) { trP=1; unP=2; } // Maternal transmission? if ( (!mat1) && mat2 ) { trM=1; unM=2; } } else if ( (!kid1) && kid2 ) // Kid is 01 { // Everybody heterozygous? if ( pat1 != pat2 && mat1 != mat2 ) hhh = true; else { // het father if ( pat1 != pat2 ) { // what did mother transmit? if ( !mat1 ) { trP=2; unP=1; } else { trP=1; unP=2; } } else { // what did father transmit? if ( !pat1 ) { trM=2; unM=1; } else { trM=1; unM=2; } } } } else // kid is 1/1 { // Paternal transmission? if ( (!pat1) && pat2 ) { trP=2; unP=1; } // Maternal transmission? if ( (!mat1) && mat2 ) { trM=2; unM=1; } } /////////////// // Permutation? if (permute) { // Determine whether to flip parental origin... if (par::perm_POO_poo) { if (flipA[f]) { int t=trP; trP=trM; trM=t; } if (flipB[f]) { int t=unP; unP=unM; unM=t; } } else // ... or allelic transmission { if (flipA[f]) { int t=trP; trP=unP; unP=t; } if (flipB[f]) { int t=trM; trM=unM; unM=t; } } } // Increment transmission counts if (hhh) { p1 += 0.5; p2 += 0.5; m1 += 0.5; m2 += 0.5; } else { if (trP==1) p1++; if (trM==1) m1++; if (trP==2) p2++; if (trM==2) m2++; } } // next offspring in family } // next nuclear family ///////////////////////////// // Finished counting: now compute // the statistics double pat_chisq, mat_chisq, tot_chisq; pat_chisq = mat_chisq = tot_chisq = -1; // Basic TDT test if (p1+p2 > 0) pat_chisq = ((p1-p2)*(p1-p2))/(p1+p2); if (m1+m2 > 0) mat_chisq = ((m1-m2)*(m1-m2))/(m1+m2); double t1 = p1 + m1; double t2 = p2 + m2; if (t1+t2 > 0) tot_chisq = ((t1-t2)*(t1-t2))/(t1+t2); double pat_OR = p1 / p2; double pat_VOR = 1/p1 + 1/p2; double mat_OR = m1 / m2; double mat_VOR = 1/m1 + 1/m2; // Test of POO effect double z = ( log(pat_OR) - log(mat_OR) ) / sqrt( pat_VOR + mat_VOR ); // Display asymptotic results if (print_results) { TDT.precision(4); TDT << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(6) << string(locus[l]->allele1 + ":" + locus[l]->allele2) << " "; // Paternal transmissions TDT << setw(12) << dbl2str(p1)+":"+dbl2str(p2) << " "; if (pat_chisq>=0) TDT << setw(12) << pat_chisq << " " << setw(12) << chiprobP(pat_chisq,1) << " "; else TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " "; // Maternal transmissions TDT << setw(12) << dbl2str(m1)+":"+dbl2str(m2) << " "; if (mat_chisq>=0) TDT << setw(12) << mat_chisq << " " << setw(12) << chiprobP(mat_chisq,1) << " "; else TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " "; if ( realnum(z) ) TDT << setw(12) << z << " " << setw(12) << normdist(-fabs(z)) * 2 << " "; else { TDT << setw(12) << "NA" << " " << setw(12) << "NA" << " "; } TDT << "\n"; } /////////////////////////////////////////// // Choose which statistic for permutation if (par::perm_POO_poo) res[l] = realnum(z) ? z*z : -1 ; else if (par::perm_POO_pat) res[l] = pat_chisq; else if (par::perm_POO_mat) res[l] = mat_chisq; else if (par::perm_POO_best) res[l] = pat_chisq > mat_chisq ? pat_chisq : mat_chisq; } // next locus ////////////////////////////// // Close output file, if open if (print_results) TDT.close(); /////////////////////////////////////////// // Return chosen statistic for permutation return res; } plink-1.07-src/perm.cpp0000644000265600020320000002555211264127625014155 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include "perm.h" #include "helper.h" #include "stats.h" #include "sets.h" Perm::Perm(Plink & pref) : P(pref) { if (par::adaptive_perm) { adaptive = true; replicates = par::adaptive_max; } else { adaptive = false; replicates = par::replicates; } count = par::perm_count; min = par::adaptive_min; interval = par::adaptive_interval; performed = 0; dump_all = par::mperm_save_all; dump_best = par::mperm_save_best; if (dump_all) { PDUMP.open((par::output_file_name+".mperm.dump.all").c_str(),ios::out); } else if (dump_best) { PDUMP.open((par::output_file_name+".mperm.dump.best").c_str(),ios::out); } } void Perm::setTests(int x) { performed = 0; t = x; R.clear(); N.clear(); test.clear(); snp_test.clear(); maxR.clear(); R.resize(t,0); if (adaptive) { N.resize(t,0); test.resize(t,true); snp_test.resize(t,true); if ( par::set_test ) snp_test.resize(P.nl_all,true); // Given t tests, set the threshold to be // p +/- Phi^{-1} (1 - \gamma/2t ) sqrt( p(1-p)/N ) zt = ltqnorm( 1 - par::adaptive_ci / ( 2 * t ) ) ; } else { maxR.resize(t,0); } // For gene-dropping, set up some family-information if (par::perm_genedrop) preGeneDrop(); } // Redundant void Perm::setAdaptiveSetSNPs(int x) { // snp_test.clear(); // snp_test.resize(x,true); } void Perm::originalOrder() { for (int i=0; ipperson = P.sample[i]; } bool Perm::finished() { if (performed>=replicates) return true; else return false; } void Perm::permuteInCluster() { // Store remapped IDs vector > i(ns); // Permute phenotypes, within cluster for (int k=0; k p(s[k].size()); permute(p); i[k]=p; } ////////////////////////// // Post-permutation: // Iterate over clusters { s[][] } // i[][] holds the permuted codes // s[][] points to individuals (non-missing) // Genotype = sample[s[j][k]]; // Matching phenotype = sample[s[j][(int)i[j][k]]]; // Create pheno[] with label-swapped codes for (int j=0; jpperson = P.sample[s[j][(int)i[j][k]]]; } void Perm::setPermClusters(Plink & P) { // Permute within clusters only // (stored in sample[i]->sol) // Get list of non-missing individuals, and number of solutions // These are always numbered 0,1,2,3,.. // -1 indicates do not permute this individual // 0..ns indicate cluster numbers // Count the number of clusters: 'ns' ns=-1; for (int i=0; imissing) && P.sample[i]->sol > ns) ns=P.sample[i]->sol; ns++; // store set membership is 's' s.resize(ns); for (int i=0; imissing && P.sample[i]->sol>=0) s[P.sample[i]->sol].push_back(i); pheno.resize(P.n); if (par::permute && ! par::QTDT_test ) P.printLOG("Set to permute within "+int2str(ns)+" cluster(s)\n"); } void Perm::setOriginalRanking(vector_t & original) { vector o; for (int i=0; i & result, vector & original) { // Increment number of permutations performed performed++; // Finished all perms? bool done = false; ////////////////////////////// // Update number of successes if (!adaptive) { for (int l=0; l= original[l] || !realnum(original[l]) ) R[l]++; } else { for (int l=0; l= original[l] || !realnum(original[l]) ) R[l]++; N[l]++; } } // Stopping rules for adaptive permutation? int todo = 0; if (adaptive && performed > min && performed % interval == 0) { // Update interval interval = (int)(par::adaptive_interval + performed * par::adaptive_interval2); // Consider each test for (int l=0; l0) { double pv = (double)(R[l]+1)/(double)(performed+1); double sd = sqrt( pv * (1-pv) / performed ); double lower = pv - zt * sd; double upper = pv + zt * sd; //double cv = sd/(performed*pv); if (lower<0) lower = 0; if (lower>1) upper = 0; // Is lower bound greater than threshold, or // upper bound smaller than threshold? if (lower > par::adaptive_alpha || upper < par::adaptive_alpha ) { N[l] = performed; test[l] = false; if ( par::set_test ) { for (int j=0;j< P.pS->snpset[l].size();j++) snp_test[ P.pS->snpset[l][j] ] = false; } else snp_test[l] = false; } else todo++; } else todo++; } } if (!par::silent) { if ( par::set_test ) cout << "Adaptive permutation: " << performed << " of (max) " << replicates << " : " << todo << " sets left" << " \r"; else cout << "Adaptive permutation: " << performed << " of (max) " << replicates << " : " << todo << " SNPs left" << " \r"; } if (todo==0) done = true; } /////////////////////////////////////////////////////////// // For non-adaptive permutation, keep track of the maximum if (!adaptive) { if (dump_all) { if (performed==1) { PDUMP << 0 << " "; for (int l=0; lmx) if ( realnum(mx) ) mx=original[l]; PDUMP << mx << "\n"; } PDUMP << performed << " "; } // Find maximum, or sort all results double mx=0; if ( par::mperm_rank ) { // Ranked permutation // populate mx vector // Set any NA to 0 for (int l=0; lmx) if ( realnum(mx) ) mx=result[l]; } if (dump_best) PDUMP << mx << "\n"; else if (dump_all) { for (int l=0; l= original[l] || !realnum(original[l]) ) maxR[l]++; } // Rank(T) permutation -- compare against similar rank else { for (int l=0; l= original[order[l]] || !realnum(original[order[l]]) ) maxR[order[l]]++; } } if (!par::silent) { cout << "maxT permutation: " << performed << " of " << par::replicates << " \r"; cout.flush(); } } // Have we hit the maximum number of replicates? if (performed>=replicates) done = true; return done; } void Perm::nextSNP() { // Reset Perm class for next SNP when in adaptive, SNP-by-SNP mode performed = 0; originalOrder(); } bool Perm::updateSNP(double result, double original, int l) { ///////////////////////////////////////// // Single SNP adaptive permutation update // for QFAM -- do not allow set-based tests // here ///////////////////////////////////////// // Increment number of permutations performed for this SNP performed++; // Finished all perms for this SNP? bool done = false; ////////////////////////////// // Update number of successes if (test[l]) { if (result >= original || !realnum(original) ) R[l]++; N[l]++; } // Stopping rules for adaptive permutation? if (adaptive && performed > min && performed % interval == 0) { // Update interval interval = (int)(par::adaptive_interval + performed * par::adaptive_interval2); // Consider this specific SNP if (test[l]) { // Check for at least one success if (R[l]>0) { double pv = (double)(R[l]+1)/(double)(performed+1); double sd = sqrt( pv * (1-pv) / performed ); double lower = pv - zt * sd; double upper = pv + zt * sd; //double cv = sd/(performed*pv); if (lower<0) lower = 0; if (lower>1) upper = 0; // Is lower bound greater than threshold, or // upper bound smaller than threshold? if (lower > par::adaptive_alpha || upper < par::adaptive_alpha ) { N[l] = performed; test[l] = false; done = true; } } } } // Have we hit the maximum number of replicates? if (performed>=replicates) done = true; return done; } int Perm::rank(int l) { if ( ! par::mperm_rank ) return 0; else return t - reorder[l]; } double Perm::pvalue(int l) { if (count) return (double)R[l]; if (adaptive) return (double)(R[l]+1) / (double)(N[l]+1); else return (double)(R[l]+1) / (double)(replicates+1); } double Perm::max_pvalue(int l) { if (adaptive) return -1; else return (double)(maxR[l]+1) / (double)(replicates+1); } int Perm::reps_done(int l) { if (adaptive) return N[l]; else return replicates; } plink-1.07-src/locus.cpp0000644000265600020320000001365611264127624014340 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2006 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include "plink.h" #include "options.h" using namespace std; vector Plink::calcLocusIBD(Individual * p1, Individual * p2, Z I) { // ** TODO ** -- development code -- lots of possible speedups here. // 1) Remove I as an argument // 2) Now no need to calculate all probabilities -- just do the three required for that // particular locus // Store calculated P(M|Z) here vector ZL(nl); // Locus counter int l = 0; ////////////////////////////// // All SNPs in the scan region for (int l2=par::run_start; l2<=par::run_end; l2++) { ///////////////////// // Allele frequencies double p = locus[l2]->freq; double q = 1 - p; // Na = # alleles = 2N where N is number of individuals double Na = locus[l]->nm; double x = p * Na; double y = q * Na; ///////////////////////////////////////////////// // Assign P(M|Z) based on genotype for each SNP bool a1 = p1->one[l2]; bool a2 = p1->two[l2]; bool b1 = p2->one[l2]; bool b2 = p2->two[l2]; // Assign unit vector if either genotype is missing if ( ( a1 && (!a2) ) || ( b1 && (!b2) ) ) { ZL[l].z0 = ZL[l].z1 = ZL[l].z2 = 1; l++; continue; } if ( a1 ) { if ( a2 ) { if ( b1 ) { if ( b2 ) { // aa, aa ZL[l].z0 = q*q*q*q * ( (y-1)/y * (y-2)/y * (y-3)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); ZL[l].z1 = q*q*q * ( (y-1)/y * (y-2)/y * Na/(Na-1) * Na/(Na-2)); ZL[l].z2 = q*q * ( (y-1)/y * Na/(Na-1)); } } else { if ( b2 ) { // aa, Aa ZL[l].z0 = 2*p*q*q*q * ( (y-1)/y * (y-2)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); ZL[l].z1 = p*q*q * ((y-1)/y * Na/(Na-1) * Na/(Na-2)); ZL[l].z2 = 0; } else { // aa, AA ZL[l].z0 = p*p*q*q * ( (x-1)/x * (y-1)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); ZL[l].z1 = 0; ZL[l].z2 = 0; } } } } else { if ( a2 ) { if ( b1 ) { if ( b2 ) { // Aa, aa ZL[l].z0 = 2*p*q*q*q * ( (y-1)/y * (y-2)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); ZL[l].z1 = p*q*q * ((y-1)/y * Na/(Na-1) * Na/(Na-2)); ZL[l].z2 = 0; } } else { if ( b2 ) { // Aa, Aa ZL[l].z0 = 4*p*p*q*q * ( (x-1)/x * (y-1)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); ZL[l].z1 = p*p*q * ( (x-1)/x * Na/(Na-1) * Na/(Na-2) ) + p*q*q * ((y-1)/y * Na/(Na-1) * Na/(Na-2)); ZL[l].z2 = 2*p*q * Na/(Na-1) ; } else { // Aa, AA ZL[l].z0 = 2*p*p*p*q * ( (x-1)/x * (x-2)/x * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); ZL[l].z1 = p*p*q * ( (x-1)/x * Na/(Na-1) * Na/(Na-2) ); ZL[l].z2 = 0; } } } else { if ( b1 ) { if ( b2 ) { // AA, aa ZL[l].z0 = p*p*q*q * ( (x-1)/x * (y-1)/y * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); ZL[l].z1 = 0; ZL[l].z2 = 0; } } else { if ( b2 ) { // AA, Aa ZL[l].z0 = 2*p*p*p*q * ( (x-1)/x * (x-2)/x * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3)) ); ZL[l].z1 = p*p*q * ( (x-1)/x * Na/(Na-1) * Na/(Na-2) ); ZL[l].z2 = 0; } else { // AA, AA ZL[l].z0 = p*p*p*p * ( (x-1)/x * (x-2)/x * (x-3)/x * (Na/(Na-1)) * (Na/(Na-2)) * (Na/(Na-3))); ZL[l].z1 = p*p*p * ( (x-1)/x * (x-2)/x * Na/(Na-1) * Na/(Na-2)); ZL[l].z2 = p*p * ( (x-1)/x * Na/(Na-1)); } } } } ///////////////////////////////////// // Fudge factor for genotyping error if ( ZL[l].z1 < 0.0001 ) { ZL[l].z1 = 0.0001; double S = ZL[l].z0 + ZL[l].z1 + ZL[l].z2; ZL[l].z0 /= S; ZL[l].z1 /= S; ZL[l].z2 /= S; } ///////////////// // Next SNP l++; } return ZL; } // //////////////////////////////////////////// // // 2. Allow for possible genotyping error // // sum_{all possible true genotypes} P(observed G|true G) P(true G |IBD) // // double e = 0.005; // // double f = 0.001; // // double ER_AA_AA__AA_AA = 1- 2e - 2f - 2e*f - e*e - f*f; // // double ER_AA_AB__AA_AA = e; // // double ER_AA_BB__AA_AA = f; // // double ER_AB_AA__AA_AA = e; // // double ER_AB_AB__AA_AA = e*e; // // double ER_AB_BB__AA_AA = e*f; // // double ER_BB_AA__AA_AA = f; // // double ER_BB_AB__AA_AA = e*f; // // double ER_BB_BB__AA_AA = f*f; // // double ER_AA_AA__AA_AB = e; // // double ER_AA_AB__AA_AB = 1 - 3*e - 2*e*e - 2*e*f - f; // // double ER_AA_BB__AA_AB = e; // // double ER_AB_AA__AA_AB = e*e; // // double ER_AB_AB__AA_AB = e; // // double ER_AB_BB__AA_AB = e*e; // // double ER_BB_AA__AA_AB = f*e; // // double ER_BB_AB__AA_AB = f; // // double ER_BB_BB__AA_AB = f*e; // // Sum over all possible true genotypes P(Observed G | True G) P(True G |IBD = 1) plink-1.07-src/cnvqt.cpp0000644000265600020320000001711611264127626014343 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include "plink.h" #include "helper.h" #include "options.h" #include "perm.h" extern Plink * PP; void Plink::runTestCNVwithQT(Perm & perm) { // Permutation test for mean difference in QT between people with // versus without a CNV. By default two-sided, unless // par::segment_test_force_1sided = T // Optionally allowed for this to operate on smoothed data (i.e. // average of event count over a KB window, forwards and backwards // from the given position) // Also performs genome-wide burden analyses for QTs -- is there // an association between CNV size and QT, for example, etc. These // are based on standard correlation int validN = 0; double grandMean = 0; for (int i=0; imissing ) { grandMean += sample[i]->phenotype; ++validN; } } grandMean /= (double)validN; printLOG("Total sample mean is " + dbl2str(grandMean) + ", based on " + int2str( validN ) + " individuals\n"); ////////////////////////////////////////// // Test positons = MAP positions (nl_all) // Test positions = summed segment counts ( get from original counts ) // Test position = aggregate statistics ( 7 tests) int nt = nl_all; // IGNORE THIS FOR NOW... // if ( par::seg_test_region ) // nt = coverage_aff.size(); //////////////////////////////////////////////////////////////////// // // // Set up for individual burden tests? // // // //////////////////////////////////////////////////////////////////// // if ( par::cnv_indiv_perm ) // nt = 7; // Option per-individual summary tests? (4 tests) // Correlation between QT and these measures: // total # segs // # people w/ 1+ seg // total kb length // mean segment length // gene-count // atleast-1-gene-count // gene-enrichment //////////////////////////////////////////////////////////////////// // // // Initialise permutation procedures // // // //////////////////////////////////////////////////////////////////// perm.setTests(nt); perm.setPermClusters(*this); perm.originalOrder(); vector_t original(nt); //////////////////////////////////////////////////////////////////// // // // Standard positional tests // // // //////////////////////////////////////////////////////////////////// if ( par::cnv_indiv_perm ) error("Not implemented --cnv-indiv-perm for QTs yet"); // Test statistic is difference in QT bewteen people with // versus without a CNV at this position vector_t count; vector_t m1; vector_t m0; original = testCNVwithQT(grandMean, validN, nt, count, m1, m0); //////////////////////////////////////////////////////////////////// // // // Report to summary file // // // //////////////////////////////////////////////////////////////////// string f = par::output_file_name + ".cnv.qt.summary"; printLOG("Writing CNV QT summary to [ "+f+" ]\n"); ofstream FOUT; FOUT.open( f.c_str() , ios::out ); FOUT.precision(4); FOUT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "BP" << " " << setw(8) << "NCNV" << " " << setw(12) << "M1" << " " << setw(12) << "M0" << "\n"; for (int l=0; lchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << locus[l]->bp << " " << setw(8) << count[l] << " "; if ( count[l] > 0 ) FOUT << setw(12) << m1[l] << " "; else FOUT << setw(12) << "NA" << " "; FOUT << setw(12) << m0[l] << "\n"; } FOUT.close(); //////////////////////////////////////////////////////////////////// // // // Run permutations // // // //////////////////////////////////////////////////////////////////// bool finished = false; while(!finished) { perm.permuteInCluster(); vector_t pr = testCNVwithQT(grandMean, validN, nt, count, m1, m0); finished = perm.update(pr,original); } if (!par::silent) cout << "\n\n"; //////////////////////////////////////////////////////////////////// // // // Display permuted results // // // //////////////////////////////////////////////////////////////////// f += ".mperm"; printLOG("Writing CNV QT permutation results to [ "+f+" ]\n"); FOUT.open( f.c_str() , ios::out ); FOUT.precision(4); FOUT << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(12) << "BP" << " " << setw(12) << "EMP1" << " " << setw(12) << "EMP2" << "\n"; for (int l=0; lchr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << locus[l]->bp << " " << setw(12) << perm.pvalue( l ) << " " << setw(12) << perm.max_pvalue( l ) << "\n"; } FOUT.close(); } vector_t Plink::testCNVwithQT( double grandMean, int validN, int nt , vector_t & count, vector_t & m1, vector_t & m0 ) { vector_t score(nt,0); m1.clear(); m0.clear(); count.clear(); m1.resize(nt,0); m0.resize(nt, grandMean * validN) ; count.resize(nt,0); // Calculate QT mean for people with CNVs vector::iterator s = segment.begin(); while ( s != segment.end() ) { for (int l = s->start ; l <= s->finish; l++) { ++count[ l ]; m1[ l ] += s->p1->pperson->phenotype; } ++s; } // Calculate QT mean for all other people, given grand mean for ( int l = 0 ; l < nl_all ; l++ ) { int k = validN - (int)count[l] ; m0[l] = k > 0 ? ( m0[l] - m1[l] ) / (double)k : 0 ; m1[l] = count[ l ] > 0 ? m1[l] / count[ l ] : 0 ; score[ l ] = m1[l] - m0[l]; if ( par::segment_test_force_1sided ) { if ( score[ l ] < 0 ) score[ l ] = 0; } } return score; } plink-1.07-src/tinput.cpp0000644000265600020320000003271611264127624014534 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include "plink.h" #include "options.h" #include "helper.h" extern ofstream LOG; void Plink::readTransposedData() { ////////////////////// // Check files exist checkFileExists(par::tpedfile); checkFileExists(par::tfamfile); /////////////////////////////////////////////// // .tfam file readFamFile(par::tfamfile); /////////////////////////////////////////////// // .tped file (map and genotype file) // Read through twice -- once as MAP (first four columns) // Then take genotypes // First rows columns of .tped file: // chromosome code // SNP identifier // cM / M // Base Position (-ve implies exclude) vector include; vector ordered; FILE * MAP; MAP = fopen64(par::tpedfile.c_str(),"r"); if ( MAP == NULL ) error("Problem opening TPED file, errno = "+int2str(errno)); int c=0; while( ! feof(MAP) ) { // read first 4 entries, then ignore rest of line string chr; string name; string cm; string bp; long int inc; int f = 0; // Read chromosome code if ( readString(MAP,chr )) f++; // Empty line? if (chr=="") continue; Locus * loc = new Locus; if ( readString(MAP,name )) f++; if ( readString(MAP,cm )) f++; if ( readString(MAP,bp )) f++; // Ignore rest of line while (fgetc(MAP) != '\n' && !feof(MAP)) {} // Store in locus information loc->name = name; loc->pos = atof(cm.c_str()); loc->bp = (long int)atoi(bp.c_str()); inc = loc->bp; // Check that cM/M specification looks correct, if // we want to perform a plink-based analysis if (par::plink && (!par::cm_map) && (loc->pos > 50) ) error("Looks like you need to specify --cm ??"); // Convert cM to M map distances if (par::cm_map) loc->pos /= 100; // Chromosome coding loc->chr = getChromosomeCode(chr); // Use the frequency slot temporarily to // store order information loc->freq = c++; // Are we including this locus? if (loc->name!="") { if (inc<0) { include.push_back(false); } else { include.push_back(true); locus.push_back(loc); } ordered.push_back(*loc); } } // Done extracting initial SNP list fclose(MAP); printLOG(int2str(locus.size()) + " (of " + int2str(include.size()) + ") markers to be included from [ " + par::tpedfile + " ]\n"); if ( locus.size() == 0 ) shutdown(); /////////////////////////////////////////////// // Build ordered table, so that genotypes can // be inserted in correct order; then swap locus // file over // Sorting a vector of pointers, so we need this special fix stable_sort(locus.begin(),locus.end(),less()); // Sorting a normal vector stable_sort(ordered.begin(),ordered.end()); c=0; for (int i=0; iafreq // p2 p3 p1 p5 p4 : genetic position // 0 1 2 3 4 : file order // 1 0 1 0 1 : include // sort by cM // p1 p2 p3 p4 p5 : genetic // 2 0 1 4 3 : file order // 1 1 0 1 0 : include // 0 1 2 : add genetic order: nonmissing... // // sort by file order again // p2 p3 p1 p5 p4 : genetic // 0 1 2 3 4 : file // 1 0 1 0 1 : include // 1 0 2 : position to put in locus[l] /////////////////////////////////////////////// // Do we want to look at all the data? vector include_pos(0); int nl_actual = locus.size(); if ( (!par::plink) && (!par::run_chr==0) ) { // Get range setMarkerRange(); // And set to 'exclude' all markers outside of this range // (in physical distance terms) nl_actual = 0; for (int j=0; j par::run_end ) { include_pos.push_back(-1); include[j] = false; } else { include_pos.push_back(fp); nl_actual++; } } else // if already excluded { include_pos.push_back(-1); } } // 0 1 2 3 4 5 6 7 8 9 // We now have -1 -1 -1 3 4 5 6 -1 -1 -1 // but we want -1 -1 -1 0 1 2 3 -1 -1 -1 for (int j=0; j -1 ) include_pos[j] -= par::run_start ; } } else { // If we do want to look at all the data for (int j=0; j l0(0); for(int l=0; l < locus.size(); l++) { // If not in range if ( l < par::run_start || l > par::run_end ) { // Free memory for original element delete locus[l]; } else { l0.push_back(locus[l]); } } ///////////////// // And copy back locus.clear(); locus = l0; } /////////////////////////////////////////////////// // Add necessary locus space, if in SNP-major mode if (par::SNP_major) { for (int i=0; ione.resize(nl_actual); sample[i]->two.resize(nl_actual); } } /////////////////////////////////////////////// // .tped, take 2 // Now re-read in the .tped file // Assume: the order of the FAM file specifies the // genotypes going across rows, so we should already // have that information FILE * PED; PED = fopen64(par::tpedfile.c_str(),"r"); int i=0; // SNP count while( ! feof(PED) ) { string dummy; int f=0; if (readString(PED,dummy )) f++; // End of file? if ( dummy=="" ) { continue; } // Is this line a comment, or are we skipping it? if ( dummy.substr(0,1)=="#" ) { // Ignore rest of line while (fgetc(PED) != '\n' && !feof(PED)) {} continue; } if ( !include[i] ) { // Ignore rest of line and advance to next SNP while (fgetc(PED) != '\n' && !feof(PED)) {} i++; continue; } // Skip next 3 fields (SNP, cM, bp) if (readString(PED,dummy)) f++; if (readString(PED,dummy)) f++; if (readString(PED,dummy)) f++; ///////////////////// // Read genotypes now int gn=0; int c=0; // individual count bool linedone = false; bool fatal = false; string fmsg; while ( ! linedone ) { Individual * person = sample[c]; string one=""; string two=""; while (1) { char ch = fgetc(PED); // Delimiter? if (ch==' ' || ch=='\t' || ch=='\n' || ch=='\r' || feof(PED) ) { if (ch=='\n' || ch=='\r' || feof(PED)) linedone = true; // have we already seen something? if (one.length()>0) { gn++; break; } if (ch=='\n' || ch=='\r' || feof(PED)) break; } else { one += ch; } } // Second allele if (!linedone) while (1) { char ch = fgetc(PED); // Delimiter? if (ch==' ' || ch=='\t' || ch=='\n' || ch=='\r' || feof(PED) ) { if (ch=='\n' || ch=='\r' || feof(PED)) linedone = true; // have we already seen something? if (two.length()>0) { gn++; break; } if (ch=='\n' || ch=='\r' || feof(PED)) break; } else { two += ch; } } if (linedone && one.length()==0 && two.length()==0 ) break; ///////////////////////////////////// // Only consider loci to be included if (include[i]) { ////////////////////////////// // Look up genomic order, // insert in slot k in locus[] int k = include_pos[i]; Locus * loc = locus[k]; ///////////////////////////////////////// // Add allele names to list, if needed // If allele is not missing... if (one!=par::missing_genotype && two!=par::missing_genotype) { // ...and not already listed if (one!=loc->allele1 && one!=loc->allele2) { // ...then add to first empty slot if(loc->allele1=="") loc->allele1=one; else if(loc->allele2=="") loc->allele2=one; else { // .. or show an error if no empty slots if (!fatal) fmsg = "Locus " + loc->name + " has >2 alleles:\n individual " + person->fid + " " + person->iid + " has genotype [ " + one +" "+two+" ]\n" + " but we've already seen [ " + loc->allele1 + " ] and [ " + loc->allele2 + " ]\n"; fatal=true; } } } // Repeat for second allele, if different if (two!=one) { // If allele is not missing... if (one!=par::missing_genotype) // ...and not already listed if (two!=loc->allele1 && two!=loc->allele2) { // ...then add to first empty slot if(loc->allele1=="") loc->allele1=two; else if(loc->allele2=="") loc->allele2=two; else { if (!fatal) fmsg = "Locus " + loc->name + " has >2 alleles:\n individual " + person->fid + " " + person->iid + " has genotype [ " + one +" "+two+" ]\n" + " but we've already seen [ " + loc->allele1 + " ] and [ " + loc->allele2 + " ]\n"; fatal=true; } } } ///////////////////////////// // Add specific genotypes if (par::SNP_major) { // 00 hom if (one==loc->allele1 && two==loc->allele1) { SNP[k]->one.push_back(false); SNP[k]->two.push_back(false); } // 01 het else if (one!=par::missing_genotype && two!=par::missing_genotype && one!=two) { SNP[k]->one.push_back(false); SNP[k]->two.push_back(true); } // 11 hom else if (one==loc->allele2 && two==loc->allele2) { SNP[k]->one.push_back(true); SNP[k]->two.push_back(true); } // 10 missing else if (one==par::missing_genotype || two==par::missing_genotype) { SNP[k]->one.push_back(true); SNP[k]->two.push_back(false); } } else { // 00 hom if (one==loc->allele1 && two==loc->allele1) { person->one[k]=false; person->two[k]=false; } // 01 het else if (one!=par::missing_genotype && two!=par::missing_genotype && one!=two) { person->one[k]=false; person->two[k]=true; } // 11 hom else if (one==loc->allele2 && two==loc->allele2) { person->one[k]=true; person->two[k]=true; } // 10 missing else if (one==par::missing_genotype || two==par::missing_genotype) { person->one[k]=true; person->two[k]=false; } } } // Advance to next individual c++; if ( c > sample.size()) { fmsg += "\nProblem with line "+int2str(i+1)+" in [ "+par::tpedfile+" ]\n"; fmsg += "Expecting 4 + 2 * " + int2str(sample.size()) + " = " + int2str(4+2*sample.size())+ " columns, but found more\n"; error(fmsg); } } // line done? Next SNP // check size of line length somewhere if ( gn != 2 * sample.size() ) { fmsg += "\nProblem with line "+int2str(i+1)+" in [ "+par::tpedfile+" ]\n"; fmsg += "Expecting 4 + 2 * " + int2str(sample.size()) + " = " + int2str(4+2*sample.size())+ " columns, but found " + int2str(f+gn) + "\n"; fatal=true; } if (fatal) error(fmsg); // Increase SNP counter i++; } // Next SNP // Close TPED file fclose(PED); } plink-1.07-src/linear.h0000644000265600020320000000311511264127626014121 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __LINEAR_H__ #define __LINEAR_H__ #include #include "plink.h" #include "model.h" using namespace std; class LinearModel : public Model { public: LinearModel(Plink *); ~LinearModel() { }; void setDependent(); void fitLM(); void fitUnivariateLM(); void pruneY(); void standardise(); void reset(); vector_t getCoefs(); vector_t getVar(); vector_t getSE(); vector_t getPVals(); double getPValue(); void HuberWhite(); void displayResults(ofstream &, Locus *); double calculateRSS(); double calculateRSquared(); double calculateAdjustedRSquared(); double calculateMallowC(LinearModel *); double calculateFTest(LinearModel *); private: vector_t Y; vector C; vector se; double chisq; vector sig; vector w; vector > u; vector > v; double varY; double meanY; double RSS; void function(const int i, vector & p ); void setVariance(); }; #endif plink-1.07-src/idhelp.cpp0000644000265600020320000015424311264127626014460 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// // Rules: // ID cannot have any spaces, tabs, commas (,) or plus (+) signs // Attributes can have commas #include "idhelp.h" #include "options.h" #include "helper.h" #include "nlist.h" #include extern Plink * PP; map > IDHelper::parseQuery(string q) { map > lookup; // Convert a query string into a searchable form NList tlist(0); vector ids = tlist.deparseStringList( q ); // Consider each term (comma-delimited list) map qmap; map qjoint; for ( int i = 0 ; i < ids.size() ; i++) { string s = ids[i]; if ( s.find("=") == string::npos ) error("Query should be: ID=value or ATTRIB=value[,ATTRIB=value]"); string fs = s.substr(0, s.find("=")); string vs = s.substr(s.find("=")+1); // Is this a joint field? if ( fs.find("+") != string::npos ) { // We need to split both fs and vs: they must have the same // number of entries on both sides vector flist = tlist.deparseStringList( searchAndReplace( fs,"+",",") ); vector vlist = tlist.deparseStringList( searchAndReplace( vs,"+",",") ); if ( flist.size() != vlist.size() ) error("Joint query wrong: " + fs + " = " + vs); for (int i=0; i::iterator i = qmap.begin(); while ( i != qmap.end() ) { string fs = i->first; map::iterator f = fieldMap.find( fs ); if ( f == fieldMap.end() ) error("Cannot find field " + fs ); IDField * thisField = f->second; IDValue t; t.field = thisField; t.value = i->second; if ( t.field->equiv ) t.updateAlias(); ////////////////////////////////////////////////// // Is this query being framed as a joint field? map::iterator k = qjoint.find( fs ); if ( k != qjoint.end() ) { t.jointValue = k->second; t.field->joint = true; } else { t.field->joint = false; } map >::iterator j = lookup.find( thisField ); if ( j == lookup.end() ) { set ts; ts.insert(t); lookup.insert(make_pair(thisField,ts) ); } else j->second.insert(t); ++i; } return lookup; } bool IDHelper::matchIndividual(IDGroup * group, map > & matchTemplate ) { bool match = true; // Compare // map > lookupValues; // with this group int found = 0; for (int g=0; gvalues.size(); g++) { IDField * f = group->values[g]->field; map >::iterator mf = matchTemplate.find( f ); if ( mf == matchTemplate.end() ) continue; ++found; // The observed value IDValue * myValue = group->values[g]; set::iterator j = mf->second.begin(); bool matchField = false; while ( j != mf->second.end() ) { if ( *myValue == *j ) { matchField = true; } ++j; } if ( ! matchField ) return false; } // Did we get a look at all the match fields? if ( found != matchTemplate.size() ) { return false; } // If still here, we must match return true; } void IDHelper::setJointValues( set & val ) { // Make a dummy ID group, with pointers to the originals IDGroup g; set::iterator i = val.begin(); while ( i != val.end() ) { g.values.push_back( (IDValue*)&(*i) ); ++i; } // Use main joint-value update function setJointValues( &g ); // We will add "jointValue" attribs to val, but no need // to remove any items } void IDHelper::setJointValues( IDGroup * group ) { // Find the joint fields and edit in values // Similar to the above function, except here if the joint values // are completely missing, then we need to remove the entire set // from the ID group for (int j = 0 ; j < jointField.size(); j++ ) { set & jf = jointField[j]; vector & jo = jointOrder[j]; // Does this group contain one of these // joint fields? bool hasJoint = false; map mapback; for (int j = 0 ; j < group->values.size(); j++) { if ( jf.find( group->values[j]->field ) != jf.end() ) { hasJoint = true; mapback.insert( make_pair( group->values[j]->field , j ) ); } } if ( ! hasJoint ) continue; string jointValue = ""; bool doneFirst = false; bool jointMissing = false; bool jointOneSeen = false; set::iterator k = jf.begin(); while ( k != jf.end() ) { map::iterator mi = mapback.find( *k ); if ( mi == mapback.end() ) { jointMissing = true; if ( doneFirst ) jointValue += "+."; else { jointValue += "."; doneFirst = true; } } else { jointOneSeen = true; if ( doneFirst ) jointValue += "+" + group->values[ mi->second ]->value; else { jointValue += group->values[mi->second]->value; doneFirst = true; } } ++k; } // Update values accordingly // Except, remove entirely missing values vector mask( group->values.size(), false); set::iterator k2 = jf.begin(); while ( k2 != jf.end() ) { map::iterator mi = mapback.find( *k2 ); if ( mi != mapback.end() ) { int j = mi->second; if ( jointMissing && ! jointOneSeen ) mask[j] = true; else group->values[j]->jointValue = jointValue; } ++k2; } // Remove entirely missing values if ( jointMissing && ! jointOneSeen ) { vector newValues = group->values; group->values.clear(); for ( int i = 0 ; i < mask.size() ; i++) { if ( ! mask[i] ) group->values.push_back( newValues[i] ); } } } return; } IDGroup * IDHelper::findUniqueIndividual( set & matchTemplate ) { set thisGroup; set::iterator k = matchTemplate.begin(); while ( k != matchTemplate.end() ) { // Hmm need to decide how to handle: is default joint or single lookup? // for IDValues? map >::iterator i = idmap.find( *k ); if ( i != idmap.end() ) { set::iterator j = i->second.begin(); while ( j != i->second.end() ) { if ( ! (*j)->resolved ) thisGroup.insert( *j ); ++j; } } ++k; } if ( thisGroup.size() > 1 ) error("Internal problem: found more than one match when expecting a unique match"); return *thisGroup.begin(); } set IDHelper::findAllIndividuals( map > & matchTemplate ) { set matched; for ( int g = 0 ; g < idgroup.size(); g++ ) { IDGroup * group = idgroup[g]; if ( matchIndividual( group , matchTemplate ) ) matched.insert( group ); } return matched; } bool IDHelper::setAlias(IDField * myField, string val, int f, map & originalEquivalence) { bool needToStore = true; map::iterator i = originalEquivalence.find( myField ); if ( i == originalEquivalence.end() ) { set::iterator k = myField->aliasList.find( val ); if ( k != myField->aliasList.end() ) error("Alias specified more than once in " + files[f].filename + " : " + myField->name + " " + val ); myField->masterList.insert( val ); originalEquivalence.insert(make_pair( myField, val )); } else { // We need to add an equivalence value to this field? map::iterator k = myField->eqid.find( val ); // We only let each alias be specified once if ( k != myField->eqid.end() ) error("Alias specified more than once in " + files[f].filename + " : " + myField->name + " " + val ); k = myField->eqid.find( i->second ); if ( k != myField->eqid.end() ) error("Alias specified more than once in " + files[f].filename + " : " + myField->name + " " + i->second); if ( myField->masterList.find( val ) != myField->masterList.end() ) error("Alias specified more than once in " + files[f].filename + " : " + myField->name + " " + val ); // Keep track of this alias myField->eqid.insert( make_pair ( val , i->second ) ); myField->aliasList.insert( val ); // And now we do not need to store this particular value // as a distinct field needToStore = false; } return needToStore; } void IDHelper::idHelp() { // Turn off the "-" initiated range-delimiting (i.e. so that // IDs can have hyphens) par::range_delimiter = " "; // Are we only performing a "simple match", performed without // a dictionary being specified? If so, jump there now, creating // the dictionary on the fly. if ( par::idhelp_match && par::idhelp_no_dict ) { idMatch(); return; } // 1. Read in dictionary, and make the fields and files // Contains files (and can be full path) and description of each field // {file name} {col names } : { rules } // e.g. // ../files/id1.txt FID IID FID1 FID2 : uniq=FID,IID uniq=FID1,IID2 // ../names/id.lst ID2 ID23 : missing=NA,---,0 // ../names/id2.lst ID3 : equiv // // note: for "equiv" files, assume all IDs on same line are equivalent, // only one ID can be specified here PP->printLOG("ID helper, with dictionary [ " + par::idhelp_dictionary + " ]\n"); checkFileExists( par::idhelp_dictionary ); ifstream DICT( par::idhelp_dictionary.c_str() , ios::in ); while ( ! DICT.eof() ) { vector tokens = tokenizeLine( DICT ); // Needs atleast three fields: // filename, and two IDs if ( tokens.size() == 0 ) continue; if ( tokens.size() < 2 || tokens[1] == ":" ) error("Expecting at least 2 fields w/ 1 ID in every dictionary row\n"); IDFile d; checkFileExists( tokens[0] ); d.filename = tokens[0]; int p = 1; while (1) { if ( tokens[p] == ":" ) { ++p; break; } // Is this a new field name? IDField f; f.name = tokens[p]; // Set to skip this field? if ( f.name == "." ) f.null = f.attribute = true; iField = fields.find( f ); if ( iField == fields.end() ) { fields.insert( f ); iField = fields.find( f ); } // Track that this field was found in this file IDField * ip = (IDField*)&( *iField ); d.fields.push_back( ip ); ++d.uniqFieldCount; // Consider the next field in this file if ( ++p == tokens.size() ) break; } bool seenJoint = false; bool seenAttrib = false; // Now read any rules if ( p < tokens.size() ) { while (1) { string cmd = tokens[p]; // Commands // attrib=X,Y,Z // joint=X,Y // set:X=Y // header // missing=NA,-9 // alias-delimit=| bool parsed = false; if ( cmd.size() > 6 && cmd.substr(0,6) == "joint=" ) { parsed = true; bool seenJoint = true; string u = cmd.substr(6); string jName = searchAndReplace(u,","," "); // This should contain at least two ID fields // These fields must always then appear together in // any file that features at least one NList tlist(0); vector ids = tlist.deparseStringList( u ); if ( ids.size() < 2 ) error("Problem with specification of : " + cmd ); set t; for (int i = 0 ; i < ids.size() ; i++) { t.insert(ids[i]); } for (int i = 0 ; i < ids.size() ; i++) jointMap.insert( make_pair( ids[i] , t ) ); set jointf; vector jointo; for (int i = 0 ; i < ids.size() ; i++) { IDField f; f.name = ids[i]; iField = fields.find( f ); if ( iField == fields.end() ) { error("Could not find field " + ids[i] + " which is specified as joint"); } IDField * ip = (IDField*)&( *iField ); ip->joint = true; ip->jointName = jName; jointf.insert( ip ); jointo.push_back( ip ); } jointField.push_back( jointf ); jointOrder.push_back( jointo ); } if ( cmd.size() > 7 && cmd.substr(0,7) == "attrib=" ) { parsed = true; seenAttrib = true; string u = cmd.substr(7); NList tlist(0); vector ids = tlist.deparseStringList( u ); for (int i = 0 ; i < ids.size() ; i++) attribFields.insert( ids[i] ); } if ( cmd.size() > 8 && cmd.substr(0,8) == "missing=" ) { parsed = true; string u = cmd.substr(8); NList tlist(0); vector ids = tlist.deparseStringList( u ); for (int i = 0 ; i < ids.size() ; i++) d.missingValues.insert( ids[i] ); } if ( cmd.size() > 4 && cmd.substr(0,4) == "set:" ) { parsed = true; if ( seenJoint ) error("Must specify set:X=Y before joint=X,Z"); if ( seenAttrib ) error("Must specify set:X=Y before attrib=X"); string u = cmd.substr(4); bool okay = true; if ( u.find("=") == string::npos ) okay = false; else { string u1 = u.substr(0,u.find("=")); string u2 = u.substr(u.find("=")+1); if ( u1.size() < 1 ) okay = false; if ( u2.size() < 1 ) okay = false; if ( okay ) { d.injections.insert(make_pair(u1,u2)); IDField f; f.name = u1; if ( f.name == "." ) error("Cannot set field value name to ."); iField = fields.find( f ); if ( iField == fields.end() ) { fields.insert( f ); iField = fields.find( f ); } // Track that this field was found in this file IDField * ip = (IDField*)&( *iField ); d.fields.push_back( ip ); ++d.uniqFieldCount; } } if ( ! okay ) error("Badly formed set:X=V command"); } if( cmd == "header" || cmd == "hasHeader" ) { parsed = true; d.hasHeader = true; } // if ( cmd.size() > 10 && cmd.substr(0,10) == "delimiter=" ) // { // string u = cmd.substr(10); // if ( u.size() != 1 ) // error("Delimiters can only be a single character length"); // d.delimit = u; // } if ( cmd.size() > 16 && cmd.substr(0,16) == "alias-delimiter=" ) { parsed = true; string u = cmd.substr(16); if ( u.size() != 1 ) error("Delimiters can only be a single character length"); if ( u == d.delimit ) error("Delimiter and alias delimiter cannot be the same value"); d.alias_delimit = u; } if ( ! parsed ) error("Could not parse the following rule in the ID dictionary: " + cmd ); if ( ++p == tokens.size() ) break; } } files.push_back(d); // Next line } DICT.close(); PP->printLOG("Read " + int2str( fields.size() ) + " unique fields\n"); set::iterator i = fields.begin(); while ( i != fields.end() ) { fieldMap.insert(make_pair( i->name , (IDField*)&(*i) )); ++i; } ///////////////////////////////////////////////// // Set flags for any attribute fields if ( attribFields.size() > 0 ) { PP->printLOG(" Attribute fields: "); set::iterator i = attribFields.begin(); while ( i != attribFields.end() ) { if ( fieldMap.find( *i ) == fieldMap.end() ) error("Cannot find specified attribute " + *i + " -- please check your dictionary file"); // Set as an attribute field fieldMap.find(*i)->second->attribute = true; PP->printLOG( *i + " " ); ++i; } PP->printLOG("\n"); } ///////////////////////////////////////////////// // If set fields, check either joint or attrib for ( int f = 0 ; f < files.size() ; f++ ) { map::iterator i = files[f].injections.begin(); while ( i != files[f].injections.end() ) { IDField * f = fieldMap.find( i->first )->second; if ( ! ( f->joint || f->attribute ) ) error("Any set:field=value should be an attribute or a joint field"); ++i; } } //////////////////////////////////////////////// // If joint fields, check always all specified for (int j = 0 ; j < jointField.size(); j++ ) { set & jf = jointField[j]; for ( int f = 0 ; f < files.size() ; f++ ) { for ( int j = 0; j < files[f].fields.size(); j++) { if ( jointMap.find( files[f].fields[j]->name ) != jointMap.end() ) { map >::iterator i = jointMap.find( files[f].fields[j]->name ); set & ss = i->second; set::iterator is = ss.begin(); while ( is != ss.end() ) { bool okay = false; for ( int j = 0; j < files[f].fields.size(); j++) if ( *is == files[f].fields[j]->name ) okay =true; if ( ! okay ) error("Need to specify all joint fields in dictionary, [" + files[f].filename + " ]"); ++is; } } } } } if ( jointField.size() > 0 ) { PP->printLOG(" Joint fields:"); for (int j = 0 ; j < jointField.size(); j++ ) { set & jf = jointField[j]; set::iterator j = jf.begin(); PP->printLOG(" { "); while ( j != jf.end() ) { PP->printLOG( (*j)->name + " " ); ++j; } PP->printLOG(" }"); } PP->printLOG("\n"); } ///////////////////////////////// // 2. Read in and index all IDs for ( int f = 0 ; f < files.size() ; f++ ) { IDFile * file = &files[f]; PP->printLOG("Reading [ " + files[f].filename + " ] with fields : "); for ( int j = 0 ; j < files[f].fields.size(); j++ ) { if (j>0 ) PP->printLOG(", "); PP->printLOG( files[f].fields[j]->name ); } //////////////////////////// // Find any equivalence sets bool foundEquiv = false; map > equiv; map > equivMap; for ( int j = 0 ; j < files[f].fields.size(); j++) { string n = files[f].fields[j]->name; map >::iterator i = equiv.find( n ); if ( i == equiv.end() ) { vector t; t.push_back(j); equiv.insert(make_pair( n , t )); } else { i->second.push_back(j); files[f].fields[j]->equiv = true; foundEquiv = true; } } if ( foundEquiv ) { PP->printLOG(" : "); map >::iterator i = equiv.begin(); while ( i != equiv.end() ) { if ( i->second.size() > 1 ) { map tmap; PP->printLOG(" "+files[f].fields[i->second[0]]->name + "("); for (int k = 0 ; k < i->second.size(); k++) { if ( k>0 ) { if ( k==1 ) PP->printLOG("<-"); else PP->printLOG(","); tmap.insert(make_pair( i->second[k] , i->second[0] ) ); } PP->printLOG(int2str( i->second[k]+1 )); } PP->printLOG(")"); equivMap.insert( make_pair( files[f].fields[i->second[0]]->name , tmap )); } ++i; } } PP->printLOG("\n"); //////////////////////////////////// // Read the raw data ifstream ID1( files[f].filename.c_str() , ios::in ); if ( files[f].hasHeader ) { vector header = tokenizeLine( ID1 ); } while ( !ID1.eof() ) { vector tokens = tokenizeLine( ID1 ); if ( tokens.size() == 0 ) continue; // Insert SET:X=Y values here map::iterator i = files[f].injections.begin(); while ( i != files[f].injections.end() ) { tokens.push_back( i->second ); // Note -- this won't be same order -- need to check/fix this??? ++i; } if ( tokens.size() != file->uniqFieldCount ) { PP->printLOG("\n\nIn [ " + file->filename + " ] encountered a row with the wrong number of fields\n"); PP->printLOG("Found " + int2str( tokens.size() ) + " fields but expecting " + int2str( file->fields.size() ) + "\n"); int mx = tokens.size() > file->uniqFieldCount ? tokens.size() : file->uniqFieldCount ; for (int j = 0 ; j < mx ; j++) { if ( j < file->uniqFieldCount ) PP->printLOG( " " + file->fields[j]->name + ":\t" ); else PP->printLOG( " {?}:\t" ); if ( j < tokens.size() ) PP->printLOG( " " + tokens[j] + "\n" ); else PP->printLOG( " {?}\n" ); } error("Problem with [ " + file->filename + " ]\n" ); } IDGroup * g = new IDGroup; // Track which file this group of IDs came form g->file = file; // Track what the original eq-value is for this line map originalEquivalence; for ( int j = 0 ; j < files[f].fields.size(); j++ ) { // Is this a missing value? if ( files[f].missingValues.find( tokens[j] ) != files[f].missingValues.end() ) continue; IDField * myField = files[f].fields[j]; if ( myField->null ) continue; string val = tokens[j]; // Handle equivalence specifications (aliases) bool needToStore = true; if ( val.find( files[f].alias_delimit ) != string::npos ) { // This is now an equiv. field myField->equiv = true; NList nl(0); nl.setDelimiter( files[f].alias_delimit ); nl.setRangeChar(" "); // allow hyphens vector atoken = nl.deparseStringList( val ); string first = ""; for ( int i=0; iequiv ) needToStore = setAlias( myField, tokens[j] , f , originalEquivalence ); if ( needToStore ) { IDValue * v = new IDValue; v->field = myField; v->value = tokens[j]; g->values.push_back(v); // for pretty-printing if ( v->value.size() + 3 > myField->width ) myField->width = v->value.size() + 3 ; } } idgroup.push_back(g); } ID1.close(); } ///////////////////////////////// // Done reading in the raw data // cout << "DISPLAY LEVEL 0\n"; // for ( int g = 0 ; g < idgroup.size(); g++ ) // idgroup[g]->display(); // cout << "-------------------------------------------\n"; ////////////////////////////////////////////////////////////// // 1. Swap in preferred values for any equiv fields for ( int g = 0 ; g < idgroup.size(); g++ ) { IDGroup * group = idgroup[g]; for (int j = 0 ; j < group->values.size(); j++) if ( group->values[j]->field->equiv ) group->values[j]->updateAlias(); } ////////////////////////////////////////////////////////////// // 1b. Compile joint fields into joint values for ( int g = 0 ; g < idgroup.size(); g++ ) { IDGroup * group = idgroup[g]; setJointValues( idgroup[g] ); } //////////////////////////////// // 2. Create the idmap for ( int g = 0 ; g < idgroup.size(); g++ ) { IDGroup * group = idgroup[g]; for (int j = 0 ; j < group->values.size(); j++) { IDValue & v = *(group->values[j]); map >::iterator i = idmap.find( *(group->values[j]) ); if ( i == idmap.end() ) { set t; t.insert(group); idmap.insert(make_pair( *(group->values[j]) , t) ); } else { i->second.insert( group ); } } } /////////////////////////////////////////////// // 2.5 Simple match in any file? if ( par::idhelp_dump_from_dict ) { idDump(); return; } /////////////////////////////////////////////// // 3. Attempt to resolve into a single table bool okay = true; map problem ; while (1) { bool allDone = true; for ( int g = 0 ; g < idgroup.size(); g++ ) { IDGroup * group = idgroup[g]; // Has this group already been assigned to a person? if ( group->resolved ) continue; // Find all other groups (resolved or otherwise) that this group // matches with, but ignoring attributes set matches; for (int j = 0 ; j < group->values.size(); j++) { // Skip matching on attributes if ( group->values[j]->field->attribute ) continue; map >::iterator i = idmap.find( *(group->values[j]) ); if ( i != idmap.end() ) { set::iterator i2 = i->second.begin(); while ( i2 != i->second.end() ) { matches.insert( *i2 ); ++i2; } } } ////////////////////////////// // Merge into the key group // Make a set of the key groups IDValues map keyValues; for ( int j = 0; j < group->values.size(); j++ ) { keyValues.insert( make_pair( group->values[j]->field, group->values[j] )); } set::iterator i0 = matches.begin(); while ( i0 != matches.end() ) { if ( *i0 == group ) { ++i0; continue; } // Step through all the values in this matching group for ( int k = 0; k < (*i0)->values.size(); k++) { // Does the key have this field? IDField * f = (*i0)->values[k]->field; map::iterator i = keyValues.find( f ); if ( i == keyValues.end() ) { // Insert this key value into place IDValue * t = new IDValue; t->field = f; t->value = (*i0)->values[k]->value; t->jointValue = (*i0)->values[k]->jointValue; group->values.push_back(t); allDone = false; // Keep track of what has been added, so we don't add twice keyValues.insert( make_pair( f , (*i0)->values[k] )); } else { // Something already exists -- check it is not inconsistent // as if is an ID, i.e. attributes are allowed to not be // unique by definition if ( *((*i0)->values[k]) != *(i->second) ) { okay = false; string title = "Two unique entries [ " + (*i0)->values[k]->field->name + " = "; if ( (*i0)->values[k]->field->joint ) { if ( (*i0)->values[k]->jointValue <= i->second->jointValue ) title += (*i0)->values[k]->jointValue + " and " + i->second->jointValue + " (joint)"; else title += i->second->jointValue + " and " + (*i0)->values[k]->jointValue + " (joint)"; } else { if ( (*i0)->values[k]->value <= i->second->value ) title += (*i0)->values[k]->value + " and " + i->second->value; else title += i->second->value + " and " + (*i0)->values[k]->value; } title += " ] that match elsewhere"; if ( problem.find( title ) == problem.end() ) { string p = "\n a) "; for (int z=0; zvalues.size(); z++) p += group->values[z]->field->name + "=" + group->values[z]->value + " "; p += "\n"; p += " b) "; for (int z=0; z<(*i0)->values.size(); z++) p += (*i0)->values[z]->field->name + "=" + (*i0)->values[z]->value + " "; p += "\n\n"; problem.insert(make_pair(title,p)); } } } } // We are now done with this IDGroup (*i0)->resolved = true; ++i0; } } if ( allDone ) break; } if ( ! okay ) { PP->printLOG("\n\n*** Problems were detected in the ID lists:\n\n"); map::iterator p = problem.begin(); while ( p != problem.end() ) { PP->printLOG( p->first + p->second ); ++p; } error("You need to fix the above problems"); } ////////////////////////////////////////////////////////////// // Update the IDMAP, now only for resolved values // value -> idgroup idmap.clear(); for ( int g = 0 ; g < idgroup.size(); g++ ) { if ( idgroup[g]->resolved ) continue; IDGroup * group = idgroup[g]; for (int j = 0 ; j < group->values.size(); j++) { IDValue & v = *(group->values[j]); map >::iterator i = idmap.find( *(group->values[j]) ); if ( i == idmap.end() ) { set t; t.insert(group); idmap.insert(make_pair( *(group->values[j]) , t) ); } else { i->second.insert( group ); } } } // cout << "DISPLAY LEVEL 1\n"; // for ( int g = 0 ; g < idgroup.size(); g++ ) // idgroup[g]->display(); // cout << "-------------------------------------------\n"; ////////////////////////////////////////////////////////////// // 4. Figure out actual transformation required, and perform // Rules: ID cannot contain whitespace or commas // Can contain "-", "_", "=", ".", etc. // IDs are case-sensitive // Cannot contain "+" or "," // Values if "." are taken to mean not known, n/a // Functions: dump all fields on this person, or group if ( par::idhelp_list_aliases ) // Lookup a single person --id-lookup ID=27364883-1 // or match on an attribute --id-lookup SITE=Boston // Dump the entire table (DEFAULT) // or subset of cols --id-table ID,CLIN_ID,BSP_ID // Take an existing file and replace a field // --id-replace [header|noheader,field=N,skip|miss|warn|list] mydata.txt ID1 ID2 // Default behavior is to put 'missing' as the field // default = to autodetect a header field // skip (do not print these lines) // miss (print a missing ID code) // warn (do not allow if >1 missing) // list (print only these lines, ID in file not in DB) // Take an existing file and replace a field {file} {field} {old ID} {new ID} // --id-replace mydata.txt ID1 CLIN_ID // par::idhelp_command = // dump_table, dump_subtable, replace, lookup if ( par::idhelp_list_aliases ) { idListAlias(); return; } //////////////////////////////////////////////////////////////////////// // Replace mode if ( par::idhelp_replace ) { idReplace(); return; } /////////////////////////////////////////////////////////////// // Line up 1 or more files based on the first file if ( par::idhelp_match ) { idMatch(); return; } ///////////////////////////////////////////////////////////////////////// // Lookup functions set subsetFields; map > lookupValues; if ( par::idhelp_subset ) { NList tlist(0); vector ids = tlist.deparseStringList( par::idhelp_subset_string ); for (int i=0; iprintLOG("Looking up items matching: "); // These will be sorted in field name order, so we // can easily figure out OR versus AND conditions map >::iterator i = lookupValues.begin(); while ( i != lookupValues.end() ) { set::iterator j = i->second.begin(); PP->printLOG( "\n " + i->first->name + " = " ); while ( j != i->second.end() ) { PP->printLOG( j->value + " " ); ++j; } if ( i->first->attribute ) PP->printLOG(" (attribute)"); else PP->printLOG(" (id)"); ++i; } PP->printLOG("\n"); } ////////////////////////////////////////////////////////// // Main output routine PP->printLOG("Writing output to [ " + par::output_file_name + ".id ]\n"); ofstream OFILE( (par::output_file_name+".id").c_str() , ios::out ); // Header row set::iterator f = fields.begin(); while ( f != fields.end() ) { if ( f->null ) { ++f; continue; } if ( (! par::idhelp_subset ) || subsetFields.find( f->name )!=subsetFields.end() ) OFILE << setw(f->width) << f->name << par::idhelp_output_delimit << " "; ++f; } OFILE << "\n"; // Keep track of how many records we retrieve int numMatched = 0; for ( int g = 0 ; g < idgroup.size(); g++ ) { IDGroup * group = idgroup[g]; // Has this group already been assigned to a person? if ( group->resolved ) continue; // Make a set of the key groups IDValues // If doing a lookup, we might also need to include // all fields here map keyValues; for ( int j = 0; j < group->values.size(); j++ ) { if ( par::idhelp_lookup || ( ! par::idhelp_subset ) || subsetFields.find( group->values[j]->field->name )!=subsetFields.end() ) keyValues.insert( make_pair( group->values[j]->field, group->values[j] )); } /////////////////////////////////////////////////// // If we are filtering, does this person match? if ( par::idhelp_lookup ) { if ( ! matchIndividual( group, lookupValues ) ) continue; } ++numMatched; /////////////////////////////////////////// // Print row, in same order for all fields set::iterator f = fields.begin(); while ( f != fields.end() ) { if ( f->null ) { ++f; continue; } if ( (! par::idhelp_subset) || subsetFields.find( f->name ) != subsetFields.end() ) { map::iterator k = keyValues.find( (IDField*)&(*f) ); if ( k == keyValues.end() ) OFILE << setw( f->width ) << "." << par::idhelp_output_delimit << " "; else { OFILE << setw( f->width ) << k->second->value << par::idhelp_output_delimit << " "; } } ++f; } OFILE << "\n"; } OFILE.close(); PP->printLOG( int2str( numMatched ) + " unique records retrieved\n"); } void IDHelper::idListAlias() { PP->printLOG("Listing ID equivalents/aliases to [ " + par::output_file_name + ".id.eq ]\n"); ofstream O1( (par::output_file_name + ".id.eq").c_str() , ios::out ); O1 << setw(20) << "FIELD" << " " << setw(20) << "PREF" << " " << setw(20) << "EQUIV" << "\n"; map::iterator i1 = fieldMap.begin(); while ( i1 != fieldMap.end() ) { if ( i1->second->equiv ) { IDField * f = i1->second; map::iterator j = f->eqid.begin(); while ( j != f->eqid.end() ) { O1 << setw(20) << i1->first << " " << setw(20) << j->second << " " << setw(20) << j->first << "\n"; ++j; } } ++i1; } O1.close(); return; } void IDHelper::idReplace() { NList nl(0); vector tok = nl.deparseStringList( par::idhelp_replace_string ); if ( tok.size() != 3 ) error("Problem with --id-replace string format\n"); checkFileExists( tok[0] ); // This/these are the fields are vector rep_field; NList tlist(0); string t = searchAndReplace( tok[1] , "+" , "," ); vector targetFields = tlist.deparseStringList( t ); for ( int i=0; isecond; if ( f->joint ) fname = f->jointName; PP->printLOG("Replacing " + tok[1] + " with " + fname + " from [ " + tok[0] + " ]\n"); PP->printLOG("Writing new file to [ " + par::output_file_name + ".rep ]\n"); OptionSet * id_opt = par::opt.getOptions("IDHELP"); bool skipMode = id_opt->isSet("skip"); bool missMode = id_opt->isSet("miss"); bool warnMode = id_opt->isSet("warn"); bool listMode = id_opt->isSet("list"); int c = 0; if ( skipMode ) { PP->printLOG("Set to skip unmatched observations\n"); ++c; } if ( missMode ) { PP->printLOG("Set to set unmatched observations to NA\n"); ++c; } if ( warnMode ) { PP->printLOG("Set to give error for first unmatched observation\n"); ++c; } if ( listMode ) { PP->printLOG("Set to list only IDs in file but not in database\n"); ++c; } if ( c == 0 ) PP->printLOG("Set to keep original value for unmatched observations\n"); if ( c>1 ) error("Can only specify one of [miss|warn|skip|list] options in --id-replace"); // Do we have a header row? bool header = false; if ( id_opt->isSet("header") ) header = true; // If not, we need a number specified if ( ! header ) { string field_str; if ( id_opt->isSet("field") ) field_str = id_opt->getValue("field"); else error("Need to specify field={N,N} if no header"); // Convert field str to vector of fields NList tlist(0); string t = searchAndReplace( field_str , "+" , "," ); vector ids = tlist.deparseStringList( t ); if ( ids.size() != targetFields.size() ) error("Must specify the same number of fields/cols"); rep_field.resize( ids.size() , -1 ); for (int i=0; i( rep_field[i], ids[i] , std::dec ) ) error("Problem with field specified in --id-replace options"); // Make 0-based --rep_field[i]; } } else { // Lookup in header row ifstream IN1( tok[0].c_str() , ios::in ); vector tokens = tokenizeLine( IN1 ); rep_field.resize( targetFields.size() , -1 ); for (int f = 0 ; f < targetFields.size(); f++) for (int i = 0 ; i < tokens.size(); i++) { if ( tokens[i] == targetFields[f] ) rep_field[f] = i; } for (int i=0; i maxfield ) maxfield = rep_field[i]; while( ! IN1.eof() ) { vector tokens = tokenizeLine( IN1 ); if ( tokens.size() == 0 ) continue; if ( tokens.size() <= maxfield ) error("Not enough columns here"); bool changed = false; // Deal with header row? if ( ! readHeader ) { for (int i=0; i myTemplate; for (int i=0; isecond; findField.value = tokens[ rep_field[i] ]; if ( findField.field->equiv ) findField.updateAlias(); myTemplate.insert( findField ); } // Connect up any joint fields setJointValues( myTemplate ); // Find the match IDGroup * thisGroup = findUniqueIndividual( myTemplate ); if ( thisGroup == NULL ) { ++notFound; } else { // Find each old field for (int f=0; fvalues.size(); i++) if ( thisGroup->values[i]->field->name == tok[2] ) { if ( f>0 ) tokens[ rep_field[f] ] = "."; else { // Replace this item: handle if the replacing // field is itself a joint one if ( thisGroup->values[i]->field->joint ) { tokens[ rep_field[f] ] = searchAndReplace( thisGroup->values[i]->jointValue,"+"," " ); } else tokens[ rep_field[f] ] = thisGroup->values[i]->value; } changed = true; } } } // Done reading, processng this line } /////////////////////////////// // Output this line if ( ! changed ) { if ( listMode ) { for (int i=0; i 0 ) PP->printLOG("Could not find matches for " + int2str( notFound ) + " lines\n"); OUT1.close(); IN1.close(); return; } void IDHelper::idMatch() { // e.g. --id-match myfile.fam FID+IID 1+2 file1.txt CLIN_ID,1 file2.txt ID // in form {file} {id,{col}} // where joint IDs are ID1+ID2, or with fields: ID1+ID2,5+7 // If joint IDs, then all must be specified. // Cannot specify more than 1 non-joint ID though // Can be different IDs in different files if ( par::idhelp_match_string.size() < 4 ) error("Must specify more than 1 file to match"); // Assemble all data here: vector > > table; vector tableSize; // And keep track of which files which individuals are in map > foundIn; // Keep track of order from first file map fileOrder; set seenBefore; // do we see at least 1 header bool atleastOneHeader = false; vector< vector > headers; // Create a single field on the fly, for use in simple match mode IDField * nullField = new IDField; nullField->name = "tmp1"; ////////////////// // Read each file for (int s=0; s< par::idhelp_match_string.size(); s+=2) { PP->printLOG("Matching [ " + par::idhelp_match_string[s] + " ] on " + par::idhelp_match_string[s+1] + "\n"); int t = (int)s/2; if ( par::idhelp_no_dict ) { IDFile f; f.filename = "F" + int2str(t); files.push_back(f); } map > inserts; // Each element should be in form: filename ID filename2 ID+ID filename3 ID ... checkFileExists( par::idhelp_match_string[s] ); ifstream I1( (par::idhelp_match_string[s]).c_str() , ios::in ); string id = par::idhelp_match_string[s+1]; // A Implies header row // A,2 Implies no header row // A+B Implies header row // A+B,2+3 Implies no header row // If in "quick-match" mode, then we assume that the ID column is always the // same, whether or not it is explicitly named differently here bool jointQuery = id.find("+") != string::npos; // Fields to match on vector fieldCodes; vector fieldNames; int maxF = -1; if ( id.find(",") != string::npos ) { NList nl(0); nl.setDelimiter("+"); nl.setRangeChar(" "); vector fstr = nl.deparseStringList( id.substr( id.find(",")+1 ) ); for (int i=0; i( myf, id.substr( id.find(",")+1 ) , std::dec ) ) error("Trouble converting to a field number"); // Make zero-based --myf; if ( myf < 0 ) error("Invalid value for field # specified"); if ( myf > maxF ) maxF = myf; fieldCodes.push_back( myf ); } string tmp = id.substr( 0, id.find(",") ); NList nl2(0); nl2.setDelimiter("+"); nl2.setRangeChar(" "); fieldNames = nl2.deparseStringList( tmp ); if ( fieldNames.size() != fieldCodes.size() ) error("Problem with joint ID specification in: " + id ); if ( ! par::idhelp_no_dict ) for (int f = 0; fieldNames.size(); f++) if ( fieldMap.find( fieldNames[f] ) == fieldMap.end() ) error("Field " + fieldNames[f] + " does not exist in the database"); // Read rest of line; insert dummy headers vector h = tokenizeLine(I1); I1.close(); I1.clear(); I1.open( (par::idhelp_match_string[s]).c_str() , ios::in ); vector header; for (int k=0; k header = tokenizeLine(I1); NList nl2(0); nl2.setDelimiter("+"); nl2.setRangeChar(" "); fieldNames = nl2.deparseStringList( id ); // Find each field for (int f = 0; f maxF ) maxF = i; foundField = true; break; } } if ( ! foundField ) error("Could not find field " + fieldNames[f] + " in [ " + par::idhelp_match_string[s] + " ]"); } headers.push_back(header); } vector thisField; if ( ! par::idhelp_no_dict ) { for (int f = 0 ; f < fieldNames.size(); f++) thisField.push_back( fieldMap.find( fieldNames[f] )->second ); } else { thisField.push_back( nullField ); } // Keep track of the # of columns per file, as a check // for rectangular files tableSize.push_back(-1); //////////////////////////////////////// // Read each row of the data files: int notFound = 0; string missingList = ""; while ( ! I1.eof() ) { vector tok = tokenizeLine(I1); if ( tok.size() == 0 ) continue; if ( tableSize[t] == -1 ) tableSize[t] = tok.size(); else if ( tok.size() != tableSize[t] ) { string msg = "Problem with non-rectangular file [ " + par::idhelp_match_string[s] + " ]\n"; msg += "Execting " + int2str( tableSize[t] ) + " fields but found " + int2str(tok.size()) + "\n"; for (int k=0; k= tok.size() ) error("Line does not contain enough columns"); set myTemplate; for (int f=0; fequiv ) findField.updateAlias(); myTemplate.insert( findField ); } // Connect up any joint fields setJointValues( myTemplate ); // If running without a dictionary, now add this person in if ( par::idhelp_no_dict ) { IDGroup * thisGroup = findUniqueIndividual( myTemplate ); if ( thisGroup == NULL ) { IDGroup * g = new IDGroup; g->resolved = false; g->file = &files[t]; set::iterator k = myTemplate.begin(); while ( k != myTemplate.end() ) { IDValue * nv = new IDValue; *nv = *k; g->values.push_back( nv ); idgroup.push_back(g); ++k; } // Add to the ID Map set t; t.insert( idgroup[ idgroup.size()-1] ); for (int v=0; vvalues.size(); v++) idmap.insert(make_pair( *(g->values[v]) ,t )); } } // Find the match IDGroup * thisGroup = findUniqueIndividual( myTemplate ); if ( thisGroup != NULL ) { // Add this row to the collection inserts.insert(make_pair( thisGroup, tok ) ); // Keep track of order in which we came across this person for the // first time if ( seenBefore.find( thisGroup ) == seenBefore.end() ) { int sz = fileOrder.size(); fileOrder.insert(make_pair(sz,thisGroup)); seenBefore.insert( thisGroup ); } // Keep track that this person was in this file map >::iterator i1 = foundIn.find( thisGroup ); if ( i1 == foundIn.end() ) { set t; t.insert(int(s/2)); foundIn.insert(make_pair( thisGroup, t ) ); } else i1->second.insert(int(s/2)); } else { set::iterator i = myTemplate.begin(); while ( i != myTemplate.end() ) { missingList += i->field->name + " = " + i->value + "\t"; ++i; } missingList += "\n"; ++notFound; } // Get next line from this file } // Add this file to big collection table.push_back( inserts ); if ( notFound > 0 ) { PP->printLOG("Could not find " + int2str( notFound ) + " individuals from [ " + par::idhelp_match_string[s] + " ] in database\n"); PP->printLOG("Writing this list to [ " + par::output_file_name + ".noid ]\n"); ofstream O1( ( par::output_file_name+".noid").c_str() , ios::out ); O1 << "FIELD = VALUE\n"; O1 << missingList ; O1.close(); } I1.close(); // Get next file } // Now output all files tied together // Output, in order of the order in which we encountered // each unique individual // Only output complete rows? OptionSet * id_opt = par::opt.getOptions("IDHELP"); bool complete = false; if ( id_opt->isSet("complete") ) complete = true; if ( id_opt->isSet("noheader") ) atleastOneHeader = false; PP->printLOG("Writing output file to [ " + par::output_file_name + ".matched ]\n"); ofstream O1( (par::output_file_name+".matched").c_str() , ios::out ); // What about header row? // skip for now... if ( atleastOneHeader ) { for (int k=0; k::iterator j = fileOrder.begin(); while ( j != fileOrder.end() ) { map >::iterator i = foundIn.find( (IDGroup* const)j->second ); int f = i->second.size(); if ( complete && f != tfiles ) { ++j; continue; } for (int t = 0 ; t < tfiles; t++) { // Does this individual exist for this file? map > & thisTable = table[t]; if ( thisTable.find( i->first ) == thisTable.end() ) { for (int k=0; k & thisLine = thisTable.find( i->first )->second; for (int k = 0 ; k < thisLine.size() ; k++ ) O1 << setw(12) << thisLine[k] << " "; } } O1 << "\n"; // Next individual ++j; } O1.close(); return; } // end of id-match routine void IDHelper::idDump() { // Note: use of parseQuery modifies the data structure, setting // fields to non-joint potentially, and so subsequent attempts to // line things up will not work; therefore, for now we stop here. PP->printLOG("\nReporting rows that match [ " + par::idhelp_dump_from_dict_cmd + " ] \n\n"); map > myTemplate = parseQuery( par::idhelp_dump_from_dict_cmd ); for ( int g = 0 ; g < idgroup.size(); g++ ) { IDGroup * group = idgroup[g]; if ( matchIndividual( group , myTemplate ) ) { for (int j = 0 ; j < group->values.size(); j++ ) { PP->printLOG( group->file->filename + " : " ); PP->printLOG( group->values[j]->field->name + " = " ); PP->printLOG( group->values[j]->value + "\n" ); } PP->printLOG("\n"); } } PP->printLOG("---------------------------------\n"); // Important: joint status of fields will have changed; in any // case, let's stop here return; } plink-1.07-src/proxy.cpp0000644000265600020320000011454311264127625014372 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2008 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include "options.h" #include "helper.h" #include "plink.h" #include "phase.h" #include "model.h" #include "linear.h" #include "stats.h" ////////////////////////////////////////// // Helper classes to store and sort proxies class ProxyResult { public: string name; double f; double r2; double odds; double chisq; double pvalue; ProxyResult(string n, double frq, double r, double o, double c, double p) : name(n), f(frq), r2(r), odds(o), chisq(c), pvalue(p) { } bool operator< (const ProxyResult & b) const { return ( pvalue == b.pvalue ? name < b.name : pvalue < b.pvalue ); } }; class LDPair { public: int s1; int s2; double ld; bool operator< (const LDPair & b) const { return ld > b.ld; } }; /////////////////////////////////////////////// // Helper function to find n of m combinations void combinations_recursive(const vector &elems, unsigned long req_len, vector &pos, unsigned long depth, unsigned long margin, vector > & collection) { if (depth >= req_len) { vector t; for (unsigned long ii = 0; ii < pos.size(); ++ii) t.push_back( elems[pos[ii]] ); collection.push_back(t); return; } if ((elems.size() - margin) < (req_len - depth)) return; for (unsigned long ii = margin; ii < elems.size(); ++ii) { pos[depth] = ii; combinations_recursive(elems, req_len, pos, depth + 1, ii + 1, collection); } return; } /////////////////////////////////////////////// // For locus l, main proxy association function void Plink::performProxyTests(int l) { // Consider a particular SNP, l, and // a) form haplotypes surrounding SNPs // b) calculate association with phenotype for these SNPs // c) calculate r^2 (haplotypic) with allele and haplotypes // If this is a rarer SNP, use a slightly broader search strategy if ( locus[l]->freq < par::proxy_planB_threshold ) { par::proxy_kb = par::proxy_kb_planB; par::proxy_window = par::proxy_window_planB; par::proxy_snp_filter = par::proxy_snp_filter_planB; par::proxy_r2_filter_A = par::proxy_r2_filter_A_planB; par::proxy_r2_filter_B = par::proxy_r2_filter_B_planB; par::proxy_r2_filter_C = par::proxy_r2_filter_C_planB; } else { par::proxy_kb = par::proxy_kb_planA; par::proxy_window = par::proxy_window_planA; par::proxy_snp_filter = par::proxy_snp_filter_planA; par::proxy_r2_filter_A = par::proxy_r2_filter_A_planA; par::proxy_r2_filter_B = par::proxy_r2_filter_B_planA; par::proxy_r2_filter_C = par::proxy_r2_filter_C_planA; } // Form haplotypes based on the surrounding SNPs // Form phenotype based on the patterns of missingness for test SNP // Is there any association? bool old_silent = par::silent; //////////////////////////////////////////////////////////////// // If we are in 'impute' mode: this is to evaluate imputation, in a // 'leave-one-out' manner; here we assume the reference panel is // coded as 'missing phenotype' and the rest of the sample is coded // as non-missing phenotype; so we must first blank out (but later // replace) any genotype data for these individuals vector tmp1, tmp2; if ( par::proxy_impute || par::proxy_leave_out ) { // Pretend that we do not have these genotypes, except for the // reference panel (i.e. individuals with a missing phenotype) for ( int i = 0 ; i < n ; i++ ) if ( ! sample[i]->missing ) { tmp1.push_back( SNP[l]->one[i] ); tmp2.push_back( SNP[l]->two[i] ); SNP[l]->one[i] = true; SNP[l]->two[i] = false; } } /////////////////////// // Form test haplotypes CSNP * s = SNP[l]; vector proxyHaplotypePlusSNP; // Add reference SNP proxyHaplotypePlusSNP.push_back(l); // Either a fixed maximum number of SNPs left and right (allowing // for different filters, and chromosome ends) or read from a file // (these SNPs must be on same chromosome) if ( par::proxy_list ) { checkFileExists( par::proxy_list_file ); printLOG("Reading proxy list from [ " + par::proxy_list_file + " ]\n"); ifstream PL( par::proxy_list_file.c_str(), ios::in); map mlocus; for (int j=0;jname,j)); while ( ! PL.eof() ) { string psnp; PL >> psnp; if ( psnp == "" ) continue; map::iterator m = mlocus.find( psnp ); if ( m == mlocus.end() ) continue; // Add if okay w/ MAF and genotyping thresholds int pn = m->second; if ( pn != l && locus[pn]->chr == locus[l]->chr && locus[pn]->freq >= par::proxy_maf && abs(double((locus[l]->bp - locus[pn]->bp)/1000.0)) <= par::proxy_kb && locus[pn]->pos <= par::proxy_geno ) { proxyHaplotypePlusSNP.push_back( pn ); } } PL.close(); } else // ... use window approach { int i = l-1; int added = 0; while ( added < par::proxy_window ) { if ( i >= 0 && locus[i]->chr == locus[l]->chr ) { // Add MAF and genotyping thresholds here if ( locus[i]->freq >= par::proxy_maf && abs(double((locus[l]->bp - locus[i]->bp)/1000.0)) <= par::proxy_kb && locus[i]->pos <= par::proxy_geno ) { proxyHaplotypePlusSNP.push_back(i); added++; } // Shift left --i; } else { // Cannot add any more added = par::proxy_window; } } // Now move right added = 0; i = l+1; while ( added < par::proxy_window ) { if ( i < nl_all && locus[i]->chr == locus[l]->chr ) { // Add MAF, kb and genotyping thresholds here if ( locus[i]->freq >= par::proxy_maf && abs(double((locus[l]->bp - locus[i]->bp)/1000.0)) <= par::proxy_kb && locus[i]->pos <= par::proxy_geno ) { proxyHaplotypePlusSNP.push_back(i); added++; } //Shift right ++i; } else { // Cannot add any more added = par::proxy_window; } } } /////////////////////////////////////////////////////////////////////////// // // Optionally, filter list based on LD with reference and with eachother // /////////////////////////////////////////////////////////////////////////// if ( par::proxy_r2_filter ) { // Only use Reference Panel for these r-sq calculations at this stage; // add flag to modify this if ( par::proxy_reference_only ) haplo->reference_only = true; set added; // Examine SNP number list: proxyHaplotypePlusSNP // First entry is always the reference SNP set proxies; // Skip first entry for (int i=1; irsq(p.s1,p.s2); //p.ld = haplo->dprime(p.s1,p.s2); proxies.insert(p); } set::iterator pi = proxies.begin(); while ( pi != proxies.end() ) { // Enough proxies already? if ( added.size() >= par::proxy_snp_filter ) break; if ( ( added.size() < 2 && pi->ld >= par::proxy_r2_filter_A ) // low filter || pi->ld >= par::proxy_r2_filter_B ) // higher filter, once 2 proxies found { // But does this already correlate too strongly with an // existing proxy? set::iterator si = added.begin(); bool okay = true; while ( si != added.end() ) { int2 snps; if ( snps.p1 > snps.p2 ) { snps.p1 = *si; snps.p2 = pi->s2; } else { snps.p1 = pi->s2; snps.p2 = *si; } double ld; map::iterator f = proxyLD.find(snps); if ( f != proxyLD.end() ) { ld = f->second; } else { ld = haplo->rsq(snps.p1,snps.p2); //ld = haplo->dprime(snps.p1,snps.p2); proxyLD.insert(make_pair(snps,ld)); } if ( ld > par::proxy_r2_filter_C ) { okay = false; break; } ++si; } if ( okay ) { added.insert( pi->s2 ); } } // Consider next proxy SNP ++pi; } ////////////////////////////////// // Update the fitlered proxy list proxyHaplotypePlusSNP.clear(); proxyHaplotypePlusSNP.push_back(l); set::iterator si = added.begin(); while ( si != added.end() ) { proxyHaplotypePlusSNP.push_back( *si ); ++si; } // And remember to reset haplo->reference_only = false; } ////////////////////////////////////////////////////////////////// // // Sort, and select reference SNP // ////////////////////////////////////////////////////////////////// sort( proxyHaplotypePlusSNP.begin(), proxyHaplotypePlusSNP.end()); int cnt = proxyHaplotypePlusSNP.size(); int ref; for (int i=0; iHTEST << setw( 4 ) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << locus[l]->bp << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(10) << 1 - locus[l]->pos << " " << setw(4) << proxyHaplotypePlusSNP.size() - 1 << " " << setw(8) << "NA" << " "; // Display C/C or T/U for case/control and TDT, else F and BETA if ( par::qt || par::proxy_glm ) haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " "; haplo->HTEST << setw(10) << "NA" << " "; if ( par::proxy_list_proxies ) haplo->HTEST << "(NONE)"; haplo->HTEST << "\n"; haplo->HTEST.flush(); // Finally, replace any genotypes made temporarily missing if ( par::proxy_leave_out ) { int cnt = 0; for ( int i = 0 ; i < n ; i++ ) if ( ! sample[i]->missing ) { SNP[l]->one[i] = tmp1[cnt]; SNP[l]->two[i] = tmp2[cnt++]; } tmp1.clear(); tmp2.clear(); } return; } } } /////////////////////////////////////////////////////////////////////////////// // // Phase haplotypes // /////////////////////////////////////////////////////////////////////////////// haplo->reset(); haplo->new_pred_locus.resize(1); haplo->new_map.resize(1); haplo->new_pred_locus[0] = proxyHaplotypePlusSNP; haplo->new_map[0] = locus[l]; par::silent = true; haplo->phaseAllHaplotypes(true,*pperm); haplo->hname = locus[l]->name; par::silent = old_silent; /////////////////////////////////////////////////////////////////////////////// // // Per-genotype error screen // /////////////////////////////////////////////////////////////////////////////// if ( par::proxy_error ) { // Identify test SNP with respect to 0..cnt phased region // (i.e. 'ref' and not 'l') haplo->queryGenotype( ref ); return; } ////////////////////////////////////////////////////////////////////////////// // // If we are in 'impute' mode: do not perform association tests, but // just compare imputed genotypes to actual (left-out) genotypes, // and report on this. // ////////////////////////////////////////////////////////////////////////////// if ( par::proxy_leave_out ) { int cnt = 0; for ( int i = 0 ; i < n ; i++ ) if ( ! sample[i]->missing ) { SNP[l]->one[i] = tmp1[cnt]; SNP[l]->two[i] = tmp2[cnt++]; } tmp1.clear(); tmp2.clear(); } else if ( par::proxy_impute ) { //////////////////////////////////////// // Replace genotypes, and record dosage // For imputation quality score boolvec_t m1(cnt,false); m1[ref] = true; boolvec_t a1(cnt,false); map tests = haplo->makeTestSet(m1,a1); set hs; map::iterator i1 = tests.begin(); while ( i1 != tests.end() ) { if ( i1->second == 0 ) hs.insert( i1->first); ++i1; } haplo->calculateEmpiricalVariance(hs); int con[4][4]; for (int j=0;j<4;j++) for (int k=0; k<4; k++) con[j][k] = 0; if ( par::proxy_record_dosage ) OUTFILE << locus[l]->name << "\t" << locus[l]->allele1 << "\t" << locus[l]->allele2 << "\t" << haplo->ratio << "\t"; int cnt = 0; for ( int i = 0 ; i < n ; i++ ) if ( ! sample[i]->missing ) { // Actual genotypes bool a1 = tmp1[cnt]; bool a2 = tmp2[cnt++]; int og; if ( a1 ) { if ( a2 ) og = 2; else og = 3; } else { if ( a2 ) og = 1; else og = 0; } // Imputed genotypes vector_t g = haplo->imputeGenotype(i,ref); // Call imputed bool i1, i2; if ( g[0] > par::proxy_impute_threshold ) { i1 = i2 = false; con[og][0]++; } else if ( g[1] > par::proxy_impute_threshold ) { i1 = false; i2 = true; con[og][1]++; } else if ( g[2] > par::proxy_impute_threshold ) { i1 = i2 = true; con[og][2]++; } else { i1 = true; i2 = false; con[og][3]++; } if ( par::proxy_full_report ) haplo->HTEST << locus[l]->name << "\t" << sample[i]->fid << " " << sample[i]->iid << "\t" << a1<one[i] = i1; SNP[l]->two[i] = i2; } else { SNP[l]->one[i] = a1; SNP[l]->two[i] = a2; } // } } if ( par::proxy_record_dosage ) OUTFILE << "\n"; // Report matrix of concordance int total = 0; for (int j=0;j<4;j++) for (int k=0;k<4;k++) total += con[j][k]; int concordant = con[0][0] + con[1][1] + con[2][2]; int both_geno = 0; for (int j=0;j<3;j++) for (int k=0;k<3;k++) both_geno += con[j][k]; int observed_geno = total; for (int j=0; j<4; j++) observed_geno -= con[3][j]; int imputed_geno = total; for (int j=0; j<4; j++) imputed_geno -= con[j][3]; double rate = both_geno == 0 ? -1 : (double)concordant/(double)both_geno ; double rate_obs = (double)observed_geno/(double)total; double rate_imp = (double)imputed_geno/(double)total; double rate_ovr = (double)both_geno/(double)total; if ( par::proxy_full_report ) { haplo->HTEST << "\nImputation matrix " << "(rows observed, columns imputed)\n\n"; for (int j=0;j<4;j++) { if ( j == 0 ) haplo->HTEST << locus[l]->allele1 << "/" << locus[l]->allele1 << "\t"; else if ( j == 1 ) haplo->HTEST << locus[l]->allele1 << "/" << locus[l]->allele2 << "\t"; else if ( j == 2 ) haplo->HTEST << locus[l]->allele2 << "/" << locus[l]->allele2 << "\t"; else haplo->HTEST << par::missing_genotype << "/" << par::missing_genotype << "\t"; for (int k=0; k<4; k++) haplo->HTEST << con[j][k] << "\t"; haplo->HTEST << "\n"; } haplo->HTEST << setw(4) << "CHR" << " " << setw(par::pp_maxsnp) << "SNP" << " " << setw(8) << "INFO" << " " << setw(4) << "NPRX" << " " << setw(8) << "TOTAL_N" << " " << setw(8) << "OBSERVD" << " " << setw(8) << "IMPUTED" << " " << setw(8) << "OVERLAP" << " " << setw(8) << "CONCORD" << " "; if ( par::proxy_impute_genotypic_concordance ) haplo->HTEST << setw(8) << "F_AA" << " " << setw(8) << "I_AA" << " " << setw(8) << "C_AA" << " " << setw(8) << "F_AB" << " " << setw(8) << "I_AB" << " " << setw(8) << "C_AB" << " " << setw(8) << "F_BB" << " " << setw(8) << "I_BB" << " " << setw(8) << "C_BB" << " "; if ( par::proxy_list_proxies ) haplo->HTEST << "SNPS"; haplo->HTEST << "\n"; } haplo->HTEST << setw(4) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(4) << proxyHaplotypePlusSNP.size() - 1 << " " << setw(8) << haplo->ratio << " " << setw(8) << total << " " << setw(8) << rate_obs << " " << setw(8) << rate_imp << " " << setw(8) << rate_ovr << " "; if ( rate >= 0 ) haplo->HTEST << setw(8) << rate << " "; else haplo->HTEST << setw(8) << "NA" << " "; if ( par::proxy_impute_genotypic_concordance ) { int impAA = con[0][0]+con[0][1]+con[0][2]; int impAB = con[1][0]+con[1][1]+con[1][2]; int impBB = con[2][0]+con[2][1]+con[2][2]; double fAA = (double)(con[0][0]+con[1][0]+con[2][0]) / (double)observed_geno; double fAB = (double)(con[0][1]+con[1][1]+con[2][1]) / (double)observed_geno; double fBB = (double)(con[0][2]+con[1][2]+con[2][2]) / (double)observed_geno; if ( observed_geno == 0 ) fAA = fAB = fBB = 0; int obsAA = impAA+con[0][3]; int obsAB = impAB+con[1][3]; int obsBB = impBB+con[2][3]; haplo->HTEST << setw(8) << fAA << " "; if ( obsAA == 0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << (double)(impAA)/(double)(impAA+con[0][3]) << " "; if ( impAA==0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << (double)(con[0][0])/(double)impAA << " "; haplo->HTEST << setw(8) << fAB << " "; if ( obsAB == 0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << (double)(impAB)/(double)(impAB+con[1][3]) << " "; if ( impAB==0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << (double)(con[1][1])/(double)impAB << " "; haplo->HTEST << setw(8) << fBB << " "; if ( obsBB == 0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << (double)(impBB)/(double)(impBB+con[2][3]) << " "; if ( impBB==0 ) haplo->HTEST << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << (double)(con[2][2])/(double)impBB << " "; } if ( par::proxy_list_proxies ) { bool printed = false; for (int l0=0; l0< proxyHaplotypePlusSNP.size(); l0++) { if ( proxyHaplotypePlusSNP[ l0 ] != l ) { if ( printed ) haplo->HTEST << "|"; haplo->HTEST << locus[ proxyHaplotypePlusSNP[ l0 ] ]->name; printed = true; } } } haplo->HTEST << endl; haplo->HTEST.flush(); tmp1.clear(); tmp2.clear(); return; } //////////////////////////////////////////////////////////////// // // // Consider all subsets of subhaplotypes, if in verbose mode // // // //////////////////////////////////////////////////////////////// if ( ! ( par::proxy_all || par::proxy_full_report ) ) printLOG("Estimated haplotype frequencies: now considering combinations...\n"); //////////////////////////////////////////////////// // Display haplotype frequencies and individual r^2 if ( ( ! par::proxy_all ) || par::proxy_full_report ) haplo->HTEST << "\n" << " *** Proxy haplotype association report for " << haplo->hname << " *** \n\n"; //////////////////////////////////////// // Report SNPs and single-SNP r-squared if ( ( ! par::proxy_all) || par::proxy_full_report ) { haplo->HTEST << setw(par::pp_maxsnp) << "SNP" << " " << setw(8) << "MAF" << " " << setw(8) << "GENO" << " " << setw(8) << "KB" << " " << setw(8) << "RSQ" << " "; if ( par::qt ) haplo->HTEST << setw(8) << "BETA" << " " << setw(8) << "STAT" << " " << setw(8) << "P" << "\n"; else haplo->HTEST << setw(8) << "OR" << " " << setw(8) << "CHISQ" << " " << setw(8) << "P" << "\n"; for ( int s = 0 ; s < cnt ; s++) { haplo->HTEST << setw(par::pp_maxsnp) << locus[ haplo->new_pred_locus[0][s] ]->name << " " << setw(8) << locus[ haplo->new_pred_locus[0][s] ]->freq << " " << setw(8) << 1- locus[ haplo->new_pred_locus[0][s] ]->pos << " " << setw(8) << (double)(locus[ haplo->new_pred_locus[0][s]]->bp - locus[ haplo->new_pred_locus[0][ref]]->bp)/1000<<" "; // R-squared if ( s == ref ) haplo->HTEST << setw(8) << "*" << " "; else haplo->HTEST << setw(8) << haplo->rsq_internal(s,ref) << " "; // Single SNP association boolvec_t snpmask(cnt,false); boolvec_t dummy_allele(cnt,true); snpmask[s] = true; haplo->testSet = haplo->makeTestSet(snpmask,dummy_allele); if (par::proxy_CC) { // GLM or standard test? if ( par::proxy_glm ) { glmAssoc(false,*pperm); } else { if ( par::qt ) haplo->haplotypicQTL(haplo->testSet,2,false); else haplo->haplotypicCC(haplo->testSet,2,false); } } else if (par::proxy_TDT) { haplo->subhaplotypes = true; haplo->downcoding = haplo->testSet; haplo->trans.clear(); haplo->untrans.clear(); haplo->trans.resize(2,0); haplo->untrans.resize(2,0); haplo->nt = 2; // First rescore transmissions for (int i=0; ifounder) && haplo->include[i] ) { haplo->transmissionCount(i,haplo->phasemap[i]); } } // Then recount T:U based on revised transmission counts haplo->haplotypicTDT(haplo->testSet,2,false); haplo->subhaplotypes = false; haplo->downcoding.clear(); } // Recover main results from Model, if GLM used if ( par::proxy_glm ) { vector_t coef = model->getCoefs(); haplo->odds = par::bt ? exp(coef[1]) : coef[1]; haplo->result = model->isValid() ? model->getStatistic() : 0; haplo->pvalue = par::bt ? chiprobP(haplo->result,1) : ((LinearModel*)model)->getPValue(); delete model; } haplo->HTEST << setw(8) << haplo->odds << " " << setw(8) << haplo->result << " " << setw(8) << haplo->pvalue << "\n"; } haplo->HTEST << "\n\n"; } /////////////////////////////////////////////////// // Report haplotypes and frequencies, and also all // haplotype-specific tests and omnibus test result if ( ( ! par::proxy_all ) || par::proxy_full_report ) { string str = ""; for ( int i = 0; i < cnt ; i++) if ( i == ref ) str += "*"; else str += "."; haplo->HTEST << setw(14) << str << " " << setw(10) << "FREQ" << " "; if ( par::qt ) haplo->HTEST << setw(8) << "BETA" << " " << setw(8) << "STAT" << " " << setw(8) << "P" << "\n"; else haplo->HTEST << setw(10) << "OR" << " " << setw(10) << "CHISQ" << " " << setw(10) << "P" << "\n"; for (int h=0; h< haplo->nh; h++) { if (haplo->f[h] >= par::proxy_mhf ) { haplo->HTEST << setw(14) << haplo->haplotypeName(h) << " " << setw(10) << haplo->f[h] << " "; haplo->testSet.clear(); for (int h2=0; h2 < haplo->nh; h2++) { if ( haplo->f[h2] >= par::proxy_mhf) { if (h==h2) { haplo->testSet.insert(make_pair(h2,0)); } else { haplo->testSet.insert(make_pair(h2,1)); } } } if (par::proxy_CC) { // GLM or standard test? if ( par::proxy_glm ) { glmAssoc(false,*pperm); } else { if ( par::qt) haplo->haplotypicQTL(haplo->testSet,2,false); else haplo->haplotypicCC(haplo->testSet,2,false); } } else if (par::proxy_TDT) { haplo->subhaplotypes = true; haplo->downcoding = haplo->testSet; haplo->trans.clear(); haplo->untrans.clear(); haplo->trans.resize(2,0); haplo->untrans.resize(2,0); haplo->nt = 2; // First rescore transmissions for (int i=0; ifounder) && haplo->include[i] ) { haplo->transmissionCount(i,haplo->phasemap[i]); } } // Then recount T:U based on revised transmission counts haplo->haplotypicTDT(haplo->testSet,2,false); haplo->subhaplotypes = false; haplo->downcoding.clear(); } // Recover main results from Model, if GLM used if ( par::proxy_glm ) { vector_t coef = model->getCoefs(); haplo->odds = par::bt ? exp(coef[1]) : coef[1]; haplo->result = model->isValid() ? model->getStatistic() : 0; haplo->pvalue = par::bt ? chiprobP(haplo->result,1) : ((LinearModel*)model)->getPValue(); delete model; } haplo->HTEST << setw(10) << haplo->odds << " " << setw(10) << haplo->result << " " << setw(10) << haplo->pvalue << "\n"; haplo->HTEST.flush(); } } haplo->HTEST << "\nHaplotype frequency estimation based on " << haplo->validN; if ( haplo->X ) { int found_chr = 0; for (int i=0; ifounder ) { if ( sample[i]->sex ) found_chr++; else found_chr+=2; } haplo->HTEST << " of " << found_chr << " founder chromosomes\n"; } else if ( haplo->haploid ) haplo->HTEST << " of " << haplo->cnt_f << " founder chromosomes\n"; else haplo->HTEST << " of " << haplo->cnt_f * 2 << " founder chromosomes\n"; /////////////////////////////////// // Omnibus test: C/C only if ( par::proxy_CC && ! par::qt ) { map tests; int nch=0; for (int h=0; h < haplo->nh; h++) if ( haplo->f[h] >= par::proxy_mhf) tests.insert(make_pair(h,nch++)); if (nch>2) { haplo->haplotypicCC(tests,nch,false); haplo->HTEST << "Omnibus haplotype test statistic: " << haplo->result << ", df = " << nch-1 << ", " << "p = " << chiprobP( haplo->result , nch-1 ) << "\n\n"; } } } /////////////////////////////////////////// // Create masks // Reference SNP (fix here) boolvec_t m1(cnt,false); m1[ref] = true; boolvec_t a1(cnt,true); // Proxy haplotype (populated below) boolvec_t m2(cnt,false); boolvec_t a2(cnt); /////////////////////////////////////////// // Report actual SNP // /////////////////////////////////////////// haplo->testSet = haplo->makeTestSet(m1,a1); if (par::proxy_CC) { // GLM or standard test? if ( par::proxy_glm ) { glmAssoc(false,*pperm); } else { if ( par::qt ) haplo->haplotypicQTL(haplo->testSet,2,false); else haplo->haplotypicCC(haplo->testSet,2,false); } } else if (par::proxy_TDT) { haplo->subhaplotypes = true; haplo->downcoding = haplo->testSet; haplo->trans.clear(); haplo->untrans.clear(); haplo->trans.resize(2,0); haplo->untrans.resize(2,0); haplo->nt = 2; // First rescore transmissions for (int i=0; ifounder) && haplo->include[i] ) { haplo->transmissionCount(i,haplo->phasemap[i]); } } // Then recount T:U based on revised transmission counts haplo->haplotypicTDT(haplo->testSet,2,false); haplo->subhaplotypes = false; haplo->downcoding.clear(); // Also, calculate the information score set t1 = haplo->makeSetFromMap(haplo->testSet); haplo->calculateEmpiricalVariance(t1); } // Recover main results from Model, if GLM used if ( par::proxy_glm ) { vector_t coef = model->getCoefs(); haplo->odds = par::bt ? exp(coef[1]) : coef[1]; haplo->result = model->isValid() ? model->getStatistic() : 0; haplo->pvalue = par::bt ? chiprobP(haplo->result,1) : ((LinearModel*)model)->getPValue(); delete model; // Also, calculate the information score set t1 = haplo->makeSetFromMap(haplo->testSet); haplo->calculateEmpiricalVariance(t1); } ////////////////////////////////////////////////////////////////////// // // Just report the single SNP result (based on haplotype test)? // ////////////////////////////////////////////////////////////////////// if ( par::proxy_all && ( ! par::proxy_full_report ) ) { haplo->HTEST << setw( 4 ) << locus[l]->chr << " " << setw(par::pp_maxsnp) << locus[l]->name << " " << setw(12) << locus[l]->bp << " " << setw(4) << locus[l]->allele1 << " " << setw(4) << locus[l]->allele2 << " " << setw(10) << 1 - locus[l]->pos << " " << setw(4) << proxyHaplotypePlusSNP.size() - 1 << " " << setw(8) << haplo->ratio << " "; // Display C/C or T/U for case/control and TDT, else F and BETA if ( haplo->pvalue < -1 ) { // Not a valid test, e.g. monomorphic, and so p value is returned as -9 if ( par::qt || par::proxy_glm ) haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " "; else haplo->HTEST << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " " << setw(8) << "NA" << " "; } else { if ( par::qt || par::proxy_glm ) { // Get population haplotype frequency for imputed SNP double f = haplo->freq(m1,a1); haplo->HTEST << setw(8) << f << " " << setw(8) << haplo->odds << " "; } else haplo->HTEST << setw(8) << haplo->case_freq << " " << setw(8) << haplo->control_freq << " " << setw(8) << haplo->odds << " "; haplo->HTEST << setw(10) << haplo->pvalue << " "; } if ( par::proxy_list_proxies ) { bool printed = false; for (int l0=0; l0< proxyHaplotypePlusSNP.size(); l0++) { if ( proxyHaplotypePlusSNP[ l0 ] != l ) { if ( printed ) haplo->HTEST << "|"; haplo->HTEST << locus[ proxyHaplotypePlusSNP[ l0 ] ]->name; printed = true; } } } haplo->HTEST << endl; haplo->HTEST.flush(); return; } //////////////////////////////////////////////////////////////////////// // // Rest of this function is for the extended report mode // //////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// // Consider from cnt-1 to 1 SNP haplotypes int search_cnt = par::proxy_include_reference ? cnt : cnt - 1; int maxsnp = search_cnt < par::proxy_maxhap ? search_cnt : par::proxy_maxhap; int num_subhaps_total = 0; int num_subhaps_valid = 0; set presults; for (int i=1; i<=maxsnp; i++) { // For an i-SNP of cnt-SNP, consider all the permutations vector pos1(i); vector d(search_cnt); int i2=0; for (int z=0; z > collection; combinations_recursive(d,i,pos1,0,0,collection); vector mapback; for (int s=0;s posit = collection[c1]; // Now consider all search_cnt SNP haplotypes // i.e. excluding reference SNP int hapcnt = (int)pow((double)2,i); int h = 0; while ( h < hapcnt ) { // Skip redundant second allele of // SNPs if ( collection[c1].size() == 1 && h == 1 ) { h++; continue; } vector tmp; unsigned int p=1; for (int s=0;sfreq(m2,a2); //////////////////////////////////////////////////////////// // Calculate r^2 between haplotype and reference SNP allele double r2 = haplo->rsq_internal(m1,a1,m2,a2); //////////////////////////////////////////// // Is this haplotype not worth considering? ++num_subhaps_total; if ( r2 < par::proxy_r2 || f < par::proxy_mhf ) { h++; continue; } ++num_subhaps_valid; //////////////////////////////////////////////////////////// // Calculte association between proxy haplotype and disease // If only two haplotypes, report only 1 // (note may only be two *common* haplotypes, but // in that case, we should report both haplo->testSet = haplo->makeTestSet(m2,a2); if (par::proxy_CC) { // GLM or standard test? if ( par::proxy_glm ) { glmAssoc(false,*pperm); } else { if ( par::qt ) haplo->haplotypicQTL(haplo->testSet,2,false); else haplo->haplotypicCC(haplo->testSet,2,false); } } else if (par::proxy_TDT) { haplo->subhaplotypes = true; haplo->downcoding = haplo->testSet; haplo->trans.clear(); haplo->untrans.clear(); haplo->trans.resize(2,0); haplo->untrans.resize(2,0); haplo->nt = 2; // First rescore transmissions for (int i=0; ifounder) && haplo->include[i] ) { haplo->transmissionCount(i,haplo->phasemap[i]); } } // Then recount T:U based on revised transmission counts haplo->haplotypicTDT(haplo->testSet,2,false); haplo->subhaplotypes = false; haplo->downcoding.clear(); } string str = par::proxy_include_reference ? haplo->getSubHaplotypeName(m2,a2,-1) : haplo->getSubHaplotypeName(m2,a2,ref); // Recover main results from Model, if GLM used if ( par::proxy_glm ) { vector_t coef = model->getCoefs(); haplo->odds = par::bt ? exp(coef[1]) : coef[1]; haplo->result = model->isValid() ? model->getStatistic() : 0; haplo->pvalue = par::bt ? chiprobP(haplo->result,1) : ((LinearModel*)model)->getPValue(); delete model; } ////////////////////// // Store this result ProxyResult r(str,f,r2, haplo->odds, haplo->result, haplo->pvalue); presults.insert(r); // Consider next haplotype h++; } } } haplo->HTEST << "Of " << num_subhaps_total << " subhaplotypes considered, " << num_subhaps_valid << " met proxy criteria\n\n"; // Report results if ( presults.size() == 0 ) haplo->HTEST << "No proxies found above r-sq " << par::proxy_r2 << "\n"; else { haplo->HTEST << setw(14) << "HAP" << " " << setw(10) << "FREQ" << " " << setw(10) << "RSQ" << " "; if ( par::qt ) haplo->HTEST << setw(8) << "BETA" << " " << setw(8) << "STAT" << " " << setw(8) << "P" << "\n"; else haplo->HTEST << setw(10) << "OR" << " " << setw(10) << "CHISQ" << " " << setw(10) << "P" << "\n"; set::iterator i = presults.begin(); while ( i != presults.end() ) { haplo->HTEST << setw(14) << i->name << " " << setw(10) << i->f << " " << setw(10) << i->r2 << " " << setw(10) << i->odds << " " << setw(10) << i->chisq << " " << setw(10) << i->pvalue << "\n"; i++; } } if ( par::proxy_full_report ) haplo->HTEST << "\n+--------------------------------------------------------------------+\n\n"; return; } plink-1.07-src/helper.h0000644000265600020320000001315711264127626014135 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #ifndef __HELPER_H__ #define __HELPER_H__ #include #include #include #include #include #include "plink.h" #include "options.h" template inline const T SQR(const T a) {return a*a;} template inline const T MAX(const T &a, const T &b) {return b > a ? (b) : (a);} template inline const T MIN(const T &a, const T &b) {return b < a ? (b) : (a);} template inline const T SIGN(const T &a, const T &b) {return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);} template inline void SWAP(T &a, T &b) {T dum=a; a=b; b=dum;} class Plink; class Individual; class CSNP; using namespace std; void sizeMatrix(matrix_t &, int,int); void sizefMatrix(fmatrix_t &,int,int); void sizeTable(table_t & , int, int); void NoMem(); vector nvec_bool(); class CArgs { public: CArgs(int,char**); int count() { return n; } bool any() { return n > 1 ? true : false; } void fromScript(string); void fromPriorLog(string); bool find(string); string value(string); int value_int(string); double value_double(string); long unsigned int value_lui(string); void check_unused_options(Plink &); bool parseOptions(string,string); vector value(string,int); vector varValue(string); vector a; private: int n; vector parsed; vector option; vector root_command; vector original; map optionLabel; }; vector parse2str(string); vector parse2int(string); string searchAndReplace(string,string,string); vector commaParse(string); template bool from_string(T& t, const std::string& s, std::ios_base& (*f)(std::ios_base&)) { std::istringstream iss(s); return !(iss >> f >> t).fail(); } string display(vector &); string displayLine(vector &); void error(string); void shutdown(); void checkDupes(Plink&); bool readString(FILE *,string &); void summaryBasics(Plink&); string relType(Individual *, Individual *); typedef vector > matrix_t; typedef vector vector_t; void display(matrix_t &); void display(vector_t &); void display(vector &); double genotypingRate(Plink &, int); bool identicalSNPs(Plink *, int, int); vector listPossibleHaplotypes(Plink &, vector); void geno2matrix(vector & snps, matrix_t &, boolmatrix_t &,bool); int getInt(string,string); long unsigned int getLongUnsignedInt(string,string); double getDouble(string,string); void permute(vector&); void permute(vector&); vector FDR_BH(vector&); void affCoding(Plink &); void removeMissingPhenotypes(Plink & ); string genotype(Plink &, int i, int l); string genotype(Plink & P, Individual *, int); string genotypeToFile(Plink &, int i, int l); int getChromosomeCode(string); string chromosomeName(int); int getMarkerChromosome(Plink &,string); int getMarkerNumber(Plink &,string); string leftWindowEdge(Plink & P, int bp, int chr); string rightWindowEdge(Plink & P, int bp, int chr); vector getChromosomeMarkerRange(Plink &, int); bool seeChromosome(Plink &,int); vector getChromosomeRange(Plink &); vector getWindowRange(Plink &P, int); vector > two_locus_table(int,int); void makePersonMap(Plink&,map&); void makeLocusMap(Plink&,map&); double SNPHWE(int obs_hets, int obs_hom1, int obs_hom2); string int2str(int); string dbl2str(double,int prc = -1); string dbl2str_fixed(double, int prc = -1); string longint2str(long int); std::string sw(std::string s , int n); std::string sw(double d , int n); std::string sw(double d , int f, int n); std::string sw(int i , int n); std::string itoa(int, int); void checkFileExists(string); void checkFileExists(vector); bool doesFileExist(string); bool compressed(string); vector tokenizeLine(ifstream&); vector tokenizeLine(string); vector tokenizeLine(ifstream &,string); void defineDogChromosomes(); void defineMouseChromosomes(); void defineCowChromosomes(); void defineSheepChromosomes(); void defineHorseChromosomes(); void defineHumanChromosomes(); void defineRiceChromosomes(); vector vif_prune(vector > , double threshold,vector&); vector > calcSetCovarianceMatrix(vector & nSNP); void smoother(Plink & P, vector_t & input, int n, vector_t & output1, vector_t & output2, vector & count); map > readRange(string); double modelComparisonPValue(Model * alternate, Model * null); set rangeIntersect(Range & r1, map > & ranges); set mapRanges2SNP(int l, map > & ranges); int2 mapSNPs2Range(Plink & P, const Range * range); void makeScaffold(Plink & P); void mapRangesToSNPs(string, map > & ranges, map > & snp2range); map > filterRanges(map > & ranges, string filename); #endif plink-1.07-src/dosage.cpp0000644000265600020320000007625511264127626014463 0ustar tilleaadmin ////////////////////////////////////////////////////////////////// // // // PLINK (c) 2005-2009 Shaun Purcell // // // // This file is distributed under the GNU General Public // // License, Version 2. Please see the file COPYING for more // // details // // // ////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "options.h" #include "helper.h" #include "plink.h" #include "phase.h" #include "stats.h" #include "zed.h" #include "model.h" #include "logistic.h" #include "linear.h" extern Plink * PP; class Var { public: string snp; string a1; bool operator< (const Var & b) const { return (snp < b.snp); } bool operator== (const Var & b) const { return (snp == b.snp && a1 == b.a1 ); } }; // Helper functions void setUpQScoring( map & , vector &, vector &); void Plink::processDosageFile() { // For scroring procedure, if used map > qflag; map wt; matrix_t scores; map qscore; vector qthresh; vector qlabel; if ( par::score_risk ) { if ( par::score_risk_on_qrange ) setUpQScoring( qscore, qthresh, qlabel); // Read in score(s) ZInput zin( par::score_risk_file , compressed( par::score_risk_file ) ); while ( ! zin.endOfFile() ) { vector tok = zin.tokenizeLine(); if ( tok.size() != 3 ) continue; Var v; v.snp = tok[0]; v.a1 = tok[1]; double s; if ( ! from_string( s, tok[2] , std::dec ) ) continue; // Store weight wt.insert(make_pair( v, s )); // Which scores should we place this in? if ( par::score_risk_on_qrange ) { vector qq; // Can we find a q-score? map::iterator i1 = qscore.find( v.snp ); if ( i1 == qscore.end() ) continue; double sc = i1->second; for ( int q = 0 ; q < qthresh.size() ; q++) { if ( sc >= qthresh[q].p1 && sc <= qthresh[q].p2 ) qq.push_back(q); } qflag.insert(make_pair(v,qq)); } // Read next score } zin.close(); printLOG("Done reading scores\n"); } printLOG("\nReading dosage information from [ " + par::dosage_file + " ]\n"); // Expect a single file, with header bool compressed_in = false; bool compressed_out = false; bool filelist = false; bool countOccur = false; bool header = true; bool sepHeader = false; int skip0 = 0; int skip1 = 0; int skip2 = 0; bool dosageScale2 = true; bool snpBatch = false; // {skip0} SNP {skip1} A1 A2 {skip2} DATA... OptionSet * dosage_opt = par::opt.getOptions("DOSAGE"); if ( dosage_opt->isSet("Z") ) compressed_in = compressed_out = true; if ( dosage_opt->isSet("Zout") ) { compressed_out = true; } if ( dosage_opt->isSet("Zin") ) compressed_in = true; if ( dosage_opt->isSet("list") ) filelist = true; if ( dosage_opt->isSet("occur") ) countOccur = true; if ( dosage_opt->isSet("noheader") ) header = false; if ( dosage_opt->isSet("sepheader") ) sepHeader = true; if ( sepHeader && ! filelist ) error("The 'sepheader' option requires the 'list' option to be set"); if ( sepHeader && ! header ) error("Cannot specify both 'sepheader' and 'noheader'"); if ( dosage_opt->getValue("skip0") != "" ) skip0 = atoi( dosage_opt->getValue("skip0").c_str() ); if ( dosage_opt->getValue("skip1") != "" ) skip1 = atoi( dosage_opt->getValue("skip1").c_str() ); if ( dosage_opt->getValue("skip2") != "" ) skip2 = atoi( dosage_opt->getValue("skip2").c_str() ); if ( dosage_opt->isSet("dose1") ) dosageScale2 = false; // Get relevant columns codes int snp_field = skip0 + 0; int a1_field = snp_field + skip1 + 1; int a2_field = a1_field + 1; int geno_field = a2_field + skip2 + 1; int pre_fields = 3 + skip0 + skip1 + skip2; if ( par::dosage_hasMap ) { printLOG("A MAP file has been specified: only these markers will be processed\n"); if ( par::dosage_hard_call ) { printLOG("Going to make hard-calls at " + dbl2str( par::dosage_hard_call_thresh ) + " threshold\n"); printLOG("and write remaining dosages to [ " + par::output_file_name + ".dosage" ); if ( compressed_out ) printLOG(".gz ]"); else printLOG(" ]"); printLOG(" if more than " + int2str(par::dosage_hard_call_thresh2) + " non-calls\n"); par::SNP_major = true; // Consider each new variant // Space will already have been added for (int l=0; lone.resize(n,true); newlocus->two.resize(n,false); } } } double thresh1_AA = (1-par::dosage_hard_call_thresh)/2.0; double thresh1_AB = 0.5 - ((1-par::dosage_hard_call_thresh)/2.0); double thresh2_AB = 0.5 + ((1-par::dosage_hard_call_thresh)/2.0); double thresh1_BB = 1 - (1-par::dosage_hard_call_thresh)/2.0; //////////////////////////////////// // Genotype dosage format options bool oneDose = false; bool twoProbs = false; bool threeProbs = false; if ( dosage_opt->getValue("format") == "1" ) oneDose = true; else if ( dosage_opt->getValue("format") == "3" ) threeProbs = true; else twoProbs = true; if ( oneDose ) printLOG("Format set to one dosage per genotype\n"); else if ( twoProbs ) printLOG("Format set to two genotype probabilities\n"); else printLOG("Format set to three genotype probabilities\n"); int step = 1; if ( twoProbs ) step = 2; else if ( threeProbs ) step = 3; // Still assume a single FAM file; but allow for people/SNPs to be // spread across multiple files // *** Currently, the order still needs to be similar across files in the // same SNP batch vector dosageFilename_all; vector headerFilename_all; vector batchName; set batchNameSet; if ( ! filelist ) { dosageFilename_all.push_back( par::dosage_file ); } else { ZInput IN1( par::dosage_file , false ); bool batchModeSet = false; while ( ! IN1.endOfFile() ) { vector f = IN1.tokenizeLine(); if ( f.size() == 0 ) break; // possible formats, if !sepHeader // dosage-file // snp-batch dosage-file // if sepHeader // dosage-file header-file // snp-batch dosage-file header-file if ( batchModeSet ) { if ( sepHeader && snpBatch && f.size() != 3 ) error("Expecting 3 entries: batch dosage header"); else if ( sepHeader && (!snpBatch) && f.size() != 2 ) error("Expecting 2 entries: dosage header"); else if ( (!sepHeader) && snpBatch && f.size() != 2 ) error("Expecting 2 entries: batch dosage header"); else if ( (!sepHeader) && (!snpBatch) && f.size() != 1 ) error("Expecting 1 entry: dosage"); } else { if (sepHeader) { if (f.size() == 3 ) snpBatch = true; else if (f.size() == 2) snpBatch = false; else error("Problem with dosage file list format"); } else { if ( f.size() == 2 ) snpBatch = true; else if ( f.size() == 1 ) snpBatch = false; else error("Problem with dosage file list format"); } batchModeSet = true; } // Store information int term = 0; if ( snpBatch ) { int nm; if ( ! from_string( nm, f[term] , std::dec ) ) error("Problem reading SNP batch number"); batchName.push_back( nm ); batchNameSet.insert( nm ); ++term; } dosageFilename_all.push_back(f[term++]); if ( sepHeader ) headerFilename_all.push_back(f[term++]); } IN1.close(); printLOG("Expecting " + int2str( dosageFilename_all.size() ) + " total files, in " + int2str(batchNameSet.size()) + " distinct batches of SNPs\n"); } /////////////////////////////////////////// // // Set up some basic things, headers, etc // /////////////////////////////////////////// map msample; for (int i=0; ifid+"_"+sample[i]->iid,i)); map mlocus; if ( par::dosage_hasMap ) for (int l=0; lname,l)); string ext = countOccur ? ".occur.dosage" : par::write_dosage ? ".out.dosage" : ".assoc.dosage"; map occur; ZOutput detout; if ( par::dosage_hard_call ) { ext = ".dosage"; detout.open( par::output_file_name + ".dosage.det" , false ); std::ostringstream s2( std::stringstream::out ); s2 << sw("CHR",4) << sw("SNP",par::pp_maxsnp) << sw("BP", 12) << sw("A1", 4) << sw("A2", 4) << sw("MAF", 8) << sw("INFO", 8) << sw("ABOVE", 8) << sw("BELOW", 8) << sw("RATE", 8) << "\n"; detout.write( s2.str() ); printLOG("Writing additional information to [ " + par::output_file_name + ".dosage.det ]\n"); } string tail = compressed_out ? ".gz" : ""; ZOutput zout( par::output_file_name + ext + tail , compressed_out ); if ( ! par::dosage_hard_call ) printLOG("Writing results to [ " + par::output_file_name + ext + tail + " ]\n"); if ( par::dosage_hard_call || par::write_dosage ) { // Write header to dosage output file std::ostringstream s2( std::stringstream::out ); s2 << "SNP A1 A2 "; for (int i=0; imissing ) s2 << sample[i]->fid << " " << sample[i]->iid << " "; s2 << "\n"; zout.write( s2.str() ); } // Do we need a header? if ( ! ( countOccur || par::dosage_hard_call || par::write_dosage ) ) { string es = par::bt ? "OR" : "BETA"; if ( par::dosage_hasMap ) zout << sw("CHR",4) << sw("SNP",12) << sw("BP",12) << sw("A1",4) << sw("A2",4) << sw("FRQ",8) << sw("INFO",8) << sw(es,8) << sw("SE",8) << sw("P",8) << "\n"; else zout << sw("SNP" ,12) << sw("A1" ,4) << sw("A2" ,4) << sw("FRQ" ,8) << sw("INFO" ,8) << sw(es ,8) << sw("SE",8) << sw("P" ,8) << "\n"; } /////////////////////////////////////////////////// // Set up association model bool OLD_assoc_glm_without_main_snp = par::assoc_glm_without_main_snp; bool OLD_clist = par::clist; par::assoc_glm_without_main_snp = true; par::clist = true; // Add an extra covariate slot ++par::clist_number; clistname.resize( par::clist_number ); clistname[ par::clist_number - 1 ] = "DOSAGE"; for (int i=0; iclist.resize( par::clist_number ); int term = par::clist_number -1; int vcount = 0; /////////////////////////////////////////////////// // Set up scoring procedure if ( par::score_risk ) { if ( par::score_risk_on_qrange ) sizeMatrix(scores,n,qthresh.size()); else sizeMatrix(scores,n,1); } /////////////////////////////////////////// // // Start looping through SNP batches // /////////////////////////////////////////// set::iterator bi = batchNameSet.begin(); while (1) { // Pull out the relevant set of files vector dosageFilename; vector headerFilename; if ( snpBatch ) { for (int i=0; i expected( nFiles ); vector vzin( nFiles ); vector vhead( nFiles ); vector< vector > personMap( nFiles); vector npeople( nFiles ); int found = 0; int totpeople = 0; set inDosage; if ( nFiles > 1 && ! header ) error("Can only specify noheader when reading a single dosage file\n"); for (int f = 0 ; f < vzin.size() ; f++ ) { ZInput * d = new ZInput( dosageFilename[f] , compressed_in ); vzin[f] = d; if ( sepHeader ) vhead[f] = new ifstream(headerFilename[f].c_str() , ios::in ); // Read header, or use FAM file if ( header ) { vector tok; if ( ! sepHeader ) tok = vzin[f]->tokenizeLine(); else { // read all ID pairs while (1) { string fid, iid; (*vhead[f]) >> fid >> iid; if ( fid == "" || vhead[f]->eof() ) break; tok.push_back(fid); tok.push_back(iid); } } int firstCol = sepHeader ? 0 : pre_fields ; if ( tok.size() < firstCol ) error("Bad format fdr dosage file, expecting more columns"); if ( ! sepHeader ) { if ( tok[snp_field] != "SNP" || tok[a1_field] != "A1" || tok[a2_field] != "A2" ) error("Badly aligned columns for: SNP A1 A2"); } if ( (tok.size() - firstCol ) % 2 != 0 ) error("Expecting 3 + 2 * N columns in header\n"); // Based on header field, two entries per person (FID, IID) npeople[f] = (tok.size()- firstCol )/2; expected[f] = pre_fields + npeople[f] * step; ////////////////////////////////////////////////////// // Process header row for (int i=firstCol; i::iterator m = msample.find(id); if ( m == msample.end() ) { personMap[f].push_back( (Individual*)NULL ) ; } else { if ( inDosage.find( sample[ m->second ] ) != inDosage.end() ) error("The person appears in >1 dosage file: " + id ); Individual * person = sample[m->second]; personMap[f].push_back( person ); inDosage.insert( person ); ++found; } } totpeople += npeople[f]; } else { // If no explicit header given npeople[f] = n; expected[f] = pre_fields + npeople[f] * step; totpeople += npeople[f]; for (int i=0; ifid+ "_" + sample[i]->iid; map::iterator m = msample.find(id); if ( m == msample.end() ) { personMap[f].push_back( (Individual*)NULL ) ; } else { if ( inDosage.find( sample[ m->second ] ) != inDosage.end() ) error("The person appears in >1 dosage file: " + id ); Individual * person = sample[m->second]; personMap[f].push_back( person ); inDosage.insert( person ); } } } } // next dosage file if ( ! snpBatch && header ) { printLOG("Matched to " + int2str(found) + " of " + int2str( totpeople ) + " individuals"); if ( filelist ) printLOG(" in " + int2str( nFiles ) + " files\n"); else printLOG(" in [ " + par::dosage_file + " ]\n"); } if ( sepHeader ) { for (int f = 0 ; f < nFiles; f++ ) vhead[f]->close(); } // Remove missing individuals if ( ! par::dosage_hard_call ) { int n_removed = keepIndividuals( inDosage ); if ( n_removed > 0 ) { printLOG("Removed " + int2str(n_removed) + " individuals not in dosage file\n"); // Update person map, given we've removed some people msample.clear(); for (int i=0; ifid+"_"+sample[i]->iid,i)); } } // Create final file column position -> sample # mapping, for each file vector< vector > personPosition( nFiles ); for (int f = 0 ; f < nFiles; f++ ) { for (int i = 0; i < personMap[f].size(); i++) { Individual * person = personMap[f][i]; if ( person == NULL ) { personPosition[f].push_back(-1); } else { string id = person->fid + "_" + person->iid; map::iterator p = msample.find( id ); personPosition[f].push_back( p->second ); } } } ///////////////////////////////////////////////// // Read main dosage data, and analyse SNP by SNP while ( 1 ) { bool done = false; bool skip = false; string snp_id; string a1_id; string a2_id; int snp_code; int goodCall = 0; int badCall = 0; map dose1; map dose2; for ( int f = 0 ; f < nFiles ; f++ ) { if ( vzin[f]->endOfFile() ) { done = true; break; } // Read line from the correct file vector tok = vzin[f]->tokenizeLine(); if ( tok.size() == 0 ) { skip = true; break; } if ( tok.size() != expected[f] ) error("Problem with line:\n" + displayLine(tok) ); if ( tok[ snp_field ] == "" ) { skip = true; break; } if ( f == 0 ) { snp_id = tok[ snp_field ]; a1_id = tok[ a1_field ]; a2_id = tok[ a2_field ]; } else { if ( snp_id != tok[ snp_field ] ) error("Misaligned SNPs in dosage files"); if ( a1_id != tok[a1_field ] ) error("Misaligned allele codes in dosage file"); } // If we've loaded a MAP file, then ignore this // marker if it is not present in the MAP file if ( par::dosage_hasMap ) { map::iterator mi = mlocus.find( snp_id ); if ( mlocus.find( snp_id ) == mlocus.end() ) { skip = true; continue; } if ( f==0 ) { snp_code = mi->second; locus[ snp_code ]->allele1 = a1_id; locus[ snp_code ]->allele2 = a2_id; } } // Are we just in the mode in which we simply count // how many times/files we see this SNP? if ( countOccur ) { map::iterator o = occur.find( tok[ snp_field ] ); if ( o != occur.end() ) (o->second)++; else occur.insert(make_pair( tok[ snp_field ], 1 )); continue; } // Start position for genotype data int j = pre_fields; for (int i=0; i( dose, tok[j++], std::dec ) ) problem=true; if ( dosageScale2 ) { if ( dose < 0 || dose > 2 ) problem = true; dose /= 2.0; } else { if ( dose < 0 || dose > 1 ) problem = true; } } else if ( twoProbs ) { if ( !from_string( d1, tok[j++], std::dec ) ) problem = true; if ( !from_string( d2, tok[j++], std::dec ) ) problem = true; if ( d1 < 0 || d1 > 1 ) problem = true; if ( d2 < 0 || d2 > 1 ) problem = true; if ( d1 + d2 > 1 ) problem = true; dose = d1 + d2/2.0; dose1.insert(make_pair( person , d1 ) ); dose2.insert(make_pair( person , d2 ) ); } else if ( threeProbs ) { if ( !from_string( d1, tok[j++], std::dec ) ) problem = true; if ( !from_string( d2, tok[j++], std::dec ) ) problem = true; if ( !from_string( d3, tok[j++], std::dec ) ) problem = true; if ( d1 < 0 || d1 > 1 ) problem = true; if ( d2 < 0 || d2 > 1 ) problem = true; // skip sanity check on 3rd // if ( d1 + d2 > 1 ) // problem = true; dose = d1 + d2/2.0; dose1.insert(make_pair( person , d1 ) ); dose2.insert(make_pair( person , d2 ) ); } // Do we want to make a hard call now, or store // dosage for analysis? if ( par::dosage_hard_call ) { bool s1 = true; bool s2 = false; if ( oneDose ) { if ( dose < thresh1_AA ) { s1=s2=false; } else if ( dose > thresh1_AB && dose < thresh2_AB ) { s1=false; s2=true; } else if ( dose > thresh1_BB ) { s1=s2=true; } } else { if ( d1 > par::dosage_hard_call_thresh ) { s1=s2=false; } else if ( d2 > par::dosage_hard_call_thresh ) { s1=false; s2=true; } else if ( 1-d1-d2 > par::dosage_hard_call_thresh ) { s1=s2=true; } } if ( s1 && ! s2 ) ++badCall; else ++goodCall; SNP[snp_code]->one[personPosition[f][i]] = s1; SNP[snp_code]->two[personPosition[f][i]] = s2; } if ( problem ) { sample[ personPosition[f][i] ]->missing2 = true; } else { sample[ personPosition[f][i] ]->missing2 = false; sample[ personPosition[f][i] ]->clist[ term ] = dose; } } } // Next dosage file // Are we at the end of a file though? if ( done ) break; if ( skip ) continue; // Do we need to bother processing the genotype dosage data? if ( countOccur ) continue; ++vcount; if ( ! par::silent ) cerr << "Processed " << vcount << " markers \r"; /////////////////////////////////////////////////// // Set up scoring procedure if ( par::score_risk ) { // Does this variant have a score? Var v; v.snp = snp_id; map::iterator i = wt.find( v ); if ( i == wt.end() ) continue; double weight = i->second; // Right allele? bool swapAllele = false; if ( i->first.a1 == a2_id ) { // need to swap swapAllele = true; } else { if ( i->first.a1 != a1_id ) continue; } if ( par::score_risk_on_qrange ) { map >::iterator i0 = qflag.find(v); if ( i0 == qflag.end() ) continue; vector & inQ = i0->second; for (int q=0; qclist[ term ] ) : weight * sample[i]->clist[ term ]; } } else { for (int i=0; iclist[ term ] ) : weight * sample[i]->clist[ term ]; } } // Skip association test, etc continue; } /////////////////////////////////////////// // tabulate frequency, and info/r^2 score double frq = 0; int cnt = 0; for (int i=0; imissing ) { frq += sample[i]->clist[ term ]; ++cnt; } frq /= (double)cnt; double theoreticalVariance = frq * ( 1 - frq ); double dosageSSQ = 0; for (int i=0; imissing ) { double t1 = sample[i]->clist[ term ] - frq ; t1 *= t1; dosageSSQ += t1; } double empiricalVariance = 2 * ( dosageSSQ / (double)cnt); double rsq = theoreticalVariance > 0 ? empiricalVariance / theoreticalVariance : 0; ////////////////////////////////////////// // Give some output if ( par::dosage_hard_call ) { std::ostringstream s2( std::stringstream::out ); s2 << sw(locus[snp_code]->chr, 4) << sw(snp_id, par::pp_maxsnp) << sw(locus[snp_code]->bp, 12) << sw(a1_id, 4) << sw(a2_id, 4) << sw(frq,4, 8) << sw(rsq,4,8) << sw(goodCall, 8) << sw(badCall, 8) << sw((double)goodCall/(double)(goodCall+badCall),4, 8) << "\n"; detout.write( s2.str() ); // Do we need to write this line back out as dosage info? if ( badCall > par::dosage_hard_call_thresh2 ) { // Write header to dosage output file std::ostringstream s2( std::stringstream::out ); s2 << snp_id << " " << a1_id << " " << a2_id << " "; if ( oneDose ) { for (int i=0; imissing ) s2 << 2 * sample[i]->clist[term] << " "; } else { for (int i=0; imissing ) { double d1 = dose1.find(sample[i])->second; double d2 = dose2.find(sample[i])->second; if ( twoProbs ) s2 << d1 << " " << d2 << " "; if ( threeProbs ) s2 << 1 - d1 - d2 << " "; } } s2 << "\n"; zout.write( s2.str() ); } continue; } if ( par::write_dosage ) { std::ostringstream s2( std::stringstream::out ); s2 << snp_id << " " << a1_id << " " << a2_id << " "; if ( oneDose ) { for (int i=0; imissing ) s2 << 2 * sample[i]->clist[term] << " "; } else { for (int i=0; imissing ) { double d1 = dose1.find(sample[i])->second; double d2 = dose2.find(sample[i])->second; if ( twoProbs ) s2 << d1 << " " << d2 << " "; if ( threeProbs ) s2 << 1 - d1 - d2 << " "; } } s2 << "\n"; zout.write( s2.str() ); // Do not perform association test, go straight to // next marker continue; } /////////////////////////////////////////// // Perform association glmAssoc(false,*pperm); /////////////////////////////////////////// // Report results bool valid = model->isValid(); // Do not output for bad markers // Hard-code for now... if ( frq < 0.01 || frq > 0.99 || rsq < 0.1 || rsq > 2) { valid = false; if ( rsq<0 ) rsq = 0; if ( rsq>2 ) rsq = 2.0; } vector_t b = model->getCoefs(); vector_t pval = model->getPVals(); vector_t var = model->getVar(); // NOTE: b includes intercept; pval doesn't // Note: internal coding of dosage is on 0..1 scale, so // divide beta and SE by 2 here to get per-allele effects double statistic = valid ? model->getStatistic() : 0; double pvalue = pval[ pval.size()-1 ]; double beta = par::bt ? exp( b[ b.size()-1 ] / 2.0) : b[ b.size()-1 ] / 2.0 ; double se = sqrt( var[ var.size()-1 ] ) / 2.0; if ( par::dosage_hasMap ) zout << sw(locus[snp_code]->chr , 4) << sw(snp_id , 12) << sw(locus[snp_code]->bp ,12) << sw(a1_id ,4) << sw(a2_id ,4) << sw(frq,4 ,8) << sw(rsq,4 ,8); else zout << sw(snp_id ,12) << sw(a1_id ,4) << sw(a2_id ,4) << sw(frq,4 ,8) << sw(rsq,4 ,8); if ( valid ) { zout << sw(beta,4,8) << sw(se,4,8) << sw(pvalue,-4,8)<< "\n"; } else { zout << sw("NA",8) << sw("NA",8) << sw("NA",8) << "\n"; } delete model; // Next variant(s) } /////////////////////////////////////// // // Finished this batch // /////////////////////////////////////// for (int f = 0 ; f < nFiles; f++) vzin[f]->close(); if ( ! snpBatch ) break; ++bi; if ( bi == batchNameSet.end() ) break; } /////////////////////////////////////// // // Finished all .. output & wrap up // /////////////////////////////////////// if ( ! par::silent ) cerr << "\n"; // Write out occurence info if ( countOccur ) { map::iterator o = occur.begin(); int totCount = 0; int nonF = 0; while ( o != occur.end() ) { zout.write( o->first + " " + int2str( o->second ) + "\n" ); if ( o->second != dosageFilename_all.size() ) ++nonF; totCount += o->second; ++o; } printLOG("Counted unique " + int2str(occur.size() ) + " markers, " + int2str( totCount ) + " across all files\n"); if ( nonF == 0 ) printLOG("All SNPs occured exactly once in all files\n"); else printLOG("Note: " + int2str(nonF) + " SNPs did not occur exactly once per file\n"); } zout.close(); if ( par::dosage_hard_call ) detout.close(); // if ( ! countOccur ) // printLOG("In total, analysed " + int2str(vcount) + " markers\n"); /////////////////////////////////////// // Write scores to file if ( par::score_risk ) { int qq = 0; while (1) { string append = par::score_risk_on_qrange ? ".S" + int2str(qq+1) : ""; ofstream O1( ( par::output_file_name + append + ".profile").c_str() , ios::out ); O1 << setw(par::pp_maxfid) << "FID" << " " << setw(par::pp_maxiid) << "IID" << " " << setw(6) << "PHENO" << " " << setw(8) << "SCORE" << "\n"; for ( int i=0; ifid << " " << setw(par::pp_maxiid) << person->iid << " " << setw(6) << person->phenotype << " " << setw(8) << scores[i][qq] << "\n"; } O1.close(); if ( !par::score_risk_on_qrange ) break; if ( ++qq == qthresh.size() ) break; } } /////////////////////////////////////// // Some final tidying up par::assoc_glm_without_main_snp = OLD_assoc_glm_without_main_snp; par::clist = OLD_clist; --par::clist_number; clistname.resize( par::clist_number ); for (int i=0; iclist.resize( par::clist_number ); return; } void setUpQScoring( map & qscore, vector & qthresh, vector & qlabel) { checkFileExists( par::score_qfile ); checkFileExists( par::score_qrange_file ); PP->printLOG("Reading quantitative scores from [ " + par::score_qfile + " ]\n"); PP->printLOG("Reading score ranges from [ " + par::score_qrange_file + " ]\n"); ifstream Q1( par::score_qfile.c_str() , ios::in ); while ( ! Q1.eof() ) { string snp; string str_score; double score; Q1 >> snp >> str_score; if ( ! from_string( score , str_score , std::dec ) ) continue; if ( snp == "" ) continue; qscore.insert( make_pair( snp , score ) ); } Q1.close(); PP->printLOG("Read q-scores for " + int2str( qscore.size() ) + " SNPs\n"); Q1.open( par::score_qrange_file.c_str() , ios::in ); while ( ! Q1.eof() ) { // Expect: name, lower, upper string label; double lower, upper; Q1 >> label >> lower >> upper; if ( label == "" ) continue; double2 d2(lower,upper); qthresh.push_back( d2 ); qlabel.push_back( label ); } Q1.close(); PP->printLOG("Read " + int2str( qthresh.size() ) + " thresholds to apply\n"); }