nsegata-lefse-3ce86f78d59e/.hg_archival.txt0000644000000000000000000000017112750144356016660 0ustar 00000000000000repo: e20290ebdfe008633514e90d97edc6091bf0b21d node: 3ce86f78d59ecd02eaf168afb6e1bd898bb3facc branch: default tag: 1.0.8 nsegata-lefse-3ce86f78d59e/.hgtags0000644000000000000000000000021512750144356015047 0ustar 0000000000000089e1a36a3c257de1ccab84f2318fe53229c8f7d0 1.0.5 f9b6a0dcd6453e6da472d7972b5bddbe8c1abab1 1.0.6 4e95b89d544e42532887f9f3e1a35d9accf5190f 1.0.7 nsegata-lefse-3ce86f78d59e/__init__.py0000644000000000000000000000000012750144356015672 0ustar 00000000000000nsegata-lefse-3ce86f78d59e/example/run.sh0000755000000000000000000000462112750144356016374 0ustar 00000000000000 # Download a 3-classes example (with subclasses and subjects) from huttenhower.sph.harvard.edu # It is a small subset of the HMP 16S dataset for finding biomarkers characterizing # different level of oxygen availability in different bodysites wget http://huttenhower.sph.harvard.edu/webfm_send/129 -O hmp_aerobiosis_small.txt # Running the LEfSe commands with -h gives the list of available options # format_input.py convert the input data matrix to the format for LEfSe. # # In this example we use the class information in the first line (-c 1) # the subclass in the second line (-s 2) and the subject in the third (-u 3). # If the subclass or the subject are not present in the data you need to set # the value -1 for them. # -o 1000000 scales the feature such that the sum (of the same taxonomic leve) # is 1M: this is done only for obtaining more meaningful values for the LDA score ../format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000 # run_lefse.py performs the actual statistica analysis # # Apply LEfSe on the formatted data producing the results (to be further processed # for visualization with the other modules). The option available # can be listed using the -h option ../run_lefse.py hmp_aerobiosis_small.in hmp_aerobiosis_small.res # plot_res.py visualizes the output # # Plot the list of biomarkers with their effect size # Severak graphical options are available for personalizing the output ../plot_res.py hmp_aerobiosis_small.res hmp_aerobiosis_small.png # plot_cladogram.py visualizes the output on a hierarchical tree # # Plot the representation of the biomarkers on the hierarchical tree # specified in the input data (using | in the name of the features) # In this case we will obtain the RDP taxonomy. # This is an early implementation of the module. I'm working on an improved version # that will be released independently from LEfSe ../plot_cladogram.py hmp_aerobiosis_small.res hmp_aerobiosis_small.cladogram.png --format png # Create a directory for storing the raw-data representation of the discovered biomarkers mkdir biomarkers_raw_images # plot_features.py visualizes the raw-data features # # The module for exporting the raw-data representation of the features. # With the default options we will obtain the images for all the features that are # detected as biomarkers ../plot_features.py hmp_aerobiosis_small.in hmp_aerobiosis_small.res biomarkers_raw_images/ nsegata-lefse-3ce86f78d59e/format_input.py0000755000000000000000000005412212750144356016663 0ustar 00000000000000#!/usr/bin/env python import sys,os,argparse,pickle,re,numpy #*************************************************************************************************************** #* Log of change * #* January 16, 2014 - George Weingart - george.weingart@gmail.com * #* * #* biom Support * #* Modified the program to enable it to accept biom files as input * #* * #* Added two optional input parameters: * #* 1. biom_c is the name of the biom metadata to be used as class * #* 2. biom_s is the name of the biom metadata to be used as subclass * #* class and subclass are used in the same context as the original * #* parameters class and subclass * #* These parameters are totally optional, the default is the program * #* chooses as class the first metadata received from the conversion * #* of the biom file into a sequential (pcl) file as generated by * #* breadcrumbs, and similarly, the second metadata is selected as * #* subclass. * #* The syntax or logic for the original non-biom case was NOT changed. * #* * #* <******************* IMPORTANT NOTE *************************> * #* The biom case requires breadcrumbs and therefore there is a * #* a conditional import of the breadcrumbs modules * #* If the User uses a biom input and breadcrumbs is not detected, * #* the run is abnormally ended * #* breadcrumbs itself needs a biom environment, so if the immport * #* of biom in breadcrumbs fails, the run is also abnormally #* ended (Only if the input file was biom) * #* * #* USAGE EXAMPLES * #* -------------- * #* Case #1: Using a sequential file as input (Old version - did not change * #* ./format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000 * #* Case #2: Using a biom file as input * #* ./format_input.py hmp_aerobiosis_small.biom hmp_aerobiosis_small.in -o 1000000 * #* Case #3: Using a biom file as input and override the class and subclass * #* ./format_input.py lefse.biom hmp_aerobiosis_small.in -biom_c oxygen_availability -biom_s body_site -o 1000000 #* * #*************************************************************************************************************** def read_input_file(inp_file, CommonArea): if inp_file.endswith('.biom'): #* If the file format is biom: CommonArea = biom_processing(inp_file) #* Process in biom format return CommonArea #* And return the CommonArea with open(inp_file) as inp: CommonArea['ReturnedData'] = [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()] return CommonArea def transpose(data): return zip(*data) def read_params(args): parser = argparse.ArgumentParser(description='LEfSe formatting modules') parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.") parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, help="the output file containing the data for LEfSe") parser.add_argument('--output_table', type=str, required=False, default="", help="the formatted table in txt format") parser.add_argument('-f',dest="feats_dir", choices=["c","r"], type=str, default="r", help="set whether the features are on rows (default) or on columns") parser.add_argument('-c',dest="class", metavar="[1..n_feats]", type=int, default=1, help="set which feature use as class (default 1)") parser.add_argument('-s',dest="subclass", metavar="[1..n_feats]", type=int, default=None, help="set which feature use as subclass (default -1 meaning no subclass)") parser.add_argument('-o',dest="norm_v", metavar="float", type=float, default=-1.0, help="set the normalization value (default -1.0 meaning no normalization)") parser.add_argument('-u',dest="subject", metavar="[1..n_feats]", type=int, default=None, help="set which feature use as subject (default -1 meaning no subject)") parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d", help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)") parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10, help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)") parser.add_argument('-biom_c',dest="biom_class", type=str, help="For biom input files: Set which feature use as class ") parser.add_argument('-biom_s',dest="biom_subclass", type=str, help="For biom input files: set which feature use as subclass ") args = parser.parse_args() return vars(args) def remove_missing(data,roc): if roc == "c": data = transpose(data) max_len = max([len(r) for r in data]) to_rem = [] for i,r in enumerate(data): if len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i) if len(to_rem): for i in to_rem.reverse(): data.pop(i) if roc == "c": return transpose(data) return data def sort_by_cl(data,n,c,s,u): def sort_lines1(a,b): return int(a[c] > b[c])*2-1 def sort_lines2u(a,b): if a[c] != b[c]: return int(a[c] > b[c])*2-1 return int(a[u] > b[u])*2-1 def sort_lines2s(a,b): if a[c] != b[c]: return int(a[c] > b[c])*2-1 return int(a[s] > b[s])*2-1 def sort_lines3(a,b): if a[c] != b[c]: return int(a[c] > b[c])*2-1 if a[s] != b[s]: return int(a[s] > b[s])*2-1 return int(a[u] > b[u])*2-1 if n == 3: data.sort(sort_lines3) if n == 2: if s is None: data.sort(sort_lines2u) else: data.sort(sort_lines2s) if n == 1: data.sort(sort_lines1) return data def group_small_subclasses(cls,min_subcl): last = "" n = 0 repl = [] dd = [list(cls['class']),list(cls['subclass'])] for d in dd: if d[1] != last: if n < min_subcl and last != "": repl.append(d[1]) last = d[1] n = 1 for i,d in enumerate(dd): if d[1] in repl: dd[i][1] = "other" dd[i][1] = str(dd[i][0])+"_"+str(dd[i][1]) cls['class'] = dd[0] cls['subclass'] = dd[1] return cls def get_class_slices(data): previous_class = data[0][0] previous_subclass = data[0][1] subclass_slices = [] class_slices = [] last_cl = 0 last_subcl = 0 class_hierarchy = [] subcls = [] for i,d in enumerate(data): if d[1] != previous_subclass: subclass_slices.append((previous_subclass,(last_subcl,i))) last_subcl = i subcls.append(previous_subclass) if d[0] != previous_class: class_slices.append((previous_class,(last_cl,i))) class_hierarchy.append((previous_class,subcls)) subcls = [] last_cl = i previous_subclass = d[1] previous_class = d[0] subclass_slices.append((previous_subclass,(last_subcl,i+1))) subcls.append(previous_subclass) class_slices.append((previous_class,(last_cl,i+1))) class_hierarchy.append((previous_class,subcls)) return dict(class_slices), dict(subclass_slices), dict(class_hierarchy) def numerical_values(feats,norm): mm = [] for k,v in feats.items(): feats[k] = [float(val) for val in v] if norm < 0.0: return feats tr = zip(*(feats.values())) mul = [] fk = feats.keys() hie = True if sum([k.count(".") for k in fk]) > len(fk) else False for i in range(len(feats.values()[0])): if hie: mul.append(sum([t for j,t in enumerate(tr[i]) if fk[j].count(".") < 1 ])) else: mul.append(sum(tr[i])) if hie and sum(mul) == 0: mul = [] for i in range(len(feats.values()[0])): mul.append(sum(tr[i])) for i,m in enumerate(mul): if m == 0: mul[i] = 0.0 else: mul[i] = float(norm) / m for k,v in feats.items(): feats[k] = [val*mul[i] for i,val in enumerate(v)] if numpy.mean(feats[k]) and (numpy.std(feats[k])/numpy.mean(feats[k])) < 1e-10: feats[k] = [ float(round(kv*1e6)/1e6) for kv in feats[k]] return feats def add_missing_levels2(ff): if sum( [f.count(".") for f in ff] ) < 1: return ff dn = {} added = True while added: added = False for f in ff: lev = f.count(".") if lev == 0: continue if lev not in dn: dn[lev] = [f] else: dn[lev].append(f) for fn in sorted(dn,reverse=True): for f in dn[fn]: fc = ".".join(f.split('.')[:-1]) if fc not in ff: ab_all = [ff[fg] for fg in ff if (fg.count(".") == 0 and fg == fc) or (fg.count(".") > 0 and fc == ".".join(fg.split('.')[:-1]))] ab =[] for l in [f for f in zip(*ab_all)]: ab.append(sum([float(ll) for ll in l])) ff[fc] = ab added = True if added: break return ff def add_missing_levels(ff): if sum( [f.count(".") for f in ff] ) < 1: return ff clades2leaves = {} for f in ff: fs = f.split(".") if len(fs) < 2: continue for l in range(len(fs)): n = ".".join( fs[:l] ) if n in clades2leaves: clades2leaves[n].append( f ) else: clades2leaves[n] = [f] for k,v in clades2leaves.items(): if k and k not in ff: ff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])] return ff def modify_feature_names(fn): ret = fn for v in [' ',r'\$',r'\@',r'#',r'%',r'\^',r'\&',r'\*',r'\"',r'\'']: ret = [re.sub(v,"",f) for f in ret] for v in ["/",r'\(',r'\)',r'-',r'\+',r'=',r'{',r'}',r'\[',r'\]', r',',r'\.',r';',r':',r'\?',r'\<',r'\>',r'\.',r'\,']: ret = [re.sub(v,"_",f) for f in ret] for v in ["\|"]: ret = [re.sub(v,".",f) for f in ret] ret2 = [] for r in ret: if r[0] in ['0','1','2','3','4','5','6','7','8','9']: ret2.append("f_"+r) else: ret2.append(r) return ret2 def rename_same_subcl(cl,subcl): toc = [] for sc in set(subcl): if len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1: toc.append(sc) new_subcl = [] for i,sc in enumerate(subcl): if sc in toc: new_subcl.append(cl[i]+"_"+sc) else: new_subcl.append(sc) return new_subcl #************************************************************************************* #* Modifications by George Weingart, Jan 15, 2014 * #* If the input file is biom: * #* a. Load an AbundanceTable (Using breadcrumbs) * #* b. Create a sequential file from the AbundanceTable (de-facto - pcl) * #* c. Use that file as input to the rest of the program * #* d. Calculate the c,s,and u parameters, either from the values the User entered * #* from the meta data values in the biom file or set up defaults * #* <<<------------- I M P O R T A N T N O T E ------------------->> * #* breadcrumbs src directory must be included in the PYTHONPATH * #* <<<------------- I M P O R T A N T N O T E ------------------->> * #************************************************************************************* def biom_processing(inp_file): CommonArea = dict() #* Set up a dictionary to return CommonArea['abndData'] = AbundanceTable.funcMakeFromFile(inp_file, #* Create AbundanceTable from input biom file cDelimiter = None, sMetadataID = None, sLastMetadataRow = None, sLastMetadata = None, strFormat = None) #**************************************************************** #* Building the data element here * #**************************************************************** ResolvedData = list() #This is the Resolved data that will be returned IDMetadataName = CommonArea['abndData'].funcGetIDMetadataName() #* ID Metadataname IDMetadata = [CommonArea['abndData'].funcGetIDMetadataName()] #* The first Row for IDMetadataEntry in CommonArea['abndData'].funcGetMetadataCopy()[IDMetadataName]: #* Loop on all the metadata values IDMetadata.append(IDMetadataEntry) ResolvedData.append(IDMetadata) #Add the IDMetadata with all its values to the resolved area for key, value in CommonArea['abndData'].funcGetMetadataCopy().iteritems(): if key != IDMetadataName: MetadataEntry = list() #* Set it up MetadataEntry.append(key) #* And post it to the area for x in value: MetadataEntry.append(x) #* Append the metadata value name ResolvedData.append(MetadataEntry) for AbundanceDataEntry in CommonArea['abndData'].funcGetAbundanceCopy(): #* The Abundance Data lstAbundanceDataEntry = list(AbundanceDataEntry) #Convert tuple to list ResolvedData.append(lstAbundanceDataEntry) #Append the list to the metadata list CommonArea['ReturnedData'] = ResolvedData #Post the results return CommonArea #******************************************************************************* #* Check the params and override in the case of biom * #******************************************************************************* def check_params_for_biom_case(params, CommonArea): CommonArea['MetadataNames'] = list() #Metadata names params['original_class'] = params['class'] #Save the original class params['original_subclass'] = params['subclass'] #Save the original subclass params['original_subject'] = params['subject'] #Save the original subclass TotalMetadataEntriesAndIDInBiomFile = len(CommonArea['abndData'].funcGetMetadataCopy()) # The number of metadata entries for i in range(0,TotalMetadataEntriesAndIDInBiomFile): #* Populate the meta data names table CommonArea['MetadataNames'].append(CommonArea['ReturnedData'][i][0]) #Add the metadata name #**************************************************** #* Setting the params here * #**************************************************** if TotalMetadataEntriesAndIDInBiomFile > 0: #If there is at least one entry - has to be the subject params['subject'] = 1 if TotalMetadataEntriesAndIDInBiomFile == 2: #If there are 2 - The first is the subject and the second has to be the metadata, and that is the class params['class'] = 2 if TotalMetadataEntriesAndIDInBiomFile == 3: #If there are 3: Set up default that the second entry is the class and the third is the subclass params['class'] = 2 params['subclass'] = 3 FlagError = False #Set up error flag if not params['biom_class'] is None and not params['biom_subclass'] is None: #Check if the User passed a valid class and subclass if params['biom_class'] in CommonArea['MetadataNames']: params['class'] = CommonArea['MetadataNames'].index(params['biom_class']) +1 #* Set up the index for that metadata else: FlagError = True if params['biom_subclass'] in CommonArea['MetadataNames']: params['subclass'] = CommonArea['MetadataNames'].index(params['biom_subclass']) +1 #* Set up the index for that metadata else: FlagError = True if FlagError == True: #* If the User passed an invalid class print "**Invalid biom class or subclass passed - Using defaults: First metadata=class, Second Metadata=subclass\n" params['class'] = 2 params['subclass'] = 3 return params if __name__ == '__main__': CommonArea = dict() #Build a Common Area to pass variables in the biom case params = read_params(sys.argv) #************************************************************* #* Conditionally import breadcrumbs if file is a biom file * #* If it is and no breadcrumbs found - abnormally exit * #************************************************************* if params['input_file'].endswith('.biom'): try: from lefsebiom.ConstantsBreadCrumbs import * from lefsebiom.AbundanceTable import * except ImportError: sys.stderr.write("************************************************************************************************************ \n") sys.stderr.write("* Error: Breadcrumbs libraries not detected - required to process biom files - run abnormally terminated * \n") sys.stderr.write("************************************************************************************************************ \n") exit(1) if type(params['subclass']) is int and int(params['subclass']) < 1: params['subclass'] = None if type(params['subject']) is int and int(params['subject']) < 1: params['subject'] = None CommonArea = read_input_file(sys.argv[1], CommonArea) #Pass The CommonArea to the Read data = CommonArea['ReturnedData'] #Select the data if sys.argv[1].endswith('biom'): #* Check if biom: params = check_params_for_biom_case(params, CommonArea) #Check the params for the biom case if params['feats_dir'] == "c": data = transpose(data) ncl = 1 if not params['subclass'] is None: ncl += 1 if not params['subject'] is None: ncl += 1 first_line = zip(*data)[0] first_line = modify_feature_names(list(first_line)) data = zip( first_line, *sort_by_cl(zip(*data)[1:], ncl, params['class']-1, params['subclass']-1 if not params['subclass'] is None else None, params['subject']-1 if not params['subject'] is None else None)) # data.insert(0,first_line) # data = remove_missing(data,params['missing_p']) cls = {} cls_i = [('class',params['class']-1)] if params['subclass'] > 0: cls_i.append(('subclass',params['subclass']-1)) if params['subject'] > 0: cls_i.append(('subject',params['subject']-1)) cls_i.sort(lambda x, y: -cmp(x[1],y[1])) for v in cls_i: cls[v[0]] = data.pop(v[1])[1:] if not params['subclass'] > 0: cls['subclass'] = [str(cl)+"_subcl" for cl in cls['class']] cls['subclass'] = rename_same_subcl(cls['class'],cls['subclass']) # if 'subclass' in cls.keys(): cls = group_small_subclasses(cls,params['subcl_min_card']) class_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values())) feats = dict([(d[0],d[1:]) for d in data]) feats = add_missing_levels(feats) feats = numerical_values(feats,params['norm_v']) out = {} out['feats'] = feats out['norm'] = params['norm_v'] out['cls'] = cls out['class_sl'] = class_sl out['subclass_sl'] = subclass_sl out['class_hierarchy'] = class_hierarchy if params['output_table']: with open( params['output_table'], "w") as outf: if 'class' in cls: outf.write( "\t".join(list(["class"])+list(cls['class'])) + "\n" ) if 'subclass' in cls: outf.write( "\t".join(list(["subclass"])+list(cls['subclass'])) + "\n" ) if 'subject' in cls: outf.write( "\t".join(list(["subject"])+list(cls['subject'])) + "\n" ) for k,v in out['feats'].items(): outf.write( "\t".join([k]+[str(vv) for vv in v]) + "\n" ) with open(params['output_file'], 'wb') as back_file: pickle.dump(out,back_file) nsegata-lefse-3ce86f78d59e/lefse.py0000755000000000000000000002251512750144356015253 0ustar 00000000000000import os,sys,math,pickle import random as lrand import rpy2.robjects as robjects import argparse import numpy #import svmutil def init(): lrand.seed(1982) robjects.r('library(splines)') robjects.r('library(stats4)') robjects.r('library(survival)') robjects.r('library(mvtnorm)') robjects.r('library(modeltools)') robjects.r('library(coin)') robjects.r('library(MASS)') def get_class_means(class_sl,feats): means = {} clk = class_sl.keys() for fk,f in feats.items(): means[fk] = [numpy.mean((f[class_sl[k][0]:class_sl[k][1]])) for k in clk] return clk,means def save_res(res,filename): with open(filename, 'w') as out: for k,v in res['cls_means'].items(): out.write(k+"\t"+str(math.log(max(max(v),1.0),10.0))+"\t") if k in res['lda_res_th']: for i,vv in enumerate(v): if vv == max(v): out.write(str(res['cls_means_kord'][i])+"\t") break out.write(str(res['lda_res'][k])) else: out.write("\t") out.write( "\t" + (res['wilcox_res'][k] if 'wilcox_res' in res and k in res['wilcox_res'] else "-")+"\n") def load_data(input_file, nnorm = False): with open(input_file, 'rb') as inputf: inp = pickle.load(inputf) if nnorm: return inp['feats'],inp['cls'],inp['class_sl'],inp['subclass_sl'],inp['class_hierarchy'],inp['norm'] else: return inp['feats'],inp['cls'],inp['class_sl'],inp['subclass_sl'],inp['class_hierarchy'] def load_res(input_file): with open(input_file, 'rb') as inputf: inp = pickle.load(inputf) return inp['res'],inp['params'],inp['class_sl'],inp['subclass_sl'] def test_kw_r(cls,feats,p,factors): robjects.globalenv["y"] = robjects.FloatVector(feats) for i,f in enumerate(factors): robjects.globalenv['x'+str(i+1)] = robjects.FactorVector(robjects.StrVector(cls[f])) fo = "y~x1" #for i,f in enumerate(factors[1:]): # if f == "subclass" and len(set(cls[f])) <= len(set(cls["class"])): continue # if len(set(cls[f])) == len(cls[f]): continue # fo += "+x"+str(i+2) kw_res = robjects.r('kruskal.test('+fo+',)$p.value') return float(tuple(kw_res)[0]) < p, float(tuple(kw_res)[0]) def test_rep_wilcoxon_r(sl,cl_hie,feats,th,multiclass_strat,mul_cor,fn,min_c,comp_only_same_subcl,curv=False): comp_all_sub = not comp_only_same_subcl tot_ok = 0 alpha_mtc = th all_diff = [] for pair in [(x,y) for x in cl_hie.keys() for y in cl_hie.keys() if x < y]: dir_cmp = "not_set" # l_subcl1, l_subcl2 = (len(cl_hie[pair[0]]), len(cl_hie[pair[1]])) if mul_cor != 0: alpha_mtc = th*l_subcl1*l_subcl2 if mul_cor == 2 else 1.0-math.pow(1.0-th,l_subcl1*l_subcl2) ok = 0 curv_sign = 0 first = True for i,k1 in enumerate(cl_hie[pair[0]]): br = False for j,k2 in enumerate(cl_hie[pair[1]]): if not comp_all_sub and k1[len(pair[0]):] != k2[len(pair[1]):]: ok += 1 continue cl1 = feats[sl[k1][0]:sl[k1][1]] cl2 = feats[sl[k2][0]:sl[k2][1]] med_comp = False if len(cl1) < min_c or len(cl2) < min_c: med_comp = True sx,sy = numpy.median(cl1),numpy.median(cl2) if cl1[0] == cl2[0] and len(set(cl1)) == 1 and len(set(cl2)) == 1: tres, first = False, False elif not med_comp: robjects.globalenv["x"] = robjects.FloatVector(cl1+cl2) robjects.globalenv["y"] = robjects.FactorVector(robjects.StrVector(["a" for a in cl1]+["b" for b in cl2])) pv = float(robjects.r('pvalue(wilcox_test(x~y,data=data.frame(x,y)))')[0]) tres = pv < alpha_mtc*2.0 if first: first = False if not curv and ( med_comp or tres ): dir_cmp = sx < sy #if sx == sy: br = True elif curv: dir_cmp = None if med_comp or tres: curv_sign += 1 dir_cmp = sx < sy else: br = True elif not curv and med_comp: if ((sx < sy) != dir_cmp or sx == sy): br = True elif curv: if tres and dir_cmp == None: curv_sign += 1 dir_cmp = sx < sy if tres and dir_cmp != (sx < sy): br = True curv_sign = -1 elif not tres or (sx < sy) != dir_cmp or sx == sy: br = True if br: break ok += 1 if br: break if curv: diff = curv_sign > 0 else: diff = (ok == len(cl_hie[pair[1]])*len(cl_hie[pair[0]])) # or (not comp_all_sub and dir_cmp != "not_set") if diff: tot_ok += 1 if not diff and multiclass_strat: return False if diff and not multiclass_strat: all_diff.append(pair) if not multiclass_strat: tot_k = len(cl_hie.keys()) for k in cl_hie.keys(): nk = 0 for a in all_diff: if k in a: nk += 1 if nk == tot_k-1: return True return False return True def contast_within_classes_or_few_per_class(feats,inds,min_cl,ncl): ff = zip(*[v for n,v in feats.items() if n != 'class']) cols = [ff[i] for i in inds] cls = [feats['class'][i] for i in inds] if len(set(cls)) < ncl: return True for c in set(cls): if cls.count(c) < min_cl: return True cols_cl = [x for i,x in enumerate(cols) if cls[i] == c] for i,col in enumerate(zip(*cols_cl)): if (len(set(col)) <= min_cl and min_cl > 1) or (min_cl == 1 and len(set(col)) <= 1): return True return False def test_lda_r(cls,feats,cl_sl,boots,fract_sample,lda_th,tol_min,nlogs): fk = feats.keys() means = dict([(k,[]) for k in feats.keys()]) feats['class'] = list(cls['class']) clss = list(set(feats['class'])) for uu,k in enumerate(fk): if k == 'class': continue ff = [(feats['class'][i],v) for i,v in enumerate(feats[k])] for c in clss: if len(set([float(v[1]) for v in ff if v[0] == c])) > max(float(feats['class'].count(c))*0.5,4): continue for i,v in enumerate(feats[k]): if feats['class'][i] == c: feats[k][i] = math.fabs(feats[k][i] + lrand.normalvariate(0.0,max(feats[k][i]*0.05,0.01))) rdict = {} for a,b in feats.items(): if a == 'class' or a == 'subclass' or a == 'subject': rdict[a] = robjects.StrVector(b) else: rdict[a] = robjects.FloatVector(b) robjects.globalenv["d"] = robjects.DataFrame(rdict) lfk = len(feats[fk[0]]) rfk = int(float(len(feats[fk[0]]))*fract_sample) f = "class ~ "+fk[0] for k in fk[1:]: f += " + " + k.strip() ncl = len(set(cls['class'])) min_cl = int(float(min([cls['class'].count(c) for c in set(cls['class'])]))*fract_sample*fract_sample*0.5) min_cl = max(min_cl,1) pairs = [(a,b) for a in set(cls['class']) for b in set(cls['class']) if a > b] for k in fk: for i in range(boots): means[k].append([]) for i in range(boots): for rtmp in range(1000): rand_s = [lrand.randint(0,lfk-1) for v in range(rfk)] if not contast_within_classes_or_few_per_class(feats,rand_s,min_cl,ncl): break rand_s = [r+1 for r in rand_s] means[k][i] = [] for p in pairs: robjects.globalenv["rand_s"] = robjects.IntVector(rand_s) robjects.globalenv["sub_d"] = robjects.r('d[rand_s,]') z = robjects.r('z <- suppressWarnings(lda(as.formula('+f+'),data=sub_d,tol='+str(tol_min)+'))') robjects.r('w <- z$scaling[,1]') robjects.r('w.unit <- w/sqrt(sum(w^2))') robjects.r('ss <- sub_d[,-match("class",colnames(sub_d))]') if 'subclass' in feats: robjects.r('ss <- ss[,-match("subclass",colnames(ss))]') if 'subject' in feats: robjects.r('ss <- ss[,-match("subject",colnames(ss))]') robjects.r('xy.matrix <- as.matrix(ss)') robjects.r('LD <- xy.matrix%*%w.unit') robjects.r('effect.size <- abs(mean(LD[sub_d[,"class"]=="'+p[0]+'"]) - mean(LD[sub_d[,"class"]=="'+p[1]+'"]))') scal = robjects.r('wfinal <- w.unit * effect.size') rres = robjects.r('mm <- z$means') rowns = list(rres.rownames) lenc = len(list(rres.colnames)) coeff = [abs(float(v)) if not math.isnan(float(v)) else 0.0 for v in scal] res = dict([(pp,[float(ff) for ff in rres.rx(pp,True)] if pp in rowns else [0.0]*lenc ) for pp in [p[0],p[1]]]) for j,k in enumerate(fk): gm = abs(res[p[0]][j] - res[p[1]][j]) means[k][i].append((gm+coeff[j])*0.5) res = {} for k in fk: m = max([numpy.mean([means[k][kk][p] for kk in range(boots)]) for p in range(len(pairs))]) res[k] = math.copysign(1.0,m)*math.log(1.0+math.fabs(m),10) return res,dict([(k,x) for k,x in res.items() if math.fabs(x) > lda_th]) def test_svm(cls,feats,cl_sl,boots,fract_sample,lda_th,tol_min,nsvm): return NULL """ fk = feats.keys() clss = list(set(cls['class'])) y = [clss.index(c)*2-1 for c in list(cls['class'])] xx = [feats[f] for f in fk] if nsvm: maxs = [max(v) for v in xx] mins = [min(v) for v in xx] x = [ dict([(i+1,(v-mins[i])/(maxs[i]-mins[i])) for i,v in enumerate(f)]) for f in zip(*xx)] else: x = [ dict([(i+1,v) for i,v in enumerate(f)]) for f in zip(*xx)] lfk = len(feats[fk[0]]) rfk = int(float(len(feats[fk[0]]))*fract_sample) mm = [] best_c = svmutil.svm_ms(y, x, [pow(2.0,i) for i in range(-5,10)],'-t 0 -q') for i in range(boots): rand_s = [lrand.randint(0,lfk-1) for v in range(rfk)] r = svmutil.svm_w([y[yi] for yi in rand_s], [x[xi] for xi in rand_s], best_c,'-t 0 -q') mm.append(r[:len(fk)]) m = [numpy.mean(v) for v in zip(*mm)] res = dict([(v,m[i]) for i,v in enumerate(fk)]) return res,dict([(k,x) for k,x in res.items() if math.fabs(x) > lda_th]) """ nsegata-lefse-3ce86f78d59e/lefse2circlader.py0000755000000000000000000000265412750144356017210 0ustar 00000000000000#!/usr/bin/env python from __future__ import with_statement import sys import os import argparse def read_params(args): parser = argparse.ArgumentParser(description='Convert LEfSe output to ' 'Circlader input') parser.add_argument( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str, help="the input file [stdin if not present]") parser.add_argument( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str, help="the output file [stdout if not present]") parser.add_argument('-l', metavar='levels with label', default=0, type=int) return vars(parser.parse_args()) def lefse2circlader(par): finp,fout = bool(par['inp_f']), bool(par['out_f']) with open(par['inp_f']) if finp else sys.stdin as inpf: put_bm = (l.strip().split('\t') for l in inpf.readlines()) biomarkers = [p for p in put_bm if len(p) > 2] circ = [ [ b[0], "" if b[0].count('.') > par['l'] else b[0].split('.')[-1], b[2], b[2]+"_col" ] for b in biomarkers] with open(par['out_f'],'w') if fout else sys.stdout as out_file: for c in circ: out_file.write( "\t".join( c ) + "\n" ) if __name__ == '__main__': params = read_params(sys.argv) lefse2circlader(params) nsegata-lefse-3ce86f78d59e/lefsebiom/AbundanceTable.py0000755000000000000000000030073512750144356020755 0ustar 00000000000000""" Author: Timothy Tickle Description: Class to abstract an abundance table and methods to run on such a table. """ ##################################################################################### #Copyright (C) <2012> # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal in the #Software without restriction, including without limitation the rights to use, copy, #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, #and to permit persons to whom the Software is furnished to do so, subject to #the following conditions: # #The above copyright notice and this permission notice shall be included in all copies #or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ##################################################################################### __author__ = "Timothy Tickle" __copyright__ = "Copyright 2012" __credits__ = ["Timothy Tickle"] __license__ = "MIT" __maintainer__ = "Timothy Tickle" __email__ = "ttickle@sph.harvard.edu" __status__ = "Development" import csv import sys import blist from CClade import CClade from ConstantsBreadCrumbs import ConstantsBreadCrumbs import copy from datetime import date import numpy as np import os import re import scipy.stats import string from ValidateData import ValidateData #************************************************************* #* import biom * #* If not found - abnormally exit * #************************************************************* try: from biom.parse import * from biom.table import * except ImportError: sys.stderr.write("************************************************************************************************************ \n") sys.stderr.write("* Error: biom environment required to process biom files - Not found - run abnormally terminated * \n") sys.stderr.write("* See http://http://biom-format.org/ * \n") sys.stderr.write("************************************************************************************************************ \n") exit(1) from biom.parse import * from biom.table import * c_dTarget = 1.0 c_fRound = False c_iSumAllCladeLevels = -1 c_fOutputLeavesOnly = False class RowMetadata: """ Holds the row (feature) metadata and associated functions. """ def __init__(self, dictRowMetadata, iLongestMetadataEntry=None, lsRowMetadataIDs=None): """ Constructor requires a dictionary or row metadata. :param dictRowMetadata: The row metadata values with the ids as the keys, must be stable (keep order) :type: {string feature id: {'metadata': {'taxonomy': [list of metadata values]}}} """ self.dictRowMetadata = dictRowMetadata self.iLongestMetadataEntry = iLongestMetadataEntry self.lsRowMetadataIDs = lsRowMetadataIDs self.dictMetadataIDs = {} # Get the ids for the metadata if self.dictRowMetadata: for dictMetadata in self.dictRowMetadata.values(): dictMetadata = dictMetadata.get(ConstantsBreadCrumbs.c_metadata_lowercase, None) if dictMetadata: for key,value in dictMetadata.items(): if self.dictMetadataIDs.get(key, None): self.dictMetadataIDs[key] = max(self.dictMetadataIDs[key],len(dictMetadata[key])) else: self.dictMetadataIDs[key] = len(dictMetadata[key]) def funcMakeIDs(self): """ There should be a one to one mapping of row metadata ids and the values associated here with a feature ID. If not make ids from the key by appending numbers. """ # If there exists a ids list already return (this allows ID order to be given and preserved) # Else make a list of IDs if self.lsRowMetadataIDs: return self.lsRowMetadataIDs lsIDs = [] lsKeys = [] for key, value in self.dictMetadataIDs.items(): lsKeys.append( key ) if value > 1: lsIDs.extend( [ "_".join( [ key, str( iIndex ) ] ) for iIndex in xrange( value ) ] ) else: lsIDs.append( key ) return [ lsIDs, lsKeys ] def funGetFeatureMetadata(self, sFeature, sMetadata): """ Returns a list of values in the order of row metadta ids for a microbial feature given an id. :param sFeature: Feature id to get metadata :type: string :param sMetadata: Metadata to get :type: string :return: list of metadata associated with the metadata """ lsMetadata = [] if self.dictRowMetadata: dictFeature = self.dictRowMetadata.get( sFeature, None ) if dictFeature: dictFeatureMetadata = dictFeature.get(ConstantsBreadCrumbs.c_metadata_lowercase, None) if dictFeatureMetadata: lsMetadata = dictFeatureMetadata.get(sMetadata, None) return lsMetadata class AbundanceTable: """ Represents an abundance table and contains common function to perform on the object. This class is made from an abundance data file. What is expected is a text file delimited by a character (which is given to the object). The first column is expected to be the id column for each of the rows. Metadata is expected before measurement data. Columns are samples and rows are features (bugs). This object is currently not hashable. """ def __init__(self, npaAbundance, dictMetadata, strName, strLastMetadata, rwmtRowMetadata = None, dictFileMetadata = None, lOccurenceFilter = None, cFileDelimiter = ConstantsBreadCrumbs.c_cTab, cFeatureNameDelimiter = "|"): """ Constructor for an abundance table. :param npaAbundance: Structured Array of abundance data (Row=Features, Columns=Samples) :type: Numpy Structured Array abundance data (Row=Features, Columns=Samples) :param dictMetadata: Dictionary of metadata {"String ID":["strValue","strValue","strValue","strValue","strValue"]} :type: Dictionary Dictionary :param npaRowMetdata Structured Array of row (feature) metadata (optional) :type: Numpy Structured Array abundance data (Row=Features, Columns=Feature metadata) :param strName: The name of the metadata that serves as the ID for the columns (For example a sample ID) :type: string :param strLastMetadata: The string last metadata name :type: string :param lOccurenceFilter: List of integers used in an occurence filter. [Min abundance, Min sample] :type: List of integers :param cFileDelimiter: Character used as the delimiter of the file that is read in to create the abundance table. Will also be used to write the abudance table file to a file to keep file consistency. :type: Character delimiter for reading the data in (default = TAB) :param cFeatureNameDelimiter: Character used as the delimiter of the feature names (column 1). This is useful if the name are complex, for instance consensus lineages in metagenomics. :type: Character delimiter for feature names (default = |) """ ### File Metadata #Date self.dateCreationDate = dictFileMetadata.get(ConstantsBreadCrumbs.c_strDateKey,None) if dictFileMetadata else None #Indicates if the table has been filtered and how self._strCurrentFilterState = "" #The delimiter from the source file self._cDelimiter = cFileDelimiter #The feature name delimiter self._cFeatureDelimiter = cFeatureNameDelimiter #File type self.strFileFormatType = dictFileMetadata.get(ConstantsBreadCrumbs.c_strFormatKey,None) if dictFileMetadata else None #File generation source self.strFileGenerationSource = dictFileMetadata.get(ConstantsBreadCrumbs.c_strSourceKey,None) if dictFileMetadata else None #File type self.strFileType = dictFileMetadata.get(ConstantsBreadCrumbs.c_strTypekey,None) if dictFileMetadata else None #File url self.strFileURL = dictFileMetadata.get(ConstantsBreadCrumbs.c_strURLKey,None) if dictFileMetadata else None #The id of the file self.strId = dictFileMetadata.get(ConstantsBreadCrumbs.c_strIDKey,None) if dictFileMetadata else None #The lastmetadata name (which should be preserved when writing the file) # Can be a None if biom file is read in. self._strLastMetadataName = strLastMetadata #The original number of features in the table self._iOriginalFeatureCount = -1 #The name of the object relating to the file it was read from or would have been read from if it exists #Keeps tract of changes to the file through the name #Will be used to write out the object to a file as needed self._strOriginalName = strName #The original number of samples in the table self._iOriginalSampleCount = -1 #Data sparsity type self.fSparseMatrix = dictFileMetadata.get(ConstantsBreadCrumbs.c_strSparsityKey,False) if dictFileMetadata else False ### Data metadata #The column (sample) metdata self._dictTableMetadata = dictMetadata #The row (feature) metadata (Row Metadata object) self.rwmtRowMetadata = rwmtRowMetadata ### Data #The abundance data self._npaFeatureAbundance = npaAbundance ### Logistical #Clade prefixes for biological samples self._lsCladePrefixes = ["k__","p__","c__","o__","f__","g__","s__"] #This is not a hashable object self.__hash__ = None ### Prep the object self._fIsNormalized = self._fIsSummed = None #If contents is not a false then set contents to appropriate objects # Checking to see if the data is normalized, summed and if we need to run a filter on it. if ( self._npaFeatureAbundance != None ) and self._dictTableMetadata: self._iOriginalFeatureCount = self._npaFeatureAbundance.shape[0] self._iOriginalSampleCount = len(self.funcGetSampleNames()) self._fIsNormalized = ( max( [max( list(a)[1:] or [0] ) for a in self._npaFeatureAbundance] or [0] ) <= 1 ) lsLeaves = AbundanceTable.funcGetTerminalNodesFromList( [a[0] for a in self._npaFeatureAbundance], self._cFeatureDelimiter ) self._fIsSummed = ( len( lsLeaves ) != len( self._npaFeatureAbundance ) ) #Occurence filtering #Removes features that do not have a given level iLowestAbundance in a given amount of samples iLowestSampleOccurence if ( not self._fIsNormalized ) and lOccurenceFilter: iLowestAbundance, iLowestSampleOccurrence = lOccurenceFilter self.funcFilterAbundanceBySequenceOccurence( iLowestAbundance, iLowestSampleOccurrence ) # else: # sys.stderr.write( "Abundance or metadata was None, should be atleast an empty object\n" ) @staticmethod def funcMakeFromFile(xInputFile, cDelimiter = ConstantsBreadCrumbs.c_cTab, sMetadataID = None, sLastMetadataRow = None, sLastMetadata = None, lOccurenceFilter = None, cFeatureNameDelimiter="|", xOutputFile = None, strFormat = None): """ Creates an abundance table from a table file. :param xInputFile: Path to input file. :type: String String path. :param cDelimiter: Delimiter for parsing the input file. :type: Character Character :param sMetadataID: String ID that is a metadata row ID (found on the first column) and used as an ID for samples :type: String String ID :param sLastRowMetadata: The id of the last (most right column) row metadata :type: String String ID :param sLastMetadata: The ID of the metadata that is the last metadata before measurement or feature rows. :type: String String ID :param lOccurenceFilter: List of integers used in an occurence filter. [Min abundance, Min sample] :type: List of integers :param cFeatureNameDelimiter: Used to parse feature (bug) names if they are complex. For example if they are consensus lineages and contain parent clade information. :type: Character Delimiting letter :param xOutputFile: File to output the abundance table which was read in. :type: FileStream or String file path :return AbundanceTable: Will return an AbundanceTable object on no error. Returns False on error. """ #Get output file and remove if existing outputFile = open( xOutputFile, "w" ) if isinstance(xOutputFile, str) else xOutputFile ################################################################################# # Check if file is a biom file - if so invoke the biom routine # ################################################################################# strFileName = xInputFile if isinstance(xInputFile, str) else xInputFile.name # Determine the file read function by file extension if strFileName.endswith(ConstantsBreadCrumbs.c_strBiomFile) or (strFormat == ConstantsBreadCrumbs.c_strBiomFile): BiomCommonArea = AbundanceTable._funcBiomToStructuredArray(xInputFile) if BiomCommonArea: lContents = [BiomCommonArea[ConstantsBreadCrumbs.c_BiomTaxData], BiomCommonArea[ConstantsBreadCrumbs.c_Metadata], BiomCommonArea[ ConstantsBreadCrumbs.c_dRowsMetadata], BiomCommonArea[ConstantsBreadCrumbs.c_BiomFileInfo] ] # Update last metadata and id if given if not sLastMetadata: strLastMetadata = BiomCommonArea[ConstantsBreadCrumbs.c_sLastMetadata] else: # return false on failure lContents = False elif( strFileName.endswith(ConstantsBreadCrumbs.c_strPCLFile) or (strFormat == ConstantsBreadCrumbs.c_strPCLFile) ): #Read in from text file to create the abundance and metadata structures lContents = AbundanceTable._funcTextToStructuredArray(xInputFile=xInputFile, cDelimiter=cDelimiter, sMetadataID = sMetadataID, sLastMetadataRow = sLastMetadataRow, sLastMetadata = sLastMetadata, ostmOutputFile = outputFile) else: print("I do not understand the format to read and write the data as, please use the correct file extension or indicate a type.") return( false ) #If contents is not a false then set contents to appropriate objects return AbundanceTable(npaAbundance=lContents[0], dictMetadata=lContents[1], strName=str(xInputFile), strLastMetadata=sLastMetadata, rwmtRowMetadata = lContents[2], dictFileMetadata = lContents[3], lOccurenceFilter = lOccurenceFilter, cFileDelimiter=cDelimiter, cFeatureNameDelimiter=cFeatureNameDelimiter) if lContents else False #Testing Status: Light happy path testing @staticmethod def funcCheckRawDataFile(strReadDataFileName, iFirstDataIndex = -1, sLastMetadataName = None, lOccurenceFilter = None, strOutputFileName = "", cDelimiter = ConstantsBreadCrumbs.c_cTab): """ Check the input abundance table. Currently reduces the features that have no occurence. Also inserts a NA for blank metadata and a 0 for blank abundance data. Gives the option to filter features through an occurence filter (a feature must have a level of abundance in a minimal number of samples to be included). Either iFirstDataIndex or sLastMetadataName must be given :param strReadDataFileName: File path of file to read and check. :type: String File path. :param iFirstDataIndex: First (row) index of data not metadata in the abundance file. :type: Integer Index starting at 0. :param sLastMetadataName: The ID of the last metadata in the file. Rows of measurements should follow this metadata. :type: String :param lOccurenceFilter: The lowest number of occurences in the lowest number of samples needed for a feature to be kept :type: List[2] List length 2 [lowest abundance (not normalized), lowest number of samples to occur in] (eg. [2.0,2.0]) :param strOutputFileName: File path of out put file. :type: String File path. :param cDelimiter: Character delimiter for reading and writing files. :type: Character Delimiter. :return Output Path: Output path for written checked file. """ #Validate parameters if (iFirstDataIndex == -1) and (sLastMetadataName == None): print "AbundanceTable:checkRawDataFile::Error, either iFirstDataIndex or sLastMetadataNamemust be given." return False #Get output file and remove if existing outputFile = strOutputFileName if not strOutputFileName: outputFile = os.path.splitext(strReadDataFileName)[0]+ConstantsBreadCrumbs.OUTPUT_SUFFIX #Read input file lines #Drop blank lines readData = "" with open(strReadDataFileName,'rU') as f: readData = f.read() readData = filter(None,readData.split(ConstantsBreadCrumbs.c_strEndline)) #Read the length of each line and make sure there is no jagged data #Also hold row count for the metadata iLongestLength = len(readData[0].split(cDelimiter)) iMetadataRow = -1 if not sLastMetadataName: sLastMetadataName = "None" for iIndex, strLine in enumerate(readData): sLineElements = strLine.split(cDelimiter) if sLineElements[0] == sLastMetadataName: iMetadataRow = iIndex iLongestLength = max(iLongestLength, len(sLineElements)) #If not already set, set iFirstDataIndex if iFirstDataIndex < 0: iFirstDataIndex = iMetadataRow + 1 #Used to substitute . to - reSubPeriod = re.compile('\.') #File writer with open(outputFile,'w') as f: #Write metadata #Empty data is changed to a default #Jagged ends are filled with a default for strDataLine in readData[:iFirstDataIndex]: lsLineElements = strDataLine.split(cDelimiter) for iindex, sElement in enumerate(lsLineElements): if not sElement.strip(): lsLineElements[iindex] = ConstantsBreadCrumbs.c_strEmptyDataMetadata if len(lsLineElements) < iLongestLength: lsLineElements = lsLineElements + ([ConstantsBreadCrumbs.c_strEmptyDataMetadata]*(iLongestLength-len(lsLineElements))) f.write(cDelimiter.join(lsLineElements)+ConstantsBreadCrumbs.c_strEndline) #For each data line in the table for line in readData[iFirstDataIndex:]: writeToFile = False cleanLine = list() #Break line into delimited elements lineElements = line.split(cDelimiter) #Clean feature name sCleanFeatureName = reSubPeriod.sub("-",lineElements[0]) #For each element but the first (taxa name) #Element check to see if not == zero #If so add to output for element in lineElements[1:]: if(element.strip() in string.whitespace): cleanLine.append(ConstantsBreadCrumbs.c_strEmptyAbundanceData) #Set abundance of 0 but do not indicate the line should be saved elif(element == "0"): cleanLine.append(element) #If an abundance is found set the line to be saved. else: cleanLine.append(element) writeToFile = True #Occurence filtering #Removes features that do not have a given level iLowestAbundance in a given amount of samples iLowestSampleOccurence if lOccurenceFilter: iLowestAbundance, iLowestSampleOccurence = lOccurenceFilter if iLowestSampleOccurence > sum([1 if float(sEntry) >= iLowestAbundance else 0 for sEntry in cleanLine]): writeToFile = False #Write to file if writeToFile: f.write(sCleanFeatureName+cDelimiter+cDelimiter.join(cleanLine)+ConstantsBreadCrumbs.c_strEndline) return outputFile def __repr__(self): """ Represent or print object. """ return "AbundanceTable" def __str__(self): """ Create a string representation of the Abundance Table. """ return "".join(["Sample count:", str(len(self._npaFeatureAbundance.dtype.names[1:])), os.linesep+"Feature count:", str(len(self._npaFeatureAbundance[self._npaFeatureAbundance.dtype.names[0]])), os.linesep+"Id Metadata:", self._npaFeatureAbundance.dtype.names[0], os.linesep+"Metadata ids:", str(self._dictTableMetadata.keys()), os.linesep+"Metadata count:", str(len(self._dictTableMetadata.keys())), os.linesep+"Originating source:",self._strOriginalName, os.linesep+"Original feature count:", str(self._iOriginalFeatureCount), os.linesep+"Original sample count:", str(self._iOriginalSampleCount), os.linesep+"Is normalized:", str(self._fIsNormalized), os.linesep+"Is summed:", str(self._fIsSummed), os.linesep+"Current filtering state:", str(self._strCurrentFilterState), os.linesep+"Feature delimiter:", self._cFeatureDelimiter, os.linesep+"File delimiter:",self._cDelimiter]) def __eq__(self, objOther): """ Check if an object is equivalent in data to this object Check to make sure that objOther is not None Check to make sure objOther is the correct class type Check to make sure self and other internal data are the same (exclusing file name) Check data and make sure the npa arrays are the same Check the metdata to make sure the dicts are the same (will need to sort the keys of the dicts before comparing, they do not guarentee any order. """ # Check for none if objOther is None: return False #Check for object type if isinstance(objOther,AbundanceTable) != True: return False #Check feature delimiter if self.funcGetFeatureDelimiter() != objOther.funcGetFeatureDelimiter(): return False #Check file delimiter if self.funcGetFileDelimiter() != objOther.funcGetFileDelimiter(): return False #************************************************** #* Commented out * #************************************************** #Check name - Commented out by GW on 2013/09/14 because #If we import pcl file into biom file and then export to pcl, the file names might be different but the tables might be the same #Check name #if self.funcGetName() != objOther.funcGetName(): #return False #Check sample metadata #Go through the metadata result1 = self.funcGetMetadataCopy() result2 = objOther.funcGetMetadataCopy() if sorted(result1.keys()) != sorted(result2.keys()): return False for strKey in result1.keys(): if strKey not in result2: return False if result1[strKey] != result2[strKey]: return False #TODO check the row (feature) metadata #TODO check the file metadata #Check the ID if self.funcGetFileDelimiter() != objOther.funcGetFileDelimiter(): return False #Check the date if self.dateCreationDate != objOther.dateCreationDate: return False #Check the format if self.strFileFormatType != objOther.strFileFormatType: return False #************************************************** #* Commented out * #************************************************** #Check source - Commented out by GW on 2013/09/14 because #If we import pcl file into biom file and then export to pcl, the file names might be different but the tables might be the same #Check the source #if self.strFileGenerationSource != objOther.strFileGenerationSource: #return False #Check the type if self.strFileType != objOther.strFileType: return False #Check the URL if self.strFileURL != objOther.strFileURL: return False #Check data #TODO go through the data #TODO also check the data type result1 = self.funcGetAbundanceCopy() result2 = objOther.funcGetAbundanceCopy() if len(result1) != len(result2): return False sorted_result1 = sorted(result1, key=lambda tup: tup[0]) sorted_result2 = sorted(result2, key=lambda tup: tup[0]) if sorted_result1 != sorted_result2 : return False #************************************************** #* Commented out * #************************************************** #Check AbundanceTable.__str__(self) - Commented out by GW on 2013/09/14 because #If we import pcl file into biom file and then export to pcl, the file names might be different but the tables might be the same #Check string representation #if AbundanceTable.__str__(self) != AbundanceTable.__str__(objOther): #return False #Check if sample ids are the same and in the same order if self.funcGetSampleNames() != objOther.funcGetSampleNames(): return False return True def __ne__(self, objOther): return not self == objOther #Testing Status: Light happy path testing #TODO: Tim change static to class methods @staticmethod def _funcTextToStructuredArray(xInputFile = None, cDelimiter = ConstantsBreadCrumbs.c_cTab, sMetadataID = None, sLastMetadataRow = None, sLastMetadata = None, ostmOutputFile = None): """ Private method Used to read in a file that is samples (column) and taxa (rows) into a structured array. :param xInputFile: File stream or path to input file. :type: String File stream or string path. :param cDelimiter: Delimiter for parsing the input file. :type: Character Character. :param sMetadataID: String ID that is a metadata row ID (found on the first column) and used as an ID for samples. If not given it is assumed to be position 0 :type: String String ID :param sLastMetadataRow: String ID that is the last row metadat id (id of the most right column with row/feature metadata) :type: String String ID :param sLastMetadata: The ID of the metadata that is the last metadata before measurement or feature rows. :type: String String ID :param ostmOutputFile: Output File to write to if needed. None does not write the file. :type: FileStream or String :return [taxData,metadata,rowmetadata]: Numpy Structured Array of abundance data and dictionary of metadata. Metadata is a dictionary as such {"ID", [value,value,values...]} Values are in the order thety are read in (and the order of the sample names). ID is the first column in each metadata row. - rowmetadata is a optional Numpy strucured array (can be None if not made) + rowmetadata is a optional RowMetadata object (can be None if not made) The rowmetadata and taxData row Ids should match - [Numpy structured Array, Dictionary, Numpy structured array] + The last dict is a collection of BIOM fielparameters when converting from a BIOM file + [Numpy structured Array, Dictionary, Numpy structured array, dict] """ # Open file from a stream or file path istmInput = open( xInputFile, 'rU' ) if isinstance(xInputFile, str) else xInputFile # Flag that when incremented will switch from metadata parsing to data parsing iFirstDataRow = -1 # Sample id row namesRow = None # Row metadata names lsRowMetadataIDs = None # Index of the last row metadata iIndexLastMetadataRow = None # Holds metadata {ID:[list of values]} metadata = dict() # Holds the data measurements [(tuple fo values)] dataMatrix = [] # Holds row metadata { sID : [ list of values ] } dictRowMetadata = {} # Positional index iIndex = -1 # File handle csvw = None # Read in files if ostmOutputFile: csvw = csv.writer( open(ostmOutputFile,'w') if isinstance(ostmOutputFile, str) else ostmOutputFile, csv.excel_tab, delimiter = cDelimiter ) # For each line in the file, and assume the tax id is the first element and the data follows for lsLineElements in csv.reader( istmInput, dialect = csv.excel_tab, delimiter = cDelimiter ): iIndex += 1 taxId, sampleReads = lsLineElements[0], lsLineElements[1:] # Read through data measurements # Process them as a list of tuples (needed for structured array) if iFirstDataRow > 0: try: # Parse the sample reads, removing row metadata and storing row metadata if it exists if lsRowMetadataIDs: # Build expected dict for row metadata dictionary {string feature id: {'metadata': {metadatakey: [list of metadata values]}}} dictFeature = dict([ [sID, [sKey]] for sID, sKey in zip( lsRowMetadataIDs, sampleReads[ 0 : iIndexLastMetadataRow ]) ]) if len( dictFeature ): dictRowMetadata[ taxId ] = { ConstantsBreadCrumbs.c_metadata_lowercase: dictFeature } dataMatrix.append(tuple([taxId] + [( float(s) if s.strip( ) else 0 ) for s in sampleReads[ iIndexLastMetadataRow: ]])) else: dataMatrix.append(tuple([taxId] + [( float(s) if s.strip( ) else 0 ) for s in sampleReads])) except ValueError: sys.stderr.write( "AbundanceTable:textToStructuredArray::Error, non-numerical value on data row. File:" + str(xInputFile) + " Row:" + str(lsLineElements) + "\n" ) return False # Go through study measurements else: # Read in metadata values, if the entry is blank then give it the default empty metadata value. for i, s in enumerate( sampleReads ): if not s.strip( ): sampleReads[i] = ConstantsBreadCrumbs.c_strEmptyDataMetadata # If no id metadata (sample ids) is given then the first row is assumed to be the id row, otherwise look for the id for the metadata. # Add the metadata to the containing dict if ( ( not sMetadataID ) and ( iIndex == 0 ) ) or ( taxId == sMetadataID ): namesRow = lsLineElements # Remove the row metadata ids, these names are for the column ID and the samples ids if sLastMetadataRow: iIndexLastMetadataRow = lsLineElements.index(sLastMetadataRow) lsRowMetadataIDs = namesRow[ 1 : iIndexLastMetadataRow + 1 ] namesRow = [ namesRow[ 0 ] ] + namesRow[ iIndexLastMetadataRow + 1: ] # If the sample metadata dictionary already has entries then remove the row metadata info from it. if len( metadata ) and len( lsRowMetadataIDs ): for sKey, lsValues in metadata.items(): metadata[ sKey ] = lsValues[ iIndexLastMetadataRow: ] # Set the metadata without row metadata entries metadata[taxId] = sampleReads[ iIndexLastMetadataRow: ] if (lsRowMetadataIDs and len( lsRowMetadataIDs )) else sampleReads # If the last metadata was just processed switch to data processing # If the last metadata name is not given it is assumed that there is only one metadata if ( not sLastMetadata ) or ( taxId == sLastMetadata ): iFirstDataRow = iIndex + 1 # If writing out the data write back out the line read in. # This happens at the end so that the above cleaning is captured and written. if csvw: csvw.writerow( [taxId] + sampleReads ) if sLastMetadata and ( not dataMatrix ): sys.stderr.write( "AbundanceTable:textToStructuredArray::Error, did not find the row for the last metadata ID. File:" + str(xInputFile) + " Identifier:" + sLastMetadata + "\n" ) return False # Make sure the names are found if namesRow == None: sys.stderr.write( "AbundanceTable:textToStructuredArray::Error, did not find the row for the unique sample/column. File:" + str(xInputFile) + " Identifier:" + sMetadataID + "\n" ) return False # Now we know the longest taxId we can define the first column holding the tax id # Gross requirement of Numpy structured arrays, a = ASCII followed by max # of characters (as a string) longestTaxId = max( len(a[0]) for a in dataMatrix ) dataTypeVector = [(namesRow[0],'a' + str(longestTaxId*2))] + [(s, "f4") for s in namesRow[1:]] # Create structured array taxData = np.array(dataMatrix,dtype=np.dtype(dataTypeVector)) # Returns a none currently because the PCL file specification this originally worked on did not have feature metadata # Can be updated in the future. # [Data (structured array), column metadata (dict), row metadata (structured array), file metadata (dict)] return [taxData, metadata, RowMetadata(dictRowMetadata = dictRowMetadata, lsRowMetadataIDs = lsRowMetadataIDs), { ConstantsBreadCrumbs.c_strIDKey:ConstantsBreadCrumbs.c_strDefaultPCLID, ConstantsBreadCrumbs.c_strDateKey:str(date.today()), ConstantsBreadCrumbs.c_strFormatKey:ConstantsBreadCrumbs.c_strDefaultPCLFileFormateType, ConstantsBreadCrumbs.c_strSourceKey:ConstantsBreadCrumbs.c_strDefaultPCLGenerationSource, ConstantsBreadCrumbs.c_strTypekey:ConstantsBreadCrumbs.c_strDefaultPCLFileTpe, ConstantsBreadCrumbs.c_strURLKey:ConstantsBreadCrumbs.c_strDefaultPCLURL, ConstantsBreadCrumbs.c_strSparsityKey:ConstantsBreadCrumbs. c_fDefaultPCLSparsity}] # def funcAdd(self,abndTwo,strFileName=None): # """ # Allows one to add an abundance table to an abundance table. They both must be the same state of normalization or summation # or they will be summed or normalized if one of the two are. # # :param abndTwo: AbundanceTable object 2 # :type: AbundanceTable # :return AbudanceTable: # """ # # #Check summation and normalization # if(self.funcIsSummed() or abndTwo.funcIsSummed()): # self.funcSum() # abndTwo.funcSum() # if(self.funcIsNormalized() or abndTwo.funcIsNormalized()): # self.funcNormalize() # abndTwo.funcNormalize() # # #Normalize Feature names # #Get if the abundance tables have clades # fAbndInputHasClades = self.funcHasFeatureHierarchy() # fAbndCompareHasClades = abndTwo.funcHasFeatureHierarchy() # # if(fAbndInputHasClades or fAbndCompareHasClades): # #If feature delimiters do not match, switch # if not self.funcGetFeatureDelimiter() == abndTwo.funcGetFeatureDelimiter(): # abndTwo.funcSetFeatureDelimiter(self.funcGetFeatureDelimiter()) # # #Add prefixes if needed. # self.funcAddCladePrefixToFeatures() # abndTwo.funcAddCladePrefixToFeatures() # # #Get feature Names # lsFeatures1 = self.funcGetFeatureNames() # lsFeatures2 = abndTwo.funcGetFeatureNames() # # #Make one feature name list # lsFeaturesCombined = list(set(lsFeatures1+lsFeature2)) # # #Add samples by features (Use 0.0 for empty data features, use NA for empty metadata features) # # # #Combine metadata # dictMetadata1 = self.funcGetMetadataCopy() # dictMetadata2 = abndTwo.funcGetMetadataCopy() # # #Get first table metadata and add NA for metadata it is missing for the length of the current metadata # lsMetadataOnlyInTwo = list(set(dictMetadata2.keys())-set(dictMetadata1.keys())) # dictCombinedMetadata = dictMetadata1 # lsEmptyMetadata = ["NA" for i in xrange(self.funcGetSampleCount())] # for sKey in lsMetadataOnlyInTwo: # dictCombinedMetadata[sKey]=lsEmptyMetadata # #Add in the other metadata dictionary # lsCombinedKeys = dictCombinedMetadata.keys() # lsEmptyMetadata = ["NA" for i in xrange(abndTwo.funcGetSampleCount())] # for sKey in lsCombinedKeys(): # if sKey in dictMetadata2: # dictCombinedMetadata[sKey] = dictCombinedMetadata[sKey]+dictMetadata2[sKey] # else: # dictCombinedMetadata[sKey] = dictCombinedMetadata[sKey]+lsEmptyMetadata # # #Make Abundance table # return AbundanceTable(npaAbundance=npaAbundance, # dictMetadata = dictCombinedMetadata, # strName = strFileName if strFileName else os.path.splitext(self)[0]+"_combined_"+os.path.splitext(abndTwo)[0], # strLastMetadata = self.funcGetLastMetadataName(), # cFileDelimiter = self.funcGetFileDelimiter(), cFeatureNameDelimiter=self.funcGetFeatureDelimiter()) #TODO This does not adjust for sample ordering, needs to def funcAddDataFeature(self, lsNames, npdData): """ Adds a data or group of data to the underlying table. Names should be in the order of the data Each row is considered a feature (not sample). :param lsNames: Names of the features being added to the data of the table :type: List List of string names :param npdData: Rows of features to add to the table :type: Numpy array accessed by row. """ if ( self._npaFeatureAbundance == None ): return False # Check number of input data rows iDataRows = npdData.shape[0] if (len(lsNames) != iDataRows): print "Error:The names and the rows of data features to add must be of equal length" # Grow the array by the neccessary amount and add the new rows iTableRowCount = self.funcGetFeatureCount() iRowElementCount = self.funcGetSampleCount() self._npaFeatureAbundance.resize(iTableRowCount+iDataRows) for iIndexData in xrange(iDataRows): self._npaFeatureAbundance[iTableRowCount+iIndexData] = tuple([lsNames[iIndexData]]+list(npdData[iIndexData])) return True #TODO This does not adjust for sample ordering, needs to def funcAddMetadataFeature(self,lsNames,llsMetadata): """ Adds metadata feature to the underlying table. Names should be in the order of the lists of metadata Each internal list is considered a metadata and paired to a name """ if ( self._dictTableMetadata == None ): return False # Check number of input data rows iMetadataCount = len(llsMetadata) if (len(lsNames) != iMetadataCount): print "Error:The names and the rows of metadata features to add must be of equal length" # Add the metadata for tpleMetadata in zip(lsNames,llsMetadata): self._dictTableMetadata[tpleMetadata[0]]=tpleMetadata[1] return True #2 test Cases def funcSetFeatureDelimiter(self, cDelimiter): """ Changes the feature delimiter to the one provided. Updates the feature names. :param cDelimiter: Character feature delimiter :type: Character :return Boolean: Indicator of success or not (false) """ if ( self._npaFeatureAbundance == None ): return False cDelimiterCurrent = self.funcGetFeatureDelimiter() if ( not cDelimiter or not cDelimiterCurrent): return False #Make new feature names lsNewFeatureNames = [sFeatureName.replace(cDelimiterCurrent,cDelimiter) for sFeatureName in self.funcGetFeatureNames()] #Update new feature names to abundance table if (not self.funcGetIDMetadataName() == None): self._npaFeatureAbundance[self.funcGetIDMetadataName()] = np.array(lsNewFeatureNames) #Update delimiter self._cFeatureDelimiter = cDelimiter return True #Happy path tested def funcGetSampleNames(self): """ Returns the sample names (IDs) contained in the abundance table. :return Sample Name: A List of sample names indicated by the metadata associated with the sMetadataId given in table creation. A list of string names or empty list on error as well as no underlying table. """ return self._npaFeatureAbundance.dtype.names[1:] if ( self._npaFeatureAbundance != None ) else [] #Happy Path Tested def funcGetIDMetadataName(self): """ Returns the metadata id. :return ID: The metadata id (the sample Id). Returns none on error. """ return self._npaFeatureAbundance.dtype.names[0] if ( self._npaFeatureAbundance != None ) else None #Happy path tested def funcGetAbundanceCopy(self): """ Returns a deep copy of the abundance table. :return Numpy Structured Array: The measurement data in the Abundance table. Can use sample names to access each column of measurements. Returns none on error. """ return self._npaFeatureAbundance.copy() if ( self._npaFeatureAbundance != None ) else None #Happy path tested def funcGetAverageAbundancePerSample(self, lsTargetedFeatures): """ Averages feature abundance within a sample. :param lsTargetedFeatures: String names of features to average :type: List of string names of features which are measured :return List: List of lists or boolean (False on error). One internal list per sample indicating the sample and the feature's average abudance [[sample,average abundance of selected taxa]] or False on error """ #Sample rank averages [[sample,average abundance of selected taxa]] sampleAbundanceAverages = [] sampleNames = self.funcGetSampleNames() allTaxaNames = self.funcGetFeatureNames() #Get an abundance table compressed to features of interest abndReducedTable = self.funcGetFeatureAbundanceTable(lsTargetedFeatures) if abndReducedTable == None: return False #If the taxa to be selected are not in the list, Return nothing and log lsMissing = [] for sFeature in lsTargetedFeatures: if not sFeature in allTaxaNames: lsMissing.append(sFeature) else: #Check to make sure the taxa of interest is not average abundance of 0 if not abndReducedTable.funcGetFeatureSumAcrossSamples(sFeature): lsMissing.append(sFeature) if len(lsMissing) > 0: sys.stderr.write( "Could not find features for averaging: " + str(lsMissing) ) return False #For each sample name get average abundance for sName in sampleNames: npaFeaturesSample = abndReducedTable.funcGetSample(sName) sampleAbundanceAverages.append([sName,sum(npaFeaturesSample)/float(len(npaFeaturesSample))]) #Sort based on average return sorted(sampleAbundanceAverages, key = lambda sampleData: sampleData[1], reverse = True) #Happy path tested 1 def funcGetAverageSample(self): """ Returns the average sample of the abundance table. This average sample is made of the average of each feature. :return list: A list of averages in the order of the feature names. """ ldAverageSample = [] #If there are no samples then return empty list. if len(self.funcGetSampleNames()) < 1: return ldAverageSample #If there are samples return the average of each feature in the order of the feature names. for sFeature in self._npaFeatureAbundance: npFeaturesAbundance = list(sFeature)[1:] ldAverageSample.append(sum(npFeaturesAbundance)/float(len(npFeaturesAbundance))) return ldAverageSample #Tested 2 cases def funcHasFeatureHierarchy(self): """ Returns an indicator of having a hierarchy in the features indicated by the existance of the feature delimiter. :return Boolean: True (Has a hierarchy) or False (Does not have a hierarchy) """ if ( self._npaFeatureAbundance == None ): return None cDelimiter = self.funcGetFeatureDelimiter() if ( not cDelimiter ): return False #For each feature name, check to see if the delimiter is in the name for sFeature in self.funcGetFeatureNames(): if cDelimiter in sFeature: return True return False def funcGetCladePrefixes(self): """ Returns the list of prefixes to use on biological sample hierarchy :return List: List of strings """ return self._lsCladePrefixes #3 test cases def funcAddCladePrefixToFeatures(self): """ As a standardized clade prefix to indicate biological clade given hierarchy. Will not add a prefix to already prefixes feature names. Will add prefix to feature names that do not have them or clades in a feature name that do not have them while leaving ones that do as is. :return Boolean: True (Has a hierarchy) or False (Does not have a hierarchy) """ if ( self._npaFeatureAbundance == None ): return None cDelimiter = self.funcGetFeatureDelimiter() lsPrefixes = self.funcGetCladePrefixes() iPrefixLength = len(lsPrefixes) if ( not cDelimiter ): return False #Append prefixes to feature names lsUpdatedFeatureNames = [] lsFeatureNames = self.funcGetFeatureNames() for sFeatureName in lsFeatureNames: lsClades = sFeatureName.split(cDelimiter) #If there are not enough then error if(len(lsClades) > iPrefixLength): print "Error:: Too many clades given to be biologically meaningful" return False lsUpdatedFeatureNames.append(cDelimiter.join([lsPrefixes[iClade]+lsClades[iClade] if not(lsClades[iClade][0:len(lsPrefixes[iClade])]==lsPrefixes[iClade]) else lsClades[iClade] for iClade in xrange(len(lsClades))])) #Update new feature names to abundance table if not self.funcGetIDMetadataName() == None: self._npaFeatureAbundance[self.funcGetIDMetadataName()] = np.array(lsUpdatedFeatureNames) return True #Happy Path Tested def funcGetFeatureAbundanceTable(self, lsFeatures): """ Returns a copy of the current abundance table with the abundance of just the given features. :param lsFeatures: String Feature IDs that are kept in the compressed abundance table. :type: List of strings Feature IDs (found as the first entry of a filter in the input file. :return AbundanceTable: A compressed version of the abundance table. On an error None is returned. """ if ( self._npaFeatureAbundance == None ) or ( lsFeatures == None ): return None #Get a list of boolean indicators that the row is from the features list lfFeatureData = [sRowID in lsFeatures for sRowID in self.funcGetFeatureNames()] #compressed version as an Abundance table lsNamePieces = os.path.splitext(self._strOriginalName) abndFeature = AbundanceTable(npaAbundance=np.compress(lfFeatureData, self._npaFeatureAbundance, axis = 0), dictMetadata = self.funcGetMetadataCopy(), strName = lsNamePieces[0] + "-" + str(len(lsFeatures)) +"-Features"+lsNamePieces[1], strLastMetadata=self.funcGetLastMetadataName(), cFileDelimiter = self.funcGetFileDelimiter(), cFeatureNameDelimiter= self.funcGetFeatureDelimiter()) #Table is no longer normalized abndFeature._fIsNormalized = False return abndFeature #Happy path tested def funcGetFeatureDelimiter(self): """ The delimiter of the feature names (For example to use on concensus lineages). :return Character: Delimiter for the feature name pieces if it is complex. """ return self._cFeatureDelimiter #Happy path tested def funcGetFeatureCount(self): """ Returns the current feature count. :return Count: Returns the int count of features in the abundance table. Returns None on error. """ return self._npaFeatureAbundance.shape[0] if not self._npaFeatureAbundance is None else 0 #Happy path tested def funcGetFeatureSumAcrossSamples(self,sFeatureName): """ Returns float sum of feature values across the samples. :param sFeatureName: The feature ID to get the sum from. :type: String. :return Double: Sum of one feature across samples. """ return sum(self.funcGetFeature(sFeatureName)) def funcGetFeature(self,sFeatureName): """ Returns feature values across the samples. :param sFeatureName: The feature ID to get the sum from. :type: String. :return Double: Feature across samples. """ for sFeature in self._npaFeatureAbundance: if sFeature[0] == sFeatureName: return list(sFeature)[1:] return None #Happy path tested def funcGetFeatureNames(self): """ Return the feature names as a list. :return Feature Names: List of feature names (or IDs) as strings. As an error returns empty list. """ if (not self._npaFeatureAbundance == None): return self._npaFeatureAbundance[self.funcGetIDMetadataName()] return [] #Happy path tested def funcGetFileDelimiter(self): """ The delimiter of the file the data was read from and which is also the delimiter which would be used to write the data to a file. :return Character: Delimiter for the parsing and writing the file. """ return self._cDelimiter def funcGetLastMetadataName(self): """ Get the last metadata name that seperates abundance and metadata measurements. :return string: Metadata name """ return self._strLastMetadataName #Happy path tested def funcGetSample(self,sSampleName): """ Return a copy of the feature measurements of a sample. :param sSampleName: Name of sample to return. :type: String :return Sample: Measurements Feature measurements of a sample. Empty numpy array returned on error. """ if (not self._npaFeatureAbundance == None): return self._npaFeatureAbundance[sSampleName].copy() return np.array([]) #Happy path tested def funcGetMetadata(self, strMetadataName): """ Returns a list of metadata that is associated with the given metadata name (id). :param strMetadataName: String metadata ID to be returned :type: String ID :return Metadata: List of metadata """ return copy.deepcopy( self._dictTableMetadata.get(strMetadataName) ) \ if self._dictTableMetadata else None #Happy path tested def funcGetMetadataCopy(self): """ Returns a deep copy of the metadata. :return Metadata copy: {"ID":[value,value...]} """ return copy.deepcopy(self._dictTableMetadata) #Happy path tested def funcGetName(self): """ Returns the name of the object which is the file name that generated it. If the object was generated from an Abundance Table (for instance through stratification) the name is still in the form of a file that could be written to which is informative of the changes that have occurred on the data set. :return string: Name """ return self._strOriginalName #Happy path tested. could do more def funcGetTerminalNodes(self): """ Returns the terminal nodes given the current feature names in the abundance table. The features must contain a consensus lineage or all will be returned. :return List: List of strings of the terminal nodes given the abundance table. """ return AbundanceTable.funcGetTerminalNodesFromList(lsNames=self.funcGetFeatureNames(),cNameDelimiter=self.funcGetFeatureDelimiter()) #Tested 2 test cases @staticmethod def funcGetTerminalNodesFromList(lsNames,cNameDelimiter): """ Returns the terminal nodes given the current feature names in the abundance table. The features must contain a consensus lineage or all will be returned. :param lsNames: The list of string names to parse and filter. :type: List of strings :param cNameDelimiter: The delimiter for the name of the features. :type: Character Delimiter :return list: A list of terminal elements in the list (given only the list). """ #Build hash dictCounts = dict() for strTaxaName in lsNames: #Split into the elements of the clades lsClades = filter(None,strTaxaName.split(cNameDelimiter)) #Count clade levels iCladeLength = len(lsClades) #Evaluate first element sClade = lsClades[0] dictCounts[sClade] = sClade not in dictCounts #Evaluate the rest of the elements if iCladeLength < 2: continue for iIndex in xrange(1,iCladeLength): prevClade = sClade sClade = cNameDelimiter.join([sClade,lsClades[iIndex]]) if sClade in dictCounts: dictCounts[sClade] = dictCounts[prevClade] = False else: dictCounts[sClade] = True dictCounts[prevClade] = False #Return only the elements that were of count 1 return filter( lambda s: dictCounts[s] == True, dictCounts ) #Happy path tested def funcIsNormalized(self): """ Returns if the data has been normalized. :return Boolean: Indicates if the data is normalized. True indicates it the data is normalized. """ return self._fIsNormalized #Happy path tested def funcIsPrimaryIdMetadata(self,sMetadataName): """ Checks the metadata data associated with the sMetadatName and returns if the metadata is unique. This is important to some of the functions in the Abundance Table specifically when translating from one metadata to another. :param sMetadataName: ID of metadata to check for uniqueness. :type: String Metadata ID. :return Boolean: Returns indicator of uniqueness. True indicates unique. """ lMetadata = self.funcGetMetadata(sMetadataName) if not lMetadata: return False return (len(lMetadata) == len(set(lMetadata))) #Happy path tested def funcIsSummed(self): """ Return is the data is summed. :return Boolean: Indicator of being summed. True indicates summed. """ return self._fIsSummed #Happy path tested def funcFilterAbundanceByPercentile(self, dPercentileCutOff = 95.0, dPercentageAbovePercentile=1.0): """ Filter on features. A feature is removed if it's abundance is not found in the top X percentile a certain percentage of the samples. :param dPercentileCutOff: The percentile used for filtering. :type: double A double between 0.0 and 100.0 :param dPercentageAbovePercentile: The percentage above the given percentile (dPercentileCutOff) that must exist to keep the feature. :type: double Between 0.0 and 100.0 :return Boolean: Indicator of filtering occuring without error. True indicates filtering occuring. """ #No need to do anything if(dPercentileCutOff==0.0) or (dPercentageAbovePercentile==0.0): return True #Sample names lsSampleNames = self.funcGetSampleNames() #Scale percentage out of 100 dPercentageAbovePercentile = dPercentageAbovePercentile/100.0 #Sample count iSampleCount = len(lsSampleNames) #Get a threshold score of the value at the specified percentile for each sample #In the order of the sample names ldScoreAtPercentile = [scipy.stats.scoreatpercentile(self._npaFeatureAbundance[lsSampleNames[iIndex]],dPercentileCutOff) for iIndex in xrange(iSampleCount)] #Record how many entries for each feature have a value equal to or greater than the dPercentileCutOff #If the percentile of entries passing the criteria are above the dPercentageAbovePercentile put index in list to keep liKeepIndices = [] iSampleCount = float(iSampleCount) for iRowIndex, npaRow in enumerate(self._npaFeatureAbundance): iCountPass = sum([1 if dValue >= ldScoreAtPercentile[iValueIndex] else 0 for iValueIndex, dValue in enumerate(list(npaRow)[1:])]) if (iCountPass / iSampleCount) >= dPercentageAbovePercentile: liKeepIndices.append(iRowIndex) #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepIndices,:] #Update filter state self._strCurrentFilterState += ":dPercentileCutOff=" + str(dPercentileCutOff) + ",dPercentageAbovePercentile=" + str(dPercentageAbovePercentile) #Table is no longer normalized self._fIsNormalized = False return True def funcFilterAbundanceByMinValue(self, dMinAbundance = 0.0001, iMinSamples = 3): """ Filter abundance by requiring features to have a minimum relative abundance in a minimum number of samples. Will evaluate greater than or equal to the dMinAbundance and iMinSamples. :param dMinAbundance: Minimum relative abundance. :type: Real Number Less than 1. :param iMinSamples: Minimum samples to have the relative abundnace or greater in. :type: Integer Number greater than 1. :return Boolean: Indicator of the filter running without error. False indicates error. """ #No need to do anything if(dMinAbundance==0) or (iMinSamples==0): return True #This normalization requires the data to be relative abundance if not self._fIsNormalized: #sys.stderr.write( "Could not filter by sequence occurence because the data is already normalized.\n" ) return False #Holds which indexes are kept liKeepFeatures = [] for iRowIndex, dataRow in enumerate( self._npaFeatureAbundance ): #See which rows meet the criteria and keep the index if needed. if len( filter( lambda d: d >= dMinAbundance, list(dataRow)[1:] ) ) >= iMinSamples: liKeepFeatures.append(iRowIndex) #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepFeatures,:] #Update filter state self._strCurrentFilterState += ":dMinAbundance=" + str(dMinAbundance) + ",iMinSamples=" + str(iMinSamples) return True #Happy path tested def funcFilterAbundanceBySequenceOccurence(self, iMinSequence = 2, iMinSamples = 2): """ Filter occurence by requiring features to have a minimum sequence occurence in a minimum number of samples. Will evaluate greater than or equal to the iMinSequence and iMinSamples. :param iMinSequence: Minimum sequence to occur. :type: Integer Number Greater than 1. :param iMinSamples: Minimum samples to occur in. :type: Integer Number greater than 1. :return Boolean: Indicator of the filter running without error. False indicates error. """ #No need to do anything if(iMinSequence==0) or (iMinSamples==0): return True #This normalization requires the data to be reads if self._fIsNormalized: #sys.stderr.write( "Could not filter by sequence occurence because the data is already normalized.\n" ) return False #Holds which indexes are kept liKeepFeatures = [] for iRowIndex, dataRow in enumerate( self._npaFeatureAbundance ): #See which rows meet the criteria and keep the index if needed. if len( filter( lambda d: d >= iMinSequence, list(dataRow)[1:] ) ) >= iMinSamples: liKeepFeatures.append(iRowIndex) #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepFeatures,:] #Update filter state self._strCurrentFilterState += ":iMinSequence=" + str(iMinSequence) + ",iMinSamples=" + str(iMinSamples) return True #1 Happy path test def funcFilterFeatureBySD(self, dMinSDCuttOff = 0.0): """ A feature is removed if it's abundance is not found to have standard deviation more than the given dMinSDCutoff. :param dMinSDCuttOff: Standard deviation threshold. :type: Double A double greater than 0.0. :return Boolean: Indicator of success. False indicates error. """ #No need to do anything if(dMinSDCuttOff==0.0): return True #Holds which indexes are kept liKeepFeatures = [] #Evaluate each sample for iRowIndex, dataRow in enumerate(self._npaFeatureAbundance): if(np.std(list(dataRow)[1:])>=dMinSDCuttOff): liKeepFeatures.append(iRowIndex) #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepFeatures,:] #Update filter state self._strCurrentFilterState += ":dMinSDCuttOff=" + str(dMinSDCuttOff) #Table is no longer normalized self._fIsNormalized = False return True #Happy path tested 2 tests def funcGetWithoutOTUs(self): """ Remove features that are terminal otus. Terminal otus are identified as being an integer. """ #Get the feature names lsFeatures = self.funcGetFeatureNames() #Reduce, filter the feature names lsFeatures = [sFeature for sFeature in lsFeatures if not (ValidateData.funcIsValidStringInt(sFeature.split(self.funcGetFeatureDelimiter())[-1]))] return self.funcGetFeatureAbundanceTable(lsFeatures) #Happy path tested def funcNormalize(self): """ Convenience method which will call which ever normalization is approriate on the data. :return Boolean: Indicator of success (true). """ if self._fIsSummed: return self.funcNormalizeColumnsWithSummedClades() else: return self.funcNormalizeColumnsBySum() #Testing Status: Light happy path testing def funcNormalizeColumnsBySum(self): """ Normalize the data in a manner that is approrpiate for NOT summed data. Normalize the columns (samples) of the abundance table. Normalizes as a fraction of the total (number/(sum of all numbers in the column)). Will not act on summed tables. :return Boolean: Indicator of success. False indicates error. """ if self._fIsNormalized: # sys.stderr.write( "This table is already normalized, did not perform new normalization request.\n" ) return False if self._fIsSummed: sys.stderr.write( "This table has clades summed, this normalization is not appropriate. Did not perform.\n" ) return False #Normalize for columnName in self.funcGetSampleNames(): column = self._npaFeatureAbundance[columnName] columnTotal = sum(column) if(columnTotal > 0.0): column = column/columnTotal self._npaFeatureAbundance[columnName] = column #Indicate normalization has occured self._fIsNormalized = True return True #Happy path tested def funcNormalizeColumnsWithSummedClades(self): """ Normalizes a summed Abundance Table. If this is called on a dataset which is not summed and not normalized. The data will be summed first and then normalized. If already normalized, the current normalization is kept. :return Boolean: Indicator of success. False indicates error. """ if self._fIsNormalized: # sys.stderr.write( "This table is already normalized, did not perform new summed normalization request.\n" ) return False if not self._fIsSummed: sys.stderr.write( "This table does not have clades summed, this normalization is not appropriate until the clades are summed. The clades are being summed now before normalization.\n" ) self.funcSumClades() #Load a hash table with root data {sKey: npaAbundances} hashRoots = {} for npaRow in self._npaFeatureAbundance: curldAbundance = np.array(list(npaRow)[1:]) curFeatureNameLength = len(npaRow[0].split(self._cFeatureDelimiter)) curlRootData = hashRoots.get(npaRow[0].split(self._cFeatureDelimiter)[0]) if not curlRootData: hashRoots[npaRow[0].split(self._cFeatureDelimiter)[0]] = [curFeatureNameLength, curldAbundance] elif curlRootData[0] > curFeatureNameLength: hashRoots[npaRow[0].split(self._cFeatureDelimiter)[0]] = [curFeatureNameLength, curldAbundance] #Normalize each feature by thier root feature dataMatrix = list() for npaRow in self._npaFeatureAbundance: curHashRoot = list(hashRoots[npaRow[0].split(self._cFeatureDelimiter)[0]][1]) dataMatrix.append(tuple([npaRow[0]]+[npaRow[i+1]/curHashRoot[i] if curHashRoot[i] > 0 else 0 for i in xrange(len(curHashRoot))])) self._npaFeatureAbundance = np.array(dataMatrix,self._npaFeatureAbundance.dtype) #Indicate normalization has occured self._fIsNormalized = True return True def _funcRankAbundanceHelper( self, aaTodo, iRank, lRankAbundance ): """ Helper method for ranking abudance which are tied. :params aaTodo: List of tied ranks to change to a rank. :type: List of Enumerates of samples. :params iRank: Current Rank :type: Integer :params lRankAbundance: Sample of abundance :type: List of integers """ # Subtract one from iRank (each time) to account for next loop iteration # Then average it with itself minus (the length of aaTodo + 1) dRank = ( iRank + iRank - len( aaTodo ) - 1 ) / 2.0 for a in aaTodo: lRankAbundance[a[0]] = dRank #1 Happy path test def funcRankAbundance(self): """ Rank abundances of features with in a sample. :return AbundanceTable: Abundance table data ranked (Features with in samples). None is returned on error. """ if self._npaFeatureAbundance == None: return None lsSampleNames = self.funcGetSampleNames() npRankAbundance = self.funcGetAbundanceCopy() liRanks = [] #For each sample name get the ranks for sName in lsSampleNames: #Enumerate for order and sort abundances lfSample = list(enumerate(npRankAbundance[sName])) lfSample = sorted(lfSample, key = lambda a: a[1], reverse = True) # Accumulate indices until a new value is encountered to detect + handle ties aaTodo = [] for i, a in enumerate( lfSample ): if ( not aaTodo ) or ( a[1] == aaTodo[-1][1] ): aaTodo.append( a ) else: # Make multiple tied ranks = average of first and last self._funcRankAbundanceHelper( aaTodo, i, npRankAbundance[sName] ) aaTodo = [a] self._funcRankAbundanceHelper( aaTodo, i + 1, npRankAbundance[sName] ) abndRanked = AbundanceTable(npaAbundance=npRankAbundance, dictMetadata=self.funcGetMetadataCopy(), strName= self.funcGetName() + "-Ranked", strLastMetadata=self.funcGetLastMetadataName(), cFileDelimiter=self.funcGetFileDelimiter(), cFeatureNameDelimiter=self.funcGetFeatureDelimiter()) #Table is no longer normalized abndRanked._fIsNormalized = False return abndRanked def funcGetSampleCount(self): """ Returns the sample count of the abundance table. """ return len(self.funcGetSampleNames()) #Happy Path Tested def funcReduceFeaturesToCladeLevel(self, iCladeLevel): """ Reduce the current table to a certain clade level. :param iCladeLevel: The level of the clade to trim the features to. :type: Integer The higher the number the more clades are presevered in the consensus lineage contained in the feature name. :return Boolean: Indicator of success. False indicates error. """ if iCladeLevel < 1: return False if not self._npaFeatureAbundance == None: liFeatureKeep = [] [liFeatureKeep.append(tplFeature[0]) if (len(tplFeature[1][0].split(self.funcGetFeatureDelimiter())) <= iCladeLevel) else 0 for tplFeature in enumerate(self._npaFeatureAbundance)] #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liFeatureKeep,:] #Update filter state self._strCurrentFilterState += ":iCladeLevel=" + str(iCladeLevel) return True else: return False #Happy path tested def funcRemoveSamples(self,lsSampleNames): """ Removes the samples given in the list. :param lsSampleNames: A list of string names of samples to remove. :type: List of strings Unique values :return Boolean: Indicator of success (True = success, no error) """ #Samples to remove setSamples = set(lsSampleNames) #Get orignal sample count iOriginalCount = self._iOriginalSampleCount #The samples to keep lsKeepSamples = [sSample for sSample in self.funcGetSampleNames() if not sSample in setSamples] #The sample to keep as boolean flags for compressing the metadata lfKeepSamples = [not sSample in setSamples for sSample in self.funcGetSampleNames()] #Reduce the abundance data and update self._npaFeatureAbundance = self._npaFeatureAbundance[[self.funcGetIDMetadataName()]+lsKeepSamples] #Reduce the metadata and update for sKey in self._dictTableMetadata: self._dictTableMetadata[sKey] = [value for iindex, value in enumerate(self._dictTableMetadata[sKey]) if lfKeepSamples[iindex]] #Update sample number count self._iOriginalSampleCount = len(self.funcGetSampleNames()) return self._iOriginalSampleCount == (iOriginalCount-len(setSamples)) #Happy path tested def funcRemoveSamplesByMetadata(self, sMetadata, lValuesToRemove): """ Removes samples from the abundance table based on values of a metadata. If a metadata has any value given the associated sample is removed. :param sMetadata: ID of the metdata to check the given values. :type: String Metadata ID :param lValuesToRemove: A list of values which if equal to a metadata entry indicate to remove the associated sample. :type: List of values: List :return Boolean: Indicator of success (True = success, no error) """ lsSampleNames = self.funcGetSampleNames() return self.funcRemoveSamples([lsSampleNames[iindex] for iindex, sValue in enumerate(self.funcGetMetadata(sMetadata)) if sValue in lValuesToRemove]) #Happy path testing def funcSumClades(self): """ Sums abundance data by clades indicated in the feature name (as consensus lineages). :return Boolean: Indicator of success. False indicates an error. """ if not self.funcIsSummed(): #Read in the data #Find the header column (iCol) assumed to be 1 or 2 depending on the location of "NAME" #Create a list (adSeq) that will eventually hold the sum of the columns of data astrHeaders = iCol = None adSeqs = np.array([0] * len(self.funcGetSampleNames())) pTree = CClade( ) aastrRaw = [] #For each row in the npaAbundance #Get the feature name, feature abundances, and sum up the abudance columns #Keep the sum for later normalization #Give a tree the feature name and abundance for dataRow in self._npaFeatureAbundance: sFeatureName = dataRow[0] ldAbundances = list(dataRow)[1:] #Add to the sum of the columns (samples) adSeqs = adSeqs + np.array(list(dataRow)[1:]) #Build tree pTree.get( sFeatureName.split(self._cFeatureDelimiter) ).set( ldAbundances ) #Create tree of data #Input missing data #Fill hashFeatures with the clade name (key) and a blist of values (value) of the specified level interested. pTree.impute( ) hashFeatures = {} pTree.freeze( hashFeatures, c_iSumAllCladeLevels, c_fOutputLeavesOnly ) setstrFeatures = hashFeatures.keys( ) #Remove parent clades that are identical to child clades for strFeature, adCounts in hashFeatures.items( ): astrFeature = strFeature.strip( ).split( "|" ) while len( astrFeature ) > 1: astrFeature = astrFeature[:-1] strParent = "|".join( astrFeature ) adParent = hashFeatures.get( strParent ) if adParent == adCounts: del hashFeatures[strParent] setstrFeatures.remove( strParent ) #Sort features to be nice astrFeatures = sorted( setstrFeatures ) #Change the hash table to an array dataMatrix = list() for sFeature in astrFeatures: dataMatrix.append(tuple([sFeature]+list(hashFeatures[sFeature]))) self._npaFeatureAbundance=np.array(dataMatrix,self._npaFeatureAbundance.dtype) #Indicate summation has occured self._fIsSummed = True return True #Happy path tested def funcStratifyByMetadata(self, strMetadata, fWriteToFile=False): """ Stratifies the AbundanceTable by the given metadata. Will write each stratified abundance table to file if fWriteToFile is True the object will used it's internally stored name as a file to write to if fWriteToFile is a string then it should be a directory and end with "." This will rebase the file and store it in a different directory but with an otherwise unchanged name. Note: If the metadata used for stratification has NAs, they will be segregated to thier own table and returned. :param strMetadata: Metadata ID to stratify data with. :type: String ID for a metadata. :param fWriteToFile: Indicator to write to file. :type: Boolean True indicates to write to file. :return List: List of AbundanceTables which are deep copies of the original. Empty list on error. """ if self._npaFeatureAbundance is None or self._dictTableMetadata is None: return [] #Get unique metadata values to stratify by lsMetadata = self._dictTableMetadata.get(strMetadata,[]) setValues = set(lsMetadata) #If there is only one metadata value then no need to stratify so return the original in the list (and write if needed) if len(setValues) == 0: return [] retlAbundanceTables = [] dictAbundanceBlocks = dict() #Given here there are multiple metadata values, continue to stratify lsNames = self.funcGetSampleNames() #Get index of values to break up for value in setValues: lfDataIndex = [sData==value for sData in lsMetadata] #Get abundance data for the metadata value #The true is added to keep the first column which should be the feature id npaStratfiedAbundance = self._npaFeatureAbundance[[self.funcGetIDMetadataName()]+list(np.compress(lfDataIndex,lsNames))] #Get metadata for the metadata value dictStratifiedMetadata = dict() for metadataType in self._dictTableMetadata: dictValues = self.funcGetMetadata(metadataType) dictStratifiedMetadata[metadataType] = np.compress(lfDataIndex,dictValues).tolist() #Make abundance table #Add abundance table to the list lsNamePieces = os.path.splitext(self._strOriginalName) objStratifiedAbundanceTable = AbundanceTable(npaAbundance=npaStratfiedAbundance, dictMetadata=dictStratifiedMetadata, strName=lsNamePieces[0] + "-StratBy-" + value+lsNamePieces[1], strLastMetadata=self.funcGetLastMetadataName(), cFeatureNameDelimiter=self._cFeatureDelimiter, cFileDelimiter = self._cDelimiter) if fWriteToFile: objStratifiedAbundanceTable.funcWriteToFile(lsNamePieces[0] + "-StratBy-" + value+lsNamePieces[1]) #Append abundance table to returning list retlAbundanceTables.append(objStratifiedAbundanceTable) return retlAbundanceTables #Happy Path Tested def funcTranslateIntoMetadata(self, lsValues, sMetadataFrom, sMetadataTo, fFromPrimaryIds=True): """ Takes the given data values in one metadata and translates it to values in another metadata of the sample samples holding the values of the first metadata FPrimaryIds, if true the sMetadataFrom are checked for unique values, If FPrimaryIds is not true, duplicate values can stop the preservation of order Or may cause duplication in the "to" group. This is not advised. if the sMetadataFrom has any duplicates the function fails and return false. :param lsValues: Values to translate. :type: List List of values. :param sMetadataFrom: The metadata the lsValues come from. :type: String ID for the metadata. :param sMetadataTo: The metadata the lsValues will be translated into keeping the samples the same. :type: String ID for the metadata. :param fFromPrimaryIds: The metadata that are in the from metadata list must be unique in each sample. :type: Boolean True indicates the metadata list should be unique in each sample. Otherwise a false will return. :return List: List of new values or False on error. """ #Get metadata lFromMetadata = self.funcGetMetadata(sMetadataFrom) if not lFromMetadata: sys.stderr.write( "Abundancetable::funcTranlateIntoMetadata. Did not receive lFromMetadata.\n" ) return False lToMetadata = self.funcGetMetadata(sMetadataTo) if not lToMetadata: sys.stderr.write( "Abundancetable::funcTranlateIntoMetadata. Did not receive lToMetadata.\n" ) return False #Check to see if the values are unique if indicated to do so if fFromPrimaryIds: if not len(lFromMetadata) == len(set(lFromMetadata)): sys.stderr.write( "Abundancetable::funcTranlateIntoMetadata. sMetadataFrom did not have unique values.\n" ) return False #Translate over if lFromMetadata and lToMetadata: return [lToMetadata[iIndex] for iIndex in [lFromMetadata.index(value) for value in lsValues]] return False #Happy path tested def funcToArray(self): """ Returns a numpy array of the current Abundance Table. Removes the first ID head column and the numpy array is Made of lists, not tuples. :return Numpy Array: np.array([[float,float,...],[float,float,...],[float,float,...]]) None is returned on error. """ if not self._npaFeatureAbundance == None: return np.array([list(tplRow)[1:] for tplRow in self._npaFeatureAbundance],'float') return None #Happy Path tested def funcWriteToFile(self, xOutputFile, cDelimiter=None, cFileType=ConstantsBreadCrumbs.c_strPCLFile): """ Writes the AbundanceTable to a file strOutputFile. Will rewrite over a file as needed. Will use the cDelimiter to delimit columns if provided. :param xOutputFile: File stream or File path to write the file to. :type: String File Path :param cDelimiter: Delimiter for the output file. :type: Character If cDlimiter is not specified, the internally stored file delimiter is used. """ if not xOutputFile: return # Check delimiter argument if not cDelimiter: cDelimiter = self._cDelimiter # Check file type: If pcl: Write pcl file; If biom: write biom file; If None - write pcl file if(cFileType == None): cFileType == ConstantsBreadCrumbs.c_strPCLFile if(cFileType == ConstantsBreadCrumbs.c_strPCLFile): # Write as a pcl file self._funcWritePCLFile(xOutputFile, cDelimiter=cDelimiter) elif(cFileType == ConstantsBreadCrumbs.c_strBiomFile): #Write as a biom file self._funcWriteBiomFile(xOutputFile) return def _funcWritePCLFile(self, xOutputFile, cDelimiter=None): """ Write an abundance table object as a PCL file. :param xOutputFile: File stream or File path to write the file to. :type: String File Path :param cDelimiter: Delimiter for the output file. :type: Character If cDlimiter is not specified, the internally stored file delimiter is used. """ f = csv.writer(open( xOutputFile, "w" ) if isinstance(xOutputFile, str) else xOutputFile, csv.excel_tab, delimiter=cDelimiter) # Get Row metadata id info (IDs for column header, keys that line up with the ids) lsRowMetadataIDs, lsRowMetadataIDKeys = self.rwmtRowMetadata.funcMakeIDs() if self.rwmtRowMetadata else [[],[]] #Write Ids f.writerows([[self.funcGetIDMetadataName()]+lsRowMetadataIDs+list(self.funcGetSampleNames())]) #Write column metadata lsKeys = list(set(self._dictTableMetadata.keys())-set([self.funcGetIDMetadataName(),self.funcGetLastMetadataName()])) lMetadataIterations = list(set(lsKeys+[self.funcGetLastMetadataName()] )) f.writerows([[sMetaKey]+([ConstantsBreadCrumbs.c_strEmptyDataMetadata]*len(lsRowMetadataIDs))+self.funcGetMetadata(sMetaKey) for sMetaKey in lMetadataIterations if sMetaKey != self.funcGetIDMetadataName() and not sMetaKey is None]) #Write abundance lsOutput = list() curAbundance = self._npaFeatureAbundance.tolist() for curAbundanceRow in curAbundance: # Make feature metadata, padding with NA as needed lsMetadata = [] for sMetadataId in lsRowMetadataIDKeys: lsMetadata = lsMetadata + self.rwmtRowMetadata.funGetFeatureMetadata( curAbundanceRow[0], sMetadataId ) lsMetadata = lsMetadata + ( [ ConstantsBreadCrumbs.c_strEmptyDataMetadata ] * ( self.rwmtRowMetadata.dictMetadataIDs.get( sMetadataId, 0 ) - len( lsMetadata ) ) ) f.writerows([[curAbundanceRow[0]]+lsMetadata+[str(curAbundanceElement) for curAbundanceElement in curAbundanceRow[1:]]]) return def _funcWriteBiomFile(self, xOutputFile): """ Write an abundance table object as a Biom file. :param xOutputFile: File stream or File path to write the file to. :type: String File Path """ #************************** # Get Sample Names * #************************** lSampNames = list(self.funcGetSampleNames()) #************************** # Metadata Names * #************************** dictMetadataCopy = self.funcGetMetadataCopy() lMetaData = list() iKeysCounter = 0 for lMetadataCopyEntry in dictMetadataCopy.iteritems(): iKeysCounter +=1 sMetadataName = lMetadataCopyEntry[0] lMetadataEntries = lMetadataCopyEntry[1] iMetadataEntryCounter = -1 for sMetadataEntry in lMetadataEntries: iMetadataEntryCounter+=1 dictMetadataNames = dict() dictMetadataNames[sMetadataName ] = sMetadataEntry if iKeysCounter == 1: lMetaData.append(dictMetadataNames) else: lMetaData[iMetadataEntryCounter][sMetadataName ] = sMetadataEntry #************************** # Observation Ids * # and row metadata * #************************** bTaxonomyInRowsFlag = False if self.rwmtRowMetadata.dictRowMetadata is not None: bTaxonomyInRowsFlag = True lObservationMetadataTable = list() lObservationIds = list() lFeatureNamesResultArray = self.funcGetFeatureNames() for sFeatureName in lFeatureNamesResultArray: lObservationIds.append(sFeatureName) if self.rwmtRowMetadata and self.rwmtRowMetadata.dictRowMetadata: RowMetadataEntry = self.rwmtRowMetadata.dictRowMetadata[sFeatureName][ConstantsBreadCrumbs.c_metadata_lowercase] lObservationMetadataTable.append( RowMetadataEntry ) #************************** # Data * #************************** lData = list() lAbundanceCopyResultArray = self.funcGetAbundanceCopy() for r in lAbundanceCopyResultArray: lr = list(r) lr.pop(0) #Remove metadata lAbundanceValues = list() for AbundanceEntry in lr: flAbundanceEntry = float(AbundanceEntry) lAbundanceValues.append(flAbundanceEntry) lData.append(lAbundanceValues) arrData = array(lData) #Convert list to array #************************** # Invoke the * # biom table factory * #************************** if bTaxonomyInRowsFlag == False: BiomTable = table_factory(arrData, lSampNames, lObservationIds, lMetaData, constructor=SparseOTUTable) else: #There was metadata in the rows BiomTable = table_factory(arrData, lSampNames, lObservationIds, lMetaData, lObservationMetadataTable if len(lObservationMetadataTable) > 0 else None, constructor=SparseOTUTable) #************************** # Generate biom Output * #************************** f = open( xOutputFile, "w" ) if isinstance(xOutputFile, str) else xOutputFile f.write(BiomTable.getBiomFormatJsonString(ConstantsBreadCrumbs.c_biom_file_generated_by)) f.close() return #Testing Status: 1 Happy path test @staticmethod def funcPairTables(strFileOne, strFileTwo, strIdentifier, cDelimiter, strOutFileOne, strOutFileTwo, lsIgnoreValues=None): """ This method will read in two files and abridge both files (saved as new files) to just the samples in common between the two files given a common identifier. ***If the identifier is not unique in each data set, the first sample with the pairing id is taken so make sure the ID is unique. Expects the files to have the sample delimiters. :param strFileOne: Path to file one to be paired. :type: String File path. :param strFileTwo: Path to file two to be paired. :type: String File path. :param strIdentifier: Metadata ID that is used for pairing. :type: String Metadata ID. :param cDelimiter: Character delimiter to read the files. :type: Character Delimiter. :param strOutFileOne: The output file for the paired version of the first file. :type: String File path. :param strOutFileTwo: The output file for the paired version of the second file. :type: String File path. :param lsIgnoreValues: These values are ignored even if common IDs between the two files. :type: List List of strings. :return Boolean: Indicator of no errors. False indicates errors. """ #Validate parameters if(not ValidateData.funcIsValidFileName(strFileOne)): sys.stderr.write( "AbundanceTable:checkRawDataFile::Error, file not valid. File:" + strFileOne + "\n" ) return False #Validate parameters if(not ValidateData.funcIsValidFileName(strFileTwo)): sys.stderr.write( "AbundanceTable:checkRawDataFile::Error, file not valid. File:"+ strFileTwo + "\n" ) return False #Make file one #Read in file istm = csv.reader(open(strFileOne,'rU'), csv.excel_tab, delimiter=cDelimiter) lsContentsOne = [lsRow for lsRow in istm] #Get the file identifier for file one fileOneIdentifier = None for sLine in lsContentsOne: if sLine[0] == strIdentifier: fileOneIdentifier = sLine break #Make file two #Read in file istm = csv.reader(open(strFileTwo,'rU'), csv.excel_tab, delimiter=cDelimiter) lsContentsTwo = [lsRow for lsRow in istm] #Get the file identifier for file two fileTwoIdentifier = None for sLine in lsContentsTwo: if sLine[0] == strIdentifier: fileTwoIdentifier = sLine break #Get what is in common between the identifiers #And find which columns to keep in the tables based on the common elements setsCommonIdentifiers = set(fileOneIdentifier) & set(fileTwoIdentifier) if lsIgnoreValues: setsCommonIdentifiers = setsCommonIdentifiers - set(lsIgnoreValues) #Get positions of common identifiers in each data set, if the identifier is not unique in a date set just take the first index lfFileOneIDIndexes = [fileOneIdentifier.index(sCommonID) for sCommonID in setsCommonIdentifiers] lfFileTwoIDIndexes = [fileTwoIdentifier.index(sCommonID) for sCommonID in setsCommonIdentifiers] #Convert index list to list of boolean lfFileOneElements = [iIndex in lfFileOneIDIndexes for iIndex, sIdentifier in enumerate(fileOneIdentifier)] lfFileTwoElements = [iIndex in lfFileTwoIDIndexes for iIndex, sIdentifier in enumerate(fileTwoIdentifier)] #Write out file one ostm = csv.writer(open(strOutFileOne,'w'), csv.excel_tab, delimiter=cDelimiter) (ostm.writerows([np.compress(lfFileOneElements,sLine) for sLine in lsContentsOne])) #Write out file two ostm = csv.writer(open(strOutFileTwo,'w'), csv.excel_tab, delimiter=cDelimiter) (ostm.writerows([np.compress(lfFileTwoElements,sLine) for sLine in lsContentsTwo])) return True #Testing Status: Light happy path testing @staticmethod def funcStratifyAbundanceTableByMetadata(strInputFile = None, strDirectory = "", cDelimiter = ConstantsBreadCrumbs.c_cTab, iStratifyByRow = 1, llsGroupings = []): """ Splits an abundance table into multiple abundance tables stratified by the metadata :param strInputFile: String file path to read in and stratify. :type: String File path. :param strDirectory: Output directory to write stratified files. :type: String Output directory path. :param cDelimiter: The delimiter used in the adundance file. :type: Character Delimiter. :param iStratifyByRow: The row which contains the metadata to use in stratification. :type: Integer Positive integer index. :param llsGroupings: A list of string lists where each string list holds values that are equal and should be grouped together. So for example, if you wanted to group metadata "1", "2", and "3" seperately but "4" and "5" together you would Give the following [["4","5"]]. If you know what "1" and "3" also together you would give [["1","3"],["4","5"]] :type List List of list of strings :return Boolean: Indicator of NO error. False indicates an error. """ #Validate parameters if(not ValidateData.funcIsValidFileName(strInputFile)): sys.stderr.write( "AbundanceTable:stratifyAbundanceTableByMetadata::Error, file not valid. File:" + strInputFile + "\n" ) return False if(not ValidateData.funcIsValidStringType(cDelimiter)): sys.stderr.write( "AbundanceTable:stratifyAbundanceTableByMetadata::Error, Delimiter is not a valid string/char type. Delimiter =" + cDelimiter + "\n" ) return False if(not ValidateData.funcIsValidPositiveInteger(iStratifyByRow, tempZero = True) and (not ValidateData.funcIsValidString(iStratifyByRow))): sys.stderr.write( "AbundanceTable:stratifyAbundanceTableByMetadata::Error, Stratify by row is not a positive integer or string keyword. Row =" + str(iStratifyByRow) + ".\n" ) return False #Get the base of the file path #This is dependent on the given output directory and the prefix of the file name of the input file #If no output file is given then the input file directory is used. baseFilePath = strDirectory lsFilePiecesExt = os.path.splitext(strInputFile) if baseFilePath: baseFilePath = baseFilePath + os.path.splitext(os.path.split(strInputFile)[1])[0] else: baseFilePath = lsFilePiecesExt[0] #Read in file istm = csv.reader(open(strInputFile,'rU'), csv.excel_tab, delimiter=cDelimiter) sFileContents = [lsRow for lsRow in istm] #Collect metadata metadataInformation = dict() #If the tempStratifyRow is by key word than find the index if ValidateData.funcIsValidString(iStratifyByRow): for iLineIndex, strLine in enumerate(sFileContents): if strLine[0].strip("\"") == iStratifyByRow: iStratifyByRow = iLineIndex break #Stratify by metadata row #Split metadata row into metadata entries #And put in a dictionary containing {"variable":[1,2,3,4 column index]} stratifyByRow = sFileContents[iStratifyByRow] for metaDataIndex in xrange(1,len(stratifyByRow)): metadata = stratifyByRow[metaDataIndex] #Put all wierd categories, none, whitespace, blank space metadata cases into one bin if not metadata or metadata in string.whitespace: metadata = "Blank" #Remove any extraneous formatting metadata = metadata.strip(string.whitespace) #Store processed metadata with column occurence in dictionary if(not metadata in metadataInformation): metadataInformation[metadata] = [] metadataInformation[metadata].append(metaDataIndex) #For each of the groupings #Use the first value as the primary value which the rest of the values in the list are placed into #Go through the dict holding the indices and extend the list for the primary value with the secondary values #Then set the secondary value list to empty so that it will be ignored. if llsGroupings: for lSKeyGroups in llsGroupings: if len(lSKeyGroups) > 1: for sGroup in lSKeyGroups[1:]: if sGroup in metadataInformation: metadataInformation[lSKeyGroups[0]].extend(metadataInformation[sGroup]) metadataInformation[sGroup] = [] #Stratify data stratifiedAbundanceTables = dict() for tableRow in sFileContents: if(len(tableRow)> 1): for metadata in metadataInformation: #[0] includes the taxa line columns = metadataInformation[metadata] if columns: columns = [0] + columns lineList = list() for column in columns: lineList.append(tableRow[column]) stratifiedAbundanceTables.setdefault(metadata,[]).append(lineList) #Write to file lsFilesWritten = [] for metadata in stratifiedAbundanceTables: sOutputFile = baseFilePath+"-by-"+metadata.strip("\"")+lsFilePiecesExt[1] f = csv.writer(open(sOutputFile,'w'), csv.excel_tab, delimiter = cDelimiter ) f.writerows(stratifiedAbundanceTables[metadata]) lsFilesWritten.append(sOutputFile) return lsFilesWritten #******************************************* #* biom interface functions: * #* 1. _funcBiomToStructuredArray * #* 2. _funcDecodeBiomMetadata * #******************************************* @staticmethod def _funcBiomToStructuredArray(xInputFile = None): """ Reads the biom input file and builds a "BiomCommonArea" that contains: 1.BiomCommonArea['sLastMetadata'] - This is the name of the last Metadata (String) 2.BiomCommonArea['BiomTaxData']- dict() - going to be used as lcontents[0]==TaxData 3.BiomCommonArea['Metadata'] - dict() - going to be used as lcontents[1]==MetaData 4.BiomCommonArea['BiomFileInfo'] - dict() - going to be used as lcontents[2]==FileInfo (id, format:eg. Biological Observation Matrix 0.9.1) etc. 5.BiomCommonArea['column_metadata_id'] - This is a string which is the name of the column id :param xInputFile: File path of biom file to read. :type: String File path. :return: BiomCommonArea (See description above) :type: dict() """ #******************************************* #* Build the metadata * #******************************************* try: BiomTable = parse_biom_table(open(xInputFile,'U') if isinstance(xInputFile, str) else xInputFile) #Import the biom file except: print("Failure decoding biom file - please check your input biom file and rerun") BiomCommonArea = None return BiomCommonArea BiomCommonArea = dict() dBugNames = list() #Bug Names Table dRowsMetadata = None #Initialize the np.array of the Rows metadata BiomElements = BiomTable.getBiomFormatObject('') for BiomKey, BiomValue in BiomElements.iteritems(): #**************************************************** #* Checking the different keys: format, * #* rows, columns, date, generated_by * #**************************************************** if (BiomKey == ConstantsBreadCrumbs.c_strFormatKey or BiomKey == ConstantsBreadCrumbs.c_strFormatUrl or BiomKey == ConstantsBreadCrumbs.c_MatrixTtype or BiomKey == ConstantsBreadCrumbs.c_strTypekey or BiomKey == ConstantsBreadCrumbs.c_strIDKey #Same as below or BiomKey == ConstantsBreadCrumbs.c_GeneratedBy #<---Need to follow up with Biom as always BiomValue = "" even though in the file has a value or BiomKey == ConstantsBreadCrumbs.c_strDateKey): #Same as above BiomCommonArea = AbundanceTable._funcInsertKeyToCommonArea(BiomCommonArea, BiomKey, BiomValue) if BiomKey == ConstantsBreadCrumbs.c_rows: iMaxIdLen = 0 for iIndexRowMetaData in range(0, len(BiomValue)): if ConstantsBreadCrumbs.c_id_lowercase in BiomValue[iIndexRowMetaData]: sBugName = BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_id_lowercase] dBugNames.append(sBugName) #Post to the bug table if len(sBugName) > iMaxIdLen: #We are calculating dynamically the length of the ID iMaxIdLen = len(sBugName) if ConstantsBreadCrumbs.c_metadata_lowercase in BiomValue[0] and BiomValue[0][ConstantsBreadCrumbs.c_metadata_lowercase] != None : dRowsMetadata = AbundanceTable._funcBiomBuildRowMetadata(BiomValue, iMaxIdLen ) if BiomKey == ConstantsBreadCrumbs.c_columns: BiomCommonArea = AbundanceTable._funcDecodeBiomMetadata(BiomCommonArea, BiomValue, iMaxIdLen) #Call the subroutine to Build the metadata #******************************************* #* Build the TaxData * #******************************************* BiomTaxDataWork = list() #Initlialize TaxData BiomObservations = BiomTable.iterObservations(conv_to_np=True) #Invoke biom method to fetch data from the biom file for BiomObservationData in BiomObservations: sBugName = str( BiomObservationData[1]) BiomTaxDataEntry = list() BiomTaxDataEntry.append(sBugName) BiomObservationsValues = BiomObservationData[0] for BiomDataValue in BiomObservationsValues: BiomTaxDataEntry.append(BiomDataValue) BiomTaxDataWork.append(tuple(BiomTaxDataEntry)) BiomCommonArea[ConstantsBreadCrumbs.c_BiomTaxData] = np.array(BiomTaxDataWork,dtype=np.dtype(BiomCommonArea[ConstantsBreadCrumbs.c_Dtype])) BiomCommonArea[ConstantsBreadCrumbs.c_dRowsMetadata] = RowMetadata(dRowsMetadata) del(BiomCommonArea[ConstantsBreadCrumbs.c_Dtype]) #Not needed anymore return BiomCommonArea @staticmethod def _funcDecodeBiomMetadata(BiomCommonArea, BiomValue = None, iMaxIdLen=0 ): """ Decode the Biom Metadata and build: 1. BiomCommonArea['Metadata'] 2. BiomCommonArea['Dtype'] 3. BiomCommonArea['sLastMetadata'] 4. BiomCommonArea['column_metadata_id'] - This is a string which is the name of the column id These elements will be formatted and passed down the line to build the AbundanceTable :param BiomValue: The "columns" Metadata from the biom file (Contains the Metadata information) :type: dict() :param iMaxIdLen: The maximum length of a row ID :type: Integer :return: BiomCommonArea :type: dict() """ BiomCommonArea[ConstantsBreadCrumbs.c_sLastMetadata] = None #Initialize the LastMetadata element BiomCommonArea['dRowsMetadata'] = None #Initialize for cases that there is no metadata in the rows strLastMetadata = None strIDMetadata = None lenBiomValue = len(BiomValue) BiomMetadata = dict() for cntMetadata in range(0, lenBiomValue): BiomMetadataEntry = BiomValue[cntMetadata] for key, value in BiomMetadataEntry.iteritems(): #Loop on the entries if key == ConstantsBreadCrumbs.c_id_lowercase: #If id - process it strIDMetadata = ConstantsBreadCrumbs.c_ID if ConstantsBreadCrumbs.c_ID not in BiomMetadata: #If ID not in the common area - initalize it BiomMetadata[ConstantsBreadCrumbs.c_ID] = list() #Initialize a list for indx in range(0, lenBiomValue): #And post the values BiomMetadata[ConstantsBreadCrumbs.c_ID].append(None) BiomMetadata[ConstantsBreadCrumbs.c_ID][cntMetadata] = value.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore) if key == ConstantsBreadCrumbs.c_metadata_lowercase: #If key = metadata if not value is None: #And value is not empty MetadataDict = value #Initialize a dictionary and post the values for MDkey, MDvalue in MetadataDict.iteritems(): if type(MDkey) == unicode : MDkeyAscii = MDkey.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore) else: MDkeyAscii = MDkey if type(MDvalue) == unicode: MDvalueAscii = MDvalue.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore) else: MDvalueAscii = MDvalue if len(MDkeyAscii) > 0: #Search for the last metadata if not strIDMetadata: strIDMetadata = MDkeyAscii BiomCommonArea[ConstantsBreadCrumbs.c_sLastMetadata] = MDkeyAscii #Set the last Metadata if MDkeyAscii not in BiomMetadata: BiomMetadata[MDkeyAscii] = list() for indx in range(0, lenBiomValue): BiomMetadata[MDkeyAscii].append(None) BiomMetadata[MDkeyAscii][cntMetadata] = MDvalueAscii BiomCommonArea[ConstantsBreadCrumbs.c_Metadata] = BiomMetadata BiomCommonArea[ConstantsBreadCrumbs.c_MetadataID] = strIDMetadata #********************************************** #* Build dtype * #********************************************** BiomDtype = list() iMaxIdLen+=10 #Increase it by 10 BiomDtypeEntry = list() FirstValue = ConstantsBreadCrumbs.c_ID SecondValue = "a" + str(iMaxIdLen) BiomDtypeEntry.append(FirstValue) BiomDtypeEntry.append(SecondValue) BiomDtype.append(tuple(BiomDtypeEntry)) for a in BiomMetadata[ConstantsBreadCrumbs.c_ID]: BiomDtypeEntry = list() FirstValue = a.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore) SecondValue = ConstantsBreadCrumbs.c_f4 BiomDtypeEntry.append(FirstValue) BiomDtypeEntry.append(SecondValue) BiomDtype.append(tuple(BiomDtypeEntry)) BiomCommonArea[ConstantsBreadCrumbs.c_Dtype] = BiomDtype return BiomCommonArea @staticmethod def _funcBiomBuildRowMetadata( BiomValue, iMaxIdLen ): """ Builds the row metadata from a BIOM value :param BiomValue: BIOM Value from the BIOM JSON parsing :type: Complex dict of string pairs and dicts :param iMaxIdLen: Maximum length of all the IDs :type: int :return: dictRowsMetadata - np Array containing the rows metadata :type: {string feature id: {'metadata': {'taxonomy': [list of metadata values]}}} """ # Build the input dict for RowMetadata from a dict of dicts from a BIOM file dictRowsMetadata = dict() for iIndexRowMetaData in range(0, len(BiomValue)): dictRowsMetadata[str(BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_id_lowercase])] = dict() RowMetadataEntryFromTable = BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_metadata_lowercase] dMetadataTempDict = dict() for key, value in RowMetadataEntryFromTable.iteritems(): dMetadataTempDict[key] = value dictRowsMetadata[str(BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_id_lowercase])][ConstantsBreadCrumbs.c_metadata_lowercase] = dMetadataTempDict return dictRowsMetadata @staticmethod def _funcInsertKeyToCommonArea(BiomCommonArea, BiomKey, BiomValue): """ Inserts the keys into the BiomCommonArea["BiomFileInfo"] :param BiomCommonArea - The common area that has been built before :type: dict() :param BiomKey - The current key (eg. format, date, generated by) :type: str :param BiomValue - The current value of the key (eg. for format: "Biological Observation Matrix 0.9.1") :type: str :return: BiomCommonArea - The updated common area :type: dict() """ if ConstantsBreadCrumbs.c_BiomFileInfo not in BiomCommonArea: BiomCommonArea[ConstantsBreadCrumbs.c_BiomFileInfo] = dict() strInsertKey = BiomKey #Set Default - But it is now always the same... (eg. URL is not: format_url -->url and others) PostBiomValue = BiomValue #The default value to be posted if BiomKey == ConstantsBreadCrumbs.c_strFormatUrl: strInsertKey = ConstantsBreadCrumbs.c_strURLKey if BiomKey == ConstantsBreadCrumbs.c_MatrixTtype: strInsertKey = ConstantsBreadCrumbs.c_strSparsityKey if BiomKey == ConstantsBreadCrumbs.c_GeneratedBy: PostBiomValue = None if BiomKey == ConstantsBreadCrumbs.c_strDateKey: PostBiomValue = None BiomCommonArea[ConstantsBreadCrumbs.c_BiomFileInfo][strInsertKey] = PostBiomValue return BiomCommonArea nsegata-lefse-3ce86f78d59e/lefsebiom/CClade.py0000644000000000000000000001537412750144356017237 0ustar 00000000000000""" Author: Curtis Huttenhower Description: Used to create tree structures to hierarchically normalize abundance tables. """ ##################################################################################### #Copyright (C) <2012> # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal in the #Software without restriction, including without limitation the rights to use, copy, #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, #and to permit persons to whom the Software is furnished to do so, subject to #the following conditions: # #The above copyright notice and this permission notice shall be included in all copies #or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ##################################################################################### __author__ = "Curtis Huttenhower" __copyright__ = "Copyright 2012" __credits__ = ["Curtis Huttenhower"] __license__ = "MIT" __maintainer__ = "Timothy Tickle" __email__ = "ttickle@sph.harvard.edu" __status__ = "Development" import blist import sys class CClade: def __init__( self ): """ Initialize CClade Dictionary to hold the children nodes from feature consensus lineages. adValues is a list of the abundance value. """ self.m_hashChildren = {} self.m_adValues = None def get( self, astrClade ): """ Recursively travel the length of a tree until you find the terminal node (where astrClade == Falseor actually []) or a dict key that matches the clade call. If at any time a clade is given that is not currently know, return a new clade which is set to the current Clade as a child. """ return self.m_hashChildren.setdefault( astrClade[0], CClade( ) ).get( astrClade[1:] ) if astrClade else self def set( self, adValues ): """ Set all the values given as a list in the same order given. """ self.m_adValues = blist.blist( [0] ) * len( adValues ) for i, d in enumerate( adValues ): if d: self.m_adValues[i] = d def impute( self ): """ This allows you to recursively impute values for clades without values given their children counts. Assumably this should be called only once and after all clade abundances have been added. If the m_adValues already exist return the stored m_adValues. (No imputation needed). Otherwise call impute for all children and take the sum of the values from all the children by column Not a sum of a list but summing a list with lists by element. """ #If values do not exist if not self.m_adValues: #Call impute on all children #If the parent clade has no abundance values #Then take a copy of the child's #If they now have a copy of a child's but have other children #Sum their children with thier current values for pChild in self.m_hashChildren.values( ): adChild = pChild.impute( ) if self.m_adValues: for i in range( len( adChild or [] ) ): if adChild[i]: self.m_adValues[i] += adChild[i] elif adChild: self.m_adValues = adChild[:] #If values exist return return self.m_adValues def _freeze( self, hashValues, iTarget, astrClade, iDepth, fLeaves ): """ Update the given hashValues dict with clade abundances given depth specifications Return a set of integers returning an indicator of the structure of the tree preserved in the dict/hash When the appropriate level of the tree is reached Hashvalue is updated with the clade (including lineage) and the abundance. looks like {"clade":blist(["0.0","0.1"...])} """ #fHit is true on atleast one of the following conditions: #iTarget is not 0 indicating no changes #Leaves are indicated to be only given and the target depth for the leaves is reached. #The required depth is reached. fHit = ( not iTarget ) or ( ( fLeaves and ( iDepth == iTarget ) ) or ( ( not fLeaves ) and ( iDepth <= iTarget ) ) ) #Increment depth iDepth += 1 #Returns a set setiRet = set() #If there are children build integer set indicating structure of the tree preserved in the dict if self.m_hashChildren: #Union all the results from freeze of all children #Call freeze but append the child clade to the clade in the call. #And give an incremented depth for strChild, pChild in self.m_hashChildren.items( ): setiRet |= pChild._freeze( hashValues, iTarget, astrClade + [strChild], iDepth, fLeaves ) setiRet = set( ( i + 1 ) for i in setiRet ) else: setiRet.add( 0 ) #Indicate if the correct level is reached if iTarget < 0: if fLeaves: fHit = -( iTarget + 1 ) in setiRet else: fHit = -( iTarget + 1 ) <= max( setiRet ) #if astrClade is not == [] (so you are actually in a clade of the tree) #And the clade has values (should be true, if not impute should have been callded before running this method) #And we are at the correct level of the tree then #Add to the dict the clade and the abundance values if astrClade and self.m_adValues and fHit: hashValues["|".join( astrClade )] = self.m_adValues return setiRet def freeze( self, hashValues, iTarget, fLeaves ): """ Call helper function setting the clade and depth to defaults (start positions) The important result of this method is hashValues is updated with clade and abundance information """ self._freeze( hashValues, iTarget, [], 0, fLeaves ) def _repr( self, strClade ): """ Represent tree clade for debugging. Helper function for recursive repr. """ strRet = "<" if strClade: strRet += "%s %s" % (strClade, self.m_adValues) if self.m_hashChildren: strRet += " " if self.m_hashChildren: strRet += " ".join( p._repr( s ) for (s, p) in self.m_hashChildren.items( ) ) return ( strRet + ">" ) def __repr__( self ): """ Represent tree clade for debugging. """ return self._repr( "" ) """ pTree = CClade( ) pTree.get( ("A", "B") ).set( [1, 2, 3] ) pTree.get( ("A", "C") ).set( [4, 5, 6] ) pTree.get( ("D", "E") ).set( [7, 8, 9] ) iTaxa = 0 if iTaxa: pTree.impute( ) hashFeatures = {} pTree.freeze( hashFeatures, iTaxa ) print( pTree ) print( hashFeatures ) sys.exit( 0 ) #""" nsegata-lefse-3ce86f78d59e/lefsebiom/ConstantsBreadCrumbs.py0000755000000000000000000001171712750144356022212 0ustar 00000000000000""" Author: Timothy Tickle Description: Project constants. """ ##################################################################################### #Copyright (C) <2012> # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal in the #Software without restriction, including without limitation the rights to use, copy, #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, #and to permit persons to whom the Software is furnished to do so, subject to #the following conditions: # #The above copyright notice and this permission notice shall be included in all copies #or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ##################################################################################### __author__ = "Timothy Tickle" __copyright__ = "Copyright 2012" __credits__ = ["Timothy Tickle"] __license__ = "MIT" __maintainer__ = "Timothy Tickle" __email__ = "ttickle@sph.harvard.edu" __status__ = "Development" ## #Used to test the FileIO class class ConstantsBreadCrumbs(): """ Class to hold project constants. """ #Character Constants c_strComma = ',' c_strColon = ':' c_strConfigFileHeaderChar = '[' c_strConfigFileCommentChar = '#' c_strEndline = '\n' c_strExtDelim = '.' c_cFastaIDLineStart = '>' c_strPathDelim = '/' c_cPipe = '|' c_cQuote = '\"' c_cTab = '\t' c_strWhiteSpace = ' ' c_matrixFileDelim = '\t' c_strBreadCrumbsSVMSpace = c_strWhiteSpace #Default values for missing data in the Abundance Table c_strEmptyAbundanceData = "0" c_strEmptyDataMetadata = "NA" c_strSVMNoSample = "-" lNAs = list(set(["NA","na","Na","nA",c_strEmptyDataMetadata])) #TODO remove #Reference to circlader c_strCircladerScript = "circlader/circlader.py" #AbundanceTable #Suffix given to a file that is check with the checkRawDataFile method OUTPUT_SUFFIX = "-checked.pcl" #BIOM related #PCL File metadata defaults (many of these come from biom file requirements #ID c_strIDKey = "id" c_strDefaultPCLID = None #File date c_strDateKey = "date" #File format type c_strFormatKey = "format" c_strDefaultPCLFileFormateType = "PCL" #File generation source c_strSourceKey = "source" c_strDefaultPCLGenerationSource = None #File type c_strTypekey = "type" c_strDefaultPCLFileTpe = None #Allowable file types for biom files c_strOTUType = "OTU" c_strOTUBIOMType = "OTU table" c_strPathwayType = "Pathway" c_strPathwayBIOMType = "Pathway table" c_strFunctionType = "Function" c_strFunctionBIOMType = "Function table" c_strOrthologType = "Ortholog" c_strOrthologBIOMType = "Ortholog table" c_strGeneType = "Gene" c_strGeneBIOMType = "Gene table" c_strMetaboliteType = "Metabolite" c_strMetaboliteBIOMType = "Metabolite table" c_strTaxonType = "Taxon" c_strTaxonBIOMType = "Taxon table" c_dictFileType = {c_strOTUType:c_strOTUBIOMType, c_strPathwayType:c_strPathwayBIOMType, c_strFunctionType:c_strFunctionBIOMType, c_strOrthologType:c_strOrthologBIOMType, c_strGeneType:c_strGeneBIOMType, c_strMetaboliteType:c_strMetaboliteBIOMType, c_strTaxonType:c_strTaxonType} #File URL c_strURLKey = "url" c_strDefaultPCLURL = None c_strFormatUrl = "format_url" #File sparse matrix c_strSparsityKey = "sparsity" c_fDefaultPCLSparsity = False # BIOM related Data # Data shape c_strDataShapeKey = "shape" ###################################################################### # Constants related to biom import and export files # ###################################################################### # Biom file extension c_strBiomFile = "biom" c_BiomTaxData = "BiomTaxData" c_MetadataID = "column_metadata_id" c_Metadata = "Metadata" c_metadata_lowercase = "metadata" c_sLastMetadata = "sLastMetadata" c_columns = "columns" c_rows = "rows" c_ascii = "ascii" c_ignore = "ignore" c_Dtype = "Dtype" c_ID = "ID" c_id_lowercase = "id" c_f4 = "f8" c_biom_file_generated_by = "BreadCrumbs" c_strPCLFile = "pcl" c_taxonomy = "taxonomy" c_dRowsMetadata = "dRowsMetadata" c_BiomFileInfo = "BiomFileInfo" c_MatrixTtype = "matrix_type" c_GeneratedBy = "generated_by" c_MetadataEntriesTotal = "MetadataEntriesTotal" c_MaximumLength = "MaximumLength" def __init__(self): pass nsegata-lefse-3ce86f78d59e/lefsebiom/ValidateData.py0000755000000000000000000004522712750144356020452 0ustar 00000000000000""" Author: Timothy Tickle Description: Validate Data containing methods for testing variables. """ ##################################################################################### #Copyright (C) <2012> # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal in the #Software without restriction, including without limitation the rights to use, copy, #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, #and to permit persons to whom the Software is furnished to do so, subject to #the following conditions: # #The above copyright notice and this permission notice shall be included in all copies #or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ##################################################################################### __author__ = "Timothy Tickle" __copyright__ = "Copyright 2012" __credits__ = ["Timothy Tickle"] __license__ = "MIT" __maintainer__ = "Timothy Tickle" __email__ = "ttickle@sph.harvard.edu" __status__ = "Development" #Import local code from types import * import decimal import os import re import string class ValidateData: #Tested 5 @staticmethod def funcIsValidBoolean(parameterValue): """ Validates a parameter as a valid boolean. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a valid boolean. :type Boolean """ #Check to make sure it is not null if parameterValue == None: return False #Check to make sure it is a string if not type(parameterValue) is BooleanType: return False return True #Tested 6 @staticmethod def funcIsTrue(parameterValue): """ Validates a parameter as true. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is True. :type Boolean """ if(ValidateData.funcIsValidBoolean(parameterValue)): if(parameterValue == True): return True return False #Tested 6 @staticmethod def funcIsFalse(parameterValue): """ Validates a parameter as false. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is False. :type Boolean """ if(ValidateData.funcIsValidBoolean(parameterValue)): if(parameterValue == False): return True return False #Tested 5 @staticmethod def funcIsValidInteger(parameterValue): """ Validates a parameter as an integer. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is an integer. :type Boolean """ #Check to make sure it is not null if (parameterValue == None): return False #Check to make sure it is an integer if not type(parameterValue) is IntType: return False return True #Tested 5 @staticmethod def funcIsValidPositiveInteger(parameterValue, tempZero = False): """ Validates a parameter as false. :param parameterValue: Value to be evaluated. :type Unknown :param tempZero: Allows one to set what the value for zero should return. :type Boolean The return value for zero. :return Boolean: True indicates the parameter is a positive integer. :type Boolean """ #Check to make sure it is not null if not ValidateData.funcIsValidInteger(parameterValue): return False #Check to see it is positive if (parameterValue < 0): return False #Check for zero value if(parameterValue == 0): return tempZero return True #Tested 14 @staticmethod def funcIsValidNumeric(parameterValue): """ Validates a parameter as an integer. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a numeric. :type Boolean """ #Check to make sure it is not null if (parameterValue == None): return False #Check to make sure it is an integer if((type(parameterValue) == IntType)or(type(parameterValue) == LongType)or(type(parameterValue) == FloatType)or(type(parameterValue) == ComplexType)or(str(type(parameterValue)) == "")): if(not type(parameterValue) == BooleanType): return True return False #Tested 5 @staticmethod def funcIsValidStringType(parameterValue): """ Validates a parameter as a string. This allows the string to be blank or empty. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a string type. :type Boolean """ #Check to make sure it is not null if parameterValue == None: return False #Check to make sure it is a string if not type(parameterValue) is StringType: return False return True #Tested 5 @staticmethod def funcIsValidString(parameterValue): """ Validates a parameter as a string. Does NOT allow string to be blank or empty. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a string. :type Boolean """ #Type check if not ValidateData.funcIsValidStringType(parameterValue): return False #Check to see it is not blank if parameterValue.strip() == "": return False return True @staticmethod def funcIsValidStringInt(parameterValue): """ Validates a parameter that is a string as a format which is an integer. :param parameterValue: Value to be evaluated. :type Unknown """ #Type string check if not ValidateData.funcIsValidStringType(parameterValue): return False #Check to see if the string can be converted to an integer try: int(parameterValue) except: return False return True @staticmethod def funcIsValidStringFloat(parameterValue): """ Validates a parameter that is a string as a format which is a numeric. :param parameterValue: Value to be evaluated. :type Unknown """ #Type string check if not ValidateData.funcIsValidStringType(parameterValue): return False #Check to see if the string can be converted to a double try: float(parameterValue) except: return False return True #Tested 6 @staticmethod def funcIsValidFormatString(parameterValue): """ Validates a parameter as a valid format string. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a valid value. :type Boolean """ lettersValid = False if ValidateData.funcIsValidString(parameterValue): validChars = "BbcdfHhIiLlPpsx0123456789" for letter in parameterValue: lettersValid = letter in validChars if(not lettersValid): break return lettersValid #Tested 5 @staticmethod def funcIsValidChar(parameterValue): """ Validates a parameter as a valid character. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a valid value. :type Boolean """ return ValidateData.funcIsValidString(parameterValue) #Tested 13 @staticmethod def funcIsValidPositiveNumberChar(parameterValue): """ Validates a parameter as a valid character representing a number. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a valid value. :type Boolean """ #Check to make sure is a valid string if not ValidateData.funcIsValidString(parameterValue): return False #Try to convert to decimal try: decimalConversion = decimal.Decimal(parameterValue) if decimalConversion < 0: return False except: return False return True #Tested 9 @staticmethod def funcIsValidFlagChar(parameterValue): """ Validates a parameter as a valid character representing a boolean. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a valid value. :type Boolean """ if parameterValue == '0' or parameterValue == "0" or parameterValue == '1' or parameterValue == "1": return True return False #Tested 15 @staticmethod def funcIsValidBoundedIntegerChar(parameterValue, iValueOne, iValueTwo): """ Validates a parameter as a valid characater that represents an integer inclusively bounded by two given values. :param parameterValue: Value to be evaluated. :type Unknown :param iValueOne: One bound for the value. :type Integer :param iValueTwo: The other bound for the data. :type Integer :return Boolean: True indicates the parameter is a valid value. :type Boolean """ #Check to make sure is a valid string if not ValidateData.funcIsValidString(parameterValue): return False #Check to make sure is a valid integer if not ValidateData.funcIsValidInteger(iValueOne): return False #Check to make sure is a valid integer if not ValidateData.funcIsValidInteger(iValueTwo): return False #Try to convert to decimal try: intConversion = int(parameterValue) if(iValueOne < iValueTwo): if ((intConversion >= iValueOne) and (intConversion <= iValueTwo)): return True return False if(iValueTwo < iValueOne): if ((intConversion >= iValueTwo) and (intConversion <= iValueOne)): return True return False if(iValueOne == iValueTwo): if (intConversion == iValueOne): return True return False except: return False #Tested 9 @staticmethod def funcIsValidList(parameterValue): """ Validates a parameter as a list. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a list :type Boolean """ #Check to make sure it is not null if parameterValue == None: return False #Check to make sure it is a list if not type(parameterValue) is ListType: return False #Check elements listSize = len(parameterValue) for i in range(0,listSize): if parameterValue[i] == None: return False if type(parameterValue[i]) is ListType: if ValidateData.funcIsValidList(parameterValue[i]) == False: return False return True #Tested 9 @staticmethod def funcIsValidTuple(parameterValue): """ Validates a parameter as a tuple. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a tuple :type Boolean """ #Check to make sure it is not null if parameterValue == None: return False #Check to make sure it is a string if not type(parameterValue) is TupleType: return False #Check elements tupleSize = len(parameterValue) for i in range(0,tupleSize): if parameterValue[i] == None: return False if type(parameterValue[i]) is TupleType: if ValidateData.funcIsValidTuple(parameterValue[i]) == False: return False return True #Tested 7 @staticmethod def funcIsValidNumericList(parameterValue): """ Validates a parameter as a list of numeric values. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a list of numeric values. :type Boolean """ #Check is valid list if(not ValidateData.funcIsValidList(parameterValue)): return False #Check elements listSize = len(parameterValue) for i in xrange(0,listSize): if(not ValidateData.funcIsValidNumeric(parameterValue[i])): return False return True #Tested 7 @staticmethod def funcIsValidStringList(parameterValue): """ Validates a parameter as a list of string values. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a list of string values. :type Boolean """ #Check is valid list if(not ValidateData.funcIsValidList(parameterValue)): return False #Check elements listSize = len(parameterValue) for i in xrange(0,listSize): if(not ValidateData.funcIsValidString(parameterValue[i])): return False return True #Tested 4 @staticmethod def funcIsValidNPArray(parameterValue): """ Validates a parameter as a numpy array. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a numpy array. :type Boolean """ #Check to make sure it is not null if parameterValue == None: return False #Check to make sure it is a structure array if not str(type(parameterValue)) == "": return False return True #Tested 9 @staticmethod def funcIsValidDictionary(parameterValue): """ Validates a parameter as a dictionary. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a dictionary. :type Boolean """ #Check to make sure it is not null if parameterValue == None: return False #Check to make sure it is a string if not type(parameterValue) is DictType: return False #Check key elements keyList = parameterValue.keys() keyListSize = len(keyList) for i in range(0,keyListSize): if keyList[i] == None: return False if type(keyList[i]) is ListType: if validateData.funcIsValidList(keyList[i]) == False: return False #Check key elements itemList = parameterValue.values() itemListSize = len(itemList) for i in range(0,itemListSize): if itemList[i] == None: return False if type(itemList[i]) is ListType: if ValidateData.funcIsValidList(itemList[i]) == False: return False return True #Tested 18 @staticmethod def funcIsValidDNASequence(parameterValue): """ Validates a parameter as a valid DNA sequence. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a valid value. :type Boolean """ if ValidateData.funcIsValidString(parameterValue): expression = re.compile(r'[^atcgATCG]') if not None == expression.search(parameterValue): return False return True return False #Tested 15 @staticmethod def funcIsValidNucleotideBase(parameterValue): """ Validates a parameter as a character which is a valid nucleotide representation. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a valid value. :type Boolean """ if (ValidateData.funcIsValidDNASequence(parameterValue) or (parameterValue == 'u') or (parameterValue == "U")): if (len(parameterValue) == 1): return True return False #Testing 4 @staticmethod def funcIsValidFileName(parameterValue): """ Validates a parameter as a valid file name. :param parameterValue: Value to be evaluated. :type Unknown :return Boolean: True indicates the parameter is a valid file path. :type Boolean """ if parameterValue is None: return False elif(ValidateData.funcIsValidString(parameterValue)): return os.path.exists(parameterValue) return False #Tested 5 @staticmethod def funcIsValidClass(parameterValue, strCorrectName): """ Validates a parameter as a valid class (of a specifc type given by name). :param parameterValue: Value to be evaluated. :type Unknown :param strCorrectName: Name of te class the parameter should be. :type Unknown :return Boolean: True indicates the parameter is a valid value. :type Boolean """ if(parameterValue==None): return False if not ValidateData.funcIsValidString(strCorrectName): return False classType = type(parameterValue).__name__ if(classType == strCorrectName): return True if(classType == 'instance'): if(parameterValue.__class__.__name__==strCorrectName): return True else: return False return False nsegata-lefse-3ce86f78d59e/lefsebiom/__init__.py0000644000000000000000000000000012750144356017637 0ustar 00000000000000nsegata-lefse-3ce86f78d59e/license.txt0000644000000000000000000000207112750144356015756 0ustar 00000000000000Copyright (c) 2012, Nicola Segata and Curtis Huttenhower Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. nsegata-lefse-3ce86f78d59e/plot_cladogram.py0000755000000000000000000003453312750144356017147 0ustar 00000000000000#!/usr/bin/env python import os,sys,matplotlib,argparse,string matplotlib.use('Agg') from pylab import * from lefse import * import numpy as np colors = ['r','g','b','m','c',[1.0,0.5,0.0],[0.0,1.0,0.0],[0.33,0.125,0.0],[0.75,0.75,0.75],'k'] dark_colors = [[0.4,0.0,0.0],[0.0,0.2,0.0],[0.0,0.0,0.4],'m','c',[1.0,0.5,0.0],[0.0,1.0,0.0],[0.33,0.125,0.0],[0.75,0.75,0.75],'k'] class CladeNode: def __init__(self, name, abundance, viz = True): self.id = name self.name = name.split(".") self.last_name = self.name[-1] self.abundance = abundance self.pos = (-1.0,-1.0) self.children = {} self.isleaf = True self.color = 'y' self.next_leaf = -1 self.prev_leaf = -1 self.viz = viz def __repr__(self): return self.last_name def add_child(self,node): self.isleaf = False self.children[node.__repr__()] = node def get_children(self): ck = sorted(self.children.keys()) return [self.children[k] for k in ck] def get_color(self): return self.color def set_color(self,c): self.color = c def set_pos(self,pos): self.pos = pos def read_params(args): parser = argparse.ArgumentParser(description='Cladoplot') parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="tab delimited input file") parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, help="the file for the output image") parser.add_argument('--clade_sep',dest="clade_sep", type=float, default=1.5) parser.add_argument('--max_lev',dest="max_lev", type=int, default=-1) parser.add_argument('--max_point_size',dest="max_point_size", type=float, default=6.0) parser.add_argument('--min_point_size',dest="min_point_size", type=float, default=1) parser.add_argument('--point_edge_width',dest="markeredgewidth", type=float, default=.25) parser.add_argument('--siblings_connector_width',dest="siblings_connector_width", type=float, default=2) parser.add_argument('--parents_connector_width',dest="parents_connector_width", type=float, default=0.75) parser.add_argument('--radial_start_lev',dest="radial_start_lev", type=int, default=1) parser.add_argument('--labeled_start_lev',dest="labeled_start_lev", type=int, default=2) parser.add_argument('--labeled_stop_lev',dest="labeled_stop_lev", type=int, default=5) parser.add_argument('--abrv_start_lev',dest="abrv_start_lev", type=int, default=3) parser.add_argument('--abrv_stop_lev',dest="abrv_stop_lev", type=int, default=5) parser.add_argument('--expand_void_lev',dest="expand_void_lev", type=int, default=1) parser.add_argument('--class_legend_vis',dest="class_legend_vis", type=int, default=1) parser.add_argument('--colored_connector',dest="colored_connectors", type=int, default=1) parser.add_argument('--alpha',dest="alpha", type=float, default=0.2) parser.add_argument('--title',dest="title", type=str, default="Cladogram") parser.add_argument('--sub_clade',dest="sub_clade", type=str, default="") parser.add_argument('--title_font_size',dest="title_font_size", type=str, default="14") parser.add_argument('--right_space_prop',dest="r_prop", type=float, default=0.1) parser.add_argument('--left_space_prop',dest="l_prop", type=float, default=0.1) parser.add_argument('--label_font_size',dest="label_font_size", type=str, default="6") parser.add_argument('--background_color',dest="back_color", type=str, choices=["k","w"], default="w", help="set the color of the background") parser.add_argument('--colored_labels',dest="col_lab", type=int, choices=[0,1], default=1, help="draw the label with class color (1) or in black (0)") parser.add_argument('--class_legend_font_size',dest="class_legend_font_size", type=str, default="10") parser.add_argument('--dpi',dest="dpi", type=int, default=72) parser.add_argument('--format', dest="format", choices=["png","svg","pdf"], default="svg", type=str, help="the format for the output file") parser.add_argument('--all_feats', dest="all_feats", type=str, default="") args = parser.parse_args() return vars(args) def cmp_names(la,lb): if len(la) != len(lb): return False for p in [(a,b) for i,a in enumerate(la) for j,b in enumerate(lb) if i == j]: if p[0] != p[1]: return False return True def build_tree(father,all_nodes,l,depth,viz): cc = [n for n in all_nodes if len(n.name) > len(father.name) and cmp_names(father.name,n.name[:len(father.name)])] children = [n for n in cc if len(n.name) == len(father.name)+1] if len(children) == 0 and l < depth -1: # !!! nc = CladeNode(father.id+"."+father.id.split(".")[-1],1.0,viz) father.add_child(nc) children.append(nc) for child in children: build_tree(child,cc,l+1,depth,viz) father.add_child(child) def get_all_nodes(father): ret = [father] children = father.get_children() for c in children: ret += get_all_nodes(c) return ret def read_data(input_file,params): with open(input_file, 'r') as inp: if params['sub_clade'] == "": rows = [line.strip().split()[:-1] for line in inp.readlines() if params['max_lev'] < 1 or line.split()[0].count(".") < params['max_lev']] else: rows = [line.split(params['sub_clade']+".")[1].strip().split()[:-1] for line in inp.readlines() if ( params['max_lev'] < 1 or line.split()[0].count(".") < params['max_lev'] ) and line.startswith(params['sub_clade']+".")] all_names = [lin[0] for lin in rows] to_add = [] abundances = [float(v) for v in zip(*rows)[1] if v >= 0.0] tree = {} tree['classes'] = list(set([v[2] for v in rows if len(v)>2])) tree['classes'].sort() all_nodes = [CladeNode("root."+row[0],float(row[1])) for row in rows] depth = max([len(n.name) for n in all_nodes]) n2 = ["_".join(nn.name) for nn in all_nodes] for i,nn in enumerate(all_nodes): n = nn while "_".join(n.name[:-1]) not in n2 and len(n.name) > 1: n = CladeNode(".".join(n.name[:-1]),n.abundance) all_nodes.append(n) n2.append("_".join(n.name)) cls2 = [] if params['all_feats'] != "": cls2 = sorted(params['all_feats'].split(":")) for i,v in enumerate(rows): if len(v)>2: if len(cls2) > 0: all_nodes[i].set_color(colors[cls2.index(v[2])%len(colors)]) else: if v[2].count('rgbcol') > 0: ccc = [float(tt) for tt in v[2].split('_')[1:]] all_nodes[i].set_color(ccc) else: all_nodes[i].set_color(colors[sorted(tree['classes']).index(v[2])%len(colors)]) root = CladeNode("root",-1.0) root.set_pos((0.0,0.0)) build_tree(root,all_nodes,0,depth,params['expand_void_lev']==1) all_nodes = get_all_nodes(root) tree['root'] = root tree['max_abs'] = max(abundances) tree['min_abs'] = min(abundances) levs = [] for i in range(depth): depthi = [n for n in all_nodes if len(n.name) == i+1] levs.append(len(depthi)) tree['nlev'] = levs return tree def add_all_pos(father,n,distn,seps,tsep,mlev,last_leaf=-1,nc=1): children = father.get_children() leaves = True if children[0].isleaf else False for i,child in enumerate(children): if leaves: n += 1.0 men = 0.5 if len(children) == 1 else 0.0 child.set_pos((n*distn-men*float(distn)+tsep,(len(father.name))/float(mlev-1))) if last_leaf != -1: child.prev_leaf = last_leaf last_leaf.next_leaf = child last_leaf = child else: ln = n ltsep = tsep n,tsep,last_leaf = add_all_pos(child,n,distn,seps,tsep,mlev,last_leaf,len(children)) nn = (ln + n)*0.5*distn ssep = (ltsep + tsep)*0.5 if n-ln == 1: ssep = ltsep child.set_pos((nn+ssep,(len(father.name))/float(mlev-1))) tsep += seps[len(father.name)-1] return n,tsep,last_leaf def plot_points(father,params,pt_scale,ax): children = father.get_children() children.sort(key = lambda a: -int(a.get_color() == 'y')*a.abundance) x,r = father.pos[0], father.pos[1] for i,child in enumerate(children): xc,rc = plot_points(child,params,pt_scale,ax) if not father.viz: return x,r ps = pt_scale[0]+father.abundance/pt_scale[1]+pt_scale[0] col = father.get_color() pw = params['markeredgewidth'] if col == 'y' else params['markeredgewidth']*3.0 if x==0 and r==0: ax.plot(x,r, 'o',markersize=ps,color=col,markeredgewidth=0.01,markeredgecolor=params['fore_color']) else: ax.plot(x,r, 'o',markersize=ps,color=col,markeredgewidth=pw,markeredgecolor=params['fore_color']) return x,r def plot_lines(father,params,depth,ax,xf): children = father.get_children() x,r = father.pos[0], father.pos[1] for i,child in enumerate(children): xc,rc = plot_lines(child,params,depth,ax,x) if i == 0: x_first, r_first = xc, rc if len(father.name) >= depth-params['radial_start_lev']: col = params['fore_color'] lw=params['parents_connector_width'] if not child.viz: continue if father.get_color() != 'y' and father.get_color() == child.get_color() and params['colored_connectors']: col = child.get_color() lw *=2.5 if col != params['fore_color']: ax.plot([x,xc],[r,rc],"-",color=params['fore_color'],lw=lw*1.5) ax.plot([x,xc],[r,rc],"-",color=col,lw=lw) if not father.viz or (len(children) == 1 and not children[0].viz): return x,r if len(father.name) < depth-params['radial_start_lev']: col = params['fore_color'] lw=params['parents_connector_width'] if father.get_color() != 'y': f =True for child in children: if child.get_color() != father.get_color() or not params['colored_connectors']: f = False break if f: col = father.get_color() lw *= 2.5 if not (x==0 and r==0): xx = xc if len(children) > 0 else x if len(children) == 0: rc = r xt = x if len(children)>1 else xx if col != params['fore_color']: ax.plot([x,xt],[r,rc],"-",color=params['fore_color'],lw=lw*1.5) ax.plot([x,xt],[r,rc],"-",color=col,lw=lw) if len(children) > 0 and 1 < len(father.name) < depth-params['radial_start_lev']: xs = arange(x_first,xc,0.01) ys = [rc for t in xs] ax.plot(xs,ys,"-",color=col,lw=params['siblings_connector_width'],markeredgecolor=params['fore_color']) return x,r def uniqueid(): for l in string.lowercase: yield l for l in string.lowercase: for i in range(10): yield l+str(i) i = 0 while True: yield str(i) i += 1 def plot_names(father,params,depth,ax,u_i,seps): children = father.get_children() l = len(father.name) if len(children)==0: if father.prev_leaf == -1 or father.next_leaf == -1: fr_0, fr_1 = father.pos[0], father.pos[0] else: fr_0, fr_1 = (father.pos[0]+father.prev_leaf.pos[0])*0.5, (father.pos[0]+father.next_leaf.pos[0])*0.5 for i,child in enumerate(children): fr,to = plot_names(child,params,depth,ax,u_i,seps) if i == 0: fr_0 = fr fr_1 = to if father.get_color() != 'y' and params['labeled_start_lev'] < l <= params['labeled_stop_lev']+1: col = father.get_color() dd = params['labeled_stop_lev'] - params['labeled_start_lev'] + 1 de = depth - 1 dim = 1.0/float(de) perc_ext = 0.65 if dim > 0.1 else 1.0 clto = (de-l+1)*dim+dim*(dd+1-(l-dd-1))*perc_ext clto = (de-l+1)*dim+dim*(dd-(l-params['labeled_start_lev'])+1)*perc_ext des = float(180.0*(fr_0+fr_1)/np.pi)*0.5-90 lab = "" txt = father.last_name if params['abrv_start_lev'] < l <= params['abrv_stop_lev'] + 1: ide = u_i.next() lab = str(ide)+": "+father.last_name txt = str(ide) # ax.bar(fr_0, clto, width = fr_1-fr_0, bottom = float(l-1)/float(depth-1), alpha = params['alpha'], color=col, edgecolor=col) ax.bar(fr_0, clto, width = fr_1-fr_0, bottom = float(l-1)/float(de), alpha = params['alpha'], color=col, edgecolor=col) ax.bar(0.0, 0.0, width = 0.0, bottom = 0.0, alpha = 1.0, color=col, edgecolor=params['fore_color'], label=lab) if l <= params['abrv_stop_lev'] + 1: if not params['col_lab']: col = params['fore_color'] else: if col not in colors: col = params['fore_color'] else: col = dark_colors[colors.index(col)%len(dark_colors)] ax.text((fr_0+fr_1)*0.5, clto+float(l-1)/float(de)-dim*perc_ext/2.0, txt, size = params['label_font_size'], rotation=des, ha ="center", va="center", color=col) return fr_0, fr_1 def draw_tree(out_file,tree,params): plt_size = 7 nlev = tree['nlev'] pt_scale = (params['min_point_size'],max(1.0,((tree['max_abs']-tree['min_abs']))/(params['max_point_size']-params['min_point_size']))) depth = len(nlev) sep = (2.0*np.pi)/float(nlev[-1]) seps = [params['clade_sep']*sep/float(depth-i+1) for i in range(1,len(tree['nlev'])+1)] totseps = sum([s*nlev[i] for i,s in enumerate(seps[:-1])]) clade_sep_err = True if totseps > np.pi else False while totseps > np.pi: params['clade_sep'] *= 0.75 seps = [params['clade_sep']*sep/(float(depth-i+1)*0.25) for i in range(1,len(tree['nlev'])+1)] totseps = sum([s*nlev[i] for i,s in enumerate(seps[:-1])]) if clade_sep_err: print 'clade_sep parameter too large, lowered to',params['clade_sep'] fig = plt.figure(edgecolor=params['back_color'],facecolor=params['back_color']) ax = fig.add_subplot(111, polar=True, frame_on=False, axis_bgcolor=params['back_color'] ) plt.subplots_adjust(right=1.0-params['r_prop'],left=params['l_prop']) ax.grid(False) xticks([]) yticks([]) ds = (2.0*np.pi-totseps)/float(nlev[-1]) add_all_pos(tree['root'],0.0,ds,seps,0.0,depth) plot_lines(tree['root'],params,depth,ax,0) plot_points(tree['root'],params,pt_scale,ax) plot_names(tree['root'],params,depth,ax,uniqueid(),seps) r = np.arange(0, 3.0, 0.01) theta = 2*np.pi*r def get_col_attr(x): return hasattr(x, 'set_color') and not hasattr(x, 'set_facecolor') h, l = ax.get_legend_handles_labels() if len(l) > 0: leg = ax.legend(bbox_to_anchor=(1.05, 1), frameon=False, loc=2, borderaxespad=0.,prop={'size':params['label_font_size']}) if leg != None: gca().add_artist(leg) for o in leg.findobj(get_col_attr): o.set_color(params['fore_color']) cll = sorted(tree['classes']) if params['all_feats'] == "" else sorted(params['all_feats'].split(":")) nll = [ax.bar(0.0, 0.0, width = 0.0, bottom = 0.0, color=colors[i%len(colors)], label=c) for i,c in enumerate(cll) if c in tree['classes']] cl = [c for c in cll if c in tree['classes']] ax.set_title(params['title'],size=params['title_font_size'],color=params['fore_color']) if params['class_legend_vis']: l2 = legend(nll, cl, loc=2, prop={'size':params['class_legend_font_size']}, frameon=False) if l2 != None: for o in l2.findobj(get_col_attr): o.set_color(params['fore_color']) plt.savefig(out_file,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi']) plt.close() if __name__ == '__main__': params = read_params(sys.argv) params['fore_color'] = 'w' if params['back_color'] == 'k' else 'k' clad_tree = read_data(params['input_file'],params) draw_tree(params['output_file'],clad_tree,params) nsegata-lefse-3ce86f78d59e/plot_features.py0000755000000000000000000001710012750144356017023 0ustar 00000000000000#!/usr/bin/env python import os,sys,matplotlib,zipfile,argparse,string matplotlib.use('Agg') from pylab import * from lefse import * import random as rand colors = ['r','g','b','m','c'] def read_params(args): parser = argparse.ArgumentParser(description='Cladoplot') parser.add_argument('input_file_1', metavar='INPUT_FILE', type=str, help="dataset files") parser.add_argument('input_file_2', metavar='INPUT_FILE', type=str, help="LEfSe output file") parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, help="the file for the output (the zip file if an archive is required, the output directory otherwise)") parser.add_argument('--width',dest="width", type=float, default=10.0 ) parser.add_argument('--height',dest="height", type=float, default=4.0) parser.add_argument('--top',dest="top", type=float, default=-1.0, help="set maximum y limit (-1.0 means automatic limit)") parser.add_argument('--bot',dest="bot", type=float, default=0.0, help="set minimum y limit (default 0.0, -1.0 means automatic limit)") parser.add_argument('--title_font_size',dest="title_font_size", type=str, default="14") parser.add_argument('--class_font_size',dest="class_font_size", type=str, default="14") parser.add_argument('--class_label_pos',dest="class_label_pos", type=str, choices=["up","down"], default="up") parser.add_argument('--subcl_mean',dest="subcl_mean", type=str, choices=["y","n"], default="y") parser.add_argument('--subcl_median',dest="subcl_median", type=str, choices=["y","n"], default="y") parser.add_argument('--font_size',dest="font_size", type=str, default="10") parser.add_argument('-n',dest="unused", metavar="flt", type=float, default=-1.0,help="unused") parser.add_argument('--format', dest="format", default="png", choices=["png","pdf","svg"], type=str, help="the format for the output file") parser.add_argument('-f', dest="f", default="diff", choices=["all","diff","one"], type=str, help="wheter to plot all features (all), only those differentially abundant according to LEfSe or only one (the one given with --feature_name) ") parser.add_argument('--feature_name', dest="feature_name", default="", type=str, help="The name of the feature to plot (levels separated by .) ") parser.add_argument('--feature_num', dest="feature_num", default="-1", type=int, help="The number of the feature to plot ") parser.add_argument('--archive', dest="archive", default="none", choices=["zip","none"], type=str, help="") parser.add_argument('--background_color',dest="back_color", type=str, choices=["k","w"], default="w", help="set the color of the background") parser.add_argument('--dpi',dest="dpi", type=int, default=72) args = parser.parse_args() return vars(args) def read_data(file_data,file_feats,params): with open(file_feats, 'r') as features: feats_to_plot = [(f.split()[:-1],len(f.split()) == 5) for f in features.readlines()] if not feats_to_plot: print "No features to plot\n" sys.exit(0) feats,cls,class_sl,subclass_sl,class_hierarchy,params['norm_v'] = load_data(file_data, True) if params['feature_num'] > 0: params['feature_name'] = [line.split()[0] for line in open(params['input_file_2'])][params['feature_num']-1] features = {} for f in feats_to_plot: if params['f'] == "diff" and not f[1]: continue if params['f'] == "one" and f[0][0] != params['feature_name']: continue features[f[0][0]] = {'dim':float(f[0][1]), 'abundances':feats[f[0][0]], 'sig':f[1], 'cls':cls, 'class_sl':class_sl, 'subclass_sl':subclass_sl, 'class_hierarchy':class_hierarchy} if not features: print "No features to plot\n" sys.exit(0) return features def plot(name,k_n,feat,params): fig = plt.figure(figsize=(params['width'], params['height']),edgecolor=params['fore_color'],facecolor=params['back_color']) ax = fig.add_subplot(111,axis_bgcolor=params['back_color']) subplots_adjust(bottom=0.15) max_m = 0.0 norm = 1.0 if float(params['norm_v']) < 0.0 else float(params['norm_v']) for v in feat['subclass_sl'].values(): fr,to = v[0], v[1] median = numpy.mean(feat['abundances'][fr:to]) if median > max_m: max_m = median max_m /= norm max_v = max_m*3 if max_m*3 < max(feat['abundances'])*1.1/norm else max(feat['abundances'])/norm min_v = max(0.0,min(feat['abundances'])*0.9/norm) if params['top'] > 0.0: max_v = params['top'] if params['bot'] >= 0.0: min_v = params['bot'] if max_v == 0.0: max_v = 0.0001 if max_v == min_v: max_v = min_v*1.1 cl_sep = max(int(sum([vv[1]/norm - vv[0]/norm for vv in feat['class_sl'].values()])/150.0),1) seps = [] xtics = [] x2tics = [] last_fr = 0.0 for i,cl in enumerate(sorted(feat['class_hierarchy'].keys())): for j,subcl in enumerate(feat['class_hierarchy'][cl]): fr = feat['subclass_sl'][subcl][0] to = feat['subclass_sl'][subcl][1] val = feat['abundances'][fr:to] fr += cl_sep*i to += cl_sep*i pos = arange(fr,to) max_x = to col = colors[j%len(colors)] vv = [v1/norm for v1 in val] median = numpy.median(vv) mean = numpy.mean(vv) valv = [max(min(v/norm,max_v),min_v) for v in val] ax.bar(pos,valv, align='center', color=col, edgecolor=col, linewidth=0.1 ) if params['subcl_median'] == 'y': ax.plot([fr,to-1],[median,median],"k--",linewidth=1,color=params['fore_color']) if params['subcl_mean'] == 'y': ax.plot([fr,to-1],[mean,mean],"-",linewidth=1,color=params['fore_color']) nna = subcl if subcl.count("_") == 0 or not subcl.startswith(cl) else "_".join(subcl.split(cl)[1:]) if nna == "subcl" or nna == "_subcl": nna = " " xtics.append(((fr+(to-fr)/2),nna)) seps.append(float(to)) x2tics.append(((last_fr+(to-last_fr)/2),cl)) last_fr = to + float(cl_sep) for s in seps[:-1]: ax.plot([s,s],[min_v,max_v],"-",linewidth=5,color=params['fore_color']) ax.set_title(k_n, size=params['title_font_size']) xticks([x[0] for x in xtics],[x[1] for x in xtics],rotation=-30, ha = 'left', fontsize=params['font_size'], color=params['fore_color']) yticks(fontsize=params['font_size']) ylabel('Relative abundance') ax.set_ylim((min_v,max_v)) a,b = ax.get_xlim() ax.set_xlim((0-float(last_fr)/float(b-a),max_x)) ax.yaxis.grid(True) def get_col_attr(x): return hasattr(x, 'set_color') and not hasattr(x, 'set_facecolor') def get_edgecol_attr(x): return hasattr(x, 'set_edgecolor') for o in fig.findobj(get_col_attr): o.set_color(params['fore_color']) for o in fig.findobj(get_edgecol_attr): if o.get_edgecolor() == params['back_color']: o.set_edgecolor(params['fore_color']) for t in x2tics: m = ax.get_ylim()[1]*0.97 if params['class_label_pos']=='up' else 0.07 plt.text(t[0],m, "class: "+t[1], ha ="center", size=params['class_font_size'], va="top", bbox = dict(boxstyle="round", ec='k', fc='y')) plt.savefig(name,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi']) plt.close() return name if __name__ == '__main__': params = read_params(sys.argv) params['fore_color'] = 'w' if params['back_color'] == 'k' else 'k' features = read_data(params['input_file_1'],params['input_file_2'],params) if params['archive'] == "zip": file = zipfile.ZipFile(params['output_file'], "w") for k,f in features.items(): print "Exporting ", k if params['archive'] == "zip": of = plot("/tmp/"+str(int(f['sig']))+"_"+"-".join(k.split("."))+"."+params['format'],k,f,params) file.write(of, os.path.basename(of), zipfile.ZIP_DEFLATED) else: if params['f'] == 'one': plot(params['output_file'],k,f,params) else: plot(params['output_file']+str(int(f['sig']))+"_"+"-".join(k.split("."))+"."+params['format'],k,f,params) if params['archive'] == "zip": file.close() nsegata-lefse-3ce86f78d59e/plot_res.py0000755000000000000000000002317112750144356016003 0ustar 00000000000000#!/usr/bin/env python import os,sys import matplotlib matplotlib.use('Agg') from pylab import * from collections import defaultdict from lefse import * import argparse colors = ['r','g','b','m','c','y','k','w'] def read_params(args): parser = argparse.ArgumentParser(description='Plot results') parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="tab delimited input file") parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, help="the file for the output image") parser.add_argument('--feature_font_size', dest="feature_font_size", type=int, default=7, help="the file for the output image") parser.add_argument('--format', dest="format", choices=["png","svg","pdf"], default='png', type=str, help="the format for the output file") parser.add_argument('--dpi',dest="dpi", type=int, default=72) parser.add_argument('--title',dest="title", type=str, default="") parser.add_argument('--title_font_size',dest="title_font_size", type=str, default="12") parser.add_argument('--class_legend_font_size',dest="class_legend_font_size", type=str, default="10") parser.add_argument('--width',dest="width", type=float, default=7.0 ) parser.add_argument('--height',dest="height", type=float, default=4.0, help="only for vertical histograms") parser.add_argument('--left_space',dest="ls", type=float, default=0.2 ) parser.add_argument('--right_space',dest="rs", type=float, default=0.1 ) parser.add_argument('--orientation',dest="orientation", type=str, choices=["h","v"], default="h" ) parser.add_argument('--autoscale',dest="autoscale", type=int, choices=[0,1], default=1 ) parser.add_argument('--background_color',dest="back_color", type=str, choices=["k","w"], default="w", help="set the color of the background") parser.add_argument('--subclades', dest="n_scl", type=int, default=1, help="number of label levels to be dislayed (starting from the leaves, -1 means all the levels, 1 is default )") parser.add_argument('--max_feature_len', dest="max_feature_len", type=int, default=60, help="Maximum length of feature strings (def 60)") parser.add_argument('--all_feats', dest="all_feats", type=str, default="") parser.add_argument('--otu_only', dest="otu_only", default=False, action='store_true', help="Plot only species resolved OTUs (as opposed to all levels)") parser.add_argument('--report_features', dest="report_features", default=False, action='store_true', help="Report important features to STDOUT") args = parser.parse_args() return vars(args) def read_data(input_file,output_file,otu_only): with open(input_file, 'r') as inp: if not otu_only: rows = [line.strip().split()[:-1] for line in inp.readlines() if len(line.strip().split())>3] else: rows = [line.strip().split()[:-1] for line in inp.readlines() if len(line.strip().split())>3 and len(line.strip().split()[0].split('.'))==8] # a feature with length 8 will have an OTU id associated with it classes = list(set([v[2] for v in rows if len(v)>2])) if len(classes) < 1: print "No differentially abundant features found in "+input_file os.system("touch "+output_file) sys.exit() data = {} data['rows'] = rows data['cls'] = classes return data def plot_histo_hor(path,params,data,bcl,report_features): cls2 = [] if params['all_feats'] != "": cls2 = sorted(params['all_feats'].split(":")) cls = sorted(data['cls']) if bcl: data['rows'].sort(key=lambda ab: fabs(float(ab[3]))*(cls.index(ab[2])*2-1)) else: mmax = max([fabs(float(a)) for a in zip(*data['rows'])[3]]) data['rows'].sort(key=lambda ab: fabs(float(ab[3]))/mmax+(cls.index(ab[2])+1)) pos = arange(len(data['rows'])) head = 0.75 tail = 0.5 ht = head + tail ints = max(len(pos)*0.2,1.5) fig = plt.figure(figsize=(params['width'], ints + ht), edgecolor=params['back_color'],facecolor=params['back_color']) ax = fig.add_subplot(111,frame_on=False,axis_bgcolor=params['back_color']) ls, rs = params['ls'], 1.0-params['rs'] plt.subplots_adjust(left=ls,right=rs,top=1-head*(1.0-ints/(ints+ht)), bottom=tail*(1.0-ints/(ints+ht))) fig.canvas.set_window_title('LDA results') l_align = {'horizontalalignment':'left', 'verticalalignment':'baseline'} r_align = {'horizontalalignment':'right', 'verticalalignment':'baseline'} added = [] m = 1 if data['rows'][0][2] == cls[0] else -1 out_data = defaultdict(list) # keep track of which OTUs result in the plot for i,v in enumerate(data['rows']): if report_features: otu = v[0].split('.')[7].replace('_','.') # string replace retains format New.ReferenceOTUxxx score = v[3] otu_class = v[2] out_data[otu] = [score, otu_class] indcl = cls.index(v[2]) lab = str(v[2]) if str(v[2]) not in added else "" added.append(str(v[2])) col = colors[indcl%len(colors)] if len(cls2) > 0: col = colors[cls2.index(v[2])%len(colors)] vv = fabs(float(v[3])) * (m*(indcl*2-1)) if bcl else fabs(float(v[3])) ax.barh(pos[i],vv, align='center', color=col, label=lab, height=0.8, edgecolor=params['fore_color']) mv = max([abs(float(v[3])) for v in data['rows']]) if report_features: print 'OTU\tLDA_score\tCLass' for i in out_data: print '%s\t%s\t%s' %(i, out_data[i][0], out_data[i][1]) for i,r in enumerate(data['rows']): indcl = cls.index(data['rows'][i][2]) if params['n_scl'] < 0: rr = r[0] else: rr = ".".join(r[0].split(".")[-params['n_scl']:]) if len(rr) > params['max_feature_len']: rr = rr[:params['max_feature_len']/2-2]+" [..]"+rr[-params['max_feature_len']/2+2:] if m*(indcl*2-1) < 0 and bcl: ax.text(mv/40.0,float(i)-0.3,rr, l_align, size=params['feature_font_size'],color=params['fore_color']) else: ax.text(-mv/40.0,float(i)-0.3,rr, r_align, size=params['feature_font_size'],color=params['fore_color']) ax.set_title(params['title'],size=params['title_font_size'],y=1.0+head*(1.0-ints/(ints+ht))*0.8,color=params['fore_color']) ax.set_yticks([]) ax.set_xlabel("LDA SCORE (log 10)") ax.xaxis.grid(True) xlim = ax.get_xlim() if params['autoscale']: ran = arange(0.0001,round(round((abs(xlim[0])+abs(xlim[1]))/10,4)*100,0)/100) if len(ran) > 1 and len(ran) < 100: ax.set_xticks(arange(xlim[0],xlim[1]+0.0001,min(xlim[1]+0.0001,round(round((abs(xlim[0])+abs(xlim[1]))/10,4)*100,0)/100))) ax.set_ylim((pos[0]-1,pos[-1]+1)) leg = ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=5, borderaxespad=0., frameon=False,prop={'size':params['class_legend_font_size']}) def get_col_attr(x): return hasattr(x, 'set_color') and not hasattr(x, 'set_facecolor') for o in leg.findobj(get_col_attr): o.set_color(params['fore_color']) for o in ax.findobj(get_col_attr): o.set_color(params['fore_color']) plt.savefig(path,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi']) plt.close() def plot_histo_ver(path,params,data,report_features): cls = data['cls'] mmax = max([fabs(float(a)) for a in zip(*data['rows'])[1]]) data['rows'].sort(key=lambda ab: fabs(float(ab[3]))/mmax+(cls.index(ab[2])+1)) pos = arange(len(data['rows'])) if params['n_scl'] < 0: nam = [d[0] for d in data['rows']] else: nam = [d[0].split(".")[-min(d[0].count("."),params['n_scl'])] for d in data['rows']] fig = plt.figure(edgecolor=params['back_color'],facecolor=params['back_color'],figsize=(params['width'], params['height'])) ax = fig.add_subplot(111,axis_bgcolor=params['back_color']) plt.subplots_adjust(top=0.9, left=params['ls'], right=params['rs'], bottom=0.3) fig.canvas.set_window_title('LDA results') l_align = {'horizontalalignment':'left', 'verticalalignment':'baseline'} r_align = {'horizontalalignment':'right', 'verticalalignment':'baseline'} added = [] out_data = defaultdict(list) # keep track of which OTUs result in the plot for i,v in enumerate(data['rows']): if report_features: otu = v[0].split('.')[7].replace('_','.') # string replace retains format New.ReferenceOTUxxx score = v[3] otu_class = v[2] out_data[otu] = [score, otu_class] indcl = data['cls'].index(v[2]) lab = str(v[2]) if str(v[2]) not in added else "" added.append(str(v[2])) col = colors[indcl%len(colors)] vv = fabs(float(v[3])) ax.bar(pos[i],vv, align='center', color=col, label=lab) if report_features: print 'OTU\tLDA_score\tCLass' for i in out_data: print '%s\t%s\t%s' %(i, out_data[i][0], out_data[i][1]) xticks(pos,nam,rotation=-20, ha = 'left',size=params['feature_font_size']) ax.set_title(params['title'],size=params['title_font_size']) ax.set_ylabel("LDA SCORE (log 10)") ax.yaxis.grid(True) a,b = ax.get_xlim() dx = float(len(pos))/float((b-a)) ax.set_xlim((0-dx,max(pos)+dx)) plt.savefig(path,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi']) plt.close() if __name__ == '__main__': params = read_params(sys.argv) params['fore_color'] = 'w' if params['back_color'] == 'k' else 'k' data = read_data(params['input_file'],params['output_file'],params['otu_only']) if params['orientation'] == 'v': plot_histo_ver(params['output_file'],params,data,params['report_features']) else: plot_histo_hor(params['output_file'],params,data,len(data['cls']) == 2,params['report_features']) nsegata-lefse-3ce86f78d59e/qiime2lefse.py0000755000000000000000000000633112750144356016360 0ustar 00000000000000#!/usr/bin/env python import sys def read_params(args): import argparse as ap import textwrap p = ap.ArgumentParser( description= ''' Script will convert QIIME TSV BIOM table for use with lefse. It is imperative that this table has taxa metadata associated with it named 'Consensus Lineage', this can be down with e.g. the follow biom convert script: ---- biom convert -i otu.biom -o otu.txt --to-tsv --header-key Taxonomy --output-metadata-id 'Consensus Lineage' ''') p.add_argument( '--in', metavar='INPUT_FILE', type=str, nargs='?', default=sys.stdin, help= "the Qiime OTU table file " "[ stdin if not present ]" ) p.add_argument( '--md', metavar='METADATA_FILE', type=str, nargs='?', default=None, help= "the Qiime OTU table file " "[ only OTU table without metadata if not present ]" ) p.add_argument( '--out', metavar='OUTPUT_FILE', type=str, nargs = '?', default=sys.stdout, help= "the output file " "[stdout if not present]") p.add_argument( '-c', metavar="class attribute", type=str, help = "the attribute to use as class" ) p.add_argument( '-s', metavar="subclass attribute", type=str, help = "the attribute to use as subclass" ) p.add_argument( '-u', metavar="subject attribute", type=str, help = "the attribute to use as subject" ) return vars(p.parse_args()) def qiime2lefse( fin, fmd, fout, all_md, sel_md ): with (fin if fin==sys.stdin else open(fin)) as inpf : lines = [list(ll) for ll in (zip(*[l.strip().split('\t') for l in inpf.readlines()[1:]]) ) ] for i,(l1,l2) in enumerate(zip( lines[0], lines[-1] )): if not l2 == 'Consensus Lineage': lines[-1][i] = l2+"|"+l1 data = dict([(l[0],l[1:]) for l in lines[1:]]) md = {} if fmd: with open(fmd) as inpf: mdlines = [l.strip().split('\t') for l in inpf.readlines()] mdf = mdlines[0][1:] for l in mdlines: mdd = dict(zip(mdf,l[1:])) md[l[0]] = mdd selected_md = md.values()[0].keys() if md else [] if not all_md: selected_md = [s for s in sel_md if s] out_m = [ selected_md + list([d.replace(";","|").replace("\"","") for d in data[ 'Consensus Lineage' ]]) ] for k,v in data.items(): if k == 'Consensus Lineage': continue out_m.append( [md[k][kmd] for kmd in selected_md] + list(v) ) with (fout if fout == sys.stdout else open( fout, "w" )) as outf: for l in zip(*out_m): outf.write( "\t".join(l) + "\n" ) if __name__ == '__main__': pars = read_params( sys.argv ) qiime2lefse( fin = pars['in'], fmd = pars['md'], fout = pars['out'], all_md = not pars['c'] and not pars['s'] and not pars['u'], sel_md = [pars['c'],pars['s'],pars['u']]) nsegata-lefse-3ce86f78d59e/requirements.txt0000644000000000000000000000025512750144356017061 0ustar 00000000000000 - R - R libraries: splines, stats4, survival, mvtnorm, modeltools, coin, MASS - python libraries: rpy2 (v. 2.1 or higher), numpy, matplotlib (v. 1.0 or higher), argparse nsegata-lefse-3ce86f78d59e/run_lefse.py0000755000000000000000000001442412750144356016137 0ustar 00000000000000#!/usr/bin/env python import os,sys,math,pickle from lefse import * def read_params(args): parser = argparse.ArgumentParser(description='LEfSe 1.0') parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file") parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, help="the output file containing the data for the visualization module") parser.add_argument('-o',dest="out_text_file", metavar='str', type=str, default="", help="set the file for exporting the result (only concise textual form)") parser.add_argument('-a',dest="anova_alpha", metavar='float', type=float, default=0.05, help="set the alpha value for the Anova test (default 0.05)") parser.add_argument('-w',dest="wilcoxon_alpha", metavar='float', type=float, default=0.05, help="set the alpha value for the Wilcoxon test (default 0.05)") parser.add_argument('-l',dest="lda_abs_th", metavar='float', type=float, default=2.0, help="set the threshold on the absolute value of the logarithmic LDA score (default 2.0)") parser.add_argument('--nlogs',dest="nlogs", metavar='int', type=int, default=3, help="max log ingluence of LDA coeff") parser.add_argument('--verbose',dest="verbose", metavar='int', choices=[0,1], type=int, default=0, help="verbose execution (default 0)") parser.add_argument('--wilc',dest="wilc", metavar='int', choices=[0,1], type=int, default=1, help="wheter to perform the Wicoxon step (default 1)") parser.add_argument('-r',dest="rank_tec", metavar='str', choices=['lda','svm'], type=str, default='lda', help="select LDA or SVM for effect size (default LDA)") parser.add_argument('--svm_norm',dest="svm_norm", metavar='int', choices=[0,1], type=int, default=1, help="whether to normalize the data in [0,1] for SVM feature waiting (default 1 strongly suggested)") parser.add_argument('-b',dest="n_boots", metavar='int', type=int, default=30, help="set the number of bootstrap iteration for LDA (default 30)") parser.add_argument('-e',dest="only_same_subcl", metavar='int', type=int, default=0, help="set whether perform the wilcoxon test only among the subclasses with the same name (default 0)") parser.add_argument('-c',dest="curv", metavar='int', type=int, default=0, help="set whether perform the wilcoxon test ing the Curtis's approach [BETA VERSION] (default 0)") parser.add_argument('-f',dest="f_boots", metavar='float', type=float, default=0.67, help="set the subsampling fraction value for each bootstrap iteration (default 0.66666)") parser.add_argument('-s',dest="strict", choices=[0,1,2], type=int, default=0, help="set the multiple testing correction options. 0 no correction (more strict, default), 1 correction for independent comparisons, 2 correction for independent comparison") # parser.add_argument('-m',dest="m_boots", type=int, default=5, # help="minimum cardinality of classes in each bootstrapping iteration (default 5)") parser.add_argument('--min_c',dest="min_c", metavar='int', type=int, default=10, help="minimum number of samples per subclass for performing wilcoxon test (default 10)") parser.add_argument('-t',dest="title", metavar='str', type=str, default="", help="set the title of the analysis (default input file without extension)") parser.add_argument('-y',dest="multiclass_strat", choices=[0,1], type=int, default=0, help="(for multiclass tasks) set whether the test is performed in a one-against-one ( 1 - more strict!) or in a one-against-all setting ( 0 - less strict) (default 0)") args = parser.parse_args() params = vars(args) if params['title'] == "": params['title'] = params['input_file'].split("/")[-1].split('.')[0] return params if __name__ == '__main__': init() params = read_params(sys.argv) feats,cls,class_sl,subclass_sl,class_hierarchy = load_data(params['input_file']) kord,cls_means = get_class_means(class_sl,feats) wilcoxon_res = {} kw_n_ok = 0 nf = 0 for feat_name,feat_values in feats.items(): if params['verbose']: print "Testing feature",str(nf),": ",feat_name, nf += 1 kw_ok,pv = test_kw_r(cls,feat_values,params['anova_alpha'],sorted(cls.keys())) if not kw_ok: if params['verbose']: print "\tkw ko" del feats[feat_name] wilcoxon_res[feat_name] = "-" continue if params['verbose']: print "\tkw ok\t", if not params['wilc']: continue kw_n_ok += 1 res_wilcoxon_rep = test_rep_wilcoxon_r(subclass_sl,class_hierarchy,feat_values,params['wilcoxon_alpha'],params['multiclass_strat'],params['strict'],feat_name,params['min_c'],params['only_same_subcl'],params['curv']) wilcoxon_res[feat_name] = str(pv) if res_wilcoxon_rep else "-" if not res_wilcoxon_rep: if params['verbose']: print "wilc ko" del feats[feat_name] elif params['verbose']: print "wilc ok\t" if len(feats) > 0: print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon" if params['lda_abs_th'] < 0.0: lda_res,lda_res_th = dict([(k,0.0) for k,v in feats.items()]), dict([(k,v) for k,v in feats.items()]) else: if params['rank_tec'] == 'lda': lda_res,lda_res_th = test_lda_r(cls,feats,class_sl,params['n_boots'],params['f_boots'],params['lda_abs_th'],0.0000000001,params['nlogs']) elif params['rank_tec'] == 'svm': lda_res,lda_res_th = test_svm(cls,feats,class_sl,params['n_boots'],params['f_boots'],params['lda_abs_th'],0.0,params['svm_norm']) else: lda_res,lda_res_th = dict([(k,0.0) for k,v in feats.items()]), dict([(k,v) for k,v in feats.items()]) else: print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon" print "No features with significant differences between the two classes" lda_res,lda_res_th = {},{} outres = {} outres['lda_res_th'] = lda_res_th outres['lda_res'] = lda_res outres['cls_means'] = cls_means outres['cls_means_kord'] = kord outres['wilcox_res'] = wilcoxon_res print "Number of discriminative features with abs LDA score >",params['lda_abs_th'],":",len(lda_res_th) save_res(outres,params["output_file"])