MACS-2.0.9/0000755000175000017500000000000011657057615013076 5ustar taoliutaoliu00000000000000MACS-2.0.9/MANIFEST0000644000175000017500000000114611657057615014231 0ustar taoliutaoliu00000000000000# file GENERATED by distutils, do NOT edit COPYING ChangeLog INSTALL MANIFEST README setup.py MACS2/Constants.py MACS2/OptValidator.py MACS2/OutputWriter.py MACS2/PeakModel.py MACS2/__init__.py MACS2/cPeakDetect.pyx MACS2/cPileup.pyx MACS2/cProb.pxd MACS2/cProb.pyx MACS2/IO/BinKeeper.py MACS2/IO/WiggleIO.py MACS2/IO/__init__.py MACS2/IO/bedGraphIO.py MACS2/IO/cBedGraph.pyx MACS2/IO/cCompositeScoreTrack.pyx MACS2/IO/cFixWidthTrack.pyx MACS2/IO/cParser.pyx MACS2/IO/cPeakIO.pyx MACS2/IO/cScoreTrack.pyx bin/bdgbroadcall bin/bdgcmp bin/bdgdiff bin/bdgpeakcall bin/filterdup bin/macs2 bin/macs2diff bin/randsample MACS-2.0.9/MACS2/0000755000175000017500000000000011657057615013703 5ustar taoliutaoliu00000000000000MACS-2.0.9/MACS2/cPileup.pyx0000644000175000017500000001221511630217211016024 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-06-19 16:14:57 Tao Liu> """Module Description: For pileup functions. Copyright (c) 2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ from array import array from MACS2.IO.cBedGraph import bedGraphTrackI from MACS2.Constants import * # ------------------------------------ # constants # ------------------------------------ # to determine the byte size def pileup_bdg (trackI, d, baseline_value = 0, directional=True, halfextension=True): """Pileup tags into bedGraphTrackI object with extension. Tag will be extended towards 3' side with size of d if directional is Ture, or both sides with d/2 if directional is False. A tag is a single genomic location. trackI : A FWTrackII object with raw plus and minus 5' end positions d : tag will be extended to this value to 3' direction, unless directional is False. baseline_value : a value to be filled for missing values. directional: if False, the strand or direction of tag will be ignored, so that extenstion will be both sides with d/2. halfextension: only make a fragment of d/2 size centered at fragment center Return a bedGraphTrackI object. """ #step = 10000000 + 2*d # step to cache data points. ret = bedGraphTrackI(baseline_value=baseline_value) # bedGraphTrackI object to be returned. chrs = trackI.get_chr_names() if directional: # only extend to 3' side if halfextension: five_shift = int(d*-0.25) # five shift is used to move cursor towards 5' direction to find the start of fragment three_shift = int(d*0.75) # three shift is used to move cursor towards 3' direction to find the end of fragment else: five_shift = 0 three_shift = d else: # both sides if halfextension: five_shift = int(d*0.25) three_shift = five_shift else: five_shift = int(d/2) three_shift = d - five_shift for chrom in chrs: (plus_tags,minus_tags) = trackI.get_locations_by_chr(chrom) l = len(plus_tags)+len(minus_tags) start_poss = array(BYTE4,[]) # store all start positions end_poss = array(BYTE4,[]) # store all end positions # for plus tags for i in xrange(len(plus_tags)): # shift to get start positions. To 5' side. #start_poss.append(max(0,plus_tags[i]-five_shift)) # prevent coordinates < 0 start_poss.append(plus_tags[i]-five_shift) # shift to get end positions by extending to d. To 3' side. #end_poss.append(max(0,plus_tags[i]+three_shift)) end_poss.append(plus_tags[i]+three_shift) # for minus tags for i in xrange(len(minus_tags)): # shift to get start positions by extending to d. To 3' side. #start_poss.append(max(0,minus_tags[i]-three_shift)) # prevent coordinates < 0 start_poss.append(minus_tags[i]-three_shift) # shift to get end positions. To 5' side. #end_poss.append(max(0,minus_tags[i]+five_shift)) end_poss.append(minus_tags[i]+five_shift) # sort start_poss = sorted(start_poss) end_poss = sorted(end_poss) # Pileup by go through start positions and end positions, # while seeing start position, pileup ++ # while seeing end position, pileup -- # i_s = 0 # index of start_poss i_e = 0 # index of end_poss pileup = 0 pre_p = min(start_poss[0],end_poss[0]) while i_s < l and i_e < l: if start_poss[i_s] < end_poss[i_e]: p = start_poss[i_s] if p != pre_p: ret.add_loc(chrom,pre_p,p,pileup) pre_p = p pileup += 1 i_s += 1 elif start_poss[i_s] > end_poss[i_e]: p = end_poss[i_e] if p != pre_p: ret.add_loc(chrom,pre_p,p,pileup) pre_p = p pileup -= 1 i_e += 1 else: i_s += 1 i_e += 1 if i_e < l: # add rest of end positions for p in end_poss[i_e:]: if p != pre_p: ret.add_loc(chrom,pre_p,p,pileup) pre_p = p pileup -= 1 if i_s < l: # add rest of start positions ( I don't think this will happen ) raise Exception("start positions can't be the only things left!") for p in start_poss[i_s:]: if p != pre_p: ret.add_loc(chrom,pre_p,p,pileup) pre_p = p pileup += 1 return ret MACS-2.0.9/MACS2/PeakModel.py0000644000175000017500000003156511630217211016105 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-08-17 17:27:18 Tao Liu> """Module Description Copyright (c) 2008,2009 Yong Zhang, Tao Liu Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ import sys, time, random def median (nums): """Calculate Median. Parameters: nums: list of numbers Return Value: median value """ p = sorted(nums) l = len(p) if l%2 == 0: return (p[l/2]+p[l/2-1])/2 else: return p[l/2] class NotEnoughPairsException(Exception): def __init__ (self,value): self.value = value def __str__ (self): return repr(self.value) class PeakModel: """Peak Model class. """ def __init__ (self, opt=None, treatment=None, max_pairnum=500, gz = 0, umfold=30, lmfold=10, bw=200, ts = 25, bg=0, quiet=False): self.treatment = treatment if opt: self.gz = opt.gsize self.umfold = opt.umfold self.lmfold = opt.lmfold self.tsize = opt.tsize self.bw = opt.bw self.info = opt.info self.debug = opt.debug self.warn = opt.warn self.error = opt.warn else: self.gz = gz self.umfold = umfold self.lmfold = lmfold self.tsize = ts self.bg = bg self.bw = bw self.info = lambda x: sys.stderr.write(x+"\n") self.debug = lambda x: sys.stderr.write(x+"\n") self.warn = lambda x: sys.stderr.write(x+"\n") self.error = lambda x: sys.stderr.write(x+"\n") if quiet: self.info = lambda x: None self.debug = lambda x: None self.warn = lambda x: None self.error = lambda x: None self.max_pairnum = max_pairnum self.summary = "" self.plus_line = None self.minus_line = None self.shifted_line = None self.d = None self.scan_window = None self.min_tags = None self.peaksize = None self.build() def build (self): """Build the model. prepare self.d, self.scan_window, self.plus_line, self.minus_line and self.shifted_line to use. """ self.peaksize = 2*self.bw self.min_tags = float(self.treatment.total) * self.lmfold * self.peaksize / self.gz /2 # mininum unique hits on single strand self.max_tags = float(self.treatment.total) * self.umfold * self.peaksize / self.gz /2 # maximum unique hits on single strand #print self.min_tags #print self.max_tags # use treatment data to build model paired_peakpos = self.__paired_peaks () # select up to 1000 pairs of peaks to build model num_paired_peakpos = 0 num_paired_peakpos_remained = self.max_pairnum num_paired_peakpos_picked = 0 for c in paired_peakpos.keys(): num_paired_peakpos +=len(paired_peakpos[c]) if num_paired_peakpos_remained == 0: paired_peakpos.pop(c) else: paired_peakpos[c] = paired_peakpos[c][:num_paired_peakpos_remained] num_paired_peakpos_remained -= len(paired_peakpos[c]) num_paired_peakpos_picked += len(paired_peakpos[c]) self.info("#2 number of paired peaks: %d" % (num_paired_peakpos)) if num_paired_peakpos < 100: self.error("Too few paired peaks (%d) so I can not build the model! Broader your MFOLD range parameter may erase this error. If it still can't build the model, please use --nomodel and --shiftsize 100 instead." % (num_paired_peakpos)) self.error("Process for pairing-model is terminated!") raise NotEnoughPairsException("No enough pairs to build model") elif num_paired_peakpos < self.max_pairnum: self.warn("Fewer paired peaks (%d) than %d! Model may not be build well! Lower your MFOLD parameter may erase this warning. Now I will use %d pairs to build model!" % (num_paired_peakpos,self.max_pairnum,num_paired_peakpos_picked)) self.debug("Use %d pairs to build the model." % (num_paired_peakpos_picked)) self.__paired_peak_model(paired_peakpos) def __str__ (self): """For debug... """ return """ Summary of Peak Model: Baseline: %d Upperline: %d Fragment size: %d Scan window size: %d """ % (self.min_tags,self.max_tags,self.d,self.scan_window) def __paired_peak_model (self, paired_peakpos): """Use paired peak positions and treatment tag positions to build the model. Modify self.(d, model_shift size and scan_window size. and extra, plus_line, minus_line and shifted_line for plotting). """ window_size = 1+2*self.peaksize self.plus_line = [0]*window_size self.minus_line = [0]*window_size for chrom in paired_peakpos.keys(): paired_peakpos_chrom = paired_peakpos[chrom] tags = self.treatment.get_locations_by_chr(chrom) tags_plus = tags[0] tags_minus = tags[1] # every paired peak has plus line and minus line # add plus_line self.plus_line = self.__model_add_line (paired_peakpos_chrom, tags_plus,self.plus_line) # add minus_line self.minus_line = self.__model_add_line (paired_peakpos_chrom, tags_minus,self.minus_line) # find top plus_tops = [] minus_tops = [] plus_max = max(self.plus_line) minus_max = max(self.minus_line) for i in range(window_size): if self.plus_line[i] == plus_max: plus_tops.append(i) if self.minus_line[i] == minus_max: minus_tops.append(i) self.d = minus_tops[len(minus_tops)/2] - plus_tops[len(plus_tops)/2] + 1 shift_size = self.d/2 # find the median point #plus_median = median(self.plus_line) #minus_median = median(self.minus_line) self.scan_window = max(self.d,self.tsize)*2 # a shifted model self.shifted_line = [0]*window_size plus_shifted = [0]*shift_size plus_shifted.extend(self.plus_line[:-1*shift_size]) minus_shifted = self.minus_line[shift_size:] minus_shifted.extend([0]*shift_size) #print "d:",self.d,"shift_size:",shift_size #print len(self.plus_line),len(self.minus_line),len(plus_shifted),len(minus_shifted),len(self.shifted_line) for i in range(window_size): self.shifted_line[i]=minus_shifted[i]+plus_shifted[i] return True def __model_add_line (self, pos1, pos2, line): """Project each pos in pos2 which is included in [pos1-self.peaksize,pos1+self.peaksize] to the line. """ i1 = 0 # index for pos1 i2 = 0 # index for pos2 i2_prev = 0 # index for pos2 in previous pos1 # [pos1-self.peaksize,pos1+self.peaksize] # region i1_max = len(pos1) i2_max = len(pos2) last_p2 = -1 flag_find_overlap = False while i1 p2: # move pos2 i2 += 1 elif p1+self.peaksize < p2: # move pos1 i1 += 1 i2 = i2_prev # search minus peaks from previous index flag_find_overlap = False else: # overlap! if not flag_find_overlap: flag_find_overlap = True i2_prev = i2 # only the first index is recorded # project for i in range(p2-p1+self.peaksize-self.tsize/2,p2-p1+self.peaksize+self.tsize/2): if i>=0 and i mp: # move minus im += 1 elif pp+self.peaksize < mp: # move plus ip += 1 im = im_prev # search minus peaks from previous index flag_find_overlap = False else: # overlap! if not flag_find_overlap: flag_find_overlap = True im_prev = im # only the first index is recorded if float(pn)/mn < 2 and float(pn)/mn > 0.5: # number tags in plus and minus peak region are comparable... if pp < mp: pair_centers.append((pp+mp)/2) #self.debug ( "distance: %d, minus: %d, plus: %d" % (mp-pp,mp,pp)) im += 1 return pair_centers def __naive_find_peaks (self, taglist ): """Naively call peaks based on tags counting. Return peak positions and the tag number in peak region by a tuple list [(pos,num)]. """ peak_info = [] # store peak pos in every peak region and # unique tag number in every peak region if len(taglist)<2: return peak_info pos = taglist[0] current_tag_list = [pos] # list to find peak pos for i in range(1,len(taglist)): pos = taglist[i] if (pos-current_tag_list[0]+1) > self.peaksize: # call peak in current_tag_list # a peak will be called if tag number is ge min tags. if len(current_tag_list) >= self.min_tags and len(current_tag_list) <= self.max_tags: peak_info.append((self.__naive_peak_pos(current_tag_list),len(current_tag_list))) current_tag_list = [] # reset current_tag_list current_tag_list.append(pos) # add pos while 1. no # need to call peak; # 2. current_tag_list is [] return peak_info def __naive_peak_pos (self, pos_list ): """Naively calculate the position of peak. return the highest peak summit position. """ peak_length = pos_list[-1]+1-pos_list[0]+self.tsize start = pos_list[0] -self.tsize/2 horizon_line = [0]*peak_length # the line for tags to be projected for pos in pos_list: for pp in range(int(pos-start-self.tsize/2),int(pos-start+self.tsize/2)): # projected point horizon_line[pp] += 1 top_pos = [] # to record the top positions. Maybe > 1 top_p_num = 0 # the maximum number of projected points for pp in range(peak_length): # find the peak posistion as the highest point if horizon_line[pp] > top_p_num: top_p_num = horizon_line[pp] top_pos = [pp] elif horizon_line[pp] == top_p_num: top_pos.append(pp) return (top_pos[int(len(top_pos)/2)]+start) MACS-2.0.9/MACS2/__init__.py0000644000175000017500000000002111630217211015762 0ustar taoliutaoliu00000000000000__all__ = ["IO"] MACS-2.0.9/MACS2/OutputWriter.py0000644000175000017500000002475711630217211016746 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-05-17 16:06:34 Tao Liu> """Module Description Copyright (c) 2008,2009,2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys from array import array from MACS2.Constants import * # ------------------------------------ # constants # ------------------------------------ # to determine the byte size if array('h',[1]).itemsize == 2: BYTE2 = 'h' else: raise Exception("BYTE2 type cannot be determined!") if array('i',[1]).itemsize == 4: BYTE4 = 'i' elif array('l',[1]).itemsize == 4: BYTE4 = 'l' else: raise Exception("BYTE4 type cannot be determined!") if array('f',[1]).itemsize == 4: FBYTE4 = 'f' elif array('d',[1]).itemsize == 4: FBYTE4 = 'd' else: raise Exception("FBYTE4 type cannot be determined!") # ------------------------------------ # Misc functions # ------------------------------------ def zwig_write (trackI, subdir, fileprefix, d, log=None,space=10, single=False): """Write shifted tags information in wiggle file in a given step. Then compress it using 'gzip' program. trackI: shifted tags from PeakDetect object subdir: directory where to put the wiggle file fileprefix: wiggle file prefix d : d length log : logging function, default is sys.stderr.write space : space to write tag number on spots, default 10 """ if not log: log = lambda x: sys.stderr.write(x+"\n") chrs = trackI.get_chr_names() os.makedirs (subdir) step = 10000000 + 2*d if single: log("write to a wiggle file") f = os.path.join(subdir,fileprefix+"_all"+".wig") wigfhd = open(f,"w") wigfhd.write("track type=wiggle_0 name=\"%s_all\" description=\"Extended tag pileup from MACS version %s for every %d bp\"\n" % (fileprefix.replace('_afterfiting',''), MACS_VERSION, space)) # data type line for chrom in chrs: if not single: f = os.path.join(subdir,fileprefix+"_"+chrom+".wig") log("write to "+f+" for chromosome "+chrom) wigfhd = open(f,"w") # suggested by dawe wigfhd.write("track type=wiggle_0 name=\"%s_%s\" description=\"Extended tag pileup from MACS version %s for every %d bp\"\n" % ( fileprefix.replace('_afterfiting',''), chrom, MACS_VERSION, space)) # data type line else: log("write data for chromosome "+chrom) wigfhd.write("variableStep chrom=%s span=%d\n" % (chrom,space)) tags = trackI.get_locations_by_chr(chrom)[0] l = len(tags) window_counts = array(BYTE4,[0]*step) startp = -1*d endp = startp+step index_tag = 0 while index_tag """Module Description Copyright (c) 2008,2009,2010,2011 Hyunjin Shin, Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Hyunjin Gene Shin, Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ from libc.math cimport exp,log,log10 #,fabs,log1p from math import fabs from math import log1p #as py_log1p from cpython cimport bool # ------------------------------------ # constants # ------------------------------------ cdef int LSTEP = 200 cdef double EXPTHRES = exp(LSTEP) cdef double EXPSTEP = exp(-1*LSTEP) # ------------------------------------ # Misc functions # ------------------------------------ cpdef factorial ( unsigned int n ): """Calculate N!. """ cdef double fact = 1 cdef unsigned long i if n < 0: return 0 for i in xrange( 2,n+1 ): fact = fact * i return fact cpdef poisson_cdf ( unsigned int n, double lam, bool lower=False, bool log10=False ): """Poisson CDF evaluater. This is a more stable CDF function. It can tolerate large lambda value. While the lambda is larger than 700, the function will be a little slower. Parameters: n : your observation lam : lambda of poisson distribution lower : if lower is False, calculate the upper tail CDF, otherwise, to calculate lower tail; Default is False. log10 : if log10 is True, calculation will be in log space. Default is False. """ assert lam > 0.0, "Lambda must > 0, however we got %d" % lam if log10: if lower: # lower tail return log10_poisson_cdf_P_large_lambda(n, lam) else: # upper tail return log10_poisson_cdf_Q_large_lambda(n, lam) if lower: if lam > 700: return __poisson_cdf_large_lambda (n, lam) else: return __poisson_cdf(n,lam) else: # upper tail if lam > 700: return __poisson_cdf_Q_large_lambda (n, lam) else: return __poisson_cdf_Q(n,lam) cdef __poisson_cdf ( unsigned int k, double a ): """Poisson CDF For small lambda. If a > 745, this will return incorrect result. Parameters: k : observation a : lambda """ if k < 0: return 0 # special cases cdef double nextcdf = exp( -1 * a ) cdef double cdf = nextcdf cdef unsigned int i cdef double lastcdf for i in xrange(1,k+1): lastcdf = nextcdf nextcdf = lastcdf * a / i cdf = cdf + nextcdf if cdf > 1: return 1 else: return cdf cdef __poisson_cdf_large_lambda ( unsigned int k, double a ): """Slower poisson cdf for large lambda ( > 700 ) Parameters: k : observation a : lambda """ assert a > 700 if k < 0: return 0 # special cases cdef int num_parts = int(a/LSTEP) cdef double lastexp = exp(-1 * (a % LSTEP) ) cdef double nextcdf = EXPSTEP num_parts -= 1 cdef double cdf = nextcdf cdef unsigned int i cdef double lastcdf for i in xrange(1,k+1): lastcdf = nextcdf nextcdf = lastcdf * a / i cdf = cdf + nextcdf if nextcdf > EXPTHRES or cdf > EXPTHRES: if num_parts>=1: cdf *= EXPSTEP nextcdf *= EXPSTEP num_parts -= 1 else: cdf *= lastexp lastexp = 1 for i in xrange(num_parts): cdf *= EXPSTEP cdf *= lastexp return cdf cdef __poisson_cdf_Q ( unsigned int k, double a ): """internal Poisson CDF evaluater for upper tail with small lambda. Parameters: k : observation a : lambda """ cdef unsigned int i if k < 0: return 1 # special cases cdef double nextcdf nextcdf = exp( -1 * a ) cdef double lastcdf for i in xrange(1,k+1): lastcdf = nextcdf nextcdf = lastcdf * a / i cdef double cdf = 0 i = k+1 while nextcdf >0: lastcdf = nextcdf nextcdf = lastcdf * a / i cdf += nextcdf i+=1 return cdf cdef __poisson_cdf_Q_large_lambda ( unsigned int k, double a ): """Slower internal Poisson CDF evaluater for upper tail with large lambda. Parameters: k : observation a : lambda """ assert a > 700 if k < 0: return 1 # special cases cdef unsigned int num_parts = int(a/LSTEP) cdef double lastexp = exp(-1 * (a % LSTEP) ) cdef double nextcdf = EXPSTEP cdef unsigned int i cdef double lastcdf num_parts -= 1 for i in xrange(1,k+1): lastcdf = nextcdf nextcdf = lastcdf * a / i if nextcdf > EXPTHRES: if num_parts>=1: nextcdf *= EXPSTEP num_parts -= 1 else: # simply raise an error raise Exception("Unexpected error") #cdf *= lastexp #lastexp = 1 cdef double cdf = 0 i = k+1 while nextcdf >0: lastcdf = nextcdf nextcdf = lastcdf * a / i cdf += nextcdf i+=1 if nextcdf > EXPTHRES or cdf > EXPTHRES: if num_parts>=1: cdf *= EXPSTEP nextcdf *= EXPSTEP num_parts -= 1 else: cdf *= lastexp lastexp = 1 for i in xrange(num_parts): cdf *= EXPSTEP cdf *= lastexp return cdf cdef double log10_poisson_cdf_P_large_lambda ( unsigned int k, double lbd ): """Slower Poisson CDF evaluater for lower tail which allow calculation in log space. Better for the pvalue < 10^-310. Parameters: k : observation lbd : lambda ret = -lambda + \ln( \sum_{i=k+1}^{\inf} {lambda^i/i!} = -lambda + \ln( sum{ exp{ln(F)} } ), where F=lambda^m/m! \ln{F(m)} = m*ln{lambda} - \sum_{x=1}^{m}\ln(x) Calculate \ln( sum{exp{N} ) by logspace_add function Return the log10(pvalue) """ cdef double residue = 0 cdef double logx = 0 cdef double ln_lbd = log(lbd) # first residue cdef int m = k cdef double sum_ln_m = 0 cdef int i = 0 for i in range(1,m+1): sum_ln_m += log(i) logx = m*ln_lbd - sum_ln_m residue = logx while m > 1: m -= 1 logy = logx-ln_lbd+log(m) pre_residue = residue residue = logspace_add(pre_residue,logy) if fabs(pre_residue-residue) < 1e-10: break logx = logy return round((residue-lbd)/log(10),2) cdef double log10_poisson_cdf_Q_large_lambda ( unsigned int k, double lbd ): """Slower Poisson CDF evaluater for upper tail which allow calculation in log space. Better for the pvalue < 10^-310. Parameters: k : observation lbd : lambda ret = -lambda + \ln( \sum_{i=k+1}^{\inf} {lambda^i/i!} = -lambda + \ln( sum{ exp{ln(F)} } ), where F=lambda^m/m! \ln{F(m)} = m*ln{lambda} - \sum_{x=1}^{m}\ln(x) Calculate \ln( sum{exp{N} ) by logspace_add function Return the log10(pvalue) """ cdef double residue = 0 cdef double logx = 0 cdef double ln_lbd = log(lbd) # first residue cdef int m = k+1 cdef double sum_ln_m = 0 cdef int i = 0 for i in range(1,m+1): sum_ln_m += log(i) logx = m*ln_lbd - sum_ln_m residue = logx while True: m += 1 logy = logx+ln_lbd-log(m) pre_residue = residue residue = logspace_add(pre_residue,logy) if fabs(pre_residue-residue) < 1e-5: break logx = logy return round((residue-lbd)/log(10),2) cdef double logspace_add ( double logx, double logy ): return max (logx, logy) + log1p (exp (-fabs (logx - logy))); cpdef poisson_cdf_inv ( double cdf, double lam, int maximum=1000 ): """inverse poisson distribution. cdf : the CDF lam : the lambda of poisson distribution note: maxmimum return value is 1000 and lambda must be smaller than 740. """ assert lam < 740 if cdf < 0 or cdf > 1: raise Exception ("CDF must >= 0 and <= 1") elif cdf == 0: return 0 cdef double sum2 = 0 cdef double newval = exp( -1*lam ) sum2 = newval cdef int i cdef double sumold cdef double lastval for i in xrange(1,maximum+1): sumold = sum2 lastval = newval newval = lastval * lam / i sum2 = sum2 + newval if sumold <= cdf and cdf <= sum2: return i return maximum cpdef poisson_cdf_Q_inv ( double cdf, double lam, int maximum=1000 ): """inverse poisson distribution. cdf : the CDF lam : the lambda of poisson distribution note: maxmimum return value is 1000 and lambda must be smaller than 740. """ assert lam < 740 if cdf < 0 or cdf > 1: raise Exception ("CDF must >= 0 and <= 1") elif cdf == 0: return 0 cdef double sum2 = 0 cdef double newval = exp( -1 * lam ) sum2 = newval cdef int i cdef double lastval cdef double sumold for i in xrange(1,maximum+1): sumold = sum2 lastval = newval newval = lastval * lam / i sum2 = sum2 + newval if sumold <= cdf and cdf <= sum2: return i return maximum cpdef poisson_pdf ( unsigned int k, double a ): """Poisson PDF. PDF(K,A) is the probability that the number of events observed in a unit time period will be K, given the expected number of events in a unit time as A. """ if a <= 0: return 0 return exp(-a) * pow (a, k, None) / factorial (k) cdef binomial_coef ( long n, long k ): """BINOMIAL_COEF computes the Binomial coefficient C(N,K) n,k are integers. """ cdef long mn = min (k, n-k) cdef long mx cdef double cnk cdef long i if mn < 0: return 0 elif mn == 0: return 1 else: mx = max(k,n-k) cnk = float(mx+1) for i in xrange(2,mn+1): cnk = cnk * (mx+i) / i return cnk cpdef binomial_cdf ( long x, long a, double b, bool lower=True ): """ BINOMIAL_CDF compute the binomial CDF. CDF(x)(A,B) is the probability of at most X successes in A trials, given that the probability of success on a single trial is B. """ if lower: return _binomial_cdf_f (x,a,b) else: return _binomial_cdf_r (x,a,b) cdef _binomial_cdf_r ( long x, long a, double b ): """ Binomial CDF for upper tail. """ if x < 0: return 1 elif a < x: return 0 elif b == 0: return 0 elif b == 1: return 1 cdef long argmax=int(a*b) cdef double seedpdf cdef double cdf cdef double pdf cdef long i if xargmax: seedpdf=binomial_pdf(argmax,a,b) pdf=seedpdf cdf = pdf for i in xrange(argmax-1,-1,-1): pdf/=(a-i)*b/(1-b)/(i+1) if pdf==0.0: break cdf += pdf pdf = seedpdf for i in xrange(argmax,x): pdf*=(a-i)*b/(1-b)/(i+1) if pdf==0.0: break cdf+=pdf cdf=min(1,cdf) cdf = float("%.10e" %cdf) return cdf else: pdf=binomial_pdf(x,a,b) cdf = pdf for i in xrange(x-1,-1,-1): pdf/=(a-i)*b/(1-b)/(i+1) if pdf==0.0: break cdf += pdf cdf=min(1,cdf) cdf = float("%.10e" %cdf) return cdf cpdef binomial_cdf_inv ( double cdf, long a, double b ): """BINOMIAL_CDF_INV inverts the binomial CDF. For lower tail only! """ if cdf < 0 or cdf >1: raise Exception("CDF must >= 0 or <= 1") cdef double cdf2 = 0 cdef long x for x in xrange(0,a+1): pdf = binomial_pdf (x,a,b) cdf2 = cdf2 + pdf if cdf < cdf2: return x return a cpdef binomial_pdf( long x, long a, double b ): """binomial PDF by H. Gene Shin """ if a<1: return 0 elif x<0 or aa-x: p=1-b mn=a-x mx=x else: p=b mn=x mx=a-x pdf=1 t = 0 for q in xrange(1,mn+1): pdf*=(a-q+1)*p/(mn-q+1) if pdf < 1e-100: while pdf < 1e-3: pdf /= 1-p t-=1 if pdf > 1e+100: while pdf > 1e+3 and t """Module for FWTrack classes. Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import logging from array import array from random import sample as random_sample from sys import stdout as sysstdout from MACS2.Constants import * # ------------------------------------ # constants # ------------------------------------ __version__ = "FixWidthTrack $Revision$" __author__ = "Tao Liu " __doc__ = "FWTrackII class" # ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ class FWTrackII: """Fixed Width Locations Track class II along the whole genome (commonly with the same annotation type), which are stored in a dict. Locations are stored and organized by sequence names (chr names) in a dict. They can be sorted by calling self.sort() function. """ def __init__ (self,fw=0,anno=""): """fw is the fixed-width for all locations. """ self.fw = fw self.__locations = {} # locations self.__sorted = False self.total = 0 # total tags self.annotation = anno # need to be figured out def add_loc (self, char * chromosome, long fiveendpos, int strand): """Add a location to the list according to the sequence name. chromosome -- mostly the chromosome name fiveendpos -- 5' end pos, left for plus strand, right for neg strand strand -- 0: plus, 1: minus """ if not self.__locations.has_key(chromosome): self.__locations[chromosome] = [array(BYTE4,[]),array(BYTE4,[])] # for (+strand, -strand) #self.__locations[chromosome] = [ plus , minus] # for (+strand, -strand) self.__locations[chromosome][strand].append(fiveendpos) self.total+=1 def get_locations_by_chr (self, chromosome): """Return a tuple of two lists of locations for certain chromosome. """ if self.__locations.has_key(chromosome): return self.__locations[chromosome] else: raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome)) def get_chr_names (self): """Return all the chromosome names stored in this track object. """ l = self.__locations.keys() l.sort() return l def length (self): """Total sequenced length = total number of tags * width of tag """ return self.total*self.fw def sort (self): """Naive sorting for locations. """ for k in self.__locations.keys(): (tmparrayplus,tmparrayminus) = self.get_locations_by_chr(k) self.__locations[k][0] = sorted(tmparrayplus) if len(tmparrayplus) < 1: logging.warning("NO records for chromosome %s, plus strand!" % (k)) self.__locations[k][1] = sorted(tmparrayminus) if len(tmparrayminus) < 1: logging.warning("NO records for chromosome %s, minus strand!" % (k)) self.__sorted = True def filter_dup (self,maxnum): """Filter the duplicated reads. Run it right after you add all data into this object. """ if not self.__sorted: self.sort() self.total = 0 for k in self.__locations.keys(): # for each chromosome # + strand plus = self.__locations[k][0] if len(plus) <1: new_plus = [] else: new_plus = array(BYTE4,[plus[0]]) pappend = new_plus.append n = 1 # the number of tags in the current location current_loc = plus[0] for p in plus[1:]: if p == current_loc: n += 1 if n <= maxnum: pappend(p) else: logging.debug("Duplicate reads found at %s:%d at + strand" % (k,p) ) else: current_loc = p pappend(p) n = 1 self.total += len(new_plus) # - strand minus = self.__locations[k][1] if len(minus) <1: new_minus = [] else: new_minus = array(BYTE4,[minus[0]]) mappend = new_minus.append n = 1 # the number of tags in the current location current_loc = minus[0] for p in minus[1:]: if p == current_loc: n += 1 if n <= maxnum: mappend(p) else: logging.debug("Duplicate reads found at %s:%d at - strand" % (k,p) ) else: current_loc = p mappend(p) n = 1 self.total += len(new_minus) self.__locations[k]=[new_plus,new_minus] def merge_plus_minus_locations_naive (self): """Merge plus and minus strand locations """ for chrom in self.__locations.keys(): #(plus_tags,minus_tags) = self.__locations[chrom] self.__locations[chrom][0].extend(self.__locations[chrom][1]) self.__locations[chrom][0] = sorted(self.__locations[chrom][0]) self.__locations[chrom][1] = [] def merge_plus_minus_locations (self): """Merge plus and minus strand locations. Tao: Amazingly, this function for merging two sorted lists is slower than merge_plus_minus_locations_naive which only concatenate the two lists then sort it again! I am so discouraged! """ if not self.__sorted: self.sort() for chrom in self.__locations.keys(): (plus_tags,minus_tags) = self.__locations[chrom] new_plus_tags = array(BYTE4,[]) ip = 0 im = 0 lenp = len(plus_tags) lenm = len(minus_tags) while ip < lenp and im < lenm: if plus_tags[ip] < minus_tags[im]: new_plus_tags.append(plus_tags[ip]) ip += 1 else: new_plus_tags.append(minus_tags[im]) im += 1 if im < lenm: # add rest of minus tags new_plus_tags.extend(minus_tags[im:]) if ip < lenp: # add rest of plus tags new_plus_tags.extend(plus_tags[ip:]) self.__locations[chrom] = [new_plus_tags,[]] self.total += len(new_plus_tags) def sample_percent (self, percent): """Sample the tags for a given percentage. Warning: the current object is changed! """ self.total = 0 for key in self.__locations.keys(): num = int(len(self.__locations[key][0])*percent) self.__locations[key][0]=array(BYTE4,sorted(random_sample(self.__locations[key][0],num))) num = int(len(self.__locations[key][1])*percent) self.__locations[key][1]=array(BYTE4,sorted(random_sample(self.__locations[key][1],num))) self.total += len(self.__locations[key][0]) + len(self.__locations[key][1]) def sample_num (self, num): """Sample the tags for a given percentage. Warning: the current object is changed! """ percent = float(num)/self.total self.total = 0 for key in self.__locations.keys(): num = int(len(self.__locations[key][0])*percent) self.__locations[key][0]=array(BYTE4,sorted(random_sample(self.__locations[key][0],num))) num = int(len(self.__locations[key][1])*percent) self.__locations[key][1]=array(BYTE4,sorted(random_sample(self.__locations[key][1],num))) self.total += len(self.__locations[key][0]) + len(self.__locations[key][1]) def __str__ (self): return self.__to_wiggle() def __to_wiggle (self): """Use a lot of memory! """ t = "track type=wiggle_0 name=\"tag list\" description=\"%s\"\n" % (self.annotation) for k in self.__locations.keys(): if self.__locations[k][0]: t += "variableStep chrom=%s span=%d strand=0\n" % (k,self.fw) for i in self.__locations[k][0]: t += "%d\t1\n" % i if self.__locations[k][1]: t += "variableStep chrom=%s span=%d strand=1\n" % (k,self.fw) for i in self.__locations[k][1]: t += "%d\t1\n" % i return t def print_to_bed (self, fhd=None): """Output FWTrackII to BED format files. If fhd is given, write to a file, otherwise, output to standard output. """ if not fhd: fhd = sysstdout assert isinstance(fhd, file) assert self.fw > 0, "FWTrackII object .fw should be set larger than 0!" for k in self.__locations.keys(): if self.__locations[k][0]: for i in self.__locations[k][0]: fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k,i,int(i+self.fw),"+") ) if self.__locations[k][1]: for i in self.__locations[k][1]: fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k,int(i-self.fw),i,"-") ) return MACS-2.0.9/MACS2/IO/cScoreTrack.pyx0000644000175000017500000004560511654316302017155 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-10-21 01:58:53 Tao Liu> """Module for Feature IO classes. Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import numpy as np from numpy import int64,int32,float32 from libc.math cimport sqrt,log10 from MACS2.Constants import * from MACS2.cProb cimport poisson_cdf from MACS2.IO.cPeakIO import PeakIO, BroadPeakIO #from MACS2.IO.cBedGraph import bedGraphTrackI # ------------------------------------ # constants # ------------------------------------ __version__ = "scoreTrackI $Revision$" __author__ = "Tao Liu " __doc__ = "scoreTrackI classes" # ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ class scoreTrackI: """Class for scoreGraph type data. Modified from bedGraphTrackI. The only difference is that we store pvalue score, qvalue score and foldchange together. In bedGraph, data are represented as continuous non-overlapping regions in the whole genome. I keep this assumption in all the functions. If data has overlaps, some functions will definitely give incorrect results. 1. Continuous: the next region should be after the previous one unless they are on different chromosomes; 2. Non-overlapping: the next region should never have overlaps with preceding region. The way to memorize bedGraph data is to remember the transition points together with values of their preceding regions. The last data point may exceed chromosome end, unless a chromosome dictionary is given. Remember the coordinations in bedGraph and this class is 0-indexed and right-open. """ def __init__ (self): """Different with bedGraphTrackI, missing values are simply replaced with 0. """ self.data = {} self.pointer = {} def add_chromosome ( self, chrom, chrom_max_len ): if not self.data.has_key(chrom): self.data[chrom] = np.zeros(chrom_max_len,dtype=[('pos','int64'), ('sample','float32'), ('control','float32'), ('-100logp','int32'), ('-100logq','int32')]) self.pointer[chrom] = 0 def add (self,chromosome,endpos,sample,control): """Add a chr-endpos-sample-control block into data dictionary. At the mean time, calculate pvalues. """ c = self.data[chromosome] i = self.pointer[chromosome] # get the preceding region c[i] = (endpos,sample,control,int(-100*poisson_cdf(sample,control,False,True)),0) self.pointer[chromosome] += 1 def get_data_by_chr (self, chromosome): """Return array of counts by chromosome. The return value is a tuple: ([end pos],[value]) """ if self.data.has_key(chromosome): return self.data[chromosome] else: return None def get_chr_names (self): """Return all the chromosome names stored. """ l = set(self.data.keys()) return l def write_bedGraph (self, fhd, name, description, colname): """Write all data to fhd in Wiggle Format. fhd: a filehandler to save bedGraph. name/description: the name and description in track line. colname: can be 'sample','control','-100logp','-100logq' """ if colname not in ['sample','control','-100logp','-100logq']: raise Exception("%s not supported!" % colname) if colname in ['-100logp', '-100logq']: flag100 = True # for pvalue or qvalue, divide them by 100 while writing to bedGraph file else: flag100 = False chrs = self.get_chr_names() for chrom in chrs: d = self.data[chrom] l = self.pointer[chrom] pre = 0 pos = d['pos'] if flag100: value = d[colname]/100.0 else: value = d[colname] for i in xrange( l ): fhd.write("%s\t%d\t%d\t%.2f\n" % (chrom,pre,pos[i],value[i])) pre = pos[i] return True def __calculate_fold_change ( self, chrom, index ): """From 'sample' and 'control' columns, calculate foldchanges. chrom: chromosome name index: index in data[chrom] """ return self.data[chrom]['sample'][index]/self.data[chrom]['control'][index] def make_pq_table ( self ): """Make pvalue-qvalue table. Step1: get all pvalue and length of block with this pvalue Step2: Sort them Step3: Apply AFDR method to adjust pvalue and get qvalue for each pvalue Return a dictionary of {-100log10pvalue:(-100log10qvalue,rank)} relationships. """ n = self.total() value_list = np.empty( n, dtype = [('v', '= cutoff: peak_content = [ ( pre_p, p, v, summit_v, x ), ] # remember the index too... pre_p = p break # found the first range above cutoff else: pre_p = p for i in xrange( x, chrom_pointer ): # continue scan the rest regions p = chrom_pos[ i ] v = chrom_score[ i ] summit_v = chrom_sample[ i ] # I will use pileup height to find summit instead of other kinds of scores if v < cutoff: # But score is still used to find boundaries pre_p = p continue # for points above cutoff # if the gap is allowed if pre_p - peak_content[ -1 ][ 1 ] <= max_gap: peak_content.append( ( pre_p, p, v, summit_v, i ) ) # put chunks above cutoff in a temporary list else: # when the gap is not allowed, close this peak peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it tmpsummit = [] summit_pos = None summit_value = None for (tmpstart,tmpend,tmpvalue,tmpsummitvalue, tmpindex) in peak_content: if not summit_value or summit_value < tmpsummitvalue: tmpsummit = [ int(( tmpend+tmpstart )/2), ] tmpsummit_index = [ tmpindex, ] summit_value = tmpsummitvalue elif summit_value == tmpsummitvalue: # remember continuous summit values tmpsummit.append( int( (tmpend+tmpstart)/2 ) ) tmpsummit_index.append( tmpindex ) middle_summit = int( ( len(tmpsummit)+1 )/2 )-1 # the middle of all highest points in peak region is defined as summit summit_pos = tmpsummit[ middle_summit ] summit_index = tmpsummit_index[ middle_summit ] # char * chromosome, long start, long end, long summit = 0, # double peak_height=0, int pileup=0, # double pvalue=0, double fold_change=0, double qvalue=0 peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = summit_pos, peak_score = chrom_score [ summit_index ], pileup = chrom_sample[ summit_index ], # should be the same as summit_value pscore = chrom_pvalue[ summit_index ]/100.0, fold_change = chrom_sample[ summit_index ]/chrom_control[ summit_index ], qscore = chrom_qvalue[ summit_index ]/100.0, ) # start a new peak peak_content = [ ( pre_p, p, v, summit_v, i ), ] pre_p = p # save the last peak if not peak_content: continue peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it tmpsummit = [] summit_pos = None summit_value = None for (tmpstart,tmpend,tmpvalue,tmpsummitvalue, tmpindex) in peak_content: if not summit_value or summit_value < tmpsummitvalue: tmpsummit = [ int(( tmpend+tmpstart )/2), ] tmpsummit_index = [ tmpindex, ] summit_value = tmpsummitvalue elif summit_value == tmpsummitvalue: # remember continuous summit values tmpsummit.append( int( (tmpend+tmpstart)/2 ) ) tmpsummit_index.append( tmpindex ) middle_summit = int( ( len(tmpsummit)+1 )/2 )-1 summit_pos = tmpsummit[ middle_summit ] summit_index = tmpsummit_index[ middle_summit ] peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = summit_pos, peak_score = chrom_score [ summit_index ], pileup = chrom_sample[ summit_index ], # should be the same as summit_value pscore = chrom_pvalue[ summit_index ]/100.0, fold_change = chrom_sample[ summit_index ]/chrom_control[ summit_index ], qscore = chrom_qvalue[ summit_index ]/100.0, ) return peaks def call_broadpeaks (self, lvl1_cutoff=500, lvl2_cutoff=100, min_length=200, lvl1_max_gap=50, lvl2_max_gap=400, colname='-100logq'): """This function try to find enriched regions within which, scores are continuously higher than a given cutoff for level 1, and link them using the gap above level 2 cutoff with a maximum length of lvl2_max_gap. lvl1_cutoff: cutoff of value at enriched regions, default 500. lvl2_cutoff: cutoff of value at linkage regions, default 100. min_length : minimum peak length, default 200. lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50. lvl2_max_gap : maximum length of linkage regions, default 400. colname: can be 'sample','control','-100logp','-100logq'. Cutoff will be applied to the specified column. Return both general PeakIO object for highly enriched regions and gapped broad regions in BroadPeakIO. """ assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2." assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1." lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff, min_length=min_length, max_gap=lvl1_max_gap, colname=colname) lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff, min_length=min_length, max_gap=lvl2_max_gap, colname=colname) chrs = lvl1_peaks.peaks.keys() broadpeaks = BroadPeakIO() # use lvl2_peaks as linking regions between lvl1_peaks for chrom in chrs: lvl1peakschrom = lvl1_peaks.peaks[chrom] lvl2peakschrom = lvl2_peaks.peaks[chrom] lvl1peakschrom_next = iter(lvl1peakschrom).next tmppeakset = [] # to temporarily store lvl1 region inside a lvl2 region # our assumption is lvl1 regions should be included in lvl2 regions try: lvl1 = lvl1peakschrom_next() except StopIteration: break for lvl2 in lvl2peakschrom: # for each lvl2 peak, find all lvl1 peaks inside try: while True: if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]: tmppeakset.append(lvl1) else: if tmppeakset: self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset) tmppeakset = [] break lvl1 = lvl1peakschrom_next() except StopIteration: if tmppeakset: self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset) break return lvl1_peaks, broadpeaks def __add_broadpeak (self, bpeaks, chrom, lvl2peak, lvl1peakset): """Internal function to create broad peak. """ start = lvl2peak["start"] end = lvl2peak["end"] thickStart = lvl1peakset[0]["start"] thickEnd = lvl1peakset[-1]["end"] blockNum = len(lvl1peakset) blockSizes = ",".join( map(lambda x:str(x["length"]),lvl1peakset) ) blockStarts = ",".join( map(lambda x:str(x["start"]-start),lvl1peakset) ) if lvl2peak["start"] != thickStart: # add 1bp mark for the start of lvl2 peak blockNum += 1 blockSizes = "1,"+blockSizes blockStarts = "0,"+blockStarts if lvl2peak["end"] != thickEnd: # add 1bp mark for the end of lvl2 peak blockNum += 1 blockSizes = blockSizes+",1" blockStarts = blockStarts+","+str(end-start-1) bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd, blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts) return bpeaks def total ( self ): """Return the number of regions in this object. """ t = 0 for chrom in self.data.keys(): t += self.pointer[chrom] return t MACS-2.0.9/MACS2/IO/cParser.pyx0000644000175000017500000006310511630217211016335 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-06-19 16:12:55 Tao Liu> """Module for all MACS Parser classes for input. Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import logging import struct import gzip from MACS2.Constants import * from MACS2.IO.cFixWidthTrack import FWTrackII # ------------------------------------ # constants # ------------------------------------ __version__ = "Parser $Revision$" __author__ = "Tao Liu " __doc__ = "All Parser classes" # ------------------------------------ # Misc functions # ------------------------------------ def guess_parser ( fhd ): parser_dict = {"BED":BEDParser, "ELAND":ELANDResultParser, "ELANDMULTI":ELANDMultiParser, "ELANDEXPORT":ELANDExportParser, "SAM":SAMParser, "BAM":BAMParser, "BOWTIE":BowtieParser } order_list = ("BAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "SAM", "BOWTIE", ) for f in order_list: p = parser_dict[f](fhd) s = p.sniff() if s: logging.info("Detected format is: %s" % (f) ) return p raise Exception("Can't detect format!") # ------------------------------------ # Classes # ------------------------------------ class StrandFormatError(Exception): """Exception about strand format error. Example: raise StrandFormatError('Must be F or R','X') """ def __init__ (self, string, strand): self.strand = strand self.string = string def __str__ (self): return repr( "Strand information can not be recognized in this line: \"%s\",\"%s\"" % (self.string,self.strand) ) class GenericParser: """Generic Parser class. Inherit this to write your own parser. """ def __init__ (self, fhd): self.fhd = fhd return def tsize(self): return def build_fwtrack (self): return def __fw_parse_line (self, thisline ): return def sniff (self): try: t = self.tsize() except: self.fhd.seek(0) return False else: if t<=10 or t>=10000: self.fhd.seek(0) return False else: self.fhd.seek(0) return t class BEDParser(GenericParser): """File Parser Class for tabular File. """ def __init__ (self,fhd): self.fhd = fhd def tsize (self): s = 0 n = 0 m = 0 while n<10 and m<1000: m += 1 thisline = self.fhd.readline() (chromosome,fpos,strand) = self.__fw_parse_line(thisline) if not fpos or not chromosome: continue thisline = thisline.rstrip() thisfields = thisline.split('\t') s += int(thisfields[2])-int(thisfields[1]) n += 1 self.fhd.seek(0) return int(s/n) def build_fwtrack (self): """Build FWTrackII from all lines, return a FWTrackII object. Note: All locations will be merged (exclude the same location) then sorted after the track is built. If both_strand is True, it will store strand information in FWTrackII object. if do_merge is False, it will not merge the same location after the track is built. """ fwtrack = FWTrackII() i = 0 m = 0 for thisline in self.fhd: (chromosome,fpos,strand) = self.__fw_parse_line(thisline) i+=1 if i == 1000000: m += 1 logging.info(" %d" % (m*1000000)) i=0 if not fpos or not chromosome: continue fwtrack.add_loc(chromosome,fpos,strand) return fwtrack def __fw_parse_line (self, thisline ): thisline = thisline.rstrip() if not thisline or thisline[:5]=="track" or thisline[:7]=="browser" or thisline[0]=="#": return ("comment line",None,None) thisfields = thisline.split('\t') chromname = thisfields[0] try: chromname = chromname[:chromname.rindex(".fa")] except ValueError: pass if len(thisfields) < 6 : # default pos strand if no strand # info can be found return (chromname, int(thisfields[1]), 0) else: if thisfields[5] == "+": return (chromname, int(thisfields[1]), 0) elif thisfields[5] == "-": return (chromname, int(thisfields[2]), 1) else: raise StrandFormatError(thisline,thisfields[5]) class ELANDResultParser(GenericParser): """File Parser Class for tabular File. """ def __init__ (self,fhd): """ """ self.fhd = fhd def tsize (self): s = 0 n = 0 m = 0 while n<10 and m<1000: m += 1 thisline = self.fhd.readline() (chromosome,fpos,strand) = self.__fw_parse_line(thisline) if not fpos or not chromosome: continue thisline = thisline.rstrip() thisfields = thisline.split('\t') s += len(thisfields[1]) n += 1 self.fhd.seek(0) return int(s/n) def build_fwtrack (self): """Build FWTrackII from all lines, return a FWTrackII object. """ fwtrack = FWTrackII() i = 0 m = 0 for thisline in self.fhd: (chromosome,fpos,strand) = self.__fw_parse_line(thisline) i+=1 if i == 1000000: m += 1 logging.info(" %d" % (m*1000000)) i=0 if not fpos or not chromosome: continue fwtrack.add_loc(chromosome,fpos,strand) return fwtrack def __fw_parse_line (self, thisline ): #if thisline.startswith("#") or thisline.startswith("track") or thisline.startswith("browser"): return ("comment line",None,None) # comment line is skipped thisline = thisline.rstrip() if not thisline: return ("blank",None,None) thisfields = thisline.split('\t') thistaglength = len(thisfields[1]) if len(thisfields) <= 6: return ("blank",None,None) try: chromname = thisfields[6] chromname = chromname[:chromname.rindex(".fa")] except ValueError: pass if thisfields[2] == "U0" or thisfields[2]=="U1" or thisfields[2]=="U2": strand = thisfields[8] if strand == "F": return (chromname, int(thisfields[7])-1, 0) elif strand == "R": return (chromname, int(thisfields[7])+thistaglength-1, 1) else: raise StrandFormatError(thisline,strand) else: return (None,None,None) class ELANDMultiParser(GenericParser): """File Parser Class for ELAND multi File. Note this parser can only work for s_N_eland_multi.txt format. Each line of the output file contains the following fields: 1. Sequence name 2. Sequence 3. Either NM, QC, RM (as described above) or the following: 4. x:y:z where x, y, and z are the number of exact, single-error, and 2-error matches found 5. Blank, if no matches found or if too many matches found, or the following: BAC_plus_vector.fa:163022R1,170128F2,E_coli.fa:3909847R1 This says there are two matches to BAC_plus_vector.fa: one in the reverse direction starting at position 160322 with one error, one in the forward direction starting at position 170128 with two errors. There is also a single-error match to E_coli.fa. """ def __init__ (self,fhd): """ """ self.fhd = fhd def tsize (self, strict = False): s = 0 n = 0 m = 0 while n<10 and m<1000: m += 1 thisline = self.fhd.readline() (chromosome,fpos,strand) = self.__fw_parse_line(thisline) if not fpos or not chromosome: continue thisline = thisline.rstrip() thisfields = thisline.split('\t') s += len(thisfields[1]) n += 1 self.fhd.seek(0) return int(s/n) def build_fwtrack (self): """Build FWTrackII from all lines, return a FWTrackII object. Note only the unique match for a tag is kept. """ fwtrack = FWTrackII() i = 0 m = 0 for thisline in self.fhd: (chromosome,fpos,strand) = self.__fw_parse_line(thisline) i+=1 if i == 1000000: m += 1 logging.info(" %d" % (m*1000000)) i=0 if not fpos or not chromosome: continue fwtrack.add_loc(chromosome,fpos,strand) return fwtrack def __fw_parse_line (self, thisline ): if not thisline: return (None,None,None) thisline = thisline.rstrip() if not thisline: return ("blank",None,None) #if thisline[0] == "#": return ("comment line",None,None) # comment line is skipped thisfields = thisline.split('\t') thistagname = thisfields[0] # name of tag thistaglength = len(thisfields[1]) # length of tag if len(thisfields) < 4: return (None,None,None) else: thistaghits = sum(map(int,thisfields[2].split(':'))) if thistaghits > 1: # multiple hits return (None,None,None) else: (chromname,pos) = thisfields[3].split(':') try: chromname = chromname[:chromname.rindex(".fa")] except ValueError: pass strand = pos[-2] if strand == "F": return (chromname, int(pos[:-2])-1, 0) elif strand == "R": return (chromname, int(pos[:-2])+thistaglength-1, 1) else: raise StrandFormatError(thisline,strand) class ELANDExportParser(GenericParser): """File Parser Class for ELAND Export File. """ def __init__ (self,fhd): self.fhd = fhd def tsize (self): s = 0 n = 0 m = 0 while n<10 and m<1000: m += 1 thisline = self.fhd.readline() (chromosome,fpos,strand) = self.__fw_parse_line(thisline) if not fpos or not chromosome: continue thisline = thisline.rstrip() thisfields = thisline.split("\t") s += len(thisfields[8]) n += 1 self.fhd.seek(0) return int(s/n) def build_fwtrack (self): """Build FWTrackII from all lines, return a FWTrackII object. Note only the unique match for a tag is kept. """ fwtrack = FWTrackII() i = 0 m = 0 for thisline in self.fhd: (chromosome,fpos,strand) = self.__fw_parse_line(thisline) i+=1 if i == 1000000: m += 1 logging.info(" %d" % (m*1000000)) i=0 if not fpos or not chromosome: continue fwtrack.add_loc(chromosome,fpos,strand) return fwtrack def __fw_parse_line (self, thisline ): #if thisline.startswith("#") : return ("comment line",None,None) # comment line is skipped thisline = thisline.rstrip() if not thisline: return ("blank",None,None) thisfields = thisline.split("\t") if len(thisfields) > 12 and thisfields[12]: thisname = ":".join(thisfields[0:6]) thistaglength = len(thisfields[8]) strand = thisfields[13] if strand == "F": return (thisfields[10],int(thisfields[12])-1,0) elif strand == "R": return (thisfields[10],int(thisfields[12])+thistaglength-1,1) else: raise StrandFormatError(thisline,strand) else: return (None,None,None) ### Contributed by Davide, modified by Tao class SAMParser(GenericParser): """File Parser Class for SAM File. Each line of the output file contains at least: 1. Sequence name 2. Bitwise flag 3. Reference name 4. 1-based leftmost position fo clipped alignment 5. Mapping quality 6. CIGAR string 7. Mate Reference Name 8. 1-based leftmost Mate Position 9. Inferred insert size 10. Query sequence on the same strand as the reference 11. Query quality The bitwise flag is made like this: dec meaning --- ------- 1 paired read 2 proper pair 4 query unmapped 8 mate unmapped 16 strand of the query (1 -> reverse) 32 strand of the mate 64 first read in pair 128 second read in pair 256 alignment is not primary 512 does not pass quality check 1024 PCR or optical duplicate """ def __init__ (self,fhd): """ """ self.fhd = fhd def tsize (self): s = 0 n = 0 m = 0 while n<10 and m<1000: m += 1 thisline = self.fhd.readline() (chromosome,fpos,strand) = self.__fw_parse_line(thisline) if not fpos or not chromosome: continue thisline = thisline.rstrip() thisfields = thisline.split("\t") s += len(thisfields[9]) n += 1 self.fhd.seek(0) return int(s/n) def build_fwtrack (self): """Build FWTrackII from all lines, return a FWTrackII object. Note only the unique match for a tag is kept. """ fwtrack = FWTrackII() i = 0 m = 0 for thisline in self.fhd: (chromosome,fpos,strand) = self.__fw_parse_line(thisline) i+=1 if i == 1000000: m += 1 logging.info(" %d" % (m*1000000)) i=0 if not fpos or not chromosome: continue fwtrack.add_loc(chromosome,fpos,strand) return fwtrack def __fw_parse_line (self, thisline ): thisline = thisline.rstrip() if not thisline: return ("blank",None,None) if thisline[0]=="@": return ("comment line",None,None) # header line started with '@' is skipped thisfields = thisline.split('\t') thistagname = thisfields[0] # name of tag thisref = thisfields[2] bwflag = int(thisfields[1]) if bwflag & 4 or bwflag & 512 or bwflag & 1024: return (None, None, None) #unmapped sequence or bad sequence if bwflag & 1: # paired read. We should only keep sequence if the mate is mapped # and if this is the left mate, all is within the flag! if not bwflag & 2: return (None, None, None) # not a proper pair if bwflag & 8: return (None, None, None) # the mate is unmapped p1pos = int(thisfields[3]) - 1 p2pos = int(thisfields[7]) - 1 if p1pos > p2pos: # this pair is the farthest one, skip it return (None, None, None) # In case of paired-end we have now skipped all possible "bad" pairs # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes # we can treat it as a single read, so just check the strand and calculate its # start position... hope I'm right! if bwflag & 16: thisstrand = 1 thisstart = int(thisfields[3]) - 1 + len(thisfields[9]) #reverse strand should be shifted len(query) bp else: thisstrand = 0 thisstart = int(thisfields[3]) - 1 try: thisref = thisref[:thisref.rindex(".fa")] except ValueError: pass return (thisref, thisstart, thisstrand) class BAMParser(GenericParser): """File Parser Class for BAM File. File is gzip-compatible and binary. Information available is the same that is in SAM format. The bitwise flag is made like this: dec meaning --- ------- 1 paired read 2 proper pair 4 query unmapped 8 mate unmapped 16 strand of the query (1 -> reverse) 32 strand of the mate 64 first read in pair 128 second read in pair 256 alignment is not primary 512 does not pass quality check 1024 PCR or optical duplicate """ def __init__ (self,fhd): """ """ self.fhd = fhd def sniff(self): if self.fhd.read(3) == "BAM": return True else: return False def tsize(self): fseek = self.fhd.seek fread = self.fhd.read ftell = self.fhd.tell # move to pos 4, there starts something fseek(4) header_len = struct.unpack('= 0: fwtrack.add_loc(references[chrid],fpos,strand) self.fhd.close() return fwtrack def __fw_binary_parse (self, data ): # we skip lot of the available information in data (i.e. tag name, quality etc etc) if not data: return (None,-1,None) thisref = struct.unpack(' p2pos: # this pair is the farthest one, skip it return (None, -1, None) # In case of paired-end we have now skipped all possible "bad" pairs # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes # we can treat it as a single read, so just check the strand and calculate its # start position... hope I'm right! if bwflag & 16: thisstrand = 1 thisstart = thisstart + struct.unpack('read-base. The offset is expressed as a 0-based offset from the high-quality (5') end of the read. """ thisline = thisline.rstrip() if not thisline: return ("blank",None,None) if thisline[0]=="#": return ("comment line",None,None) # comment line is skipped thisfields = thisline.split('\t') # I hope it will never bring me more trouble chromname = thisfields[2] try: chromname = chromname[:chromname.rindex(".fa")] except ValueError: pass if thisfields[1] == "+": return (chromname, int(thisfields[3]), 0) elif thisfields[1] == "-": return (chromname, int(thisfields[3])+len(thisfields[4]), 1) else: raise StrandFormatError(thisline,thisfields[1]) MACS-2.0.9/MACS2/IO/__init__.py0000644000175000017500000000000011630217211016266 0ustar taoliutaoliu00000000000000MACS-2.0.9/MACS2/IO/WiggleIO.py0000644000175000017500000001676411630217211016225 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-05-17 16:11:19 Tao Liu> """Module Description Copyright (c) 2008 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import re import shutil from MACS2.IO.cFeatIO import WigTrackI from MACS2.IO.BinKeeper import BinKeeperI import time # ------------------------------------ # constants # ------------------------------------ # ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ class WiggleIO: """File Parser Class for Wiggle File. Note: Only can be used with the wiggle file generated by pMA2C or MACS. This module can not be univerally used. Note2: The positions in Wiggle File must be sorted for every chromosome. Example: >>> from Cistrome.CoreLib.Parser import WiggleIO >>> w = WiggleIO('sample.wig') >>> bk = w.build_binKeeper() >>> wtrack = w.build_wigtrack() """ def __init__ (self,f): """f must be a filename or a file handler. """ if type(f) == str: self.fhd = open(f,"r") elif type(f) == file: self.fhd = f else: raise Exception("f must be a filename or a file handler.") def build_wigtrack (self): """Use this function to return a WigTrackI. """ data = WigTrackI() add_func = data.add_loc chrom = "Unknown" span = 0 pos_fixed = 0 # pos for fixedStep data 0: variableStep, 1: fixedStep for i in self.fhd: if i.startswith("track"): continue elif i.startswith("#"): continue elif i.startswith("browse"): continue elif i.startswith("variableStep"): # define line pos_fixed = 0 chromi = i.rfind("chrom=") # where the 'chrom=' is spani = i.rfind("span=") # where the 'span=' is if chromi != -1: chrom = i[chromi+6:].strip().split()[0] else: chrom = "Unknown" if spani != -1: span = int(i[spani+5:].strip().split()[0]) else: span = 0 elif i.startswith("fixedStep"): chromi = i.rfind("chrom=") # where the 'chrom=' is starti = i.rfind("start=") # where the 'chrom=' is stepi = i.rfind("step=") # where the 'chrom=' is spani = i.rfind("span=") # where the 'span=' is if chromi != -1: chrom = i[chromi+6:].strip().split()[0] else: raise Exception("fixedStep line must define chrom=XX") if spani != -1: span = int(i[spani+5:].strip().split()[0]) else: span = 0 if starti != -1: pos_fixed = int(i[starti+6:].strip().split()[0]) if pos_fixed < 1: raise Exception("fixedStep start must be bigger than 0!") else: raise Exception("fixedStep line must define start=XX") if stepi != -1: step = int(i[stepi+5:].strip().split()[0]) else: raise Exception("fixedStep line must define step=XX!") else: # read data value if pos_fixed: # fixedStep value = i.strip() add_func(chrom,int(pos_fixed),float(value)) pos_fixed += step else: # variableStep try: (pos,value) = i.split() except ValueError: print i,pos_fixed add_func(chrom,int(pos),float(value)) data.span = span self.fhd.seek(0) return data def build_binKeeper (self,chromLenDict={},binsize=200): """Use this function to return a dictionary of BinKeeper objects. chromLenDict is a dictionary for chromosome length like {'chr1':100000,'chr2':200000} bin is in bps. for detail, check BinKeeper. """ data = {} chrom = "Unknown" pos_fixed = 0 for i in self.fhd: if i.startswith("track"): continue elif i.startswith("browse"): continue elif i.startswith("#"): continue elif i.startswith("variableStep"): # define line pos_fixed = 0 chromi = i.rfind("chrom=") # where the 'chrom=' is spani = i.rfind("span=") # where the 'span=' is if chromi != -1: chrom = i[chromi+6:].strip().split()[0] else: chrom = "Unknown" if spani != -1: span = int(i[spani+5:].strip().split()[0]) else: span = 0 chrlength = chromLenDict.setdefault(chrom,250000000) + 10000000 data.setdefault(chrom,BinKeeperI(binsize=binsize,chromosomesize=chrlength)) add = data[chrom].add elif i.startswith("fixedStep"): chromi = i.rfind("chrom=") # where the 'chrom=' is starti = i.rfind("start=") # where the 'chrom=' is stepi = i.rfind("step=") # where the 'chrom=' is spani = i.rfind("span=") # where the 'span=' is if chromi != -1: chrom = i[chromi+6:].strip().split()[0] else: raise Exception("fixedStep line must define chrom=XX") if spani != -1: span = int(i[spani+5:].strip().split()[0]) else: span = 0 if starti != -1: pos_fixed = int(i[starti+6:].strip().split()[0]) if pos_fixed < 1: raise Exception("fixedStep start must be bigger than 0!") else: raise Exception("fixedStep line must define start=XX") if stepi != -1: step = int(i[stepi+5:].strip().split()[0]) else: raise Exception("fixedStep line must define step=XX!") chrlength = chromLenDict.setdefault(chrom,250000000) + 10000000 data.setdefault(chrom,BinKeeperI(binsize=binsize,chromosomesize=chrlength)) add = data[chrom].add else: # read data value if pos_fixed: # fixedStep value = i.strip() add(int(pos_fixed),float(value)) pos_fixed += step else: # variableStep try: (pos,value) = i.split() except ValueError: print i,pos_fixed add(int(pos),float(value)) self.fhd.seek(0) return data MACS-2.0.9/MACS2/IO/cCompositeScoreTrack.pyx0000644000175000017500000007250511654316302021037 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-10-22 11:40:53 Tao Liu> """Module for Composite Score Track IO classes. Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import numpy as np from numpy import int64,int32,float32 from libc.math cimport sqrt,log10 from MACS2.Constants import * from MACS2.cProb cimport poisson_cdf from MACS2.IO.cPeakIO import PeakIO from MACS2.IO.cBedGraph import bedGraphTrackI # ------------------------------------ # constants # ------------------------------------ __version__ = "scoreTrackI $Revision$" __author__ = "Tao Liu " __doc__ = "scoreTrackI classes" # ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ class compositeScoreTrackI: """Class for composite scoreGraph type data for two conditions. Modified from bedGraphTrackI. The only difference is that we store pvalue score, qvalue score and foldchange together. In bedGraph, data are represented as continuous non-overlapping regions in the whole genome. I keep this assumption in all the functions. If data has overlaps, some functions will definitely give incorrect results. 1. Continuous: the next region should be after the previous one unless they are on different chromosomes; 2. Non-overlapping: the next region should never have overlaps with preceding region. The way to memorize bedGraph data is to remember the transition points together with values of their preceding regions. The last data point may exceed chromosome end, unless a chromosome dictionary is given. Remember the coordinations in bedGraph and this class is 0-indexed and right-open. """ def __init__ (self): """Different with bedGraphTrackI, missing values are simply replaced with 0. """ self.data = {} self.pointer = {} def add_chromosome ( self, chrom, chrom_max_len ): if not self.data.has_key(chrom): self.data[chrom] = np.zeros(chrom_max_len,dtype=[('pos','int64'), ('sc1i1','float32'), ('sc2i2','float32'), ('sc1c2','float32'), ('sc2c1','float32')]) self.pointer[chrom] = 0 def add (self,chromosome,endpos,sc1i1,sc2i2,sc1c2,sc2c1): """Add a chr-endpos-score-score-score-score block into data dictionary. """ c = self.data[chromosome] i = self.pointer[chromosome] # get the preceding region c[i] = (endpos,sc1i1,sc2i2,sc1c2,sc2c1) self.pointer[chromosome] += 1 def get_data_by_chr (self, chromosome): """Return array of counts by chromosome. The return value is a tuple: ([end pos],[value]) """ if self.data.has_key(chromosome): return self.data[chromosome] else: return None def get_chr_names (self): """Return all the chromosome names stored. """ l = set(self.data.keys()) return l def call_consistent (self, cutoff=50, min_length=200, max_gap=50): """This function try to find regions within which, scores are continuously higher than a given cutoff. Consistent peaks are those met all the following criteria 1. sc1i1 >= cutoff 2. sc2i2 >= cutoff 3. sc1c2 <= cutoff 4. sc2c1 <= cutoff """ chrs = self.get_chr_names() peaks = PeakIO() # dictionary to save peaks #condition1_unique_peaks = PeakIO() # dictionary to save peaks #condition2_unique_peaks = PeakIO() # dictionary to save peaks for chrom in chrs: chrom_pointer = self.pointer[chrom] chrom_d = self.get_data_by_chr( chrom ) # arrays for position and values chrom_pos = chrom_d[ 'pos' ] chrom_sc1i1 = chrom_d[ 'sc1i1' ] chrom_sc2i2 = chrom_d[ 'sc2i2' ] chrom_sc1c2 = chrom_d[ 'sc1c2' ] chrom_sc2c1 = chrom_d[ 'sc2c1' ] x = 0 # index in compositeScoreTrackI pre_p = 0 # remember previous position peak_content = None # to store points above cutoff while True and x < chrom_pointer: # find the first region above cutoff # try to read the first data range for this chrom p = chrom_pos[ x ] vc1i1 = chrom_sc1i1[ x ] vc2i2 = chrom_sc2i2[ x ] vc1c2 = chrom_sc1c2[ x ] vc2c1 = chrom_sc2c1[ x ] x += 1 # index for the next point if vc1i1 >= cutoff and vc2i2 >= cutoff and vc1c2 <= cutoff and vc2c1 <= cutoff: peak_content = [ ( pre_p, p, 0, x ), ] # remember the index too... pre_p = p break # found the first range above cutoff else: pre_p = p for i in xrange( x, chrom_pointer ): # continue scan the rest regions p = chrom_pos[ i ] vc1i1 = chrom_sc1i1[ i ] vc2i2 = chrom_sc2i2[ i ] vc1c2 = chrom_sc1c2[ i ] vc2c1 = chrom_sc2c1[ i ] if vc1i1 < cutoff or vc2i2 < cutoff or vc1c2 > cutoff or vc2c1 > cutoff: pre_p = p continue # for points met all criteria # if the gap is allowed if pre_p - peak_content[ -1 ][ 1 ] <= max_gap: peak_content.append( ( pre_p, p, 0, i ) ) else: # when the gap is not allowed, close this peak peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = 0, peak_score = 0, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) # start a new peak peak_content = [ ( pre_p, p, 0, i ), ] pre_p = p # save the last peak if not peak_content: continue peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = 0, peak_score = 0, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) return peaks def call_condition1_unique (self, cutoff=50, min_length=200, max_gap=50): """This function try to find regions within which, scores are continuously higher than a given cutoff. Condition 1 unique peaks are those met all the following criteria 1. sc1i1 >= cutoff 2. sc1c2 >= cutoff """ chrs = self.get_chr_names() peaks = PeakIO() # dictionary to save peaks for chrom in chrs: chrom_pointer = self.pointer[chrom] chrom_d = self.get_data_by_chr( chrom ) # arrays for position and values chrom_pos = chrom_d[ 'pos' ] chrom_sc1i1 = chrom_d[ 'sc1i1' ] chrom_sc1c2 = chrom_d[ 'sc1c2' ] x = 0 # index in compositeScoreTrackI pre_p = 0 # remember previous position peak_content = None # to store points above cutoff while True and x < chrom_pointer: # find the first region above cutoff # try to read the first data range for this chrom p = chrom_pos[ x ] vc1i1 = chrom_sc1i1[ x ] vc1c2 = chrom_sc1c2[ x ] x += 1 # index for the next point if vc1i1 >= cutoff and vc1c2 >= cutoff: peak_content = [ ( pre_p, p, 0, x ), ] # remember the index too... pre_p = p break # found the first range above cutoff else: pre_p = p for i in xrange( x, chrom_pointer ): # continue scan the rest regions p = chrom_pos[ i ] vc1i1 = chrom_sc1i1[ i ] vc1c2 = chrom_sc1c2[ i ] if vc1i1 < cutoff or vc1c2 < cutoff: pre_p = p continue # for points met all criteria # if the gap is allowed if pre_p - peak_content[ -1 ][ 1 ] <= max_gap: peak_content.append( ( pre_p, p, 0, i ) ) else: # when the gap is not allowed, close this peak peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = 0, peak_score = 0, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) # start a new peak peak_content = [ ( pre_p, p, 0, i ), ] pre_p = p # save the last peak if not peak_content: continue peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = 0, peak_score = 0, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) return peaks def call_condition2_unique (self, cutoff=50, min_length=200, max_gap=50): """This function try to find regions within which, scores are continuously higher than a given cutoff. Condition 2 unique peaks are those met all the following criteria 1. sc2i2 >= cutoff 2. sc2c1 >= cutoff """ chrs = self.get_chr_names() peaks = PeakIO() # dictionary to save peaks for chrom in chrs: chrom_pointer = self.pointer[chrom] chrom_d = self.get_data_by_chr( chrom ) # arrays for position and values chrom_pos = chrom_d[ 'pos' ] chrom_sc2i2 = chrom_d[ 'sc2i2' ] chrom_sc2c1 = chrom_d[ 'sc2c1' ] x = 0 # index in compositeScoreTrackI pre_p = 0 # remember previous position peak_content = None # to store points above cutoff while True and x < chrom_pointer: # find the first region above cutoff # try to read the first data range for this chrom p = chrom_pos[ x ] vc2i2 = chrom_sc2i2[ x ] vc2c1 = chrom_sc2c1[ x ] x += 1 # index for the next point if vc2i2 >= cutoff and vc2c1 >= cutoff: peak_content = [ ( pre_p, p, 0, x ), ] # remember the index too... pre_p = p break # found the first range above cutoff else: pre_p = p for i in xrange( x, chrom_pointer ): # continue scan the rest regions p = chrom_pos[ i ] vc2i2 = chrom_sc2i2[ i ] vc2c1 = chrom_sc2c1[ i ] if vc2i2 < cutoff or vc2c1 < cutoff: pre_p = p continue # for points met all criteria # if the gap is allowed if pre_p - peak_content[ -1 ][ 1 ] <= max_gap: peak_content.append( ( pre_p, p, 0, i ) ) else: # when the gap is not allowed, close this peak peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = 0, peak_score = 0, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) # start a new peak peak_content = [ ( pre_p, p, 0, i ), ] pre_p = p # save the last peak if not peak_content: continue peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = 0, peak_score = 0, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) return peaks def call_diff_regions (self, cutoff=50, min_length=200, max_gap=50): """A function to call differential regions and common regions together. Return: common_regions, condition1_unique, and condition2_unique regions """ chrs = self.get_chr_names() consistent_peaks = PeakIO() # dictionary to save peaks condition1_peaks = PeakIO() # dictionary to save peaks condition2_peaks = PeakIO() # dictionary to save peaks for chrom in chrs: chrom_pointer = self.pointer[chrom] chrom_d = self.get_data_by_chr( chrom ) # arrays for position and values chrom_pos = chrom_d[ 'pos' ] chrom_sc1i1 = chrom_d[ 'sc1i1' ] chrom_sc2i2 = chrom_d[ 'sc2i2' ] chrom_sc1c2 = chrom_d[ 'sc1c2' ] chrom_sc2c1 = chrom_d[ 'sc2c1' ] x = 0 # index in compositeScoreTrackI pre_p = 0 # remember previous position consistent_peak_content = None # to store points above cutoff condition1_peak_content = None # to store points above cutoff condition2_peak_content = None # to store points above cutoff for i in xrange( x, chrom_pointer ): # continue scan the rest regions p = chrom_pos[ i ] vc1i1 = chrom_sc1i1[ i ] vc2i2 = chrom_sc2i2[ i ] vc1c2 = chrom_sc1c2[ i ] vc2c1 = chrom_sc2c1[ i ] if vc1i1 >= cutoff and vc2i2 >= cutoff and vc1c2 <= cutoff and vc2c1 <= cutoff: # for points met all criteria # if the gap is allowed if not consistent_peak_content: consistent_peak_content = [ ( pre_p, p, vc1i1, vc2i2, vc1c2, vc2c1, i ), ] # for consistent region, summit is decided by condition 1 if pre_p - consistent_peak_content[ -1 ][ 1 ] <= max_gap: consistent_peak_content.append( ( pre_p, p, vc1i1, vc2i2, vc1c2, vc2c1, i ) ) else: # when the gap is not allowed, close this peak # this is common region. peak_length = consistent_peak_content[ -1 ][ 1 ] - consistent_peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it # the absolute of peak score or the diff # score is the maximum of max[vc1c2,] and # max[vc2c1,]. If max[vc1c2,] is bigger, # the sign for peak score is # '+'. Otherwise, the sign is '-' m_c1c2 = max([x[4] for x in consistent_peak_content ]) m_c2c1 = max([x[5] for x in consistent_peak_content ]) if m_c1c2 >= m_c2c1: diff_score = m_c1c2 else: diff_score = -1* m_c2c1 consistent_peaks.add( chrom, consistent_peak_content[0][0], consistent_peak_content[-1][1], summit = 0, peak_score = diff_score, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) # start a new peak consistent_peak_content = [ ( pre_p, p, vc1i1, vc2i2, vc1c2, vc2c1, i ), ] elif vc1i1 >= cutoff and vc1c2 >= cutoff: if not condition1_peak_content: condition1_peak_content = [ ( pre_p, p, vc1i1, vc2i2, vc1c2, vc2c1, i ), ] if pre_p - condition1_peak_content[ -1 ][ 1 ] <= max_gap: condition1_peak_content.append( ( pre_p, p, vc1i1, vc2i2, vc1c2, vc2c1, i ) ) else: # when the gap is not allowed, close this peak # this is condition1 unique region peak_length = condition1_peak_content[ -1 ][ 1 ] - condition1_peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it # the absolute of peak score or the diff # score is the maximum of max[vc1c2,] and # max[vc2c1,]. If max[vc1c2,] is bigger, # the sign for peak score is # '+'. Otherwise, the sign is '-' diff_score = max([x[4] for x in condition1_peak_content ]) #m_c2c1 = max([x[5] in condition2_peak_content ]) #if m_c1c2 >= m_c2c1: ## diff_score = m_c1c2 #else: # diff_score = -1* m_c2c1 condition1_peaks.add( chrom, condition1_peak_content[0][0], condition1_peak_content[-1][1], summit = 0, peak_score = diff_score, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) # start a new peak condition1_peak_content = [ ( pre_p, p, vc1i1, vc2i2, vc1c2, vc2c1, i ), ] elif vc2i2 >= cutoff and vc2c1 >= cutoff: if not condition2_peak_content: condition2_peak_content = [ ( pre_p, p, vc1i1, vc2i2, vc1c2, vc2c1, i ), ] if pre_p - condition2_peak_content[ -1 ][ 1 ] <= max_gap: condition2_peak_content.append( ( pre_p, p, vc1i1, vc2i2, vc1c2, vc2c1, i ) ) else: # when the gap is not allowed, close this peak # condition 2 unique peaks peak_length = condition2_peak_content[ -1 ][ 1 ] - condition2_peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it diff_score = -1 * max([x[5] for x in condition2_peak_content ]) condition2_peaks.add( chrom, condition2_peak_content[0][0], condition2_peak_content[-1][1], summit = 0, peak_score = diff_score, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) # start a new peak condition2_peak_content = [ ( pre_p, p, vc1i1, vc2i2, vc1c2, vc2c1, i ), ] pre_p = p # save the last regions if consistent_peak_content: peak_length = consistent_peak_content[ -1 ][ 1 ] - consistent_peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it m_c1c2 = max([x[4] for x in consistent_peak_content ]) m_c2c1 = max([x[5] for x in consistent_peak_content ]) if m_c1c2 >= m_c2c1: diff_score = m_c1c2 else: diff_score = -1* m_c2c1 consistent_peaks.add( chrom, consistent_peak_content[0][0], consistent_peak_content[-1][1], summit = 0, peak_score = diff_score, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) elif condition1_peak_content: peak_length = condition1_peak_content[ -1 ][ 1 ] - condition1_peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it diff_score = max([x[4] for x in condition1_peak_content ]) condition1_peaks.add( chrom, condition1_peak_content[0][0], condition1_peak_content[-1][1], summit = 0, peak_score = diff_score, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) elif condition2_peak_content: peak_length = condition2_peak_content[ -1 ][ 1 ] - condition2_peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it diff_score = -1 * max([x[5] for x in condition2_peak_content ]) condition2_peaks.add( chrom, condition2_peak_content[0][0], condition2_peak_content[-1][1], summit = 0, peak_score = 0, pileup = 0, pscore = 0, fold_change = 0, qscore = 0, ) return ( consistent_peaks, condition1_peaks, condition2_peaks ) def total ( self ): """Return the number of regions in this object. """ t = 0 for chrom in self.data.keys(): t += self.pointer[chrom] return t #def dump ( self ): # # def make_compositeScoreTrack (bdgTrack1, bdgTrack2, bdgTrack3, bdgTrack4 ): """A modified overlie function for MACS DIFF. """ assert isinstance(bdgTrack1,bedGraphTrackI), "bdgTrack1 is not a bedGraphTrackI object" assert isinstance(bdgTrack2,bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object" assert isinstance(bdgTrack3,bedGraphTrackI), "bdgTrack3 is not a bedGraphTrackI object" assert isinstance(bdgTrack4,bedGraphTrackI), "bdgTrack4 is not a bedGraphTrackI object" ret = compositeScoreTrackI() retadd = ret.add chr1 = set(bdgTrack1.get_chr_names()) chr2 = set(bdgTrack2.get_chr_names()) chr3 = set(bdgTrack3.get_chr_names()) chr4 = set(bdgTrack4.get_chr_names()) common_chr = chr1.intersection(chr2).intersection(chr3).intersection(chr4) for chrom in common_chr: (p1s,v1s) = bdgTrack1.get_data_by_chr(chrom) # arrays for position and values p1n = iter(p1s).next # assign the next function to a viable to speed up v1n = iter(v1s).next (p2s,v2s) = bdgTrack2.get_data_by_chr(chrom) # arrays for position and values p2n = iter(p2s).next # assign the next function to a viable to speed up v2n = iter(v2s).next (p3s,v3s) = bdgTrack3.get_data_by_chr(chrom) # arrays for position and values p3n = iter(p3s).next # assign the next function to a viable to speed up v3n = iter(v3s).next (p4s,v4s) = bdgTrack4.get_data_by_chr(chrom) # arrays for position and values p4n = iter(p4s).next # assign the next function to a viable to speed up v4n = iter(v4s).next chrom_max_len = len(p1s)+len(p2s)+len(p3s)+len(p4s) # this is the maximum number of locations needed to be recorded in scoreTrackI for this chromosome. ret.add_chromosome(chrom,chrom_max_len) pre_p = 0 # remember the previous position in the new bedGraphTrackI object ret try: p1 = p1n() v1 = v1n() p2 = p2n() v2 = v2n() p3 = p3n() v3 = v3n() p4 = p4n() v4 = v4n() while True: min_p = min( p1, p2, p3, p4 ) retadd( chrom, min_p, v1, v2, v3, v4 ) pre_p = min_p if p1 == min_p: p1 = p1n() v1 = v1n() if p2 == min_p: p2 = p2n() v2 = v2n() if p3 == min_p: p3 = p3n() v3 = v3n() if p4 == min_p: p4 = p4n() v4 = v4n() except StopIteration: # meet the end of either bedGraphTrackI, simply exit pass return ret MACS-2.0.9/MACS2/IO/cPeakIO.pyx0000644000175000017500000006606111654316302016224 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-10-21 00:43:25 Tao Liu> """Module for PeakIO IO classes. Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ from MACS2.Constants import * # ------------------------------------ # constants # ------------------------------------ __version__ = "PeakIO $Revision$" __author__ = "Tao Liu " __doc__ = "PeakIO class" # ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ class PeakIO: """IO for peak information. """ def __init__ (self): self.peaks = {} def add (self, char * chromosome, long start, long end, long summit = 0, double peak_score=0, int pileup=0, double pscore=0, double fold_change=0, double qscore=0): """items: start:start end:end, length:end-start, summit:summit, score:peak_score, pileup:pileup, pscore:pscore, fc:fold_change, qscore:qscore """ if not self.peaks.has_key(chromosome): self.peaks[chromosome]=[] self.peaks[chromosome].append({"start":start, "end":end, "length":end-start, "summit":summit, "score":peak_score, "pileup":pileup, "pscore":pscore, "fc":fold_change, "qscore":qscore}) def filter_pscore (self, double pscore_cut ): peaks = self.peaks new_peaks = {} chrs = sorted(peaks.keys()) for chrom in chrs: new_peaks[chrom]=[p for p in peaks[chrom] if p["pscore"] >= pscore_cut] self.peaks = new_peaks def filter_qscore (self, double qscore_cut ): peaks = self.peaks new_peaks = {} chrs = sorted(peaks.keys()) for chrom in chrs: new_peaks[chrom]=[p for p in peaks[chrom] if p["qscore"] >= qscore_cut] self.peaks = new_peaks def filter_fc (self, fc_low, fc_up=None ): """Filter peaks in a given fc range. If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up) """ peaks = self.peaks new_peaks = {} chrs = peaks.keys() chrs.sort() if fc_up: for chrom in chrs: new_peaks[chrom]=[p for p in peaks[chrom] if p["fc"] >= fc_low and p["fc"]= fc_low] self.peaks = new_peaks def total (self): peaks = self.peaks chrs = peaks.keys() chrs.sort() x = 0 for chrom in chrs: x += len(peaks[chrom]) return x def tobed (self): """Print out peaks in BED5 format. Five columns are chromosome, peak start, peak end, peak name, and peak height. start:start end:end, length:end-start, summit:summit, score:peak_score, pileup:pileup, pscore:pvalue, fc:fold_change, qscore:qvalue """ text = "" chrs = self.peaks.keys() chrs.sort() n_peak = 0 for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 text+= "%s\t%d\t%d\tpeak_%d\t%.2f\n" % (chrom,peak["start"],peak["end"],n_peak,peak["score"]) return text def to_summits_bed (self): """Print out peak summits in BED5 format. Five columns are chromosome, summit start, summit end, peak name, and peak height. """ text = "" chrs = self.peaks.keys() chrs.sort() n_peak = 0 for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 summit_p = peak["summit"] text+= "%s\t%d\t%d\tpeak_%d\t%.2f\n" % (chrom,summit_p,summit_p+1,n_peak,peak["score"]) return text def write_to_bed (self, fhd, name_prefix="peak_", score_column="score"): """Write peaks in BED5 format in a file handler. Score (5th column) is decided by score_column setting. Check the following list. Name column ( 4th column) is made by putting name_prefix together with an ascending number. Five columns are chromosome, peak start, peak end, peak name, and peak score. items in peak hash object: start:start end:end, length:end-start, summit:summit, score:peak_score, pileup:pileup, pscore:pvalue, fc:fold_change, qscore:qvalue """ chrs = self.peaks.keys() chrs.sort() n_peak = 0 for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 fhd.write( "%s\t%d\t%d\t%s%d\t%.2f\n" % (chrom,peak["start"],peak["end"],name_prefix,n_peak,peak[score_column]) ) def write_to_summit_bed (self, fhd, name_prefix="peak_", score_column="score"): """Write peak summits in BED5 format in a file handler. Score (5th column) is decided by score_column setting. Check the following list. Name column ( 4th column) is made by putting name_prefix together with an ascending number. Five columns are chromosome, summit start, summit end, peak name, and peak score. items in peak object: start:start end:end, length:end-start, summit:summit, score:peak_score, pileup:pileup, pscore:pvalue, fc:fold_change, qscore:qvalue """ chrs = self.peaks.keys() chrs.sort() n_peak = 0 for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 summit_p = peak["summit"] fhd.write( "%s\t%d\t%d\t%s%d\t%.2f\n" % (chrom,summit_p,summit_p+1,name_prefix,n_peak,peak[score_column]) ) def write_to_narrowPeak (self, fhd, name_prefix="peak_", score_column="score"): """Print out peaks in narrowPeak format. This format is designed for ENCODE project, and basically a BED6+4 format. +-----------+------+----------------------------------------+ |field |type |description | +-----------+------+----------------------------------------+ |chrom |string|Name of the chromosome | +-----------+------+----------------------------------------+ |chromStart |int |The starting position of the feature in | | | |the chromosome. The first base in a | | | |chromosome is numbered 0. | +-----------+------+----------------------------------------+ |chromEnd |int |The ending position of the feature in | | | |the chromosome or scaffold. The chromEnd| | | |base is not included in the display of | | | |the feature. For example, the first 100| | | |bases of a chromosome are defined as | | | |chromStart=0, chromEnd=100, and span the| | | |bases numbered 0-99. | +-----------+------+----------------------------------------+ |name |string|Name given to a region (preferably | | | |unique). Use '.' if no name is assigned.| +-----------+------+----------------------------------------+ |score |int |Indicates how dark the peak will be | |(-logpvalue| |displayed in the browser (1-1000). If | |in MACS2 * | |'0', the DCC will assign this based on | |10) | |signal value. Ideally average | | | |signalValue per base spread between | | | |100-1000. | +-----------+------+----------------------------------------+ |strand |char |+/- to denote strand or orientation | |(always .) | |(whenever applicable). Use '.' if no | | | |orientation is assigned. | +-----------+------+----------------------------------------+ |signalValue|float |Measurement of overall (usually, | |(fc) | |average) enrichment for the region. | +-----------+------+----------------------------------------+ |pValue |float |Measurement of statistical signficance | | | |(-log10). Use -1 if no pValue is | | | |assigned. | +-----------+------+----------------------------------------+ |qValue |float |Measurement of statistical significance | | | |using false discovery rate. Use -1 if no| | | |qValue is assigned. | +-----------+------+----------------------------------------+ |peak |int |Point-source called for this peak; | | | |0-based offset from chromStart. Use -1 | | | |if no point-source called. | +-----------+------+----------------------------------------+ """ chrs = self.peaks.keys() chrs.sort() n_peak = 0 fhd.write("track type=narrowPeak nextItemButton=on\n") for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 # items in peak: (peak start,peak end, peak length, # peak summit, peak height, number of tags in peak # region, peak pvalue, peak fold_enrichment, qvalue) fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%.2f\t%.2f\t%.2f\t%d\n" % (chrom,peak["start"],peak["end"],name_prefix,n_peak,int(10*peak[score_column]), peak["fc"],peak["pscore"],peak["qscore"],peak["summit"]-peak["start"]) ) ### class DiffPeakIO: """IO for differential peak information. """ def __init__ (self): self.peaks = {} def add (self, char * chromosome, long start, long end, long summit = 0, double diff_score=0, int pileup=0, double pscore=0, double fold_change=0, double qscore=0): """items: start:start end:end, length:end-start, summit:summit, # summit is where the highest pileup is in a differetnial region, or the common region for only condition A. score:diff_score, # diff score is the maximum of qscore in diff/common region, for A vs B and B vs A. If B vs A is bigger, put minus sign later. pileup:pileup, # the highest pileup the summit. pscore:pscore, # pscore is the maximum of pscore in diff/common region, for A vs B and B vs A. If B vs A is bigger, put minus sign later. fc:fold_change, # fc is the maximum of foldchange in diff/common region, for A vs B and B vs A. If B vs A is bigger, put minus sign later. qscore:qscore # qscore is the maximum of qscore in diff/common region, for A vs B and B vs A. If B vs A is bigger, put minus sign later. """ if not self.peaks.has_key(chromosome): self.peaks[chromosome]=[] self.peaks[chromosome].append({"start":start, "end":end, "length":end-start, "summit":summit, "score":diff_score, "pileup":pileup, "pscore":pscore, "fc":fold_change, "qscore":qscore}) def filter_pscore (self, double pscore_cut ): peaks = self.peaks new_peaks = {} chrs = sorted(peaks.keys()) for chrom in chrs: new_peaks[chrom]=[p for p in peaks[chrom] if p["pscore"] >= pscore_cut] self.peaks = new_peaks def filter_qscore (self, double qscore_cut ): peaks = self.peaks new_peaks = {} chrs = sorted(peaks.keys()) for chrom in chrs: new_peaks[chrom]=[p for p in peaks[chrom] if p["qscore"] >= qscore_cut] self.peaks = new_peaks def filter_fc (self, fc_low, fc_up=None ): """Filter peaks in a given fc range. If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up) """ peaks = self.peaks new_peaks = {} chrs = peaks.keys() chrs.sort() if fc_up: for chrom in chrs: new_peaks[chrom]=[p for p in peaks[chrom] if p["fc"] >= fc_low and p["fc"]= fc_low] self.peaks = new_peaks def total (self): peaks = self.peaks chrs = peaks.keys() chrs.sort() x = 0 for chrom in chrs: x += len(peaks[chrom]) return x def tobed (self): """Print out peaks in BED5 format. Five columns are chromosome, peak start, peak end, peak name, and peak height. start:start end:end, length:end-start, summit:summit, score:peak_score, pileup:pileup, pscore:pvalue, fc:fold_change, qscore:qvalue """ text = "" chrs = self.peaks.keys() chrs.sort() n_peak = 0 for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 text+= "%s\t%d\t%d\tpeak_%d\t%.2f\n" % (chrom,peak["start"],peak["end"],n_peak,peak["score"]) return text def to_summits_bed (self): """Print out peak summits in BED5 format. Five columns are chromosome, summit start, summit end, peak name, and peak height. """ text = "" chrs = self.peaks.keys() chrs.sort() n_peak = 0 for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 summit_p = peak["summit"] text+= "%s\t%d\t%d\tpeak_%d\t%.2f\n" % (chrom,summit_p,summit_p+1,n_peak,peak["score"]) return text def write_to_bed (self, fhd, name_prefix="peak_", score_column="score"): """Write peaks in BED5 format in a file handler. Score (5th column) is decided by score_column setting. Check the following list. Name column ( 4th column) is made by putting name_prefix together with an ascending number. Five columns are chromosome, peak start, peak end, peak name, and peak score. items in peak hash object: start:start end:end, length:end-start, summit:summit, score:peak_score, pileup:pileup, pscore:pvalue, fc:fold_change, qscore:qvalue """ chrs = self.peaks.keys() chrs.sort() n_peak = 0 for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 fhd.write( "%s\t%d\t%d\t%s%d\t%.2f\n" % (chrom,peak["start"],peak["end"],name_prefix,n_peak,peak[score_column]) ) def write_to_summit_bed (self, fhd, name_prefix="peak_", score_column="score"): """Write peak summits in BED5 format in a file handler. Score (5th column) is decided by score_column setting. Check the following list. Name column ( 4th column) is made by putting name_prefix together with an ascending number. Five columns are chromosome, summit start, summit end, peak name, and peak score. items in peak object: start:start end:end, length:end-start, summit:summit, score:peak_score, pileup:pileup, pscore:pvalue, fc:fold_change, qscore:qvalue """ chrs = self.peaks.keys() chrs.sort() n_peak = 0 for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 summit_p = peak["summit"] fhd.write( "%s\t%d\t%d\t%s%d\t%.2f\n" % (chrom,summit_p,summit_p+1,name_prefix,n_peak,peak[score_column]) ) def write_to_narrowPeak (self, fhd, name_prefix="peak_", score_column="score"): """Print out peaks in narrowPeak format. This format is designed for ENCODE project, and basically a BED6+4 format. +-----------+------+----------------------------------------+ |field |type |description | +-----------+------+----------------------------------------+ |chrom |string|Name of the chromosome | +-----------+------+----------------------------------------+ |chromStart |int |The starting position of the feature in | | | |the chromosome. The first base in a | | | |chromosome is numbered 0. | +-----------+------+----------------------------------------+ |chromEnd |int |The ending position of the feature in | | | |the chromosome or scaffold. The chromEnd| | | |base is not included in the display of | | | |the feature. For example, the first 100| | | |bases of a chromosome are defined as | | | |chromStart=0, chromEnd=100, and span the| | | |bases numbered 0-99. | +-----------+------+----------------------------------------+ |name |string|Name given to a region (preferably | | | |unique). Use '.' if no name is assigned.| +-----------+------+----------------------------------------+ |score |int |Indicates how dark the peak will be | |(-logpvalue| |displayed in the browser (1-1000). If | |in MACS2 * | |'0', the DCC will assign this based on | |10) | |signal value. Ideally average | | | |signalValue per base spread between | | | |100-1000. | +-----------+------+----------------------------------------+ |strand |char |+/- to denote strand or orientation | |(always .) | |(whenever applicable). Use '.' if no | | | |orientation is assigned. | +-----------+------+----------------------------------------+ |signalValue|float |Measurement of overall (usually, | |(fc) | |average) enrichment for the region. | +-----------+------+----------------------------------------+ |pValue |float |Measurement of statistical signficance | | | |(-log10). Use -1 if no pValue is | | | |assigned. | +-----------+------+----------------------------------------+ |qValue |float |Measurement of statistical significance | | | |using false discovery rate. Use -1 if no| | | |qValue is assigned. | +-----------+------+----------------------------------------+ |peak |int |Point-source called for this peak; | | | |0-based offset from chromStart. Use -1 | | | |if no point-source called. | +-----------+------+----------------------------------------+ """ chrs = self.peaks.keys() chrs.sort() n_peak = 0 fhd.write("track type=narrowPeak nextItemButton=on\n") for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 # items in peak: (peak start,peak end, peak length, # peak summit, peak height, number of tags in peak # region, peak pvalue, peak fold_enrichment, qvalue) fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%.2f\t%.2f\t%.2f\t%d\n" % (chrom,peak["start"],peak["end"],name_prefix,n_peak,int(10*peak[score_column]), peak["fc"],peak["pscore"],peak["qscore"],peak["summit"]-peak["start"]) ) # class BroadPeakIO: """IO for broad peak information. """ def __init__ (self): self.peaks = {} def add (self, char * chromosome, long start, long end, long score = 0, long thickStart=0, long thickEnd=0, long blockNum=0, char *blockSizes="", char * blockStarts="" ): """items chromosome : chromosome name, start : broad region start, end : broad region end, score : average score in all blocks, thickStart : start of highly enriched region, thickEnd : end of highly enriched region, blockNum : number of blocks, blockSizes : sizes of blocks, blockStarts: starts of blocks """ if not self.peaks.has_key(chromosome): self.peaks[chromosome]=[] self.peaks[chromosome].append({"start":start, "end":end, "score":score, "thickStart":thickStart, "thickEnd":thickEnd, "blockNum":blockNum, "blockSizes":blockSizes, "blockStarts":blockStarts, } ) def total (self): peaks = self.peaks chrs = peaks.keys() chrs.sort() x = 0 for chrom in chrs: x += len(peaks[chrom]) return x def write_to_gappedPeak (self, fhd, name_prefix="peak_", name="peak", description="peak description"): """Print out peaks in bed12 format. This format is basically a BED12 format. +--------------+------+----------------------------------------+ |field |type |description | +--------------+------+----------------------------------------+ |chrom |string|Name of the chromosome | +--------------+------+----------------------------------------+ |chromStart |int |The starting position of the feature in | | | |the chromosome. The first base in a | | | |chromosome is numbered 0. | +--------------+------+----------------------------------------+ |chromEnd |int |The ending position of the feature in | | | |the chromosome or scaffold. The chromEnd| | | |base is not included in the display of | | | |the feature. For example, the first 100| | | |bases of a chromosome are defined as | | | |chromStart=0, chromEnd=100, and span the| | | |bases numbered 0-99. | +--------------+------+----------------------------------------+ |name |string|Name given to a region (preferably | | | |unique). Use '.' if no name is assigned.| +--------------+------+----------------------------------------+ |score |int |Indicates how dark the peak will be | |(always use | |displayed in the browser (1-1000). If | |1000 for | |'0', the DCC will assign this based on | |the | |signal value. Ideally average | |thickest | |signalValue per base spread between | |color) | |100-1000. | +--------------+------+----------------------------------------+ |strand |char |+/- to denote strand or orientation | |(always .) | |(whenever applicable). Use '.' if no | | | |orientation is assigned. | +--------------+------+----------------------------------------+ |thickStart |int | The starting position at which the | | | |feature is drawn thickly. Mark the start| | | |of highly enriched regions. | | | | | +--------------+------+----------------------------------------+ |thickEnd |int | The ending position at which the | | | |feature is drawn thickly. Mark the end | | | |of highly enriched regions. | +--------------+------+----------------------------------------+ |itemRGB |string| Not used. Set it as 0. | +--------------+------+----------------------------------------+ |blockCounts |int | The number of blocks (exons) in the BED| | | |line. | +--------------+------+----------------------------------------+ |blockSizes |string| A comma-separated list of the block | | | |sizes. | +--------------+------+----------------------------------------+ |blockStarts |string| A comma-separated list of block starts.| +--------------+------+----------------------------------------+ """ chrs = self.peaks.keys() chrs.sort() n_peak = 0 fhd.write("track name=\"%s\" description=\"%s\" type=bed nextItemButton=on\n" % (name, description) ) for chrom in chrs: for peak in self.peaks[chrom]: n_peak += 1 fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%d\t%d\t0\t%d\t%s\t%s\n" % (chrom,peak["start"],peak["end"],name_prefix,n_peak,int(peak["score"]), peak["thickStart"],peak["thickEnd"], peak["blockNum"],peak["blockSizes"],peak["blockStarts"] ) ) MACS-2.0.9/MACS2/IO/bedGraphIO.py0000644000175000017500000000465511654316302016526 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-09-07 22:28:15 Tao Liu> """Module Description: IO Module for bedGraph file Copyright (c) 2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ from MACS2.IO.cBedGraph import bedGraphTrackI # ------------------------------------ # constants # ------------------------------------ # ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ class bedGraphIO: """File Parser Class for bedGraph File. There are two assumptions in my bedGraphTrackI object: 1. Continuous: the next region should be after the previous one unless they are on different chromosomes; 2. Non-overlapping: the next region should never have overlaps with preceding region. If any of the above two criteria is violated, parsering will fail. """ def __init__ (self,f): """f must be a filename or a file handler. """ if type(f) == str: self.fhd = open(f,"r") elif type(f) == file: self.fhd = f else: raise Exception("f must be a filename or a file handler.") def build_bdgtrack (self, baseline_value=0): """Use this function to return a bedGraphTrackI object. baseline_value is the value to fill in the regions not defined in bedGraph. For example, if the bedGraph is like: chr1 100 200 1 chr1 250 350 2 Then the region chr1:200..250 should be filled with baseline_value. Default of baseline_value is 0. """ data = bedGraphTrackI(baseline_value=baseline_value) add_func = data.add_loc for i in self.fhd: if i.startswith("track"): continue elif i.startswith("#"): continue elif i.startswith("browse"): continue else: (chrom,startpos,endpos,value)=i.split() add_func(chrom,int(startpos),int(endpos),float(value)) self.fhd.seek(0) return data MACS-2.0.9/MACS2/IO/cBedGraph.pyx0000644000175000017500000006421111654316302016563 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-09-11 23:47:59 Tao Liu> """Module for Feature IO classes. Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import logging from array import array import numpy as np from libc.math cimport sqrt from MACS2.Constants import * from MACS2.cProb import poisson_cdf from MACS2.IO.cScoreTrack import scoreTrackI from MACS2.IO.cPeakIO import PeakIO, BroadPeakIO # ------------------------------------ # constants # ------------------------------------ __version__ = "BedGraph $Revision$" __author__ = "Tao Liu " __doc__ = "bedGraphTrackI class" # ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ class bedGraphTrackI: """Class for bedGraph type data. In bedGraph, data are represented as continuous non-overlapping regions in the whole genome. I keep this assumption in all the functions. If data has overlaps, some functions will definitely give incorrect results. 1. Continuous: the next region should be after the previous one unless they are on different chromosomes; 2. Non-overlapping: the next region should never have overlaps with preceding region. The way to memorize bedGraph data is to remember the transition points together with values of their preceding regions. The last data point may exceed chromosome end, unless a chromosome dictionary is given. Remember the coordinations in bedGraph and this class is 0-indexed and right-open. """ def __init__ (self, baseline_value=0): """ baseline_value is the value to fill in the regions not defined in bedGraph. For example, if the bedGraph is like: chr1 100 200 1 chr1 250 350 2 Then the region chr1:200..250 should be filled with baseline_value. """ self.__data = {} self.maxvalue =-10000 self.minvalue = 10000 self.baseline_value = baseline_value def add_loc (self,chromosome,startpos,endpos,value): """Add a chr-start-end-value block into __data dictionary. """ # basic assumption, end pos should > start pos assert endpos > startpos, "endpos %d can't be smaller than start pos %d" % (endpos,startpos) if endpos <= 0: return if startpos < 0: startpos = 0 if not self.__data.has_key(chromosome): self.__data[chromosome] = [array(BYTE4,[]),array(FBYTE4,[])] # for (endpos,value) c = self.__data[chromosome] if startpos: # start pos is not 0, then add two blocks, the first # with "baseline_value"; the second with "value" c[0].append(startpos) c[1].append(self.baseline_value) c[0].append(endpos) c[1].append(value) else: c = self.__data[chromosome] # get the preceding region pre_pos = c[0][-1] pre_v = c[1][-1] # to check 1. continuity; 2. non-overlapping assert pre_pos < endpos , "bedGraph regions are not continuous." assert pre_pos <= startpos , "bedGraph regions have overlappings." if startpos != pre_pos: # there is a gap, so fill it with baseline_value c[0].append(startpos) c[1].append(self.baseline_value) # then add this region c[0].append(endpos) c[1].append(value) else: # if this region is next to the previous one. if pre_v == value: # if value is the same, simply extend it. c[0][-1] = endpos else: # otherwise, add a new region c[0].append(endpos) c[1].append(value) if value > self.maxvalue: self.maxvalue = value if value < self.minvalue: self.minvalue = value def get_data_by_chr (self, chromosome): """Return array of counts by chromosome. The return value is a tuple: ([end pos],[value]) """ if self.__data.has_key(chromosome): return self.__data[chromosome] else: return None def get_chr_names (self): """Return all the chromosome names stored. """ l = set(self.__data.keys()) return l def write_bedGraph (self, fhd, name, description): """Write all data to fhd in Wiggle Format. fhd: a filehandler to save bedGraph. name/description: the name and description in track line. shift will be used to shift the coordinates. default: 0 """ #fhd.write("track type=bedGraph name=\"%s\" description=\"%s\"\n" % (name,description)) chrs = self.get_chr_names() for chrom in chrs: (p,v) = self.__data[chrom] pnext = iter(p).next vnext = iter(v).next pre = 0 for i in xrange(len(p)): pos = pnext() value = vnext() #if value != self.baseline_value: # never write baseline_value fhd.write("%s\t%d\t%d\t%.2f\n" % (chrom,pre,pos,value)) pre = pos def reset_baseline (self, baseline_value): """Reset baseline value to baseline_value. So any region between self.baseline_value and baseline_value will be set to baseline_value. """ self.baseline_value = baseline_value self.filter_score(cutoff=baseline_value) self.merge_regions() def merge_regions (self): """Merge nearby regions with the same value. """ chrs = set(self.__data.keys()) for chrom in chrs: (p,v) = self.__data[chrom] pnext = iter(p).next vnext = iter(v).next # new arrays new_pos = array(BYTE4,[pnext(),]) new_value = array(FBYTE4,[vnext(),]) newpa = new_pos.append newva = new_value.append new_pre_pos = new_pos[0] new_pre_value = new_value[0] for i in xrange(1,len(p)): pos = pnext() value = vnext() if value == new_pre_value: new_pos[-1] = pos else: # add new region newpa(pos) newva(value) new_pre_pos = pos new_pre_value = value self.__data[chrom] = [new_pos,new_value] return True def filter_score (self, cutoff=0): """Filter using a score cutoff. Any region lower than score cutoff will be set to self.baseline_value. Self will be modified. """ chrs = set(self.__data.keys()) for chrom in chrs: (p,v) = self.__data[chrom] pnext = iter(p).next vnext = iter(v).next # new arrays new_pos = array(BYTE4,[]) new_value = array(FBYTE4,[]) new_pre_pos = 0 new_pre_value = None for i in xrange(len(p)): pos = pnext() value = vnext() if value < cutoff: # this region will be set to baseline_value if new_pre_value == self.baseline_value: # if preceding region is at baseline, extend it new_pos[-1] = pos else: # else add a new baseline region new_pos.append(pos) new_value.append(self.baseline_value) else: # put it into new arrays new_pos.append(pos) new_value.append(value) new_pre_pos = new_pos[-1] new_pre_value = new_value[-1] self.__data[chrom]=[new_pos,new_value] return True def summary (self): """Calculate the sum, max, min, mean, and std. Return a tuple for (sum, max, min, mean, std). """ n_v = 0 sum_v = 0 max_v = -100000 min_v = 100000 for (p,v) in self.__data.values(): # for each chromosome pre_p = 0 for i in range(len(p)): # for each region l = p[i]-pre_p sum_v += v[i]*l n_v += l pre_p = p[i] max_v = max(max(v),max_v) min_v = min(min(v),min_v) mean_v = float(sum_v)/n_v variance = 0.0 for (p,v) in self.__data.values(): for i in range(len(p)): # for each region tmp = v[i]-mean_v l = p[i]-pre_p variance += tmp*tmp*l pre_p = p[i] variance /= float(n_v-1) std_v = sqrt(variance) return (sum_v, max_v, min_v, mean_v, std_v) def call_peaks (self, cutoff=1, up_limit=1e310, min_length=200, max_gap=50): """This function try to find regions within which, scores are continuously higher than a given cutoff. This function is NOT using sliding-windows. Instead, any regions in bedGraph above certain cutoff will be detected, then merged if the gap between nearby two regions are below max_gap. After this, peak is reported if its length is above min_length. cutoff: cutoff of value, default 1. up_limit: the highest acceptable value. Default 10^{310} * so only allow peak with value >=cutoff and <=up_limit min_length : minimum peak length, default 200. gap : maximum gap to merge nearby peaks, default 50. """ chrs = self.get_chr_names() peaks = PeakIO() # dictionary to save peaks for chrom in chrs: (ps,vs) = self.get_data_by_chr(chrom) # arrays for position and values psn = iter(ps).next # assign the next function to a viable to speed up vsn = iter(vs).next x = 0 pre_p = 0 # remember previous position while True: # find the first region above cutoff try: # try to read the first data range for this chrom p = psn() v = vsn() except: break x += 1 # index for the next point if v >= cutoff and v <= up_limit: peak_content = [(pre_p,p,v),] pre_p = p break # found the first range above cutoff else: pre_p = p for i in range(x,len(ps)): # continue scan the rest regions p = psn() v = vsn() if v < cutoff or v > up_limit: # not be detected as 'peak' pre_p = p continue # for points above cutoff # if the gap is allowed if pre_p - peak_content[-1][1] <= max_gap: peak_content.append((pre_p,p,v)) else: # when the gap is not allowed, close this peak peak_length = peak_content[-1][1]-peak_content[0][0] if peak_length >= min_length: # if the peak is too small, reject it tsummit = [] summit = None summit_value = None for (tstart,tend,tvalue) in peak_content: if not summit_value or summit_value < tvalue: tsummit = [int((tend+tstart)/2),] summit_value = tvalue elif summit_value == tvalue: tsummit.append( int((tend+tstart)/2) ) summit = tsummit[int((len(tsummit)+1)/2)-1 ] peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = summit, peak_score = summit_value, pileup = 0, pscore = 0, fold_change = 0, qscore = 0 ) # start a new peak peak_content = [(pre_p,p,v),] pre_p = p # save the last peak if not peak_content: continue if peak_length >= min_length: # if the peak is too small, reject it summit = None summit_value = None for (tstart,tend,tvalue) in peak_content: if not summit_value or summit_value < tvalue: summit = int((tend+tstart)/2) summit_value = tvalue peaks.add( chrom, peak_content[0][0], peak_content[-1][1], summit = summit, peak_score = summit_value, pileup = 0, pscore = 0, fold_change = 0, qscore = 0 ) return peaks def call_broadpeaks (self, lvl1_cutoff=500, lvl2_cutoff=100, min_length=200, lvl1_max_gap=50, lvl2_max_gap=400): """This function try to find enriched regions within which, scores are continuously higher than a given cutoff for level 1, and link them using the gap above level 2 cutoff with a maximum length of lvl2_max_gap. lvl1_cutoff: cutoff of value at enriched regions, default 500. lvl2_cutoff: cutoff of value at linkage regions, default 100. min_length : minimum peak length, default 200. lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50. lvl2_max_gap : maximum length of linkage regions, default 400. colname: can be 'sample','control','-100logp','-100logq'. Cutoff will be applied to the specified column. Return both general PeakIO object for highly enriched regions and gapped broad regions in BroadPeakIO. """ assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2." assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1." lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff, min_length=min_length, max_gap=lvl1_max_gap) lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff, min_length=min_length, max_gap=lvl2_max_gap) chrs = lvl1_peaks.peaks.keys() broadpeaks = BroadPeakIO() # use lvl2_peaks as linking regions between lvl1_peaks for chrom in chrs: lvl1peakschrom = lvl1_peaks.peaks[chrom] lvl2peakschrom = lvl2_peaks.peaks[chrom] lvl1peakschrom_next = iter(lvl1peakschrom).next tmppeakset = [] # to temporarily store lvl1 region inside a lvl2 region # our assumption is lvl1 regions should be included in lvl2 regions try: lvl1 = lvl1peakschrom_next() except StopIteration: break for lvl2 in lvl2peakschrom: # for each lvl2 peak, find all lvl1 peaks inside try: while True: if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]: tmppeakset.append(lvl1) else: if tmppeakset: self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset) tmppeakset = [] break lvl1 = lvl1peakschrom_next() except StopIteration: if tmppeakset: self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset) break return lvl1_peaks, broadpeaks def __add_broadpeak (self, bpeaks, chrom, lvl2peak, lvl1peakset): """Internal function to create broad peak. """ start = lvl2peak["start"] end = lvl2peak["end"] thickStart = lvl1peakset[0]["start"] thickEnd = lvl1peakset[-1]["end"] blockNum = len(lvl1peakset) blockSizes = ",".join( map(lambda x:str(x["length"]),lvl1peakset) ) blockStarts = ",".join( map(lambda x:str(x["start"]-start),lvl1peakset) ) if lvl2peak["start"] != thickStart: # add 1bp mark for the start of lvl2 peak blockNum += 1 blockSizes = "1,"+blockSizes blockStarts = "0,"+blockStarts if lvl2peak["end"] != thickEnd: # add 1bp mark for the end of lvl2 peak blockNum += 1 blockSizes = blockSizes+",1" blockStarts = blockStarts+","+str(end-start-1) bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd, blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts) return bpeaks def total (self): """Return the number of regions in this object. """ t = 0 for (p,s) in self.__data.values(): t += len(p) return t def set_single_value (self, new_value): """Change all the values in bedGraph to the same new_value, return a new bedGraphTrackI. """ ret = bedGraphTrackI() chroms = set(self.get_chr_names()) for chrom in chroms: (p1,v1) = self.get_data_by_chr(chrom) # arrays for position and values # maximum p max_p = max(p1) # add a region from 0 to max_p ret.add_loc(chrom,0,max_p,new_value) return ret def overlie (self, bdgTrack2, func=max ): """Calculate two bedGraphTrackI objects by letting self overlying bdgTrack2, with user-defined functions. Transition positions from both bedGraphTrackI objects will be considered and combined. For example: #1 bedGraph (self) | #2 bedGraph ----------------------------------------------- chr1 0 100 0 | chr1 0 150 1 chr1 100 200 3 | chr1 150 250 2 chr1 200 300 4 | chr1 250 300 4 these two bedGraphs will be combined to have five transition points: 100, 150, 200, 250, and 300. So in order to calculate two bedGraphs, I pair values within the following regions like: chr s e (#1,#2) applied_func_max ----------------------------------------------- chr1 0 100 (0,1) 1 chr1 100 150 (3,1) 3 chr1 150 200 (3,2) 3 chr1 200 250 (4,2) 4 chr1 250 300 (4,4) 4 Then the given 'func' will be applied on each 2-tuple as func(#1,#2) Return value is a bedGraphTrackI object. """ assert isinstance(bdgTrack2,bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object" ret = bedGraphTrackI() retadd = ret.add_loc chr1 = set(self.get_chr_names()) chr2 = set(bdgTrack2.get_chr_names()) common_chr = chr1.intersection(chr2) for chrom in common_chr: (p1s,v1s) = self.get_data_by_chr(chrom) # arrays for position and values p1n = iter(p1s).next # assign the next function to a viable to speed up v1n = iter(v1s).next (p2s,v2s) = bdgTrack2.get_data_by_chr(chrom) # arrays for position and values p2n = iter(p2s).next # assign the next function to a viable to speed up v2n = iter(v2s).next pre_p = 0 # remember the previous position in the new bedGraphTrackI object ret try: p1 = p1n() v1 = v1n() p2 = p2n() v2 = v2n() while True: if p1 < p2: # clip a region from pre_p to p1, then set pre_p as p1. retadd(chrom,pre_p,p1,func(v1,v2)) pre_p = p1 # call for the next p1 and v1 p1 = p1n() v1 = v1n() elif p2 < p1: # clip a region from pre_p to p2, then set pre_p as p2. retadd(chrom,pre_p,p2,func(v1,v2)) pre_p = p2 # call for the next p2 and v2 p2 = p2n() v2 = v2n() elif p1 == p2: # from pre_p to p1 or p2, then set pre_p as p1 or p2. retadd(chrom,pre_p,p1,func(v1,v2)) pre_p = p1 # call for the next p1, v1, p2, v2. p1 = p1n() v1 = v1n() p2 = p2n() v2 = v2n() except StopIteration: # meet the end of either bedGraphTrackI, simply exit pass ret.merge_regions() return ret def apply_func ( self, func ): """Apply function 'func' to every value in this bedGraphTrackI object. *Two adjacent regions with same value after applying func will not be merged. """ t = 0 for (p,s) in self.__data.values(): for i in xrange(len(s)): s[i] = func(s[i]) self.maxvalue = func(self.maxvalue) self.minvalue = func(self.minvalue) return True def make_scoreTrack_for_macs (self, bdgTrack2 ): """A modified overlie function for MACS v2. Return value is a bedGraphTrackI object. """ assert isinstance(bdgTrack2,bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object" ret = scoreTrackI() retadd = ret.add chr1 = set(self.get_chr_names()) chr2 = set(bdgTrack2.get_chr_names()) common_chr = chr1.intersection(chr2) for chrom in common_chr: (p1s,v1s) = self.get_data_by_chr(chrom) # arrays for position and values p1n = iter(p1s).next # assign the next function to a viable to speed up v1n = iter(v1s).next (p2s,v2s) = bdgTrack2.get_data_by_chr(chrom) # arrays for position and values p2n = iter(p2s).next # assign the next function to a viable to speed up v2n = iter(v2s).next chrom_max_len = len(p1s)+len(p2s) # this is the maximum number of locations needed to be recorded in scoreTrackI for this chromosome. ret.add_chromosome(chrom,chrom_max_len) pre_p = 0 # remember the previous position in the new bedGraphTrackI object ret try: p1 = p1n() v1 = v1n() p2 = p2n() v2 = v2n() while True: if p1 < p2: # clip a region from pre_p to p1, then set pre_p as p1. retadd( chrom, p1, v1, v2 ) pre_p = p1 # call for the next p1 and v1 p1 = p1n() v1 = v1n() elif p2 < p1: # clip a region from pre_p to p2, then set pre_p as p2. retadd( chrom, p2, v1, v2 ) pre_p = p2 # call for the next p2 and v2 p2 = p2n() v2 = v2n() elif p1 == p2: # from pre_p to p1 or p2, then set pre_p as p1 or p2. retadd( chrom, p1, v1, v2 ) pre_p = p1 # call for the next p1, v1, p2, v2. p1 = p1n() v1 = v1n() p2 = p2n() v2 = v2n() except StopIteration: # meet the end of either bedGraphTrackI, simply exit pass #ret.merge_regions() return ret def scoreTracktoBedGraph (scoretrack, colname): """Produce a bedGraphTrackI object with certain column as scores. colname: can be 'sample','control','-100logp','-100logq' """ bdgtrack = bedGraphTrackI( baseline_value = 0 ) if colname not in ['sample','control','-100logp','-100logq']: raise Exception("%s not supported!" % colname) if colname in ['-100logp', '-100logq']: flag100 = True # for pvalue or qvalue, divide them by 100 while writing to bedGraph file else: flag100 = False chrs = scoretrack.get_chr_names() for chrom in chrs: d = scoretrack.data[chrom] l = scoretrack.pointer[chrom] pre = 0 pos = d['pos'] if flag100: value = d[colname]/100.0 else: value = d[colname] for i in xrange( l ): bdgtrack.add_loc( chrom, pre, pos[i] ,value[i] ) pre = pos[i] return bdgtrack MACS-2.0.9/MACS2/IO/BinKeeper.py0000644000175000017500000002551611630217211016416 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-03-14 17:52:00 Tao Liu> """Module Description: BinKeeper for Wiggle-like tracks. Copyright (c) 2008 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import sys import re from bisect import insort,bisect_left,bisect_right,insort_right from array import array # ------------------------------------ # constants # ------------------------------------ # to determine the byte size if array('H',[1]).itemsize == 2: BYTE2 = 'H' else: raise Exception("BYTE2 type cannot be determined!") if array('I',[1]).itemsize == 4: BYTE4 = 'I' elif array('L',[1]).itemsize == 4: BYTE4 = 'L' else: raise Exception("BYTE4 type cannot be determined!") if array('f',[1]).itemsize == 4: FBYTE4 = 'f' elif array('d',[1]).itemsize == 4: FBYTE4 = 'd' else: raise Exception("BYTE4 type cannot be determined!") # ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ class BinKeeperI: """BinKeeper keeps point data from a chromosome in a bin list. Example: >>> from taolib.CoreLib.Parser import WiggleIO >>> w = WiggleIO('sample.wig') >>> bk = w.build_binKeeper() >>> bk['chrI'].pp2v(1000,2000) # to extract values in chrI:1000..2000 """ def __init__ (self,binsize=8000,chromosomesize=1e9): """Initializer. Parameters: binsize : size of bin in Basepair chromosomesize : size of chromosome, default is 1G """ self.binsize = binsize self.binnumber = int(chromosomesize/self.binsize)+1 self.cage = [] a = self.cage.append for i in xrange(self.binnumber): a([array(BYTE4,[]),array(FBYTE4,[])]) def add ( self, p, value ): """Add a position into BinKeeper. Note: position must be sorted before adding. Otherwise, pp2v and pp2p will not work. """ bin = p/self.binsize self.cage[bin][0].append(p) self.cage[bin][1].append(value) def p2bin (self, p ): """Return the bin index for a position. """ return p/self.binsize def p2cage (self, p): """Return the bin containing the position. """ return self.cage[p/self.binsize] def __pp2cages (self, p1, p2): assert p1<=p2 bin1 = self.p2bin(p1) bin2 = self.p2bin(p2)+1 t = [array(BYTE4,[]),array(FBYTE4,[])] for i in xrange(bin1,bin2): t[0].extend(self.cage[i][0]) t[1].extend(self.cage[i][1]) return t def pp2p (self, p1, p2): """Give the position list between two given positions. Parameters: p1 : start position p2 : end position Return Value: list of positions between p1 and p2. """ (ps,vs) = self.__pp2cages(p1,p2) p1_in_cages = bisect_left(ps,p1) p2_in_cages = bisect_right(ps,p2) return ps[p1_in_cages:p2_in_cages] def pp2v (self, p1, p2): """Give the value list between two given positions. Parameters: p1 : start position p2 : end position Return Value: list of values whose positions are between p1 and p2. """ (ps,vs) = self.__pp2cages(p1,p2) p1_in_cages = bisect_left(ps,p1) p2_in_cages = bisect_right(ps,p2) return vs[p1_in_cages:p2_in_cages] def pp2pv (self, p1, p2): """Give the (position,value) list between two given positions. Parameters: p1 : start position p2 : end position Return Value: list of (position,value) between p1 and p2. """ (ps,vs) = self.__pp2cages(p1,p2) p1_in_cages = bisect_left(ps,p1) p2_in_cages = bisect_right(ps,p2) return zip(ps[p1_in_cages:p2_in_cages],vs[p1_in_cages:p2_in_cages]) class BinKeeperII: """BinKeeperII keeps non-overlapping interval data from a chromosome in a bin list. This is especially designed for bedGraph type data. """ def __init__ (self,binsize=8000,chromosomesize=1e9): """Initializer. Parameters: binsize : size of bin in Basepair chromosomesize : size of chromosome, default is 1G """ self.binsize = binsize self.binnumber = int(chromosomesize/self.binsize)+1 self.cage = [] a = self.cage.append for i in xrange(self.binnumber): a([array(BYTE4,[]),array(BYTE4,[]),array(FBYTE4,[])]) def add ( self, startp, endp, value ): """Add an interval data into BinKeeper. Note: position must be sorted before adding. Otherwise, pp2v and pp2p will not work. """ startbin = startp/self.binsize endbin = endp/self.binsize if startbin == endbin: # some intervals may only be within a bin j = bisect.bisect_left(self.cage[startbin][0],startp) self.cage[startbin][0].insert(j,startp) self.cage[startbin][1].insert(j,endp) self.cage[startbin][2].insert(j,value) else: # some intervals may cover the end of bins # first bin j = bisect.bisect_left(self.cage[startbin][0],startp) self.cage[startbin][0].insert(j,startp) self.cage[startbin][1].insert(j,(startbin+1)*self.binsize) self.cage[startbin][2].insert(j,value) # other bins fully covered for i in xrange(startbin+1,endbin): p = i*self.binsize j = bisect.bisect_left(self.cage[startbin][0],p) self.cage[startbin][0].insert(j,p) self.cage[startbin][1].insert(j,(i+1)*self.binsize) self.cage[startbin][2].insert(j,value) insort_right(self.cage[i][0],i*self.binsize) insort_right(self.cage[i][1],(i+1)*self.binsize) insort_right(self.cage[i][2],value) # last bin -- the start of this bin should be covered insort_right(self.cage[endbin][0],endbin*self.binsize) insort_right(self.cage[endbin][1],endp) insort_right(self.cage[endbin][2],value) def p2bin (self, p ): """Given a position, return the bin index for a position. """ return p/self.binsize def p2cage (self, p): """Given a position, return the bin containing the position. """ return self.cage[p/self.binsize] def pp2cages (self, p1, p2): """Given an interval, return the bins containing this interval. """ assert p1<=p2 bin1 = self.p2bin(p1) bin2 = self.p2bin(p2) t = [array(BYTE4,[]),array(BYTE4,[]),array(FBYTE4,[])] for i in xrange(bin1,bin2+1): t[0].extend(self.cage[i][0]) t[1].extend(self.cage[i][1]) t[2].extend(self.cage[i][2]) return t def pp2intervals (self, p1, p2): """Given an interval, return the intervals list between two given positions. Parameters: p1 : start position p2 : end position Return Value: A list of intervals start and end positions (tuple) between p1 and p2. * Remember, I assume all intervals saved in this BinKeeperII are not overlapping, so if there is some overlap, this function will not work as expected. """ (startposs,endposs,vs) = self.pp2cages(p1,p2) p1_in_cages = bisect_left(startposs,p1) p2_in_cages = bisect_right(endposs,p2) output_startpos_list = startposs[p1_in_cages:p2_in_cages] output_endpos_list = endposs[p1_in_cages:p2_in_cages] # check if the bin (p1_in_cages-1) covers p1 if p1 < endposs[p1_in_cages-1]: # add this interval output_startpos_list = array(BYTE4,[p1,])+output_startpos_list output_endpos_list = array(BYTE4,[endposs[p1_in_cages-1],])+output_endpos_list # check if the bin (p2_in_cages+1) covers p2 if p2 > startposs[p2_in_cages+1]: # add this interval output_startpos_list = array(BYTE4,[startposs[p2_in_cages+1],])+output_startpos_list output_endpos_list = array(BYTE4,[p2,])+output_endpos_list return zip(output_startpos_list,output_endpos_list) def pp2pvs (self, p1, p2): """Given an interval, return the values list between two given positions. Parameters: p1 : start position p2 : end position Return Value: A list of start, end positions, values (tuple) between p1 and p2. Each value represents the value in an interval. Remember the interval length and positions are lost in the output. * Remember, I assume all intervals saved in this BinKeeperII are not overlapping, so if there is some overlap, this function will not work as expected. """ (startposs,endposs,vs) = self.pp2cages(p1,p2) p1_in_cages = bisect_left(startposs,p1) p2_in_cages = bisect_right(endposs,p2) output_startpos_list = startposs[p1_in_cages:p2_in_cages] output_endpos_list = endposs[p1_in_cages:p2_in_cages] output_value_list = vs[p1_in_cages:p2_in_cages] # print p1_in_cages,p2_in_cages # print vs print output_startpos_list print output_endpos_list print output_value_list # check if the bin (p1_in_cages-1) covers p1 if p1_in_cages-1 >= 0 and p1 < self.cage[p1_in_cages-1][1]: # add this interval output_startpos_list = array(BYTE4,[p1,])+output_startpos_list output_endpos_list = array(BYTE4,[self.cage[p1_in_cages-1][1],])+output_endpos_list output_value_list = array(BYTE4,[self.cage[p1_in_cages-1][2],])+output_value_list # check if the bin (p2_in_cages+1) covers p2 #print p2_in_cages+1,len(self.cage) #print p2, self.cage[p2_in_cages+1][0] if p2_in_cages+1 < len(self.cage) and p2 > self.cage[p2_in_cages+1][0]: # add this interval output_startpos_list = output_startpos_list+array(BYTE4,[self.cage[p2_in_cages+1][0],]) output_endpos_list = output_endpos_list+array(BYTE4,[p2,]) output_value_list = output_value_list+array(BYTE4,[self.cage[p2_in_cages+1][2],]) print output_startpos_list print output_endpos_list print output_value_list return zip(output_startpos_list,output_endpos_list,output_value_list) MACS-2.0.9/MACS2/OptValidator.py0000644000175000017500000004015011654316302016651 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-11-02 14:21:20 Tao Liu> """Module Description Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import sys import os import re import logging from subprocess import Popen, PIPE from math import log from MACS2.IO.cParser import BEDParser, ELANDResultParser, ELANDMultiParser, ELANDExportParser, SAMParser, BAMParser, BowtieParser, guess_parser # ------------------------------------ # constants # ------------------------------------ efgsize = {"hs":2.7e9, "mm":1.87e9, "ce":9e7, "dm":1.2e8} # ------------------------------------ # Misc functions # ------------------------------------ def opt_validate ( optparser ): """Validate options from a OptParser object. Ret: Validated options object. """ (options,args) = optparser.parse_args() # gsize try: options.gsize = efgsize[options.gsize] except: try: options.gsize = float(options.gsize) except: logging.error("Error when interpreting --gsize option: %s" % options.gsize) logging.error("Available shortcuts of effective genome sizes are %s" % ",".join(efgsize.keys())) sys.exit(1) # treatment file if not options.tfile: # only required argument optparser.print_help() sys.exit(1) # format options.gzip_flag = False # if the input is gzip file options.format = options.format.upper() if options.format == "ELAND": options.parser = ELANDResultParser elif options.format == "BED": options.parser = BEDParser elif options.format == "ELANDMULTI": options.parser = ELANDMultiParser elif options.format == "ELANDEXPORT": options.parser = ELANDExportParser elif options.format == "SAM": options.parser = SAMParser elif options.format == "BAM": options.parser = BAMParser options.gzip_flag = True elif options.format == "BOWTIE": options.parser = BowtieParser elif options.format == "AUTO": options.parser = guess_parser else: logging.error("Format \"%s\" cannot be recognized!" % (options.format)) sys.exit(1) # duplicate reads if options.keepduplicates != "auto" and options.keepduplicates != "all": if not options.keepduplicates.isdigit(): logging.error("--keep-dup should be 'auto', 'all' or an integer!") sys.exit(1) # shiftsize>0 if options.shiftsize <=0 : logging.error("--shiftsize must > 0!") sys.exit(1) if options.pvalue: # if set, ignore qvalue cutoff options.log_qvalue = None options.log_pvalue = log(options.pvalue,10)*-1 else: options.log_qvalue = log(options.qvalue,10)*-1 options.log_pvalue = None if options.broad: options.log_broadcutoff = log(options.broadcutoff,10)*-1 # uppercase the format string options.format = options.format.upper() # upper and lower mfold try: (options.lmfold,options.umfold) = map(int, options.mfold.split(",")) except: logging.error("mfold format error! Your input is '%s'. It should be like '10,30'." % options.mfold) sys.exit(1) # output filenames options.peakxls = options.name+"_peaks.xls" options.peakbed = options.name+"_peaks.bed" options.peakNarrowPeak = options.name+"_peaks.encodePeak" options.peakBroadPeak = options.name+"_broad_peaks.bed" options.summitbed = options.name+"_summits.bed" options.zwig_tr = options.name+"_treat" options.zwig_ctl= options.name+"_control" #options.negxls = options.name+"_negative_peaks.xls" #options.diagxls = options.name+"_diag.xls" options.modelR = options.name+"_model.r" options.pqtable = options.name+"_pq_table.txt" # logging object logging.basicConfig(level=(4-options.verbose)*10, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) options.error = logging.critical # function alias options.warn = logging.warning options.debug = logging.debug options.info = logging.info options.argtxt = "\n".join(( "# ARGUMENTS LIST:",\ "# name = %s" % (options.name),\ "# format = %s" % (options.format),\ "# ChIP-seq file = %s" % (options.tfile),\ "# control file = %s" % (options.cfile),\ "# effective genome size = %.2e" % (options.gsize),\ #"# tag size = %d" % (options.tsize),\ "# band width = %d" % (options.bw),\ "# model fold = %s\n" % (options.mfold),\ )) if options.pvalue: options.argtxt += "# pvalue cutoff = %.2e\n" % (options.pvalue) else: options.argtxt += "# qvalue cutoff = %.2e\n" % (options.qvalue) if options.downsample: options.argtxt += "# Larger dataset will be randomly sampled towards smaller dataset.\n" else: if options.tolarge: options.argtxt += "# Smaller dataset will be scaled towards larger dataset.\n" else: options.argtxt += "# Larger dataset will be scaled towards smaller dataset.\n" if options.cfile: options.argtxt += "# Range for calculating regional lambda is: %d bps and %d bps\n" % (options.smalllocal,options.largelocal) else: options.argtxt += "# Range for calculating regional lambda is: %d bps\n" % (options.largelocal) if options.broad: options.argtxt += "# Broad region calling is on\n" else: options.argtxt += "# Broad region calling is off\n" if options.halfext: options.argtxt += "# MACS will make 1/2d size fragments\n" return options def opt_validate_diff ( optparser ): """Validate options from a OptParser object. This parser is for macsdiffrun. Ret: Validated options object. """ (options,args) = optparser.parse_args() # gsize try: options.gsize = efgsize[options.gsize] except: try: options.gsize = float(options.gsize) except: logging.error("Error when interpreting --gsize option: %s" % options.gsize) logging.error("Available shortcuts of effective genome sizes are %s" % ",".join(efgsize.keys())) sys.exit(1) # treatment file if not options.tfile1 or not options.tfile2: # only required argument logging.error("--t1 and --t2 are required!") optparser.print_help() sys.exit(1) # control file if not options.cfile1 and not options.cfile2: logging.error("At least, either --c1 or --c2 should be set!") optparser.print_help() sys.exit(1) if not options.cfile1 and options.cfile2: options.cfile1 = options.cfile2 elif options.cfile1 and not options.cfile2: options.cfile2 = options.cfile1 # Check file assessibility. flag = True for fn in (options.tfile1, options.tfile2, options.cfile1, options.cfile2): if os.path.isfile(fn): pass else: logging.error("Can't access file: %s" % fn) flag = False if not flag: sys.exit(1) # format options.gzip_flag = False # if the input is gzip file options.format = options.format.upper() if options.format == "ELAND": options.parser = ELANDResultParser elif options.format == "BED": options.parser = BEDParser elif options.format == "ELANDMULTI": options.parser = ELANDMultiParser elif options.format == "ELANDEXPORT": options.parser = ELANDExportParser elif options.format == "SAM": options.parser = SAMParser elif options.format == "BAM": options.parser = BAMParser options.gzip_flag = True elif options.format == "BOWTIE": options.parser = BowtieParser elif options.format == "AUTO": options.parser = guess_parser else: logging.error("Format \"%s\" cannot be recognized!" % (options.format)) sys.exit(1) # duplicate reads if options.keepduplicates != "auto" and options.keepduplicates != "all": if not options.keepduplicates.isdigit(): logging.error("--keep-dup should be 'auto', 'all' or an integer!") sys.exit(1) # shiftsize>0 if options.shiftsize <=0 : logging.error("--shiftsize must > 0!") sys.exit(1) if options.pvalue: # if set, ignore qvalue cutoff options.log_qvalue = None options.log_pvalue = log(options.pvalue,10)*-1 else: options.log_qvalue = log(options.qvalue,10)*-1 options.log_pvalue = None # uppercase the format string options.format = options.format.upper() # upper and lower mfold try: (options.lmfold,options.umfold) = map(int, options.mfold.split(",")) except: logging.error("mfold format error! Your input is '%s'. It should be like '10,30'." % options.mfold) sys.exit(1) # output filenames options.condition1_peakbed = options.name+"_condition1_unique_peaks.bed" options.condition2_peakbed = options.name+"_condition2_unique_peaks.bed" options.consistent_peakbed = options.name+"_consistent_peaks.bed" options.zbdg_tr = options.name+"_treat" options.zbdg_ctl= options.name+"_control" # logging object logging.basicConfig(level=(4-options.verbose)*10, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) options.error = logging.critical # function alias options.warn = logging.warning options.debug = logging.debug options.info = logging.info options.argtxt = "\n".join(( "# ARGUMENTS LIST:",\ "# name = %s" % (options.name),\ "# format = %s" % (options.format),\ "# ChIP-seq file for condition 1 = %s" % (options.tfile1),\ "# ChIP-seq file for condition 2 = %s" % (options.tfile2),\ "# control file for condition 1 = %s" % (options.cfile1),\ "# control file for condition 2 = %s" % (options.cfile2),\ "# effective genome size = %.2e" % (options.gsize),\ "# band width = %d" % (options.bw),\ "# model fold = %s\n" % (options.mfold),\ )) if options.pvalue: options.argtxt += "# pvalue cutoff = %.2e\n" % (options.pvalue) else: options.argtxt += "# qvalue cutoff = %.2e\n" % (options.qvalue) # if options.tolarge: # options.argtxt += "# Smaller dataset will be scaled towards larger dataset.\n" # else: # options.argtxt += "# Larger dataset will be scaled towards smaller dataset.\n" if options.cfile1 or options.cfile2: options.argtxt += "# Range for calculating regional lambda is: %d bps and %d bps\n" % (options.smalllocal,options.largelocal) else: options.argtxt += "# Range for calculating regional lambda is: %d bps\n" % (options.largelocal) return options def opt_validate_filterdup ( optparser ): """Validate options from a OptParser object. Ret: Validated options object. """ (options,args) = optparser.parse_args() # gsize try: options.gsize = efgsize[options.gsize] except: try: options.gsize = float(options.gsize) except: logging.error("Error when interpreting --gsize option: %s" % options.gsize) logging.error("Available shortcuts of effective genome sizes are %s" % ",".join(efgsize.keys())) sys.exit(1) # treatment file if not options.tfile: # only required argument optparser.print_help() sys.exit(1) # format options.gzip_flag = False # if the input is gzip file options.format = options.format.upper() if options.format == "ELAND": options.parser = ELANDResultParser elif options.format == "BED": options.parser = BEDParser elif options.format == "ELANDMULTI": options.parser = ELANDMultiParser elif options.format == "ELANDEXPORT": options.parser = ELANDExportParser elif options.format == "SAM": options.parser = SAMParser elif options.format == "BAM": options.parser = BAMParser options.gzip_flag = True elif options.format == "BOWTIE": options.parser = BowtieParser elif options.format == "AUTO": options.parser = guess_parser else: logging.error("Format \"%s\" cannot be recognized!" % (options.format)) sys.exit(1) # duplicate reads if options.keepduplicates != "auto" and options.keepduplicates != "all": if not options.keepduplicates.isdigit(): logging.error("--keep-dup should be 'auto', 'all' or an integer!") sys.exit(1) # uppercase the format string options.format = options.format.upper() # logging object logging.basicConfig(level=(4-options.verbose)*10, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) options.error = logging.critical # function alias options.warn = logging.warning options.debug = logging.debug options.info = logging.info return options def opt_validate_randsample ( optparser ): """Validate options from a OptParser object. Ret: Validated options object. """ (options,args) = optparser.parse_args() # treatment file if not options.tfile: # only required argument optparser.print_help() sys.exit(1) # format options.gzip_flag = False # if the input is gzip file options.format = options.format.upper() if options.format == "ELAND": options.parser = ELANDResultParser elif options.format == "BED": options.parser = BEDParser elif options.format == "ELANDMULTI": options.parser = ELANDMultiParser elif options.format == "ELANDEXPORT": options.parser = ELANDExportParser elif options.format == "SAM": options.parser = SAMParser elif options.format == "BAM": options.parser = BAMParser options.gzip_flag = True elif options.format == "BOWTIE": options.parser = BowtieParser elif options.format == "AUTO": options.parser = guess_parser else: logging.error("Format \"%s\" cannot be recognized!" % (options.format)) sys.exit(1) # uppercase the format string options.format = options.format.upper() # percentage or number if options.percentage and options.number: logging.error("Can't specify -p and -n at the same time! Please check your options and retry!") sys.exit(1) else: if options.percentage: if options.percentage > 100.0: logging.error("Percentage can't be bigger than 100.0. Please check your options and retry!") sys.exit(1) elif options.number: if options.number <= 0: logging.error("Number of tags can't be smaller than or equal to 0. Please check your options and retry!") sys.exit(1) # logging object logging.basicConfig(level=(4-options.verbose)*10, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) options.error = logging.critical # function alias options.warn = logging.warning options.debug = logging.debug options.info = logging.info return options MACS-2.0.9/MACS2/Constants.py0000644000175000017500000000146611654316302016224 0ustar taoliutaoliu00000000000000MACS_VERSION = "2.0.9 20111102 (tag:alpha)" MACSDIFF_VERSION = "1.0.2 20111021 (tag:alpha)" FILTERDUP_VERSION = "1.0.0 20110906" RANDSAMPLE_VERSION = "1.0.0 20111102" MAX_PAIRNUM = 1000 MAX_LAMBDA = 100000 FESTEP = 20 from array import array if array('h',[1]).itemsize == 2: BYTE2 = 'h' else: raise Exception("BYTE2 type cannot be determined!") if array('H',[1]).itemsize == 2: UBYTE2 = 'H' else: raise Exception("UBYTE2 (unsigned short) type cannot be determined!") if array('i',[1]).itemsize == 4: BYTE4 = 'i' elif array('l',[1]).itemsize == 4: BYTE4 = 'l' else: raise Exception("BYTE4 type cannot be determined!") if array('f',[1]).itemsize == 4: FBYTE4 = 'f' elif array('d',[1]).itemsize == 4: FBYTE4 = 'd' else: raise Exception("FBYTE4 type cannot be determined!") MACS-2.0.9/MACS2/cPeakDetect.pyx0000644000175000017500000006725511654316302016624 0ustar taoliutaoliu00000000000000# Time-stamp: <2011-10-20 23:50:30 Tao Liu> """Module Description Copyright (c) 2008,2009 Yong Zhang, Tao Liu Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ import os from array import array from copy import deepcopy import gc # use garbage collectior from MACS2.IO.cPeakIO import PeakIO from MACS2.Constants import * from MACS2.cPileup import pileup_bdg from libc.math cimport log10 def compare_treatment_vs_control ( treat, control, fragment_size, gsize, halfext=False, slocal=0, llocal=0, tocontrol=False, shiftcontrol=False ): """To compare treatment vs control tags tracks with tag extension ,local poisson test, and Benjamini-Hochberg adjustment. Return scoreTrackI object. While calculating pvalue: First, t and c will be adjusted by the ratio between total reads in treatment and total reads in control, depending on --to-control option. Then, t and c will be multiplied by the smallest peak size -- self.d. Next, a poisson CDF is applied to calculate one-side pvalue for enrichment. Finally, BH process will be applied to adjust pvalue to qvalue. """ treat_total = treat.total control_total = control.total ratio_treat2control = float(treat_total)/control_total # Now pileup FWTrackII to form a bedGraphTrackI treat_btrack = pileup_bdg(treat,fragment_size,halfextension=halfext) if tocontrol: # if user want to scale everything to control data lambda_bg = float(fragment_size)*treat_total/gsize/ratio_treat2control treat_btrack.apply_func(lambda x:float(x)/ratio_treat2control) else: lambda_bg = float(fragment_size)*treat_total/gsize # control data needs multiple steps of calculation # I need to shift them by 500 bps, then 5000 bps if slocal: assert fragment_size <= slocal, "slocal can't be smaller than d!" if llocal: assert fragment_size <= llocal , "llocal can't be smaller than d!" assert slocal <= llocal , "llocal can't be smaller than slocal!" # d-size local # Now pileup FWTrackII to form a bedGraphTrackI c_tmp_btrack = pileup_bdg(control, fragment_size, directional=shiftcontrol, halfextension=halfext) if not tocontrol: # if user want to scale everything to ChIP data tmp_v = ratio_treat2control else: tmp_v = 1 c_tmp_btrack.apply_func(lambda x:float(x)*tmp_v) control_btrack = c_tmp_btrack # slocal size local if slocal: # Now pileup FWTrackII to form a bedGraphTrackI c_tmp_btrack = pileup_bdg(control, slocal, directional=shiftcontrol, halfextension=halfext) if not tocontrol: # if user want to scale everything to ChIP data tmp_v = float(fragment_size)/slocal*ratio_treat2control else: tmp_v = float(fragment_size)/slocal c_tmp_btrack.apply_func(lambda x:float(x)*tmp_v) control_btrack = control_btrack.overlie(c_tmp_btrack,func=max) # llocal size local if llocal and llocal > slocal: # Now pileup FWTrackII to form a bedGraphTrackI c_tmp_btrack = pileup_bdg(control, llocal, directional=shiftcontrol, halfextension=halfext) if not tocontrol: # if user want to scale everything to ChIP data tmp_v = float(fragment_size)/llocal*ratio_treat2control else: tmp_v = float(fragment_size)/llocal c_tmp_btrack.apply_func(lambda x:float(x)*tmp_v) control_btrack = control_btrack.overlie(c_tmp_btrack,func=max) control_btrack.reset_baseline(lambda_bg) # set the baseline as lambda_bg # calculate pvalue scores score_btrack = treat_btrack.make_scoreTrack_for_macs(control_btrack) treat_btrack = None # clean them control_btrack = None gc.collect() # full collect garbage # calculate and assign qvalues pqtable = score_btrack.make_pq_table() score_btrack.assign_qvalue( pqtable ) return score_btrack class PeakDetect: """Class to do the peak calling. e.g: >>> from MACS2.cPeakDetect import cPeakDetect >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, d=100, gsize=3000000000) >>> pd.call_peaks() """ def __init__ (self,opt = None,treat = None, control = None, d = None, slocal = None, llocal = None, shiftcontrol = None): """Initialize the PeakDetect object. """ self.opt = opt self.info = opt.info self.debug = opt.debug self.warn = opt.warn self.treat = treat self.control = control self.ratio_treat2control = None self.peaks = None self.final_peaks = None #self.femax = opt.femax #self.femin = opt.femin #self.festep = opt.festep self.log_pvalue = opt.log_pvalue # -log10pvalue self.log_qvalue = opt.log_qvalue # -log10qvalue if d != None: self.d = d else: self.d = self.opt.d self.shift_size = self.d/2 self.gsize = opt.gsize self.nolambda = opt.nolambda if slocal != None: self.sregion = slocal else: self.sregion = opt.smalllocal if llocal != None: self.lregion = llocal else: self.lregion = opt.largelocal if shiftcontrol != None: self.shiftcontrol = shiftcontrol else: self.shiftcontrol = opt.shiftcontrol if (self.nolambda): self.info("#3 !!!! DYNAMIC LAMBDA IS DISABLED !!!!") #self.diag = opt.diag #self.save_score = opt.store_score self.zwig_tr = opt.zwig_tr self.zwig_ctl= opt.zwig_ctl def call_peaks (self): """Call peaks function. Scan the whole genome for peaks. RESULTS WILL BE SAVED IN self.final_peaks and self.final_negative_peaks. """ if self.control: # w/ control if self.opt.broad: (self.peaks,self.broadpeaks) = self.__call_peaks_w_control() else: self.peaks = self.__call_peaks_w_control () else: # w/o control if self.opt.broad: (self.peaks,self.broadpeaks) = self.__call_peaks_wo_control() else: self.peaks = self.__call_peaks_wo_control () return self.peaks # def diag_result (self): # """Run the diagnosis process on sequencing saturation. # """ # if not self.diag: # return None # if self.control: # w/ control # return self.__diag_w_control() # else: # w/o control # return self.__diag_wo_control() def toxls (self): """Save the peak results in a tab-delimited plain text file with suffix .xls. """ text = "" if self.peaks: text += "\t".join(("chr","start", "end", "length", "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)"))+"\n" #text += "\t".join(("chr","start", "end", "length", "abs_summit", "-log10(pvalue)","-log10(qvalue)"))+"\n" else: return None peaks = self.peaks.peaks chrs = peaks.keys() chrs.sort() for chrom in chrs: for peak in peaks[chrom]: #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue] text += "%s\t%d\t%d\t%d" % (chrom,peak["start"]+1,peak["end"],peak["length"]) text += "\t%d" % (peak["summit"]+1) # summit position text += "\t%.2f" % (peak["pileup"]) # pileup height at summit text += "\t%.2f" % (peak["pscore"]) # -log10pvalue at summit text += "\t%.2f" % (peak["fc"]) # fold change at summit text += "\t%.2f" % (peak["qscore"]) # -log10qvalue at summit text+= "\n" return text def __call_peaks_w_control (self): """To call peaks with control data. A peak info type is a: dictionary key value: chromosome items: (peak start,peak end, peak length, peak summit, peak height, number of tags in peak region, peak pvalue, peak fold_enrichment) <-- tuple type While calculating pvalue: First, t and c will be adjusted by the ratio between total reads in treatment and total reads in control, depending on --to-small option. Then, t and c will be multiplied by the smallest peak size -- self.d. Finally, a poisson CDF is applied to calculate one-side pvalue for enrichment. """ treat_total = self.treat.total control_total = self.control.total self.ratio_treat2control = float(treat_total)/control_total # Now pileup FWTrackII to form a bedGraphTrackI self.info("#3 pileup treatment data by extending tags towards 3' to %d length" % self.d) treat_btrack = pileup_bdg(self.treat,self.d,halfextension=self.opt.halfext) if self.opt.tocontrol: # if user want to scale everything to control data lambda_bg = float(self.d)*treat_total/self.gsize/self.ratio_treat2control treat_btrack.apply_func(lambda x:float(x)/self.ratio_treat2control) else: lambda_bg = float(self.d)*treat_total/self.gsize # control data needs multiple steps of calculation # I need to shift them by 500 bps, then 5000 bps if self.sregion: assert self.d <= self.sregion, "slocal can't be smaller than d!" if self.lregion: assert self.d <= self.lregion , "llocal can't be smaller than d!" assert self.sregion <= self.lregion , "llocal can't be smaller than slocal!" # d-size local self.info("#3 calculate d local lambda for control data") # Now pileup FWTrackII to form a bedGraphTrackI c_tmp_btrack = pileup_bdg(self.control,self.d,directional=self.shiftcontrol,halfextension=self.opt.halfext) if not self.opt.tocontrol: # if user want to scale everything to ChIP data tmp_v = self.ratio_treat2control else: tmp_v = 1 c_tmp_btrack.apply_func(lambda x:float(x)*tmp_v) control_btrack = c_tmp_btrack # slocal size local if self.sregion: self.info("#3 calculate small local lambda for control data") # Now pileup FWTrackII to form a bedGraphTrackI c_tmp_btrack = pileup_bdg(self.control,self.sregion,directional=self.shiftcontrol,halfextension=self.opt.halfext) if not self.opt.tocontrol: # if user want to scale everything to ChIP data tmp_v = float(self.d)/self.sregion*self.ratio_treat2control else: tmp_v = float(self.d)/self.sregion c_tmp_btrack.apply_func(lambda x:float(x)*tmp_v) control_btrack = control_btrack.overlie(c_tmp_btrack,func=max) # llocal size local if self.lregion and self.lregion > self.sregion: self.info("#3 calculate large local lambda for control data") # Now pileup FWTrackII to form a bedGraphTrackI c_tmp_btrack = pileup_bdg(self.control,self.lregion,directional=self.shiftcontrol,halfextension=self.opt.halfext) if not self.opt.tocontrol: # if user want to scale everything to ChIP data tmp_v = float(self.d)/self.lregion*self.ratio_treat2control else: tmp_v = float(self.d)/self.lregion c_tmp_btrack.apply_func(lambda x:float(x)*tmp_v) control_btrack = control_btrack.overlie(c_tmp_btrack,func=max) control_btrack.reset_baseline(lambda_bg) # set the baseline as lambda_bg # calculate pvalue scores self.info("#3 Build score track ...") score_btrack = treat_btrack.make_scoreTrack_for_macs(control_btrack) treat_btrack = None # clean them control_btrack = None gc.collect() # full collect garbage self.info("#3 Calculate qvalues ...") pqtable = score_btrack.make_pq_table() self.info("#3 Saving p-value to q-value table ...") pqfhd = open(self.opt.pqtable,"w") pqfhd.write( "-log10pvalue\t-log10qvalue\trank\tbasepairs\n" ) for p in sorted(pqtable.keys(),reverse=True): q = pqtable[p] pqfhd.write("%.2f\t%.2f\t%d\t%d\n" % (p/100.0,q[0]/100.0,q[1],q[2])) pqfhd.close() self.info("#3 Assign qvalues ...") score_btrack.assign_qvalue( pqtable ) # call peaks if self.log_pvalue: if self.opt.broad: self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.2f, %.2f..." % (self.log_pvalue,self.opt.log_broadcutoff) ) peaks = score_btrack.call_broadpeaks(lvl1_cutoff=self.log_pvalue*100,lvl2_cutoff=self.opt.log_broadcutoff*100,min_length=self.d, lvl1_max_gap=self.opt.tsize,lvl2_max_gap=self.d*4,colname='-100logp') else: self.info("#3 Call peaks with given -log10pvalue cutoff: %.2f ..." % self.log_pvalue) peaks = score_btrack.call_peaks(cutoff=self.log_pvalue*100,min_length=self.d,max_gap=self.opt.tsize,colname='-100logp') elif self.log_qvalue: if self.opt.broad: self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) ) peaks = score_btrack.call_broadpeaks(lvl1_cutoff=self.log_qvalue*100,lvl2_cutoff=self.opt.log_broadcutoff*100,min_length=self.d, lvl1_max_gap=self.opt.tsize,lvl2_max_gap=self.d*4,colname='-100logq') else: self.info("#3 Call peaks with given -log10qvalue cutoff: %.2f ..." % self.log_qvalue) peaks = score_btrack.call_peaks(cutoff=self.log_qvalue*100,min_length=self.d,max_gap=self.opt.tsize,colname='-100logq') if self.opt.store_bdg: self.info("#3 save tag pileup into bedGraph file...") bdgfhd = open(self.zwig_tr + "_pileup.bdg", "w") score_btrack.write_bedGraph( bdgfhd, self.zwig_tr, "Fragment pileup at each bp from MACS version %s" % MACS_VERSION, "sample" ) self.info("#3 save local lambda into bedGraph file...") bdgfhd = open(self.zwig_ctl + "_lambda.bdg", "w") score_btrack.write_bedGraph( bdgfhd, self.zwig_ctl, "Maximum local lambda at each bp from MACS version %s" % MACS_VERSION, "control" ) self.info("#3 save the -log10pvalue score track into bedGraph file...") bdgfhd = open(self.zwig_tr + "_pvalue.bdg", "w") score_btrack.write_bedGraph( bdgfhd, self.zwig_tr+"_-log10pvalue", "-log10 pvalue scores at each bp from MACS version %s" % MACS_VERSION, "-100logp") self.info("#3 save the -log10qvalue score track into bedGraph file...") bdgfhd = open(self.zwig_tr + "_qvalue.bdg", "w") score_btrack.write_bedGraph( bdgfhd, self.zwig_tr+"_-log10qvalue", "-log10 qvalue scores at each bp from MACS version %s" % MACS_VERSION, "-100logq") return peaks def __call_peaks_wo_control (self): """To call peaks without control data. A peak info type is a: dictionary key value: chromosome items: (peak start,peak end, peak length, peak summit, peak height, number of tags in peak region, peak pvalue, peak fold_enrichment) <-- tuple type While calculating pvalue: First, t and c will be adjusted by the ratio between total reads in treatment and total reads in control, depending on --to-small option. Then, t and c will be multiplied by the smallest peak size -- self.d. Finally, a poisson CDF is applied to calculate one-side pvalue for enrichment. """ # global lambda treat_total = self.treat.total lambda_bg = float(self.d)*treat_total/self.gsize # Now pileup FWTrackII to form a bedGraphTrackI self.info("#3 pileup treatment data by extending tags towards 3' to %d length" % self.d) treat_btrack = pileup_bdg(self.treat,self.d,halfextension=self.opt.halfext) # llocal size local self.info("#3 calculate d local lambda from treatment data") if self.lregion: self.info("#3 calculate large local lambda from treatment data") # Now pileup FWTrackII to form a bedGraphTrackI control_btrack = pileup_bdg(self.treat,self.lregion,directional=self.shiftcontrol,halfextension=self.opt.halfext) tmp_v = float(self.d)/self.lregion control_btrack.apply_func(lambda x:float(x)*tmp_v) control_btrack.reset_baseline(lambda_bg) # set the baseline as lambda_bg else: # I need to fake a control_btrack control_btrack = treat_btrack.set_single_value(lambda_bg) # calculate pvalue scores self.info("#3 Build score track ...") score_btrack = treat_btrack.make_scoreTrack_for_macs(control_btrack) treat_btrack = None # clean them control_btrack = None gc.collect() # full collect garbage self.info("#3 Calculate qvalues ...") pqtable = score_btrack.make_pq_table() #self.info("#3 Saving p-value to q-value table ...") #pqfhd = open(self.opt.pqtable,"w") #pqfhd.write( "-log10pvalue\t-log10qvalue\trank\tbasepairs\n" ) #for p in sorted(pqtable.keys(),reverse=True): # q = pqtable[p] # pqfhd.write("%.2f\t%.2f\t%d\t%d\n" % (p/100.0,q[0]/100.0,q[1],q[2])) #pqfhd.close() self.info("#3 Assign qvalues ...") score_btrack.assign_qvalue( pqtable ) # call peaks if self.log_pvalue: if self.opt.broad: self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.2f, %.2f..." % (self.log_pvalue,self.opt.log_broadcutoff) ) peaks = score_btrack.call_broadpeaks(lvl1_cutoff=self.log_pvalue*100,lvl2_cutoff=self.opt.log_broadcutoff*100,min_length=self.d, lvl1_max_gap=self.opt.tsize,lvl2_max_gap=self.d*4,colname='-100logp') else: self.info("#3 Call peaks with given -log10pvalue cutoff: %.2f ..." % self.log_pvalue) peaks = score_btrack.call_peaks(cutoff=self.log_pvalue*100,min_length=self.d,max_gap=self.opt.tsize,colname='-100logp') elif self.log_qvalue: if self.opt.broad: self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %.2f, %.2f..." % (self.log_qvalue,self.opt.log_broadcutoff) ) peaks = score_btrack.call_broadpeaks(lvl1_cutoff=self.log_qvalue*100,lvl2_cutoff=self.opt.log_broadcutoff*100,min_length=self.d, lvl1_max_gap=self.opt.tsize,lvl2_max_gap=self.d*4,colname='-100logq') else: self.info("#3 Call peaks with given -log10qvalue cutoff: %.2f ..." % self.log_qvalue) peaks = score_btrack.call_peaks(cutoff=self.log_qvalue*100,min_length=self.d,max_gap=self.opt.tsize,colname='-100logq') if self.opt.store_bdg: self.info("#3 save tag pileup into bedGraph file...") bdgfhd = open(self.zwig_tr + "_pileup.bdg", "w") score_btrack.write_bedGraph( bdgfhd, self.zwig_tr, "Fragment pileup at each bp from MACS version %s" % MACS_VERSION, "sample" ) if self.lregion: self.info("#3 save local lambda into bedGraph file...") bdgfhd = open(self.zwig_ctl + "_lambda.bdg", "w") score_btrack.write_bedGraph( bdgfhd, self.zwig_ctl, "Maximum local lambda at each bp from MACS version %s" % MACS_VERSION, "control" ) self.info("#3 save the -log10pvalue score track into bedGraph file...") bdgfhd = open(self.zwig_tr + "_pvalue.bdg", "w") score_btrack.write_bedGraph( bdgfhd, self.zwig_tr+"_-log10pvalue", "-log10 pvalue scores at each bp from MACS version %s" % MACS_VERSION, "-100logp") self.info("#3 save the -log10qvalue score track into bedGraph file...") bdgfhd = open(self.zwig_tr + "_qvalue.bdg", "w") score_btrack.write_bedGraph( bdgfhd, self.zwig_tr+"_-log10qvalue", "-log10 qvalue scores at each bp from MACS version %s" % MACS_VERSION, "-100logq") return peaks # def __diag_w_control (self): # # sample # sample_peaks = {} # for i in xrange(90,10,-10): # self.info("#3 diag: sample %d%%" % i) # sample_peaks[i]=self.__diag_peakfinding_w_control_sample(float(i)/(i+10)) # return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10) # def __diag_peakfinding_w_control_sample (self, percent): # self.treat.sample(percent) # because sampling is after # # shifting, track.total is used # # now. # self.control.sample(percent) # ratio_treat2control = float(self.treat.total)/self.control.total # #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed... # #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1 # self.debug("#3 diag: after shift and merging, treat: %d, control: %d" % (self.treat.total,self.control.total)) # self.info("#3 diag: call peak candidates") # peak_candidates = self.__call_peaks_from_trackI (self.treat) # self.info("#3 diag: call negative peak candidates") # negative_peak_candidates = self.__call_peaks_from_trackI (self.control) # self.info("#3 diag: use control data to filter peak candidates...") # final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.control, ratio_treat2control) # return final_peaks_percent # def __diag_wo_control (self): # # sample # sample_peaks = {} # for i in xrange(90,10,-10): # self.info("#3 diag: sample %d%%" % i) # sample_peaks[i]=self.__diag_peakfinding_wo_control_sample(float(i)/(i+10)) # return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10) # def __diag_peakfinding_wo_control_sample (self, percent): # #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed... # #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1 # self.treat.sample(percent) # self.debug("#3 diag: after shift and merging, tags: %d" % (self.treat.total)) # self.info("#3 diag: call peak candidates") # peak_candidates = self.__call_peaks_from_trackI (self.treat) # self.info("#3 diag: use self to calculate local lambda and filter peak candidates...") # final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.treat,pass_sregion=True) # bug fixed... # return final_peaks_percent # def __overlap (self, gold_peaks, sample_peaks, top=90,bottom=10,step=-10): # """Calculate the overlap between several fe range for the # golden peaks set and results from sampled data. # """ # gp = PeakIO() # gp.init_from_dict(gold_peaks) # if self.femax: # femax = min(self.femax, (int(gp.max_fold_enrichment())//self.festep+1)*self.festep) # else: # femax = (int(gp.max_fold_enrichment())//self.festep+1)*self.festep # femin = self.femin # diag_result = [] # for f in xrange(femin, femax, self.festep): # fe_low = f # fe_up = f + self.festep # self.debug("#3 diag: fe range = %d -- %d" % (fe_low, fe_up)) # r = self.__overlap_fe(gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step) # if r: # diag_result.append(r) # return diag_result # def __overlap_fe (self, gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step): # ret = ["%d-%d" % (fe_low,fe_up)] # gp = PeakIO() # gp.init_from_dict(gold_peaks) # gp.filter_fc(fe_low,fe_up) # gptotal = gp.total() # if gptotal <= 0: # return None # ret.append(gptotal) # for i in xrange(top,bottom,step): # p = PeakIO() # p.init_from_dict(sample_peaks[i]) # percent = 100.0*gp.overlap_with_other_peaks(p)/gptotal # ret.append(percent) # del p # return ret # def __remove_overlapping_peaks (self, peaks ): # """peak_candidates[chrom] = [(peak_start,peak_end,peak_length,peak_summit,peak_height,number_cpr_tags)...] # """ # new_peaks = {} # chrs = peaks.keys() # chrs.sort() # for chrom in chrs: # new_peaks[chrom]=[] # n_append = new_peaks[chrom].append # prev_peak = None # peaks_chr = peaks[chrom] # for i in xrange(len(peaks_chr)): # if not prev_peak: # prev_peak = peaks_chr[i] # continue # else: # if peaks_chr[i][0] <= prev_peak[1]: # s_new_peak = prev_peak[0] # e_new_peak = peaks_chr[i][1] # l_new_peak = e_new_peak-s_new_peak # if peaks_chr[i][4] > prev_peak[4]: # summit_new_peak = peaks_chr[i][3] # h_new_peak = peaks_chr[i][4] # else: # summit_new_peak = prev_peak[3] # h_new_peak = prev_peak[4] # prev_peak = (s_new_peak,e_new_peak,l_new_peak,summit_new_peak,h_new_peak,peaks_chr[i][5]+prev_peak[5]) # else: # n_append(prev_peak) # prev_peak = peaks_chr[i] # if prev_peak: # n_append(prev_peak) # return new_peaks MACS-2.0.9/INSTALL0000644000175000017500000000671411654316302014123 0ustar taoliutaoliu00000000000000INSTALL Guide For MACS Time-stamp: <2011-11-02 15:22:30 Tao Liu> Please check the following instructions to complete your installation. * Prerequisite Python version must be equal to 2.6 or 2.7 to run MACS. We recommend using the version 2.6.5. Cython (>=0.14.1) is required to run MACS v2. * Install under Debian or Ubuntu Linux system The most convenient way to install MACS is through Debian APT system, so that it can be perfectly integrated in the Python environment of your operation system. You can easily manage the package, and the uninstall is much easier. Download the deb package from MACS download page, and type this in the commend line: $ dpkg -i macs_2.0.9.deb To uninstall, type: $ dpkg -r macs_2.0.9 This is tested only in Ubuntu 10.10 and 10.04 LTS. * Install from source MACS uses Python's distutils tools for source installations. To install a source distribution of MACS, unpack the distribution tarball and open up a command terminal. Go to the directory where you unpacked MACS, and simply run the install script : $ python setup.py install By default, the script will install python library and executable codes globally, which means you need to be root or administrator of the machine so as to complete the installation. Please contact the administrator of that machine if you want their help. If you need to provide a nonstandard install prefix, or any other nonstandard options, you can provide many command line options to the install script. Use the –help option to see a brief list of available options: $ python setup.py --help For example, if I want to install everything under my own HOME directory, use this command: $ python setup.py install --prefix /home/taoliu/ * Configure enviroment variables After running the setup script, you might need to add the install location to your PYTHONPATH and PATH environment variables. The process for doing this varies on each platform, but the general concept is the same across platforms. ** PYTHONPATH To set up your PYTHONPATH environment variable, you'll need to add the value PREFIX/lib/pythonX.Y/site-packages to your existing PYTHONPATH. In this value, X.Y stands for the major–minor version of Python you are using (such as 2.6 or 2.7 ; you can find this with sys.version[:3] from a Python command line). PREFIX is the install prefix where you installed MACS. If you did not specify a prefix on the command line, MACS will be installed using Python's sys.prefix value. On Linux, using bash, I include the new value in my PYTHONPATH by adding this line to my ~/.bashrc : $ export PYTHONPATH=/home/taoliu/lib/python2.6/site-packages:$PYTHONPATH Using Windows, you need to open up the system properties dialog, and locate the tab labeled Environment. Add your value to the PYTHONPATH variable, or create a new PYTHONPATH variable if there isn't one already. ** PATH Just like your PYTHONPATH, you'll also need to add a new value to your PATH environment variable so that you can use the MACS command line directly. Unlike the PYTHONPATH value, however, this time you'll need to add PREFIX/bin to your PATH environment variable. The process for updating this is the same as described above for the PYTHONPATH variable. $ export PATH=/home/taoliu/bin:$PATH * About PeakSplitter Please go to it's main site to download. * About GSL package From MACS version 1.2, I have removed GSL -- GNU Scientific Library, from the distribution. -- Tao Liu MACS-2.0.9/PKG-INFO0000644000175000017500000000123011657057615014167 0ustar taoliutaoliu00000000000000Metadata-Version: 1.0 Name: MACS Version: 2.0.9 Summary: Model Based Analysis for ChIP-Seq data Home-page: http://liulab.dfci.harvard.edu/MACS/ Author: Yong Zhang; Tao (Foo) Liu Author-email: zy@jimmy.harvard.edu; taoliu@jimmy.harvard.edu License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN Classifier: Development Status :: 4 - experimental Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: Artistic License Classifier: Operating System :: MacOS :: MacOS X Classifier: Operating System :: Microsoft :: Windows Classifier: Operating System :: POSIX Classifier: Programming Language :: Python MACS-2.0.9/README0000644000175000017500000006600011654316302013744 0ustar taoliutaoliu00000000000000README for MACS (2.0.9) Time-stamp: <2011-11-02 15:21:38 Tao Liu> * Introduction With the improvement of sequencing techniques, chromatin immunoprecipitation followed by high throughput sequencing (ChIP-Seq) is getting popular to study genome-wide protein-DNA interactions. To address the lack of powerful ChIP-Seq analysis method, we present a novel algorithm, named Model-based Analysis of ChIP-Seq (MACS), for identifying transcript factor binding sites. MACS captures the influence of genome complexity to evaluate the significance of enriched ChIP regions, and MACS improves the spatial resolution of binding sites through combining the information of both sequencing tag position and orientation. MACS can be easily used for ChIP-Seq data alone, or with control sample with the increase of specificity. * Install Please check the file 'INSTALL' in the distribution. * Usage of macs2 Usage: macs2 <-t tfile> [-n name] [-g genomesize] [options] Example: macs2 -t ChIP.bam -c Control.bam -f BAM -g hs -n test -B -q 0.01 or example for broad peak calling: macs2 -t ChIP.bam -c Control.bam --broad -g hs macs2 -- Model-based Analysis for ChIP-Sequencing Options: --version show program's version number and exit -h, --help show this help message and exit. -t TFILE, --treatment=TFILE ChIP-seq treatment file. REQUIRED. -c CFILE, --control=CFILE Control file. -n NAME, --name=NAME Experiment name, which will be used to generate output file names. DEFAULT: "NA" -f FORMAT, --format=FORMAT Format of tag file, "AUTO", "BED" or "ELAND" or "ELANDMULTI" or "ELANDEXPORT" or "SAM" or "BAM" or "BOWTIE". The default AUTO option will let MACS decide which format the file is. Please check the definition in 00README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: "AUTO" -g GSIZE, --gsize=GSIZE Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs -s TSIZE, --tsize=TSIZE Tag size. This will overide the auto detected tag size. DEFAULT: Not set --bw=BW Band width. This value is only used while building the shifting model. DEFAULT: 300 -q QVALUE, --qvalue=QVALUE Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.01 -p PVALUE, --pvalue=PVALUE Pvalue cutoff for peak detection. When set (e.g. -q 0.05 or -q 1e-5), qvalue cutoff will be ignored. Default is not set. -m MFOLD, --mfold=MFOLD Select the regions within MFOLD range of high- confidence enrichment ratio against background to build model. The regions must be lower than upper limit, and higher than the lower limit. DEFAULT:10,30 --nolambda If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to potential chromatin structure. --slocal=SMALLLOCAL The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 1000 --llocal=LARGELOCAL The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 10000. --auto-bimodal Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the '--shiftsize' parameter to shift and extend each tags. Not to use this automate fixation is a default behavior now. DEFAULT: False --nomodel Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set shiftsize to change it. DEFAULT: False --shiftsize=SHIFTSIZE The arbitrary shift size in bp. When nomodel is true, MACS will use this value as 1/2 of fragment size. DEFAULT: 100 --keep-dup=KEEPDUPLICATES It controls the MACS behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The default 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Default: auto --to-large When set, scale the small sample up to the bigger sample. By default, the bigger dataset will be scaled down towards the smaller dataset, which will lead to smaller p/qvalues and more specific results. Keep in mind that scaling down will bring down background noise more. DEFAULT: False --down-sample When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. DEFAULT: False --shift-control When set, control tags will be shifted just as ChIP tags according to their strand before the extension of d, slocal and llocal. By default, control tags are extended centered at their current positions regardless of strand. You may consider to turn this option on while comparing two ChIP datasets of different condition but the same factor. DEFAULT: False --half-ext When set, MACS extends 1/2 d size for each fragment centered at its middle point. DEFAULT: False -B, --bdg Whether or not to save extended fragment pileup, local lambda and score tracks at every bp into a bedGraph file. DEFAULT: False --broad If set, MACS will try to call broad peaks by linking nearby highly enriched regions. The linking region is controlled by another cutoff through --linking-cutoff. The maximum linking region length is 4 times of d from MACS. DEFAULT: False --broad-cutoff=BROADCUTOFF Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. DEFAULT: 0.1 --verbose=VERBOSE Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2 ** Parameters: *** -t/--treatment FILENAME This is the only REQUIRED parameter for MACS. *** -c/--control The control or mock data file in either BED format or any ELAND output format specified by --format option. Please follow the same direction as for -t/--treatment. *** -n/--name The name string of the experiment. MACS will use this string NAME to create output files like 'NAME_peaks.xls', 'NAME_negative_peaks.xls', 'NAME_peaks.bed' , 'NAME_summits.bed', 'NAME_model.r' and so on. So please avoid any confliction between these filenames and your existing files. *** -f/--format FORMAT Format of tag file, can be "ELAND", "BED", "ELANDMULTI", "ELANDEXPORT", "ELANDMULTIPET" (for pair-end tags), "SAM", "BAM" or "BOWTIE". Default is "AUTO" which will allow MACS to decide the format automatically. Please use "AUTO" only when you combine different formats of files. The BED format is defined in "http://genome.ucsc.edu/FAQ/FAQformat#format1". If the format is ELAND, the file must be ELAND result output file, each line MUST represents only ONE tag, with fields of: 1. Sequence name (derived from file name and line number if format is not Fasta) 2. Sequence 3. Type of match: NM - no match found. QC - no matching done: QC failure (too many Ns basically). RM - no matching done: repeat masked (may be seen if repeatFile.txt was specified). U0 - Best match found was a unique exact match. U1 - Best match found was a unique 1-error match. U2 - Best match found was a unique 2-error match. R0 - Multiple exact matches found. R1 - Multiple 1-error matches found, no exact matches. R2 - Multiple 2-error matches found, no exact or 1-error matches. 4. Number of exact matches found. 5. Number of 1-error matches found. 6. Number of 2-error matches found. Rest of fields are only seen if a unique best match was found (i.e. the match code in field 3 begins with "U"). 7. Genome file in which match was found. 8. Position of match (bases in file are numbered starting at 1). 9. Direction of match (F=forward strand, R=reverse). 10. How N characters in read were interpreted: ("."=not applicable, "D"=deletion, "I"=insertion). Rest of fields are only seen in the case of a unique inexact match (i.e. the match code was U1 or U2). 11. Position and type of first substitution error (e.g. 12A: base 12 was A, not whatever is was in read). 12. Position and type of first substitution error, as above. If the format is ELANDMULTI, the file must be ELAND output file from multiple-match mode, each line MUST represents only ONE tag, with fields of: 1. Sequence name 2. Sequence 3. Either NM, QC, RM (as described above) or the following: 4. x:y:z where x, y, and z are the number of exact, single-error, and 2-error matches found 5. Blank, if no matches found or if too many matches found, or the following: BAC_plus_vector.fa:163022R1,170128F2,E_coli.fa:3909847R1 This says there are two matches to BAC_plus_vector.fa: one in the reverse direction starting at position 160322 with one error, one in the forward direction starting at position 170128 with two errors. There is also a single-error match to E_coli.fa. If the data is from Pair-End sequencing. You can sepecify the format as ELANDMULTIPET ( stands for ELAND Multiple-match Pair-End Tags), then the --treat (and --control if needed) parameter must be two file names separated by comma. Each file must be in ELAND multiple-match format described above. e.g. macs2 --format ELANDMULTIPET -t s_1_1_eland_multi.txt,s_2_1_eland_multi.txt ... If you use ELANDMULTIPET, you may need to modify --petdist parameter. If the format is BAM/SAM, please check the definition in (http://samtools.sourceforge.net/samtools.shtml). Pair-end mapping results can be saved in a single BAM file, if so, MACS will automatically keep the left mate(5' end) tag. If the format is BOWTIE, you need to provide the ASCII bowtie output file with the suffix '.map'. Please note that, you need to make sure that in the bowtie output, you only keep one location for one read. Check the bowtie manual for detail if you want at (http://bowtie-bio.sourceforge.net/manual.shtml) Here is the definition for Bowtie output in ASCII characters I copied from the above webpage: 1. Name of read that aligned 2. Orientation of read in the alignment, - for reverse complement, + otherwise 3. Name of reference sequence where alignment occurs, or ordinal ID if no name was provided 4. 0-based offset into the forward reference strand where leftmost character of the alignment occurs 5. Read sequence (reverse-complemented if orientation is -) 6. ASCII-encoded read qualities (reversed if orientation is -). The encoded quality values are on the Phred scale and the encoding is ASCII-offset by 33 (ASCII char !). 7. Number of other instances where the same read aligns against the same reference characters as were aligned against in this alignment. This is not the number of other places the read aligns with the same number of mismatches. The number in this column is generally not a good proxy for that number (e.g., the number in this column may be '0' while the number of other alignments with the same number of mismatches might be large). This column was previously described as "Reserved". 8. Comma-separated list of mismatch descriptors. If there are no mismatches in the alignment, this field is empty. A single descriptor has the format offset:reference-base>read-base. The offset is expressed as a 0-based offset from the high-quality (5') end of the read. Notes: 1) For BED format, the 6th column of strand information is required by MACS. And please pay attention that the coordinates in BED format is zero-based and half-open (http://genome.ucsc.edu/FAQ/FAQtracks#tracks1). 2) For plain ELAND format, only matches with match type U0, U1 or U2 is accepted by MACS, i.e. only the unique match for a sequence with less than 3 errors is involed in calculation. If multiple hits of a single tag are included in your raw ELAND file, please remove the redundancy to keep the best hit for that sequencing tag. 3) For the experiment with several replicates, it is recommended to concatenate several ChIP-seq treatment files into a single file. To do this, under Unix/Mac or Cygwin (for windows OS), type: $ cat replicate1.bed replicate2.bed replicate3.bed > all_replicates.bed For BAM or SAM files, samtools can be used to combine replicates. 4) ELAND export format support sometimes may not work on your datasets, because people may mislabel the 11th and 12th column. MACS uses 11th column as the sequence name which should be the chromosome names. ** --petdist=PETDIST Best distance between Pair-End Tags. Only available when format is 'ELANDMULTIPE'. Default is 200bps. When MACS reads mapped positions for 5' tag and 3' tag, it will decide the best pairing for them using this best distance parameter. A simple scoring system is used as following, score = abs(abs(p5-p3)-200)+e5+e5 Where p5 is one of the position of 5' tag, and e5 is the mismatch/error for this mapped position of 5' tag. p3 and e3 are for 3' tag. Then the lowest scored paring is regarded as the best pairing. The 5' tag position of the pair is kept in model building and peak calling. *** -g/--gsize PLEASE assign this parameter to fit your needs! It's the mappable genome size or effective genome size which is defined as the genome size which can be sequenced. Because of the repetitive features on the chromsomes, the actual mappable genome size will be smaller than the original size, about 90% or 70% of the genome size. The default hs -- 2.7e9 is recommended for UCSC human hg18 assembly. Here are all precompiled parameters for effective genome size: -g hs = -g 2.7e9 -g mm = -g 1.87e9 -g ce = -g 9e7 -g dm = -g 1.2e8 *** -s/--tsize The size of sequencing tags. If you don't specify it, MACS will try to use the first 10 sequences from your input treatment file to determine the tag size. Specifying it will override the automatic determined tag size. *** --bw The band width which is used to scan the genome ONLY for model building. You can set this parameter as the sonication fragment size expected from wet experiment. The previous side effect on the peak detection process has been removed. So this parameter only affects the model building. *** -q/--qvalue The qvalue (minimum FDR) cutoff to call significant regions. Default is 0.01. For broad marks, you can try 0.05 as cutoff. Q-values are calculated from p-values using Benjamini-Hochberg procedure. *** -p/--pvalue The pvalue cutoff. If -p is specified, MACS2 will use pvalue instead of qvalue. *** -m/--mfold This parameter is used to select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. The regions must be lower than upper limit, and higher than the lower limit of fold enrichment. DEFAULT:10,30 means using all regions not too low (>10) and not too high (<30) to build paired-peaks model. If MACS can not find more than 100 regions to build model, it will use the --shiftsize parameter to continue the peak detection. Check related *--off-auto* and *--shiftsize* for detail. ** --nolambda With this flag on, MACS will use the background lambda as local lambda. This means MACS will not consider the local bias at peak candidate regions. ** --slocal, --llocal These two parameters control which two levels of regions will be checked around the peak regions to calculate the maximum lambda as local lambda. By default, MACS considers 1000bp for small local region(--slocal), and 10000bps for large local region(--llocal) which captures the bias from a long range effect like an open chromatin domain. You can tweak these according to your project. Remember that if the region is set too small, a sharp spike in the input data may kill the significant peak. ** --off-auto Whether turn off the auto paired-peak model process. If not set, when MACS failed to build paired model, it will use the nomodel settings, the '--shiftsize' parameter to shift and extend each tags. If set, MACS will be terminated if paried-peak model is failed. ** --nomodel While on, MACS will bypass building the shifting model. ** --shiftsize While '--nomodel' is set, MACS uses this parameter to shift tags to their midpoint. For example, if the size of binding region for your transcription factor is 200 bp, and you want to bypass the model building by MACS, this parameter can be set as 100. This option is only valid when --nomodel is set or when MACS fails to build paired-peak model. ** --keep-dup It controls the MACS behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The default 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Default: auto ** --broad When this flag is on, MACS will try to composite broad regions in BED12 ( a gene-model-like format ) by putting nearby highly enriched regions into a broad region with loose cutoff. The broad region is controlled by another cutoff through --broad-cutoff. The maximum length of broad region length is 4 times of d from MACS. DEFAULT: False ** --broad-cutoff Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. DEFAULT: 0.1 ** --to-large When set, linearly scale the smaller dataset to the same depth as larger dataset, by default, the smaller dataset will be scaled towards the larger dataset. Beware, to scale up small data would cause more false positives. ** --down-sample When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. This option will make the results unstable and irreproducible since each time, random reads would be selected, especially the numbers (pileup, pvalue, qvalue) would change. Consider to use 'randsample' script before MACS2 runs instead. ** -B/--bdg If this flag is on, MACS will store the fragment pileup, control lambda, -log10pvalue and -log10qvalue scores in bedGraph files. The bedGraph files will be stored in current directory named NAME+'_treat_pileup.bdg' for treatment data, NAME+'_control_lambda.bdg' for local lambda values from control, NAME+'_treat_pvalue.bdg' for Poisson pvalue scores (in -log10(pvalue) form), and NAME+'_treat_qvalue.bdg' for q-value scores from Benjamini–Hochberg–Yekutieli procedure ** --half-ext (experimental option) When this flag is on, MACS will only extend each tag with 1/2 d (predicted ChIP fragment size) instead of full d. ** -w/--wig is obsolete. ** -S/--single-profile is obsolete. ** --space=SPACE is obsolete since we don't generate wiggle file. ** --call-subpeaks is currently not functional. If set, MACS will invoke Mali Salmon's PeakSplitter software through system call. If PeakSplitter can't be found, an instruction will be shown for downloading and installing the PeakSplitter package. The PeakSplitter can refine the MACS peaks and split the wide peaks into smaller subpeaks. For more information, please check the following URL: http://www.ebi.ac.uk/bertone/software/PeakSplitter_Cpp_usage.txt Note this option doesn't work if -B/--bdg is on. *** --verbose If you don't want to see any message during the running of MACS, set it to 0. But the CRITICAL messages will never be hidden. If you want to see rich information like how many peaks are called for every chromosome, you can set it to 3 or larger than 3. ** --diag is currently not functional. A diagnosis report can be generated through this option. This report can help you get an assumption about the sequencing saturation. This funtion is only in beta stage. ** --fe-min, --fe-max & --fe-step are currently not functional. For diagnostics, FEMIN and FEMAX are the minimum and maximum fold enrichment to consider, and FESTEP is the interval of fold enrichment. For example, "--fe-min 0 --fe-max 40 --fe-step 10" will let MACS choose the following fold enrichment ranges to consider: [0,10), [10,20), [20,30) and [30,40). * Output files 1. NAME_peaks.xls is a tabular file which contains information about called peaks. You can open it in excel and sort/filter using excel functions. Information include: chromosome name, start position of peak, end position of peak, length of peak region, absolute peak summit position, pileup height at peak summit, -log10(pvalue) for the peak summit (e.g. pvalue =1e-10, then this value should be 10), fold enrichment for this peak summit against random Poisson distribution with local lambda, -log10(qvalue) at peak summit. Coordinates in XLS is 1-based which is different with BED format. 2. NAME_peaks.bed is BED format file which contains the peak locations. You can load it to UCSC genome browser or Affymetrix IGB software. The file can be loaded directly to UCSC genome browser. Remove the beginning track line if you want to analyze it by other tools. 3. NAME_peaks.encodePeak is BED6+4 format file which contains the peak locations together with peak summit, pvalue and qvalue. You can load it to UCSC genome browser. Definition of some specific columns are: 5th: -log10pvalue*10, 7th: fold-change, 8th: -log10pvalue, 9th: -log10qvalue, 10th: relative summit position to peak start. The file can be loaded directly to UCSC genome browser. Remove the beginning track line if you want to analyze it by other tools. 4. NAME_summits.bed is in BED format, which contains the peak summits locations for every peaks. The 5th column in this file is -log10pvalue the same as NAME_peaks.bed. If you want to find the motifs at the binding sites, this file is recommended. The file can be loaded directly to UCSC genome browser. Remove the beginning track line if you want to analyze it by other tools. 5. NAME_broad_peaks.bed is in BED12 format which contains both the broad region and narrow peaks. The 5th column is 100*-log10pvalue, to be more compatible to UCSC standard. Tht 7th is the start of the first narrow peak in the region, and the 8th column is the end. The 9th column should be RGB color key, however, we keep 0 here to use the default color, so change it if you want. The 10th column tells how many blocks including the starting 1bp and ending 1bp of broad regions. The 11th column shows the length of each blocks, and 12th for the starts of each blocks. The file can be loaded directly to UCSC genome browser. Remove the beginning track line if you want to analyze it by other tools. 6. NAME_model.r is an R script which you can use to produce a PDF image about the model based on your data. Load it to R by: $ R --vanilla < NAME_model.r Then a pdf file NAME_model.pdf will be generated in your current directory. Note, R is required to draw this figure. 7. The .bdg files are in bedGraph format which can be imported to UCSC genome browser or be converted into even smaller bigWig files. Four kinds of bdg files include treat_pileup, control_lambda, treat_pvalue, and treat_qvalue. 8. NAME_pqtable.txt store the -log10pvalue, -log10qvalue, rank of this pvalue, and number of bp having this pvalue. * Usage of macs2diff Will update it later... * Other useful links Cistrome web server for ChIP-chip/seq analysis: http://cistrome.org/ap/ bedTools -- a super useful toolkits for genome annotation files: http://code.google.com/p/bedtools/ UCSC toolkits: http://hgdownload.cse.ucsc.edu/admin/exe/ * Tips of fine-tuning peak calling Check the three scripts within MACSv2 package: 1. bdgcmp can be used on *_treat_pileup.bdg and *_control_lambda.bdg or bedGraph files from other resources to calculate score track; 2. bdgpeakcall can be used on *_treat_pvalue.bdg or the file generated from bdgcmp or bedGraph file from other resources to call peaks with given cutoff, maximum-gap between nearby mergable peaks and minimum length of peak. bdgbroadcall works similarly to bdgpeakcall, however it will output _broad_peaks.bed in BED12 format. 3. Differential calling tool -- bdgdiff, can be used on 4 bedgraph files which are scores between treatment 1 and control 1, treatment 2 and control 2, treatment 1 and treatment 2, treatment 2 and treatment 1. It will output the consistent and unique sites according to parameter settings for minimum length, maximum gap and cutoff.MACS-2.0.9/ChangeLog0000644000175000017500000006171511654316302014646 0ustar taoliutaoliu000000000000002011-11-2 Tao Liu MACS version 2.0.9 (tag:alpha) * Auto fixation on predicted d is turned off by default! Previous --off-auto is now default. MACS will not automatically fix d less than 2 times of tag size according to --shiftsize. While tag size is getting longer nowadays, it would be easier to have d less than 2 times of tag size, however d may still be meaningful and useful. Please judge it using your own wisdom. * Scaling issue Now, the default scaling while treatment and input are unbalanced has been adjusted. By default, larger sample will be scaled down linearly to match the smaller sample. In this way, background noise will be reduced more than real signals, so we expect to have more specific results than the other way around (i.e. --to-large is set). Also, an alternative option to randomly sample larger data (--down-sample) is provided to replace default linear scaling. However, this option will cause results irresproducible, so be careful. * randsample script A new script 'randsample' is added, which can randomly sample certain percentage or number of tags. * Peak summit Now, MACS will decide peak summits according to pileup height instead of qvalue scores. In this way, the summit may be more accurate. * Diff score MACS calculate qvalue scores as differential scores. When compare two conditions (saying A and B), the maximum qscore for comparing A to B -- maxqscore_a2b, and for comparing B to A --maxqscore_b2a will be computed. If maxqscore_a2b is bigger, the diff score is +maxqscore_a2b, otherwise, diff score is -1*maxqscore_b2a. 2011-09-15 Tao Liu MACS version 2.0.8 (tag:alpha) * bin/macs2, bin/bdgbroadcall, MACS2/IO/cScoreTrack.pyx, MACS2/IO/cBedGraph.pyx New script bdgbroadcall and the extra option '--broad' for macs2 script, can be used to call broad regions with a loose cutoff to link nearby significant regions. The output is represented as BED12 format. * MACS2/IO/cScoreTrack.pyx Fix q-value calculation to generate forcefully monotonic values. * bin/eland*2bed, bin/sam2bed and bin/filterdup They are combined to one more powerful script called "filterdup". The script filterdup can filter duplicated reads according to sequencing depth and genome size. The script can also convert any format supported by MACS to BED format. 2011-08-21 Tao Liu MACS version 2.0.7 (tag:alpha) * bin/macsdiff renamed to bin/bdgdiff Now this script will work as a low-level finetuning tool as bdgcmp and bdgpeakcall. * bin/macs2diff A new script to take treatment and control files from two condition, calculate fragment size, use local poisson to get pvalues and BH process to get qvalues, then combine 4-ways result to call differential sites. This script can use upto 4 cpus to speed up 4-ways calculation. ( I am trying multiprocessing in python. ) * MACS2/Constants.py, MACS2/IO/cBedGraph.pyx, MACS2/IO/cScoreTrack.pyx, MACS2/OptValidator.py, MACS2/PeakModel.py, MACS2/cPeakDetect.pyx All above files are modified for the new macs2diff script. * bin/macs2, bin/macs2diff, MACS2/OptValidator.py Now q-value 0.01 is the default cutoff. If -p is specified, p-value cutoff will be used instead. 2011-07-25 Tao Liu MACS version 2.0.6 (tag:alpha) * bin/macsdiff A script to call differential regions. A naive way is introduced to find the regions where: 1. signal from condition 1 is larger than input 1 and condition 2 -- unique region in condition 1; 2. signal from condition 2 is larger than input 2 and condition 1 -- unique region in condition 2; 3. signal from condition 1 is larger than input 1, signal from condition 2 is larger than input 2, however either signal from condition 1 or 2 is not larger than the other. Here 'larger' means the pvalue or qvalue from a Poisson test is under certain cutoff. (I will make another script to wrap up mulitple scripts for differential calling) 2011-07-07 Tao Liu MACS version 2.0.5 (tag:alpha) * bin/macs2, MACS2/cPeakDetect.py, MACS2/IO/cScoreTrack.pyx, MACS2/IO/cPeakIO.pyx Use hash to store peak information. Add back the feature to deal with data without control. Fix bug which incorrectly allows small peaks at the end of chromosomes. * bin/bdgpeakcall, bin/bdgcmp Fix bugs. bdgpeakcall can output encodePeak format. 2011-06-22 Tao Liu MACS version 2.0.4 (tag:alpha) * cPeakDetect.py Fix a bug, correctly assign lambda_bg while --to-small is set. Thanks Junya Seo! Add rank and num of bp columns to pvalue-qvalue table. * cScoreTrack.py Fix bugs to correctly deal with peakless chromosomes. Thanks Vaibhav Jain! Use AFDR for independent tests instead. * encodePeak Now MACS can output peak coordinates together with pvalue, qvalue, summit positions in a single encodePeak format (designed for ENCODE project) file. This file can be loaded to UCSC browser. Definition of some specific columns are: 5th: int(-log10pvalue*10), 7th: fold-change, 8th: -log10pvalue, 9th: -log10qvalue, 10th: relative summit position to peak start. 2011-06-19 Tao Liu MACS version 2.0.3 (tag:alpha) * Rich output with qvalue, fold enrichment, and pileup height Calculate q-values using a refined Benjamini–Hochberg–Yekutieli procedure: http://en.wikipedia.org/wiki/False_discovery_rate#Dependent_tests Now we have a similiar xls output file as before. The differences from previous file are: 1. Summit now is absolute summit, instead of relative summit position; 2. 'Pileup' is previous 'tag' column. It's the extended fragment pileup at the peak summit; 3. We now use '-log10(pvalue)' instead of '-10log10(pvalue)', so 5.00 means 1e-5, simple and less confusing. 4. FDR column becomes '-log10(qvalue)' column. 5. The pileup, -log10pvalue, fold_enrichment and -log10qvalue are the values at the peak summit. * Extra output files NAME_pqtable.txt contains pvalue and qvalue relationships. NAME_treat_pvalue.bdg and NAME_treat_qvalue.bdg store -log10pvalue and -log10qvalue scores in BedGraph format. Nearby regions with the same value are not merged. * Separation of FeatIO.py Its content has been divided into cPeakIO.pyx, cBedGraph.pyx, and cFixWidthTrack.pyx. A modified bedGraphTrackI class was implemented to store pileup, local lambda, pvalue, and qvalue alltogether in cScoreTrack.pyx. * Experimental option --half-ext Suggested by NPS algorithm, I added an experimental option --half-ext to let MACS only extends ChIP fragment around its middle point for only 1/2 d. 2011-06-12 Tao Liu MACS version 2.0.2 (tag:alpha) * macs2 Add an error check to see if there is no common chromosome names from treatment file and control file * cPeakDetect.pyx, cFeatIO.pyx, cPileup.pyx Reduce memory usage by removing deepcopy() calls. * Modify README documents and others. 2011-05-19 Tao Liu MACS Version 2.0.1 (tag:alpha) * cPileup.pyx, cPeakDetect.pyx and peak calling process Jie suggested me a brilliant simple method to pileup fragments into bedGraph track. It works extremely faster than the previous function, i.e, faster than MACS1.3 or MACS1.4. So I can include large local lambda calculation in MACSv2 now. Now I generate three bedGraphs for d-size local bias, slocal-size and llocal-size local bias, and calculate the maximum local bias as local lambda bedGraph track. Minor: add_loc in bedGraphTrackI now can correctly merge the region with its preceding region if their value are the same. * macs2 Add an option to shift control tags before extension. By default, control tags will be extended to both sides regardless of strand information. 2011-05-17 Tao Liu MACS Version 2.0.0 (tag:alpha) * Use bedGraph type to store data internally and externally. We can have theoretically one-basepair resolution profiles. 10 times smaller in filesize and even smaller after converting to bigWig for visualization. * Peak calling process modified. Better peak boundary detection. Extend ChIP tag to d, and pileup to have a ChIP bedGraph. Extend Control tag to d and 1,000bp, and pileup to two bedGraphs. (1000bp one will be averaged to d size) Then calculate the maximum value of these two tracks and a global background, to have a local-lambda bedGraph. Use -10log10poisson_pvalue as scores to generate a score track before peak calling. A general peak calling based on a score cutoff, min length of peak and max gap between nearby peaks. * Option changes. Wiggle file output is removed. Now we only support bedGraph output. The generation of bedGraph is highly recommended since it will not cost extra time. In other words, bedGraph generation is internally run even you don't want to save bedGraphs on disk, due to the peak calling algorithm in MACS v2. * cProb.pyx We now can calculate poisson pvalue in log space so that the score (-10*log10pvalue) will not have a upper limit of 3100 due to precision of float number. * Cython is adopted to speed up Python code. 2011-02-28 Tao Liu Small fixes * Replaced with a newest WigTrackI class and fixed the wignorm script. 2011-02-21 Tao Liu Version 1.4.0rc2 (Valentine) * --single-wig option is renamed to --single-profile * BedGraph output with --bdg or -B option. The BedGraph output provides 1bp resolution fragment pileup profile. File size is smaller than wig file. This option can be combined with --single-profile option to produce a bedgraph file for the whole genome. This option can also make --space, --call-subpeaks invalid. * Fix the description of --shiftsize to correctly state that the value is 1/2 d (fragment size). * Fix a bug in the call to __filter_w_control_tags when control is not available. * Fix a bug on --to-small option. Now it works as expected. * Fix a bug while counting the tags in candidate peak region, an extra tag may be included. (Thanks to Jake Biesinger!) * Fix the bug for the peaks extended outside of chromosome start. If the minus strand tag goes outside of chromosome start after extension of d, it will be thrown out. * Post-process script for a combined wig file: The "wignorm" command can be called after a full run of MACS14 as a postprocess. wignorm can calculate the local background from the control wig file from MACS14, then use either foldchange, -10*log10(pvalue) from possion test, or difference after asinh transformation as the score to build a single wig track to represent the binding strength. This script will take a significant long time to process. * --wigextend has been obsoleted. 2010-09-21 Tao Liu Version 1.4.0rc1 (Starry Sky) * Duplicate reads option --keep-dup behavior is changed. Now user can specify how many reads he/she wants to keep at the same genomic location. 'auto' to let MACS decide the number based on binomial distribution, 'all' to let MACS keep all reads. * pvalue and FDR fixes (Thanks to Prof. Zhiping Weng) By default, MACS will now scale the smaller dataset to the bigger dataset. For instance, if IP has 10 million reads, and Input has 5 million, MACS will double the lambda value calculated from Input reads while calling BOTH the positive peaks and negative peaks. This will address the issue caused by unbalanced numbers of reads from IP and Input. If --to-small is turned on, MACS will scale the larger dataset to the smaller one. So from now on, if d is fixed, then the peaks from a MACS call for A vs B should be identical to the negative peaks from a B vs A. 2010-09-01 Tao Liu Version 1.4.0beta (summer wishes) * New features ** Model building The default behavior in the model building step is slightly changed. When MACS can't find enough pairs to build model (implemented in alpha version) or the modeled fragment length is less than 2 times of tag length (implemented in beta version), MACS will use 2 times of --shiftsize value as fragment length in the later analysis. --off-auto can turn off this default behavior. ** Redundant tag filtering The IO module is rewritten. The redundant tag filtering process becomes simpler and works as promise. The maximum allowed number of tags at the exact same location is calculated from the sequencing depth and genome size using a binomial distribution, for both TREAMENT and CONTROL separately. ( previously only TREATMENT is considered ) The exact same location means the same coordination and the same strand. Then MACS will only keep at most this number of tags at the exact same location in the following analysis. An option --keep-dup can let MACS skip the filtering and keep all the tags. However this may bring in a lot of sequencing bias, so you may get many false positive peaks. ** Single wiggle mode First thing to mention, this is not the score track that I described before. By default, MACS generates wiggle files for fragment pileup for every chromosomes separately. When you use --single-wig option, MACS will generate a single wiggle file for all the chromosomes so you will get a wig.gz for TREATMENT and another wig.gz for CONTROL if available. ** Sniff -- automatic format detection Now, by default or "-f AUTO", MACS will decide the input file format automatically. Technically, it will try to read at most 1000 records for the first 10 non-comment lines. If it succeeds, the format is decided. I recommend not to use AUTO and specify the right format for your input files, unless you combine different formats in a single MACS run. * Options changes --single-wig and --keep-dup are added. Check previous section in ChangeLog for detail. -f (--format) AUTO is now the default option. --slocal default: 1000 --llocal default: 10000 * Bug fixed Setup script will stop the installation if python version is not python2.6 or python2.7. Local lambda calculation has been changed back. MACS will check peak_region, slocal( default 1K) and llocal (default 10K) for the local bias. The previous 200bps default will cause MACS misses some peaks where the input bias is very sharp. sam2bed.py script is corrected. Relative pos in xls output is fixed. Parser for ELAND_export is fixed to pass some of the no match lines. And elandexport2bed.py is fixed too. ( however I can't guarantee that it works on any eland_export files. ) 2010-06-04 Tao Liu Version 1.4.0alpha2 (be smarter) * Options changes --gsize now provides shortcuts for common genomes, including human, mouse, C. elegans and fruitfly. --llocal now will be 5000 bps if there is no input file, so that local lambda doesn't overkill enriched binding sites. 2010-06-02 Tao Liu Version 1.4alpha (be smarter) * Options changes --tsize option is redesigned. MACS will use the first 10 lines of the input to decide the tag size. If user specifies --tsize, it will override the auto decided tsize. --lambdaset is replaced by --slocal and --llocal which mean the small local region and large local region. --bw has no effect on the scan-window size now. It only affects the paired-peaks model process. * Model building During the model building, MACS will pick out the enriched regions which are not too high and not too low to build the paired-peak model. Default the region is from fold 10 to fold 30. If MACS fails to build the model, by default it will use the nomodel settings, like shiftsize=100bps, to shift and extend each tags. This behavior can be turned off by '--off-auto'. * Output files An extra file including all the summit positions are saved in *_summits.bed file. An option '--call-subpeaks' will invoke PeakSplitter developed by Mali Salmon to split wide peaks into smaller subpeaks. * Sniff ( will in beta ) Automatically recognize the input file format, so use can combine different format in one MACS run. Not implemented features/TODO: * Algorithms ( in near future? ) MACS will try to refine the peak boundaries by calculating the scores for every point in the candidate peak regions. The score will be the -10*log(10,pvalue) on a local poisson distribution. A cutoff specified by users (--pvalue) will be applied to find the precise sub-peaks in the original candidate peak region. Peak boudaries and peak summits positions will be saved in separate BED files. * Single wiggle track ( in near future? ) A single wiggle track will be generated to save the scores within candidate peak regions in the 10bps resolution. The wiggle file is in fixedStep format. 2009-10-16 Tao Liu Version 1.3.7.1 (Oktoberfest, bug fixed #1) * bin/Constants.py Fixed typo. FCSTEP -> FESTEP * lib/PeakDetect.py The 'femax' attribute bug is fixed 2009-10-02 Tao Liu Version 1.3.7 (Oktoberfest) * bin/macs, lib/PeakDetect.py, lib/IO/__init__.py, lib/OptValidator.py Enhancements by Peter Chines: 1. gzip files are supported. 2. when --diag is on, user can set the increment and endpoint for fold enrichment analysis by setting --fe-step and --fe-max. Enhancements by Davide Cittaro: 1. BAM and SAM formats are supported. 2. small changes in the header lines of wiggle output. Enhancements by Me: 1. I added --fe-min option; 2. Bowtie ascii output with suffix ".map" is supported. Bug fixed: 1. --nolambda bug is fixed. ( reported by Martin in JHU ) 2. --diag bug is fixed. ( reported by Bogdan Tanasa ) 3. Function to remove suffix '.fa' is fixed. ( reported by Jeff Johnston ) 4. Some "fold change" have been changed to "fold enrichment". 2009-06-10 Tao Liu Version 1.3.6.1 (default parameter change) * bin/macs, lib/PeakDetect.py "--oldfdr" is removed. The 'oldfdr' behaviour becomes default. "--futurefdr" is added which can turn on the 'new' method introduced in 1.3.6. By default it's off. * lib/PeakDetect.py Fixed a bug. p-value is corrected a little bit. 2009-05-11 Tao Liu Version 1.3.6 (Birthday cake) * bin/macs "track name" is added to the header of BED output file. Now the default peak detection method is to consider 5k and 10k nearby regions in treatment data and peak location, 1k, 5k, and 10k regions in control data to calculate local bias. The old method can be called through '--old' option. Information about how many total/unique tags in treatment or control will be saved in final .xls output. * lib/IO/__init__.py ".fa" will be removed from input tag alignment so only the chromosome names are kept. WigTrackI class is added for Wiggle like data structure. (not used now) The parser for ELAND multi PET files has been fixed. Now the 5' tag position for a pair will be kept, whereas in the previous version, the middle points are kept. * lib/IO/BinKeeper.py BinKeeperI class is inspired by Jim Kent's library for UCSC genome browser, which can quickly access certain region for values in a large wiggle like data file. (not used now) * lib/OptValidator.py typo fixed. * lib/PeakDetect.py Now the default peak detection method is to consider 5k and 10k nearby regions in treatment data and peak location, 1k, 5k, and 10k regions in control data to calculate local bias. The old method can be called through '--old' option. Two columns have beed added to BED output file. 4th column: peak name; 5th column: peak score using -10log(10,pvalue) as score. * setup.py Add support to build a Mac App through 'setup.py py2app', or a Windows executable through 'setup.py py2exe'. You need to install py2app or py2exe package in order to use these functions. 2009-02-12 Tao Liu Version 1.3.5 (local lambda fixed, typo fixed, model figure improved) * PeakDetect.py Now, besides 1k, 5k, 10k, MACS will also consider peak size region in control data to calculate local lambda for each peak. Peak calling results will be slightly different with previous version, beware! * OptValidator.py Typo fixed, ELANDParser -> ELANDResultParser * OutputWriter.py Now, modeled d value will be shown on the model figure. 2009-01-06 Tao Liu Version 1.3.4 (Happy New Year Version, bug fixed, ELAND multi/PET support) * macs, IO/__init__.py, PeakDetect.py Add support for ELAND multi format. Add support for Pair-End experiment, in this case, 5'end and 3'end ELAND multi format files are required for treatment or control data. See 00README file for detail. Add wigextend option. Add petdist option for Pair-End Tag experiment, which is the best distance between 5' and 3' tags. * PeakDetect.py Fixed a bug which cause the end positions of every peak region incorrectly added by 1 bp. ( Thanks Mali Salmon!) * OutputWriter.py Fix bugs while generating wiggle files. The start position of wiggle file is set to 1 instead of 0. Fix a bug that every 10M bps, signals in the first 'd' range are lower than actual. ( Thanks Mali Salmon!) 2008-12-03 Tao Liu Version 1.3.3 (wiggle bugs fixed) * OutputWriter.py Fix bugs while generating wiggle files. 1. 'span=' is added to 'variableStep' line; 2. previously, every 10M bps, the coordinates were wrongly shifted to the right for 'd' basepairs. * macs, PeakDetect.py Add an option to save wiggle files on different resolution. 2008-10-02 Tao Liu Version 1.3.2 (tiny bugs fixed) * IO/__init__.py Fix 65536 -> 65535. ( Thank Joon) * Prob.py Improved for binomial function with extra large number. Imported from Cistrome project. * PeakDetect.py If treatment channel misses reads in some chromosome included in control channel, or vice versa, MACS will not exit. (Thank Shaun Mahony) Instead, MACS will fake a tag at position -1 when calling treatment peaks vs control, but will ignore the chromosome while calling negative peaks. 2008-09-04 Tao Liu Version 1.3.1 (tiny bugs fixed version) * Prob.py Hyunjin Gene Shin contributed some codes to Prob.py. Now the binomial functions can tolerate large and small numbers. * IO/__init__.py Parsers now split lines in BED/ELAND file using any whitespaces. 'track' or 'browser' lines will be regarded as comment lines. A bug fixed when throwing StrandFormatError. The maximum redundant tag number at a single position can be no less than 65536. 2008-07-15 Tao Liu Version 1.3 (naming clarification version) * Naming clarification changes according to our manuscript: 'frag_len' is changed to 'd'. 'fold_change' is changed to 'fold_enrichment'. Suggest '--bw' parameter to be determined by users from the real sonication size. Maximum FDR is 100% in the output file. And other clarifications in 00README file and the documents on the website. * IO/__init__.py If the redundant tag number at a single position is over 32767, just remember 32767, instead of raising an overflow exception. * setup.py fixed a typo. * PeakDetect.py Bug fixed for diagnosis report. 2008-07-10 Tao Liu Version 1.2.2gamma * Serious bugs fix: Poisson distribution CDF and inverse CDF functions are corrected. They can produce right results even for huge lambda now. So that the p-value and FDR values in the final excel sheet are corrected. IO package now can tolerate some rare cases; ELANDParser in IO package is fixed. (Thank Bogdan) * Improvement: Reverse paired peaks in model are rejected. So there will be no negative 'frag_len'. (Thank Bogdan) * Features added: Diagnosis function is completed. Which can output a table file for users to estimate their sequencing depth. 2008-06-30 Tao Liu Version 1.2 * Probe.py is added! GSL is totally removed from MACS. Instead, I have implemented the CDF and inverse CDF for poisson and binomial distribution purely in python. * Constants.py is added! Organize constants used in MACS in the Constants.py file. * All other files are modified! Foldchange calculation is modified. Now the foldchange only be calculated at the peak summit position instead of the whole peak region. The values will be higher and more robust than before. Features added: 1. MACS can save wiggle format files containing the tag number at every 10 bp along the genome. Tags are shifted according to our model before they are calculated. 2. Model building and local lambda calculation can be skipped with certain options. 3. A diagnosis report can be generated through '--diag' option. This report can help you get an assumption about the sequencing saturation. This funtion is only in beta stage. 4. FDR calculation speed is highly improved. 2008-05-28 Tao Liu Version 1.1 * TabIO, PeakModel.py ... Bug fixed to let MACS tolerate some cases while there is no tag on either plus strand or minus strand. * setup.py Check the version of python. If the version is lower than 2.4, refuse to install with warning. MACS-2.0.9/setup.py0000644000175000017500000000536511654316302014605 0ustar taoliutaoliu00000000000000#!/usr/bin/env python # Time-stamp: <2011-11-02 12:27:16 Tao Liu> """Description Setup script for MACS -- Model Based Analysis for ChIP-Seq data Copyright (c) 2008,2009,2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: beta @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ import os import sys from distutils.core import setup, Extension from Cython.Distutils import build_ext def main(): if float(sys.version[:3])<2.6 or float(sys.version[:3])>=2.8: sys.stderr.write("CRITICAL: Python version must be 2.6 or 2.7!\n") sys.exit(1) ext_modules = [Extension("MACS2.cProb", ["MACS2/cProb.pyx"], libraries=["m"]), Extension("MACS2.IO.cParser",["MACS2/IO/cParser.pyx"]), Extension("MACS2.cPileup", ["MACS2/cPileup.pyx"]), Extension("MACS2.cPeakDetect", ["MACS2/cPeakDetect.pyx"]), Extension("MACS2.IO.cPeakIO", ["MACS2/IO/cPeakIO.pyx"],), Extension("MACS2.IO.cFixWidthTrack", ["MACS2/IO/cFixWidthTrack.pyx"],), Extension("MACS2.IO.cBedGraph", ["MACS2/IO/cBedGraph.pyx"], libraries=["m"]), Extension("MACS2.IO.cScoreTrack", ["MACS2/IO/cScoreTrack.pyx"],), Extension("MACS2.IO.cCompositeScoreTrack", ["MACS2/IO/cCompositeScoreTrack.pyx"],), ] setup(name="MACS", version="2.0.9", description="Model Based Analysis for ChIP-Seq data", author='Yong Zhang; Tao (Foo) Liu', author_email='zy@jimmy.harvard.edu; taoliu@jimmy.harvard.edu', url='http://liulab.dfci.harvard.edu/MACS/', package_dir={'MACS2' : 'MACS2'}, packages=['MACS2', 'MACS2.IO'], scripts=['bin/macs2', 'bin/macs2diff', 'bin/filterdup', 'bin/randsample', 'bin/bdgdiff', 'bin/bdgcmp', 'bin/bdgpeakcall', 'bin/bdgbroadcall', ], classifiers=[ 'Development Status :: 4 - experimental', 'Environment :: Console', 'Intended Audience :: Developers', 'License :: OSI Approved :: Artistic License', 'Operating System :: MacOS :: MacOS X', 'Operating System :: Microsoft :: Windows', 'Operating System :: POSIX', 'Programming Language :: Python', ], cmdclass = {'build_ext': build_ext}, ext_modules = ext_modules ) if __name__ == '__main__': main() MACS-2.0.9/bin/0000755000175000017500000000000011657057615013646 5ustar taoliutaoliu00000000000000MACS-2.0.9/bin/randsample0000644000175000017500000001367311654316302015715 0ustar taoliutaoliu00000000000000#!/usr/bin/env python # Time-stamp: <2011-11-02 14:01:10 Tao Liu> """Description: Random sample certain number/percentage of tags. Copyright (c) 2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: release candidate @version: $Id$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import logging from subprocess import Popen,PIPE from optparse import OptionParser import gzip # ------------------------------------ # own python modules # ------------------------------------ from MACS2.OptValidator import opt_validate_randsample as opt_validate from MACS2.cProb import binomial_cdf_inv from MACS2.Constants import * # ------------------------------------ # Main function # ------------------------------------ def main(): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate(prepare_optparser()) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 check output file if options.outputfile: assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile outfhd = open(options.outputfile,"w") else: outfhd = sys.stdout #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options (options) info("tag size = %d" % options.tsize) fwtrack.fw = options.tsize t0 = fwtrack.total info(" total tags in alignment file: %d" % (t0)) if options.number: if options.number > t0: error(" Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!") error(" %.2e > %.2e" % (options.number, t0)) sys.exit(1) info(" Number of tags you want to keep: %.2e" % (options.number)) options.percentage = float(options.number)/t0*100 info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage)) fwtrack.sample_percent(options.percentage/100.0) info(" tags after random sampling in alignment file: %d" % (fwtrack.total)) info("Write to BED file") fwtrack.print_to_bed(fhd=outfhd) info("finished! Check %s." % options.outputfile) def prepare_optparser (): """Prepare optparser object. New options will be added in this function first. """ usage = "usage: %prog <-t file> [-p percent|-n number] [-o outputfile] [options]" description = "%prog -- Random sample certain number/percentage of tags. Method: 1. Calculate the percentage of tags needed to be kept; 2. For each chromosome, random sample certain percentage of tags." optparser = OptionParser(version="%prog "+RANDSAMPLE_VERSION,description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="show this help message and exit.") optparser.add_option("-t",dest="tfile",type="string", help="Sequencing alignment file. REQUIRED.") optparser.add_option("-p","--percentage",dest="percentage",type="float", help="Percentage of tags you want to keep. Input 80.0 for 80%. This option can't be used at the same time with -n/--num. REQUIRED") optparser.add_option("-n","--number",dest="number",type="float", help="Number of tags you want to keep. Input 8000000 or 8e+6 for 8 million. This option can't be used at the same time with -p/--percent. Note that the number of tags in output is approximate as the number specified here. REQUIRED") optparser.add_option("-o",dest="outputfile",type="string", help="Output BED file name. If not specified, will write to standard output. DEFAULT: stdout", default=None) optparser.add_option("-s","--tsize",dest="tsize",type="int",default=None, help="Tag size. This will overide the auto detected tag size. DEFAULT: Not set") optparser.add_option("-f","--format",dest="format",type="string", help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let %prog decide which format the file is. Please check the definition in 00README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"", default="AUTO") optparser.add_option("--verbose",dest="verbose",type="int",default=2, help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2") return optparser def load_tag_files_options ( options ): """From the options, load alignment tags. """ options.info("read alignment tags...") tp = options.parser(open2(options.tfile)) if not options.tsize: # override tsize if user specified --tsize ttsize = tp.tsize() options.tsize = ttsize treat = tp.build_fwtrack() treat.sort() options.info("tag size is determined as %d bps" % options.tsize) return treat def open2(path, mode='r', bufsize=-1): # try gzip first f = gzip.open(path, mode) try: f.read(10) except IOError: # not a gzipped file f.close() f = open(path, mode, bufsize) else: f.seek(0) return f if __name__ == '__main__': try: main() except KeyboardInterrupt: sys.stderr.write("User interrupt me! ;-) Bye!\n") sys.exit(0) except AssertionError as e: sys.stderr.write(e.message+"\n") sys.exit(0) MACS-2.0.9/bin/bdgpeakcall0000644000175000017500000000705011654316302016010 0ustar taoliutaoliu00000000000000#!/usr/bin/env python # Time-stamp: <2011-10-13 22:14:51 Tao Liu> """Description: Naive call peaks from a single bedGraph track for scores. Copyright (c) 2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import sys import logging from optparse import OptionParser from MACS2.IO import bedGraphIO # ------------------------------------ # constants # ------------------------------------ logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) # ------------------------------------ # Misc functions # ------------------------------------ error = logging.critical # function alias warn = logging.warning debug = logging.debug info = logging.info # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main function # ------------------------------------ def main(): usage = "usage: %prog <-i bedGraph> [-c CUTOFF] [-l MIN] [-g MAX] [-o PREFIX]" description = "Call peaks from MACS pvalue or qscore score bedGraph output, with customized settings. Output encodePeak format peaks, combining peak boundaries, peak summits." optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("-i","--ifile",dest="ifile",type="string", help="MACS pvalue score bedGraph") optparser.add_option("-c","--cutoff",dest="cutoff",type="float", help="Cutoff depends on which method you used for score track. If the file contains pvalue scores from MACS2, score 5 means pvalue 1e-5. DEFAULT: 5",default=5) optparser.add_option("-l","--min-length",dest="minlen",type="int", help="minimum length of peak, better to set it as d value. DEFAULT: 200",default=200) optparser.add_option("-g","--max-gap",dest="maxgap",type="int", help="maximum gap between significant points in a peak, better to set it as tag size. DEFAULT: 30",default=30) optparser.add_option("-o","--o-prefix",dest="oprefix",default="peak", help="output file prefix, DEFAULT: peak") (options,args) = optparser.parse_args() if not options.ifile: optparser.print_help() sys.exit() info("Read and build bedGraph...") bio = bedGraphIO.bedGraphIO(options.ifile) btrack = bio.build_bdgtrack(baseline_value=0) info("Call peaks from bedGraph...") peaks = btrack.call_peaks(cutoff=options.cutoff,min_length=options.minlen,max_gap=options.maxgap) info("Write peaks...") nf = open ("%s_c%.1f_l%d_g%d_peaks.encodePeak" % (options.oprefix,options.cutoff,options.minlen,options.maxgap),"w") peaks.write_to_narrowPeak(nf, name_prefix=options.oprefix+"_encodePeak", score_column="score") info("Done") if __name__ == '__main__': try: main() except KeyboardInterrupt: sys.stderr.write("User interrupt me! ;-) See you!\n") sys.exit(0) MACS-2.0.9/bin/bdgcmp0000644000175000017500000000717011630217211015007 0ustar taoliutaoliu00000000000000#!/usr/bin/env python # Time-stamp: <2011-07-08 14:41:19 Tao Liu> import os import sys import logging from optparse import OptionParser from MACS2.IO import bedGraphIO from MACS2.cProb import poisson_cdf # ------------------------------------ # constants # ------------------------------------ logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) # ------------------------------------ # Misc functions # ------------------------------------ error = logging.critical # function alias warn = logging.warning debug = logging.debug info = logging.info # ------------------------------------ # Main function # ------------------------------------ def main(): usage = "usage: %prog <-t TREATMENT.BEDGRAPH> <-c CONTROL.BEDGRAPH> <-o OUTPUT.BEDGRAPH> [-m METHOD] " description = "Calculate scores using certain method by comparing a bedGraph file from treatment and a file from control representing local bias." optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("-t","--tfile",dest="tfile",type="string", help="Required: Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACSv2") optparser.add_option("-c","--cfile",dest="cfile",type="string", help="Required: Control bedGraph file, e.g. *_control_lambda.bdg from MACSv2") optparser.add_option("-o","--output",dest="ofile",type="string", help="Required: The output bedGraph file to write scores.") optparser.add_option("-m","--method",dest="method",type="string", help="Method to use while calculating a score in any bin by comparing treatment value and control value. Available choices are: ppois, substract, divide which represent Poisson Pvalue (-log10(pvalue) form) using control as lambda and treatment as observation, substraction from treatment, fold change which may be problematic if there are zero in control. Default option is ppois.",default="ppois") (options,args) = optparser.parse_args() if not options.tfile or not options.cfile or not options.ofile: optparser.print_help() sys.exit() available_methods = ['ppois','substract','divide'] if options.method not in available_methods: sys.stderr.write("Method can only be %s" % ",".join(available_methods)) else: method = options.method info("Read and build treatment bedGraph...") tbio = bedGraphIO.bedGraphIO(options.tfile) tbtrack = tbio.build_bdgtrack() info("Read and build control bedGraph...") cbio = bedGraphIO.bedGraphIO(options.cfile) cbtrack = cbio.build_bdgtrack() info("Calculate scores comparing treatment and control by %s..." % method) # build score track if method == 'ppois': sbtrack = tbtrack.overlie(cbtrack,func=lambda x,y:-1*poisson_cdf(x,y,False,True)) elif method == 'substract': sbtrack = tbtrack.overlie(cbtrack,func=lambda x,y:x-y) elif method == 'divide': sbtrack = tbtrack.overlie(cbtrack,func=lambda x,y:float(x)/y) else: raise Exception("Can't reach here!") info("Write to output bedGraph...") ofhd = open(options.ofile,"w") sbtrack.write_bedGraph(ofhd,name="%s_Scores" % (method.upper()),description="Scores calculated by %s" % (method.upper())) if __name__ == '__main__': main() MACS-2.0.9/bin/filterdup0000644000175000017500000001604111631505063015554 0ustar taoliutaoliu00000000000000#!/usr/bin/env python # Time-stamp: <2011-09-06 17:01:39 Tao Liu> """Description: Filter duplicate reads depending on sequencing depth. Copyright (c) 2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: release candidate @version: $Id$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import logging from subprocess import Popen,PIPE from optparse import OptionParser import gzip # ------------------------------------ # own python modules # ------------------------------------ from MACS2.OptValidator import opt_validate_filterdup as opt_validate from MACS2.cProb import binomial_cdf_inv from MACS2.Constants import * # ------------------------------------ # Main function # ------------------------------------ def main(): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate(prepare_optparser()) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 check output file if options.outputfile: assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile outfhd = open(options.outputfile,"w") else: outfhd = sys.stdout #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options (options) info("tag size = %d" % options.tsize) fwtrack.fw = options.tsize t0 = fwtrack.total info(" total tags in alignment file: %d" % (t0)) if options.keepduplicates != "all": if options.keepduplicates == "auto": info("calculate max duplicate tags in single position based on binomal distribution...") max_dup_tags = cal_max_dup_tags(options.gsize,t0) info(" max_dup_tags based on binomal = %d" % (max_dup_tags)) info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) else: info("user defined the maximum tags...") max_dup_tags = int(options.keepduplicates) info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) fwtrack.filter_dup(max_dup_tags) t1 = fwtrack.total info(" tags after filtering in alignment file: %d" % (t1)) info(" Redundant rate of alignment file: %.2f" % (float(t0-t1)/t0)) info("Write to BED file") fwtrack.print_to_bed(fhd=outfhd) info("finished! Check %s." % options.outputfile) def prepare_optparser (): """Prepare optparser object. New options will be added in this function first. """ usage = "usage: %prog <-t file> [-o outputfile] [-g genomesize] [options]" description = "%prog -- Filter duplicate reads like in MACS. This script can also be used to convert ELAND result, ELAND multi, ELAND export, SAM, BAM, BOWTIE map formats to BED format." optparser = OptionParser(version="%prog "+FILTERDUP_VERSION,description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="show this help message and exit.") optparser.add_option("-t",dest="tfile",type="string", help="Sequencing alignment file. REQUIRED.") optparser.add_option("-o",dest="outputfile",type="string", help="Output BED file name. If not specified, will write to standard output. DEFAULT: stdout", default=None) optparser.add_option("-f","--format",dest="format",type="string", help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let %prog decide which format the file is. Please check the definition in 00README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"", default="AUTO") optparser.add_option("-g","--gsize",dest="gsize",type="string",default="hs", help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), DEFAULT:hs") optparser.add_option("-s","--tsize",dest="tsize",type="int",default=None, help="Tag size. This will overide the auto detected tag size. DEFAULT: Not set") optparser.add_option("-p","--pvalue",dest="pvalue",type="float", help="Pvalue cutoff for binomial distribution test. DEFAULT:1e-5") optparser.add_option("--keep-dup",dest="keepduplicates",type="string",default="auto", help="It controls the %prog behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The default 'auto' option makes %prog calculate the maximum tags at the exact same location based on binomal distribution using given -p as pvalue cutoff; and the 'all' option keeps every tags (useful if you only want to convert formats). If an integer is given, at most this number of tags will be kept at the same location. Default: auto") optparser.add_option("--verbose",dest="verbose",type="int",default=2, help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2") return optparser def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ): """Calculate the maximum duplicated tag number based on genome size, total tag number and a p-value based on binomial distribution. Brute force algorithm to calculate reverse CDF no more than MAX_LAMBDA(100000). """ return binomial_cdf_inv(1-p,tags_number,1.0/genome_size) def load_tag_files_options ( options ): """From the options, load alignment tags. """ options.info("read alignment tags...") tp = options.parser(open2(options.tfile)) if not options.tsize: # override tsize if user specified --tsize ttsize = tp.tsize() options.tsize = ttsize treat = tp.build_fwtrack() treat.sort() options.info("tag size is determined as %d bps" % options.tsize) return treat def open2(path, mode='r', bufsize=-1): # try gzip first f = gzip.open(path, mode) try: f.read(10) except IOError: # not a gzipped file f.close() f = open(path, mode, bufsize) else: f.seek(0) return f if __name__ == '__main__': try: main() except KeyboardInterrupt: sys.stderr.write("User interrupt me! ;-) Bye!\n") sys.exit(0) except AssertionError as e: sys.stderr.write(e.message+"\n") sys.exit(0) MACS-2.0.9/bin/bdgdiff0000644000175000017500000001334211630217211015136 0ustar taoliutaoliu00000000000000#!/usr/bin/env python # Time-stamp: <2011-08-23 12:22:55 Tao Liu> """Description: Naive call differential peaks from 4 bedGraph tracks for scores. Copyright (c) 2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import re import logging from optparse import OptionParser from MACS2.IO import bedGraphIO from MACS2.IO.cCompositeScoreTrack import * # ------------------------------------ # constants # ------------------------------------ logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) # ------------------------------------ # Misc functions # ------------------------------------ error = logging.critical # function alias warn = logging.warning debug = logging.debug info = logging.info # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main function # ------------------------------------ def main(): usage = "usage: %prog <--s1 bedGraph> <--s2 bedGraph> <--s12 bedGraph> <--s21 bedGraph> [-c CUTOFF] [-l MIN] [-g MAX] [-o PREFIX]" description = "Call differential regions from four bedGraph files of pair-wise p/qvalue, with customized settings. Please read the descriptions of options carefully." optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("--s1",dest="st1c1bdg",type="string", help="MACS p/qvalue score bedGraph, condition 1 ChIP treatment against condition 1 control input.") optparser.add_option("--s2",dest="st2c2bdg",type="string", help="MACS p/qvalue score bedGraph, condition 2 ChIP treatment against condition 2 control input.") optparser.add_option("--s12",dest="st1t2bdg",type="string", help="MACS p/qvalue score bedGraph, condition 1 ChIP treatment against condition 2 ChIP treatment.") optparser.add_option("--s21",dest="st2t1bdg",type="string", help="MACS p/qvalue score bedGraph, condition 2 ChIP treatment against condition 1 ChIP treatment.") optparser.add_option("-c","--cutoff",dest="cutoff",type="float", help="Cutoff depends on which method you used for score track. If the file contains p/qvalue scores from MACS2, score 5 means p/qvalue 1e-5. DEFAULT: 5",default=5) optparser.add_option("-l","--min-length",dest="minlen",type="int", help="minimum length of peak, better to set it as d value. DEFAULT: 200",default=200) optparser.add_option("-g","--max-gap",dest="maxgap",type="int", help="maximum gap between significant points in a peak, better to set it as tag size. DEFAULT: 30",default=30) optparser.add_option("-o","--o-prefix",dest="oprefix",default="peak",type="string", help="output file prefix, DEFAULT: peak") (options,args) = optparser.parse_args() if not (options.st1c1bdg and options.st2c2bdg and options.st1t2bdg and options.st2t1bdg): optparser.print_help() sys.exit() info("Read and build bedGraph...") info("Score of condition 1 treatment vs condition 1 control...") t1c1_bio = bedGraphIO.bedGraphIO(options.st1c1bdg) t1c1_btrack = t1c1_bio.build_bdgtrack(baseline_value=0) info("Score of condition 2 treatment vs condition 2 control...") t2c2_bio = bedGraphIO.bedGraphIO(options.st2c2bdg) t2c2_btrack = t2c2_bio.build_bdgtrack(baseline_value=0) info("Score of condition 1 treatment vs condition 2 treatment...") t1t2_bio = bedGraphIO.bedGraphIO(options.st1t2bdg) t1t2_btrack = t1t2_bio.build_bdgtrack(baseline_value=0) info("Score of condition 2 treatment vs condition 1 treatment...") t2t1_bio = bedGraphIO.bedGraphIO(options.st2t1bdg) t2t1_btrack = t2t1_bio.build_bdgtrack(baseline_value=0) info("Combine four score tracks...") comp_btrack = make_compositeScoreTrack(t1c1_btrack,t2c2_btrack,t1t2_btrack,t2t1_btrack) info("Call differential regions ...") ( consistent_peaks, condition1_peaks, condition2_peaks ) = comp_btrack.call_diff_regions(cutoff=options.cutoff,min_length=options.minlen,max_gap=options.maxgap) info("Write peaks...") consistent_f = open ("%s_c%.0f_l%d_g%d_consistent_peaks.bed" % (options.oprefix,options.cutoff,options.minlen,options.maxgap),"w") condition1_f = open ("%s_c%.0f_l%d_g%d_condition1_unique_peaks.bed" % (options.oprefix,options.cutoff,options.minlen,options.maxgap),"w") condition2_f = open ("%s_c%.0f_l%d_g%d_condition2_unique_peaks.bed" % (options.oprefix,options.cutoff,options.minlen,options.maxgap),"w") consistent_peaks.write_to_bed(consistent_f,name_prefix=options.oprefix+"_consistent_peak_", score_column="score") condition1_peaks.write_to_bed(condition1_f,name_prefix=options.oprefix+"_condition1_unique_peak_", score_column="score") condition2_peaks.write_to_bed(condition2_f,name_prefix=options.oprefix+"_condition2_unique_peak_", score_column="score") info("Done") if __name__ == '__main__': try: main() except KeyboardInterrupt: sys.stderr.write("User interrupt me! ;-) See you!\n") sys.exit(0) MACS-2.0.9/bin/bdgbroadcall0000644000175000017500000001103111654316302016151 0ustar taoliutaoliu00000000000000#!/usr/bin/env python # Time-stamp: <2011-09-08 11:57:20 Tao Liu> """Description: Fine-tuning script to call broad peaks from a single bedGraph track for scores. Copyright (c) 2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import sys import logging from optparse import OptionParser from MACS2.IO import bedGraphIO # ------------------------------------ # constants # ------------------------------------ logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) # ------------------------------------ # Misc functions # ------------------------------------ error = logging.critical # function alias warn = logging.warning debug = logging.debug info = logging.info # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main function # ------------------------------------ def main(): usage = "usage: %prog <-i bedGraph> [-c CUTOFF1] [-C CUTOFF2] [-l MIN] [-g MAX1] [-G MAX2] [-o PREFIX]" description = "Call broad peaks from MACS pvalue or qscore score bedGraph output, with customized settings. Output two files for narrow peaks in encodePeak format, and one for broad peaks in bed12 format." optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("-i","--ifile",dest="ifile",type="string", help="MACS pvalue score bedGraph") optparser.add_option("-c","--cutoff-peak",dest="cutoffpeak",type="float", help="Cutoff for peaks depending on which method you used for score track. If the file contains qvalue scores from MACS2, score 2 means qvalue 0.01. DEFAULT: 2",default=2) optparser.add_option("-C","--cutoff-link",dest="cutofflink",type="float", help="Cutoff for linking regions/low abundance regions depending on which method you used for score track. If the file contains qvalue scores from MACS2, score 1 means qvalue 0.1, and score 0.3 means qvalue 0.5. DEFAULT: 5",default=1) optparser.add_option("-l","--min-length",dest="minlen",type="int", help="minimum length of peak, better to set it as d value. DEFAULT: 200",default=200) optparser.add_option("-g","--lvl1-max-gap",dest="lvl1maxgap",type="int", help="maximum gap between significant peaks, better to set it as tag size. DEFAULT: 30",default=30) optparser.add_option("-G","--lvl2-max-gap",dest="lvl2maxgap",type="int", help="maximum linking between significant peaks, better to set it as 4 times of d value. DEFAULT: 800",default=800) optparser.add_option("-o","--o-prefix",dest="oprefix",default="peak", help="output file prefix, DEFAULT: peak") (options,args) = optparser.parse_args() if not options.ifile: optparser.print_help() sys.exit() info("Read and build bedGraph...") bio = bedGraphIO.bedGraphIO(options.ifile) btrack = bio.build_bdgtrack(baseline_value=0) info("Call peaks from bedGraph...") (peaks,bpeaks) = btrack.call_broadpeaks (lvl1_cutoff=options.cutoffpeak, lvl2_cutoff=options.cutofflink, min_length=options.minlen, lvl1_max_gap=options.lvl1maxgap, lvl2_max_gap=options.lvl2maxgap) info("Write peaks...") nf = open ("%s_c%.0f_l%d_g%d_peaks.encodePeak" % (options.oprefix,options.cutoffpeak,options.minlen,options.lvl1maxgap),"w") bf = open ("%s_c%.0f_C%.0f_l%d_g%d_G%d_broad.bed" % (options.oprefix,options.cutoffpeak,options.cutofflink,options.minlen,options.lvl1maxgap,options.lvl2maxgap),"w") peaks.write_to_narrowPeak(nf, name_prefix=options.oprefix+"_encodePeak", score_column="score") bpeaks.write_to_gappedPeak(bf, name_prefix=options.oprefix+"_broadRegion") info("Done") if __name__ == '__main__': try: main() except KeyboardInterrupt: sys.stderr.write("User interrupt me! ;-) See you!\n") sys.exit(0) MACS-2.0.9/bin/macs20000644000175000017500000005142711654316302014573 0ustar taoliutaoliu00000000000000#!/usr/bin/env python # Time-stamp: <2011-11-02 15:03:03 Tao Liu> """Description: MACS 2 main executable Copyright (c) 2008,2009 Yong Zhang, Tao Liu Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: release candidate @version: $Id$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import logging from subprocess import Popen,PIPE from optparse import OptionParser import gzip # ------------------------------------ # own python modules # ------------------------------------ from MACS2.OptValidator import opt_validate from MACS2.OutputWriter import * from MACS2.cProb import binomial_cdf_inv from MACS2.PeakModel import PeakModel,NotEnoughPairsException from MACS2.cPeakDetect import PeakDetect from MACS2.Constants import * # ------------------------------------ # Main function # ------------------------------------ def main(): """The Main function/pipeline for MACS. """ # Parse options... options = opt_validate(prepare_optparser()) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments info("\n"+options.argtxt) #1 Read tag files info("#1 read tag files...") (treat, control) = load_tag_files_options (options) # check common chromosome names if control: tchrnames = set(treat.get_chr_names()) cchrnames = set(control.get_chr_names()) commonnames = tchrnames.intersection(cchrnames) if len(commonnames)==0: error("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...") error("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames))) error("Chromosome names in control: %s" % ",".join(sorted(cchrnames))) sys.exit() info("#1 tag size = %d" % options.tsize) tagsinfo = "# tag size is determined as %d bps\n" % (options.tsize) t0 = treat.total tagsinfo += "# total tags in treatment: %d\n" % (t0) info("#1 total tags in treatment: %d" % (t0)) if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 calculate max duplicate tags in single position based on binomal distribution...") treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0) info("#1 max_dup_tags based on binomal = %d" % (treatment_max_dup_tags)) info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags)) else: info("#1 user defined the maximum tags...") treatment_max_dup_tags = int(options.keepduplicates) info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags)) treat.filter_dup(treatment_max_dup_tags) t1 = treat.total info("#1 tags after filtering in treatment: %d" % (t1)) tagsinfo += "# tags after filtering in treatment: %d\n" % (t1) tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags) info("#1 Redundant rate of treatment: %.2f" % (float(t0-t1)/t0)) tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0) if control: c0 = control.total tagsinfo += "# total tags in control: %d\n" % (c0) info("#1 total tags in control: %d" % (c0)) if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 for control, calculate max duplicate tags in single position based on binomal distribution...") control_max_dup_tags = cal_max_dup_tags(options.gsize,c0) info("#1 max_dup_tags based on binomal = %d" % (control_max_dup_tags)) info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (control_max_dup_tags)) else: info("#1 user defined the maximum tags...") control_max_dup_tags = int(options.keepduplicates) info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags)) control.filter_dup(control_max_dup_tags) c1 = control.total info("#1 tags after filtering in control: %d" % (c1)) tagsinfo += "# tags after filtering in control: %d\n" % (c1) tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (control_max_dup_tags) info("#1 Redundant rate of control: %.2f" % (float(c0-c1)/c0)) tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0) info("#1 finished!") #2 Build Model info("#2 Build Peak Model...") if options.nomodel: info("#2 Skipped...") options.d=options.shiftsize*2 info("#2 Use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d)) options.scanwindow=2*options.d # remove the effect of --bw else: try: peakmodel = PeakModel(treatment = treat, max_pairnum = MAX_PAIRNUM, opt = options ) info("#2 finished!") debug("#2 Summary Model:") debug("#2 min_tags: %d" % (peakmodel.min_tags)) debug("#2 d: %d" % (peakmodel.d)) debug("#2 scan_window: %d" % (peakmodel.scan_window)) info("#2 predicted fragment length is %d bps" % peakmodel.d) info("#2.2 Generate R script for model : %s" % (options.modelR)) model2r_script(peakmodel,options.modelR,options.name) options.d = peakmodel.d options.scanwindow= 2*options.d if options.onauto and options.d <= 2*options.tsize: options.d=options.shiftsize*2 options.scanwindow=2*options.d warn("#2 Since the d calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem. MACS will use %d as shiftsize, %d as fragment length. NOTE: if the d calculated is still acceptable, please do not use --auto-bimodal option!" % (options.shiftsize,options.d)) except NotEnoughPairsException: if not options.onauto: sys.exit(1) warn("#2 Skipped...") options.d=options.shiftsize*2 options.scanwindow=2*options.d warn("#2 Since --auto-bimodal is set, MACS will use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d)) #3 Call Peaks info("#3 Call peaks...") if options.nolambda: info("# local lambda is disabled!") # decide options.tocontrol according to options.tolarge if control: if options.downsample: # use random sampling to balance treatment and control info("#3 User prefers to use random sampling instead of linear scaling.") if t1 > c1: info("#3 MACS is random sampling treatment tags...") warn("#3 Your results may not be reproducible due to the random sampling!") treatment.sample_num(c1) info("#3 %d tags from treatment are kept" % treatment.total) elif c1 > t1: info("#3 MACS is random sampling control tags...") warn("#3 Your results may not be reproducible due to the random sampling!") control.sample_num(t1) info("#3 %d tags from control are kept" % control.total) # set options.tocontrol although it would;t matter now options.tocontrol = False else: if options.tolarge: if t1 > c1: # treatment has more tags than control, since tolarge is # true, we will scale control to treatment. options.tocontrol = False else: # treatment has less tags than control, since tolarge is # true, we will scale treatment to control. options.tocontrol = True else: if t1 > c1: # treatment has more tags than control, since tolarge is # false, we will scale treatment to control. options.tocontrol = True else: # treatment has less tags than control, since tolarge is # false, we will scale control to treatment. options.tocontrol = False peakdetect = PeakDetect(treat = treat, control = control, opt = options ) peakdetect.call_peaks() #diag_result = peakdetect.diag_result() #4 output #4.1 peaks in XLS info("#4 Write output xls file... %s" % (options.peakxls)) ofhd_xls = open(options.peakxls,"w") ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION)) ofhd_xls.write(options.argtxt+"\n") ofhd_xls.write(tagsinfo) ofhd_xls.write("# d = %d\n" % (options.d)) if options.nolambda: ofhd_xls.write("# local lambda is disabled!\n") ofhd_xls.write(peakdetect.toxls()) ofhd_xls.close() #4.2 peaks in BED if options.log_pvalue: score_column = "pscore" elif options.log_qvalue: score_column = "qscore" info("#4 Write peak bed file... %s" % (options.peakbed)) ofhd_bed = open(options.peakbed,"w") peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="MACS_peak_", score_column=score_column) ofhd_bed.close() #4.2 peaks in narrowPeak info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak)) ofhd_bed = open(options.peakNarrowPeak,"w") peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix="MACS_peak_", score_column=score_column) ofhd_bed.close() #4.2 broad peaks in bed12 if options.broad: info("#4 Write broad peak in bed12 format file... %s" % (options.peakBroadPeak)) ofhd_bed = open(options.peakBroadPeak,"w") peakdetect.broadpeaks.write_to_gappedPeak (ofhd_bed, name_prefix="MACS_peak_", name=options.name, description=options.name) ofhd_bed.close() #4.2-2 summits in BED info("#4 Write summits bed file... %s" % (options.summitbed)) ofhd_summits = open(options.summitbed,"w") peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="MACS_summit_", score_column=score_column) ofhd_summits.close() def prepare_optparser (): """Prepare optparser object. New options will be added in this function first. """ usage = """usage: %prog <-t tfile> [-n name] [-g genomesize] [options] Example: %prog -t ChIP.bam -c Control.bam -f BAM -g hs -n test -B -q 0.01 or example for broad peak calling: %prog -t ChIP.bam -c Control.bam --broad -g hs """ description = "%prog -- Model-based Analysis for ChIP-Sequencing" optparser = OptionParser(version="%prog "+MACS_VERSION,description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="show this help message and exit.") optparser.add_option("-t","--treatment",dest="tfile",type="string", help="ChIP-seq treatment file. REQUIRED.") optparser.add_option("-c","--control",dest="cfile",type="string", help="Control file.") optparser.add_option("-n","--name",dest="name",type="string", help="Experiment name, which will be used to generate output file names. DEFAULT: \"NA\"", default="NA") optparser.add_option("-f","--format",dest="format",type="string", help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let MACS decide which format the file is. Please check the definition in 00README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"", default="AUTO") optparser.add_option("-g","--gsize",dest="gsize",type="string",default="hs", help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs") optparser.add_option("-s","--tsize",dest="tsize",type="int",default=None, help="Tag size. This will overide the auto detected tag size. DEFAULT: Not set") optparser.add_option("--bw",dest="bw",type="int",default=300, help="Band width. This value is only used while building the shifting model. DEFAULT: 300") optparser.add_option("-q","--qvalue",dest="qvalue",type="float",default=0.01, help="Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.01 ") optparser.add_option("-p","--pvalue",dest="pvalue",type="float", help="Pvalue cutoff for peak detection. When set (e.g. -q 0.05 or -q 1e-5), qvalue cutoff will be ignored. Default is not set.") optparser.add_option("-m","--mfold",dest="mfold",type="string",default="10,30", help="Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. The regions must be lower than upper limit, and higher than the lower limit. DEFAULT:10,30") optparser.add_option("--nolambda",dest="nolambda",action="store_true", help="If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to potential chromatin structure. ", default=False) optparser.add_option("--slocal",dest="smalllocal",type="int",default=1000, help="The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 1000 ") optparser.add_option("--llocal",dest="largelocal",type="int",default=10000, help="The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 10000.") optparser.add_option("--auto-bimodal",dest="onauto",action="store_true", help="Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the '--shiftsize' parameter to shift and extend each tags. Not to use this automate fixation is a default behavior now. DEFAULT: False", default=False) optparser.add_option("--nomodel",dest="nomodel",action="store_true", help="Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set shiftsize to change it. DEFAULT: False", default=False) optparser.add_option("--shiftsize",dest="shiftsize",type="int",default=100, help="The arbitrary shift size in bp. When nomodel is true, MACS will use this value as 1/2 of fragment size. DEFAULT: 100 ") optparser.add_option("--keep-dup",dest="keepduplicates",type="string",default="auto", help="It controls the MACS behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The default 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Default: auto") optparser.add_option("--to-large",dest="tolarge",action="store_true",default=False, help="When set, scale the small sample up to the bigger sample. By default, the bigger dataset will be scaled down towards the smaller dataset, which will lead to smaller p/qvalues and more specific results. Keep in mind that scaling down will bring down background noise more. DEFAULT: False") optparser.add_option("--down-sample",dest="downsample",action="store_true",default=False, help="When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. DEFAULT: False") optparser.add_option("--shift-control",dest="shiftcontrol",action="store_true",default=False, help="When set, control tags will be shifted just as ChIP tags according to their strand before the extension of d, slocal and llocal. By default, control tags are extended centered at their current positions regardless of strand. You may consider to turn this option on while comparing two ChIP datasets of different condition but the same factor. DEFAULT: False") optparser.add_option("--half-ext",dest="halfext",action="store_true",default=False, help="When set, MACS extends 1/2 d size for each fragment centered at its middle point. DEFAULT: False") optparser.add_option("-B","--bdg",dest="store_bdg",action="store_true", help="Whether or not to save extended fragment pileup, local lambda and score tracks at every bp into a bedGraph file. DEFAULT: False", default=False) optparser.add_option("--broad",dest="broad",action="store_true", help="If set, MACS will try to call broad peaks by linking nearby highly enriched regions. The linking region is controlled by another cutoff through --linking-cutoff. The maximum linking region length is 4 times of d from MACS. DEFAULT: False",default=False) optparser.add_option("--broad-cutoff",dest="broadcutoff",type="float",default=0.1, help="Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. DEFAULT: 0.1 ") optparser.add_option("--verbose",dest="verbose",type="int",default=2, help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") return optparser def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ): """Calculate the maximum duplicated tag number based on genome size, total tag number and a p-value based on binomial distribution. Brute force algorithm to calculate reverse CDF no more than MAX_LAMBDA(100000). """ return binomial_cdf_inv(1-p,tags_number,1.0/genome_size) def load_tag_files_options ( options ): """From the options, load treatment tags and control tags (if available). """ options.info("#1 read treatment tags...") tp = options.parser(open2(options.tfile)) if not options.tsize: # override tsize if user specified --tsize ttsize = tp.tsize() options.tsize = ttsize treat = tp.build_fwtrack() treat.sort() if options.cfile: options.info("#1.2 read input tags...") control = options.parser(open2(options.cfile)).build_fwtrack() control.sort() else: control = None options.info("#1 tag size is determined as %d bps" % options.tsize) return (treat, control) def open2(path, mode='r', bufsize=-1): # try gzip first f = gzip.open(path, mode) try: f.read(10) except IOError: # not a gzipped file f.close() f = open(path, mode, bufsize) else: f.seek(0) return f if __name__ == '__main__': try: main() except KeyboardInterrupt: sys.stderr.write("User interrupt me! ;-) Bye!\n") sys.exit(0) MACS-2.0.9/bin/macs2diff0000644000175000017500000005032111654316302015414 0ustar taoliutaoliu00000000000000#!/usr/bin/env python # Time-stamp: <2011-11-02 14:56:55 Tao Liu> """Module Description Copyright (c) 2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import logging import multiprocessing from subprocess import Popen,PIPE from optparse import OptionParser import gzip # from MACS2 libraries from MACS2.OptValidator import opt_validate_diff from MACS2.Constants import * from MACS2.cProb import binomial_cdf_inv from MACS2.PeakModel import PeakModel,NotEnoughPairsException from MACS2.cPeakDetect import compare_treatment_vs_control from MACS2.IO.cBedGraph import scoreTracktoBedGraph from MACS2.IO.cCompositeScoreTrack import * # ------------------------------------ # constants # ------------------------------------ CPUCOUNT = multiprocessing.cpu_count() # ------------------------------------ # Misc functions # ------------------------------------ def prepare_optparser (): """Prepare optparser object. New options will be added in this function first. """ usage = """usage: %prog <-t1 tfile1> [-c1 cfile1] <-t2 tfile2> [-c2 cfile2] [-n name] [-g genomesize] [options] Example: %prog --t1 CTCF_GM12878.bam --c1 Control_GM12878.bam --t2 CTCF_K562.bam --c2 Control_K562.bam -g hs -n testdiff -B -q 0.01 """ description = "%prog -- Differential Analysis for ChIP-Sequencing" optparser = OptionParser(version="%prog "+MACSDIFF_VERSION,description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="show this help message and exit.") optparser.add_option("--t1",dest="tfile1",type="string", help="ChIP-seq treatment file for the first condition. REQUIRED.") optparser.add_option("--c1",dest="cfile1",type="string", help="Control file for the first condition. If c1 is missing while c2 is specified, t1 will be paired with c2. At least one of c1 or c2 should be available.") optparser.add_option("--t2",dest="tfile2",type="string", help="ChIP-seq treatment file for the second condition. REQUIRED") optparser.add_option("--c2",dest="cfile2",type="string", help="Control file for the second condition. If c2 is missing while c1 is specified, t2 will be paired with c1. At least one of c1 or c2 should be available.") optparser.add_option("-n","--name",dest="name",type="string", help="Analysis name, which will be used to generate output file names. DEFAULT: \"NA\"", default="NA") optparser.add_option("-f","--format",dest="format",type="string", help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let MACS decide which format the file is. Please check the definition in 00README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"", default="AUTO") optparser.add_option("-g","--gsize",dest="gsize",type="string",default="hs", help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs") optparser.add_option("-s","--tsize",dest="tsize",type="int",default=None, help="Tag size. This will overide the auto detected tag size. DEFAULT: Not set") optparser.add_option("--bw",dest="bw",type="int",default=300, help="Band width. This value is only used while building the shifting model. DEFAULT: 300") optparser.add_option("-q","--qvalue",dest="qvalue",type="float",default=0.01, help="Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.01 ") optparser.add_option("-p","--pvalue",dest="pvalue",type="float", help="Pvalue cutoff for peak detection. When set (e.g. -q 0.05 or -q 1e-5), qvalue cutoff will be ignored. Default is not set.") optparser.add_option("-m","--mfold",dest="mfold",type="string",default="10,30", help="Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. The regions must be lower than upper limit, and higher than the lower limit. DEFAULT:10,30") optparser.add_option("--nolambda",dest="nolambda",action="store_true", help="If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to potential chromatin structure. ", default=False) optparser.add_option("--slocal",dest="smalllocal",type="int",default=1000, help="The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 1000 ") optparser.add_option("--llocal",dest="largelocal",type="int",default=10000, help="The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 10000.") optparser.add_option("--auto-bimodal",dest="onauto",action="store_true", help="Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the '--shiftsize' parameter to shift and extend each tags. Not to use this automate fixation is a default behavior now. DEFAULT: False", default=False) optparser.add_option("--nomodel",dest="nomodel",action="store_true", help="Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set shiftsize to change it. DEFAULT: False", default=False) optparser.add_option("--shiftsize",dest="shiftsize",type="int",default=100, help="The arbitrary shift size in bp. When nomodel is true, MACS will use this value as 1/2 of fragment size. DEFAULT: 100 ") optparser.add_option("--keep-dup",dest="keepduplicates",type="string",default="auto", help="It controls the MACS behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The default 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Default: auto") optparser.add_option("-B","--bdg",dest="store_bdg",action="store_true", help="Whether or not to save p/qvalue (depending on whether pvalue or qvalue is used) score tracks at every bp into a bedGraph file. DEFAULT: False", default=False) optparser.add_option("--minlen",dest="minlen",type="int", help="The minimum length for differential calling. By default, it will be decided as the middle of fragment sizes of treatment 1 andtreatment 2. Must be an integar larger than 0.") optparser.add_option("-a",dest="nprocesses",type="int",default=1, help="Number of CPUs MACS can use, upto 4. DEFAULT: 1 ") optparser.add_option("--verbose",dest="verbose",type="int",default=2, help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") return optparser def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ): """Calculate the maximum duplicated tag number based on genome size, total tag number and a p-value based on binomial distribution. Brute force algorithm to calculate reverse CDF no more than MAX_LAMBDA(100000). """ return binomial_cdf_inv(1-p,tags_number,1.0/genome_size) def load_tag_files_options ( options ): """From the options, load treatment tags and control tags (if available). """ options.info("#1 read treatment files...") tp1 = options.parser(open2(options.tfile1)) tp2 = options.parser(open2(options.tfile2)) if not options.tsize: # override tsize if user specified --tsize ttsize = tp1.tsize() options.tsize = ttsize options.info("#1 read treatment file for condition 1...") treat1 = tp1.build_fwtrack() options.info("#1 read treatment file for condition 2...") treat2 = tp2.build_fwtrack() treat1.sort() treat2.sort() if options.cfile1: options.info("#1.2 read input control files...") options.info("#1 read control file for condition 1...") control1 = options.parser(open2(options.cfile1)).build_fwtrack() control1.sort() if options.cfile2 == options.cfile1: control2 = control1 else: options.info("#1 read control file for condition 2...") control2 = options.parser(open2(options.cfile2)).build_fwtrack() control2.sort() else: control1 = None control2 = None options.info("#1 tag size is determined as %d bps" % options.tsize) return ( treat1, control1, treat2, control2 ) def open2(path, mode='r', bufsize=-1): # try gzip first f = gzip.open(path, mode) try: f.read(10) except IOError: # not a gzipped file f.close() f = open(path, mode, bufsize) else: f.seek(0) return f def remove_duplicates ( taskoptions): (name, fwtrack_obj, options) = taskoptions t0 = fwtrack_obj.total tagsinfo = "# total tags in %s: %d\n" % ( name, t0 ) if options.keepduplicates != "all": if options.keepduplicates == "auto": #info( "#1 calculate max duplicate tags in single position based on binomial distribution..." ) max_dup_tags = cal_max_dup_tags( options.gsize, t0 ) #info( "#1 max_dup_tags based on binomal = %d" % ( max_dup_tags ) ) #info( "#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % ( max_dup_tags ) ) else: #info( "#1 user defined the maximum tags..." ) max_dup_tags = int( options.keepduplicates ) #info( "#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % ( max_dup_tags ) ) fwtrack_obj.filter_dup( max_dup_tags ) t1 = fwtrack_obj.total #info("#1 tags after filtering in %s: %d" % ( name, t1 ) ) tagsinfo += "# tags after filtering in %s: %d\n" % ( name, t1 ) tagsinfo += "# maximum duplicate tags at the same position in %s = %d\n" % ( name, max_dup_tags ) #info("#1 Redundant rate of %s: %.2f" % ( name, float(t0-t1)/t0) ) tagsinfo += "# Redundant rate in %s: %.2f\n" % ( name, float( t0-t1 )/t0 ) return tagsinfo def macs2pqvalues ( task_options ): ( name, treat, control, diff_options, slocal, llocal, tocontrol, shiftcontrol ) = task_options if diff_options.nomodel: d = diff_options.shiftsize * 2 else: try: peakmodel = PeakModel(treatment = treat, max_pairnum = MAX_PAIRNUM, opt = diff_options, quiet = True, ) d = peakmodel.d if diff_options.onauto and d <= 2 * diff_options.tsize: d = diff_options.shiftsize * 2 except NotEnoughPairsException: if not diff_options.onauto: sys.exit(1) d = diff_options.shiftsize * 2 diff_options.halfext = False scoretrack = compare_treatment_vs_control ( treat, control, d, diff_options.gsize, halfext=False, slocal=slocal, llocal=llocal, tocontrol=tocontrol, shiftcontrol=shiftcontrol ) if diff_options.log_qvalue: bdgtrack = scoreTracktoBedGraph( scoretrack, colname='-100logq' ) elif diff_options.log_pvalue: bdgtrack = scoreTracktoBedGraph( scoretrack, colname='-100logp' ) else: raise Exception("p or q value cutoff not set?!") scoretrack = None return (bdgtrack,d) # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main function # ------------------------------------ def main(): # Parse options... options = opt_validate_diff( prepare_optparser() ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments info( "macs2diff start\n\n" + options.argtxt ) if options.nprocesses > CPUCOUNT: warn( "-a is larger than actual CPU counts, will cap it with %d" % CPUCOUNT ) options.nprocesses = CPUCOUNT if options.nprocesses > 4: warn( "Maximum 4 CPUs can be used!" ) options.nprocesses = 4 #1 Read ( treat1, control1, treat2, control2 ) = load_tag_files_options( options ) #2 Remove duplicates info( "#2 Remove duplicates..." ) pool = multiprocessing.Pool(options.nprocesses) tasks = [('treatment1', treat1, options), ('treatment2', treat2, options), ('control1', control1, options) ] if control1 != control2: tasks.append( ('control2', control2, options) ) results = pool.map(remove_duplicates, tasks) pool.close() pool.join() info( "#2 Summary of duplicates removing:\n\n%s " % "".join(results) ) #3 Build Model info("#3 do 4-way comparisons ...") pool = multiprocessing.Pool(options.nprocesses) # I wish to do something correct... When t1 and t2 have different # sequencing depth, the direction of the scaling is very # tricky. At least, we should keep the direction the same while # comparing t1 vs t2 or t2 vs t1. It means, to always scale t1 to # t2 or t2 to t1. To be safe (specific), we scale larger dataset # to smaller one. if treat1.total > treat2.total: tocontrol1 = True # tocontrol1: whether scale t1 to t2 tocontrol2 = False # tocontrol1: whether scale t2 to t1 else: tocontrol1 = False tocontrol2 = True # extra parameters passed to macs2run are 1) slocal, 2) llocal, 3) # whether or not scale 'treatment' towards 'control', 4) whether # or not 'shift control'; tasks = [ ('t1c1', treat1, control1, options, options.smalllocal, options.largelocal, False, False), ('t2c2', treat2, control2, options, options.smalllocal, options.largelocal, False, False), ('t1t2', treat1, treat2, options, 0, 0, tocontrol1, True), ('t2t1', treat2, treat1, options, 0, 0, tocontrol2, True) ] results = pool.map( macs2pqvalues, tasks ) pool.close() pool.join() ( (t1c1,d1),(t2c2,d2),(t1t2,d3),(t2t1,d4) ) = results #print results info("#3 4-way comparisons done!") info("#3 Fragment size for treatment 1 is %s bps" % d1) info("#3 Fragment size for treatment 2 is %s bps" % d2) if not options.minlen: minlen = int((d1+d2)/2) info("#3 The middle value of above two fragment sizes will be used as minimum length in differenial calling: %s bps" % minlen) else: minlen = max(1,options.minlen) info("#3 User specified minimum length for differential calling" % minlen) if options.store_bdg: info("#3 save score to bedGraph files") if options.log_qvalue: info("#3 saving qvalues") t1c1.write_bedGraph(open("%s_t1c1_qvalue.bdg" % options.name,"w"), name="%s t1c1 qvalue" % options.name, description="%s qvalues calculated by comparing treatment1 and control1" % options.name) t2c2.write_bedGraph(open("%s_t2c2_qvalue.bdg" % options.name,"w"), name="%s t2c2 qvalue" % options.name, description="%s qvalues calculated by comparing treatment2 and control2" % options.name) t1t2.write_bedGraph(open("%s_t1t2_qvalue.bdg" % options.name,"w"), name="%s t1t2 qvalue" % options.name, description="%s qvalues calculated by comparing treatment1 and treatment2" % options.name) t2t1.write_bedGraph(open("%s_t2t1_qvalue.bdg" % options.name,"w"), name="%s t2t1 qvalue" % options.name, description="%s qvalues calculated by comparing treatment2 and treatment1" % options.name) elif options.log_pvalue: info("#3 saving pvalues") t1c1.write_bedGraph(open("%s_t1c1_pvalue.bdg" % options.name,"w"), name="%s t1c1 pvalue" % options.name, description="%s pvalues calculated by comparing treatment1 and control1" % options.name) t2c2.write_bedGraph(open("%s_t2c2_pvalue.bdg" % options.name,"w"), name="%s t2c2 pvalue" % options.name, description="%s pvalues calculated by comparing treatment2 and control2" % options.name) t1t2.write_bedGraph(open("%s_t1t2_pvalue.bdg" % options.name,"w"), name="%s t1t2 pvalue" % options.name, description="%s pvalues calculated by comparing treatment1 and treatment2" % options.name) t2t1.write_bedGraph(open("%s_t2t1_pvalue.bdg" % options.name,"w"), name="%s t2t1 pvalue" % options.name, description="%s pvalues calculated by comparing treatment2 and treatment1" % options.name) else: raise Exception("p or q value cutoff not set?!") info("#3 Please check the *.bdg files in the same directory!") #4 Call differential peaks info("#4 Combine four score tracks...") comp_btrack = make_compositeScoreTrack( t1c1,t2c2,t1t2,t2t1 ) info("#4 Call differential regions ...") if options.log_qvalue: cutoff = options.log_qvalue elif options.log_pvalue: cutoff = options.log_pvalue else: raise Exception("p or q value cutoff not set?!") ( consistent_peaks, condition1_peaks, condition2_peaks ) = comp_btrack.call_diff_regions(cutoff=cutoff,min_length=minlen,max_gap=options.tsize) info("#4 Write peaks...") consistent_f = open ( options.consistent_peakbed, "w" ) condition1_f = open ( options.condition1_peakbed, "w" ) condition2_f = open ( options.condition2_peakbed, "w" ) consistent_peaks.write_to_bed(consistent_f,name_prefix="consistent_site_", score_column="score") condition1_peaks.write_to_bed(condition1_f,name_prefix="condition1_unique_site_", score_column="score") condition2_peaks.write_to_bed(condition2_f,name_prefix="condition2_unique_site_", score_column="score") info("#4 Summary\n\nConsistent sites: %d\nCondition1 unique sites: %d\nCondition2 unique sites: %d\n" % (consistent_peaks.total(), condition1_peaks.total(), condition2_peaks.total() )) info("#4 Done") if __name__ == '__main__': try: main() except KeyboardInterrupt: sys.stderr.write("User interrupt me! ;-) See you!\n") sys.exit(0) MACS-2.0.9/COPYING0000644000175000017500000001373711630217211014121 0ustar taoliutaoliu00000000000000 The "Artistic License" Preamble The intent of this document is to state the conditions under which a Package may be copied, such that the Copyright Holder maintains some semblance of artistic control over the development of the package, while giving the users of the package the right to use and distribute the Package in a more-or-less customary fashion, plus the right to make reasonable modifications. Definitions: "Package" refers to the collection of files distributed by the Copyright Holder, and derivatives of that collection of files created through textual modification. "Standard Version" refers to such a Package if it has not been modified, or has been modified in accordance with the wishes of the Copyright Holder as specified below. "Copyright Holder" is whoever is named in the copyright or copyrights for the package. "You" is you, if you're thinking about copying or distributing this Package. "Reasonable copying fee" is whatever you can justify on the basis of media cost, duplication charges, time of people involved, and so on. (You will not be required to justify it to the Copyright Holder, but only to the computing community at large as a market that must bear the fee.) "Freely Available" means that no fee is charged for the item itself, though there may be fees involved in handling the item. It also means that recipients of the item may redistribute it under the same conditions they received it. 1. You may make and give away verbatim copies of the source form of the Standard Version of this Package without restriction, provided that you duplicate all of the original copyright notices and associated disclaimers. 2. You may apply bug fixes, portability fixes and other modifications derived from the Public Domain or from the Copyright Holder. A Package modified in such a way shall still be considered the Standard Version. 3. You may otherwise modify your copy of this Package in any way, provided that you insert a prominent notice in each changed file stating how and when you changed that file, and provided that you do at least ONE of the following: a) place your modifications in the Public Domain or otherwise make them Freely Available, such as by posting said modifications to Usenet or an equivalent medium, or placing the modifications on a major archive site such as uunet.uu.net, or by allowing the Copyright Holder to include your modifications in the Standard Version of the Package. b) use the modified Package only within your corporation or organization. c) rename any non-standard executables so the names do not conflict with standard executables, which must also be provided, and provide a separate manual page for each non-standard executable that clearly documents how it differs from the Standard Version. d) make other distribution arrangements with the Copyright Holder. 4. You may distribute the programs of this Package in object code or executable form, provided that you do at least ONE of the following: a) distribute a Standard Version of the executables and library files, together with instructions (in the manual page or equivalent) on where to get the Standard Version. b) accompany the distribution with the machine-readable source of the Package with your modifications. c) give non-standard executables non-standard names, and clearly document the differences in manual pages (or equivalent), together with instructions on where to get the Standard Version. d) make other distribution arrangements with the Copyright Holder. 5. You may charge a reasonable copying fee for any distribution of this Package. You may charge any fee you choose for support of this Package. You may not charge a fee for this Package itself. However, you may distribute this Package in aggregate with other (possibly commercial) programs as part of a larger (possibly commercial) software distribution provided that you do not advertise this Package as a product of your own. You may embed this Package's interpreter within an executable of yours (by linking); this shall be construed as a mere form of aggregation, provided that the complete Standard Version of the interpreter is so embedded. 6. The scripts and library files supplied as input to or produced as output from the programs of this Package do not automatically fall under the copyright of this Package, but belong to whoever generated them, and may be sold commercially, and may be aggregated with this Package. If such scripts or library files are aggregated with this Package via the so-called "undump" or "unexec" methods of producing a binary executable image, then distribution of such an image shall neither be construed as a distribution of this Package nor shall it fall under the restrictions of Paragraphs 3 and 4, provided that you do not represent such an executable image as a Standard Version of this Package. 7. C subroutines (or comparably compiled subroutines in other languages) supplied by you and linked into this Package in order to emulate subroutines and variables of the language defined by this Package shall not be considered part of this Package, but are the equivalent of input as in Paragraph 6, provided these subroutines do not change the language in any way that would cause it to fail the regression tests for the language. 8. Aggregation of this Package with a commercial distribution is always permitted provided that the use of this Package is embedded; that is, when no overt attempt is made to make this Package's interfaces visible to the end user of the commercial distribution. Such use shall not be construed as a distribution of this Package. 9. The name of the Copyright Holder may not be used to endorse or promote products derived from this software without specific prior written permission. 10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. The End