././@PaxHeader0000000000000000000000000000003300000000000010211 xustar0027 mtime=1706536857.459836 bcbio-gff-0.7.1/0000775002421100242110000000000014555727631013514 5ustar00bchapmanbchapman././@PaxHeader0000000000000000000000000000003300000000000010211 xustar0027 mtime=1706536857.455836 bcbio-gff-0.7.1/BCBio/0000775002421100242110000000000014555727631014432 5ustar00bchapmanbchapman././@PaxHeader0000000000000000000000000000003300000000000010211 xustar0027 mtime=1706536857.459836 bcbio-gff-0.7.1/BCBio/GFF/0000775002421100242110000000000014555727631015034 5ustar00bchapmanbchapman././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706535351.0 bcbio-gff-0.7.1/BCBio/GFF/GFFOutput.py0000664002421100242110000001661214555724667017245 0ustar00bchapmanbchapman"""Output Biopython SeqRecords and SeqFeatures to GFF3 format. The target format is GFF3, the current GFF standard: http://www.sequenceontology.org/gff3.shtml """ from six.moves import urllib from Bio import SeqIO class _IdHandler: """Generate IDs for GFF3 Parent/Child relationships where they don't exist. """ def __init__(self): self._prefix = "biopygen" self._counter = 1 self._seen_ids = [] def _generate_id(self, quals): """Generate a unique ID not present in our existing IDs. """ gen_id = self._get_standard_id(quals) if gen_id is None: while 1: gen_id = "%s%s" % (self._prefix, self._counter) if gen_id not in self._seen_ids: break self._counter += 1 return gen_id def _get_standard_id(self, quals): """Retrieve standardized IDs from other sources like NCBI GenBank. This tries to find IDs from known key/values when stored differently than GFF3 specifications. """ possible_keys = ["transcript_id", "protein_id"] for test_key in possible_keys: if test_key in quals: cur_id = quals[test_key] if isinstance(cur_id, tuple) or isinstance(cur_id, list): return cur_id[0] else: return cur_id return None def update_quals(self, quals, has_children): """Update a set of qualifiers, adding an ID if necessary. """ cur_id = quals.get("ID", None) # if we have an ID, record it if cur_id: if not isinstance(cur_id, list) and not isinstance(cur_id, tuple): cur_id = [cur_id] for add_id in cur_id: self._seen_ids.append(add_id) # if we need one and don't have it, create a new one elif has_children: new_id = self._generate_id(quals) self._seen_ids.append(new_id) quals["ID"] = [new_id] return quals class GFF3Writer: """Write GFF3 files starting with standard Biopython objects. """ def __init__(self): pass def write(self, recs, out_handle, include_fasta=False): """Write the provided records to the given handle in GFF3 format. """ id_handler = _IdHandler() self._write_header(out_handle) fasta_recs = [] try: recs = iter(recs) except TypeError: recs = [recs] for rec in recs: self._write_rec(rec, out_handle) self._write_annotations(rec.annotations, rec.id, len(rec.seq), out_handle) for sf in rec.features: sf = self._clean_feature(sf) id_handler = self._write_feature(sf, rec.id, out_handle, id_handler) if include_fasta and len(rec.seq) > 0: fasta_recs.append(rec) if len(fasta_recs) > 0: self._write_fasta(fasta_recs, out_handle) def _clean_feature(self, feature): quals = {} for key, val in feature.qualifiers.items(): if not isinstance(val, (list, tuple)): val = [val] val = [str(x) for x in val] quals[key] = val feature.qualifiers = quals # Support for Biopython 1.68 and above, which removed sub_features if not hasattr(feature, "sub_features"): feature.sub_features = [] clean_sub = [self._clean_feature(f) for f in feature.sub_features] feature.sub_features = clean_sub return feature def _write_rec(self, rec, out_handle): # if we have a SeqRecord, write out optional directive if len(rec.seq) > 0: out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq))) def _get_phase(self, feature): if "phase" in feature.qualifiers: phase = feature.qualifiers["phase"][0] elif feature.type == "CDS": phase = int(feature.qualifiers.get("codon_start", [1])[0]) - 1 else: phase = "." return str(phase) def _write_feature(self, feature, rec_id, out_handle, id_handler, parent_id=None): """Write a feature with location information. """ if feature.location.strand == 1: strand = '+' elif feature.location.strand == -1: strand = '-' else: strand = '.' # remove any standard features from the qualifiers quals = feature.qualifiers.copy() for std_qual in ["source", "score", "phase"]: if std_qual in quals and len(quals[std_qual]) == 1: del quals[std_qual] # add a link to a parent identifier if it exists if parent_id: if not "Parent" in quals: quals["Parent"] = [] quals["Parent"].append(parent_id) quals = id_handler.update_quals(quals, len(feature.sub_features) > 0) if feature.type: ftype = feature.type else: ftype = "sequence_feature" parts = [str(rec_id), feature.qualifiers.get("source", ["feature"])[0], ftype, str(feature.location.start + 1), # 1-based indexing str(feature.location.end), feature.qualifiers.get("score", ["."])[0], strand, self._get_phase(feature), self._format_keyvals(quals)] out_handle.write("\t".join(parts) + "\n") for sub_feature in feature.sub_features: id_handler = self._write_feature(sub_feature, rec_id, out_handle, id_handler, quals["ID"][0]) return id_handler def _format_keyvals(self, keyvals): format_kvs = [] for key in sorted(keyvals.keys()): values = keyvals[key] key = key.strip() format_vals = [] if not isinstance(values, list) or isinstance(values, tuple): values = [values] for val in values: val = urllib.parse.quote(str(val).strip(), safe=":/ ") if ((key and val) and val not in format_vals): format_vals.append(val) format_kvs.append("%s=%s" % (key, ",".join(format_vals))) return ";".join(format_kvs) def _write_annotations(self, anns, rec_id, size, out_handle): """Add annotations which refer to an entire sequence. """ format_anns = self._format_keyvals(anns) if format_anns: parts = [rec_id, "annotation", "remark", "1", str(size if size > 1 else 1), ".", ".", ".", format_anns] out_handle.write("\t".join(parts) + "\n") def _write_header(self, out_handle): """Write out standard header directives. """ out_handle.write("##gff-version 3\n") def _write_fasta(self, recs, out_handle): """Write sequence records using the ##FASTA directive. """ out_handle.write("##FASTA\n") SeqIO.write(recs, out_handle, "fasta") def write(recs, out_handle, include_fasta=False): """High level interface to write GFF3 files from SeqRecords and SeqFeatures. If include_fasta is True, the GFF3 file will include sequence information using the ##FASTA directive. """ writer = GFF3Writer() return writer.write(recs, out_handle, include_fasta) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706535879.0 bcbio-gff-0.7.1/BCBio/GFF/GFFParser.py0000664002421100242110000011343314555725707017174 0ustar00bchapmanbchapman"""Parse GFF files into features attached to Biopython SeqRecord objects. This deals with GFF3 formatted files, a tab delimited format for storing sequence features and annotations: http://www.sequenceontology.org/gff3.shtml It will also deal with older GFF versions (GTF/GFF2): http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml http://mblab.wustl.edu/GTF22.html The implementation utilizes map/reduce parsing of GFF using Disco. Disco (http://discoproject.org) is a Map-Reduce framework for Python utilizing Erlang for parallelization. The code works on a single processor without Disco using the same architecture. """ import os import copy import re import collections import io import itertools import six from six.moves import urllib # Make defaultdict compatible with versions of python older than 2.4 try: collections.defaultdict except AttributeError: import _utils collections.defaultdict = _utils.defaultdict unknown_seq_avail = False try: from Bio.Seq import UnknownSeq unknown_seq_avail = True except ImportError: # Starting with biopython 1.81, has been removed from Bio.Seq import _UndefinedSequenceData from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqFeature from Bio import SeqIO def _gff_line_map(line, params): """Map part of Map-Reduce; parses a line of GFF into a dictionary. Given an input line from a GFF file, this: - decides if the file passes our filtering limits - if so: - breaks it into component elements - determines the type of attribute (flat, parent, child or annotation) - generates a dictionary of GFF info which can be serialized as JSON """ def _merge_keyvals(parts): """Merge key-values escaped by quotes that are improperly split at semicolons. """ out = [] for i, p in enumerate(parts): if i > 0 and len(p) == 1 and p[0].endswith('"') and not p[0].startswith('"'): if out[-1][-1].startswith('"'): prev_p = out.pop(-1) to_merge = prev_p[-1] prev_p[-1] = "%s; %s" % (to_merge, p[0]) out.append(prev_p) else: out.append(p) return out gff3_kw_pat = re.compile("\w+=") def _split_keyvals(keyval_str): """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. GFF3 has key value pairs like: count=9;gene=amx-2;sequence=SAGE:aacggagccg GFF2 and GTF have: Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 """ quals = collections.defaultdict(list) if keyval_str is None: return quals # ensembl GTF has a stray semi-colon at the end if keyval_str[-1] == ';': keyval_str = keyval_str[:-1] # GFF2/GTF has a semi-colon with at least one space after it. # It can have spaces on both sides; wormbase does this. # GFF3 works with no spaces. # Split at the first one we can recognize as working parts = keyval_str.split(" ; ") if len(parts) == 1: parts = [x.strip() for x in keyval_str.split(";")] # check if we have GFF3 style key-vals (with =) is_gff2 = True if gff3_kw_pat.match(parts[0]): is_gff2 = False key_vals = _merge_keyvals([p.split('=') for p in parts]) # otherwise, we are separated by a space with a key as the first item else: pieces = [] for p in parts: # fix misplaced semi-colons in keys in some GFF2 files if p and p[0] == ';': p = p[1:] pieces.append(p.strip().split(" ")) key_vals = [(p[0], " ".join(p[1:])) for p in pieces] for item in key_vals: # standard in-spec items are key=value if len(item) == 2: key, val = item # out-of-spec files can have just key values. We set an empty value # which will be changed to true later to standardize. else: assert len(item) == 1, item key = item[0] val = '' # remove quotes in GFF2 files quoted = False if (len(val) > 0 and val[0] == '"' and val[-1] == '"'): quoted = True val = val[1:-1] if val: if quoted: quals[key].append(val) else: quals[key].extend([v for v in val.split(',') if v]) # if we don't have a value, make this a key=True/False style # attribute else: quals[key].append('true') for key, vals in quals.items(): quals[key] = [urllib.parse.unquote(v) for v in vals] return quals, is_gff2 def _nest_gff2_features(gff_parts): """Provide nesting of GFF2 transcript parts with transcript IDs. exons and coding sequences are mapped to a parent with a transcript_id in GFF2. This is implemented differently at different genome centers and this function attempts to resolve that and map things to the GFF3 way of doing them. """ # map protein or transcript ids to a parent for transcript_id in ["transcript_id", "transcriptId", "proteinId"]: try: gff_parts["quals"]["Parent"] = \ gff_parts["quals"][transcript_id] break except KeyError: pass # case for WormBase GFF -- everything labelled as Transcript or CDS for flat_name in ["Transcript", "CDS"]: if flat_name in gff_parts["quals"]: # parent types if gff_parts["type"] in [flat_name]: if not gff_parts["id"]: gff_parts["id"] = gff_parts["quals"][flat_name][0] gff_parts["quals"]["ID"] = [gff_parts["id"]] # children types elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", "coding_exon", "five_prime_UTR", "CDS", "stop_codon", "start_codon"]: gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name] break return gff_parts strand_map = {'+' : 1, '-' : -1, '?' : None, None: None} line = line.strip() if line[:2] == "##": return [('directive', line[2:])] elif line and line[0] != "#": parts = line.split('\t') should_do = True if params.limit_info: for limit_name, limit_values in params.limit_info.items(): cur_id = tuple([parts[i] for i in params.filter_info[limit_name]]) if cur_id not in limit_values: should_do = False break if should_do: assert len(parts) >= 8, line # not python2.4 compatible but easier to understand #gff_parts = [(None if p == '.' else p) for p in parts] gff_parts = [] for p in parts: if p == ".": gff_parts.append(None) else: gff_parts.append(p) gff_info = dict() # collect all of the base qualifiers for this item if len(parts) > 8: quals, is_gff2 = _split_keyvals(gff_parts[8]) else: quals, is_gff2 = collections.defaultdict(list), False gff_info["is_gff2"] = is_gff2 if gff_parts[1]: quals["source"].append(gff_parts[1]) if gff_parts[5]: quals["score"].append(gff_parts[5]) if gff_parts[7]: quals["phase"].append(gff_parts[7]) gff_info['quals'] = dict(quals) gff_info['rec_id'] = gff_parts[0] # if we are describing a location, then we are a feature if gff_parts[3] and gff_parts[4]: gff_info['location'] = [int(gff_parts[3]) - 1, int(gff_parts[4])] gff_info['type'] = gff_parts[2] gff_info['id'] = quals.get('ID', [''])[0] gff_info['strand'] = strand_map.get(gff_parts[6], None) if is_gff2: gff_info = _nest_gff2_features(gff_info) # features that have parents need to link so we can pick up # the relationship if "Parent" in gff_info['quals']: # check for self referential parent/child relationships # remove the ID, which is not useful for p in gff_info['quals']['Parent']: if p == gff_info['id']: gff_info['id'] = '' del gff_info['quals']['ID'] break final_key = 'child' elif gff_info['id']: final_key = 'parent' # Handle flat features else: final_key = 'feature' # otherwise, associate these annotations with the full record else: final_key = 'annotation' if params.jsonify: return [(final_key, simplejson.dumps(gff_info))] else: return [(final_key, gff_info)] return [] def _gff_line_reduce(map_results, out, params): """Reduce part of Map-Reduce; combines results of parsed features. """ final_items = dict() for gff_type, final_val in map_results: if params.jsonify and gff_type not in ['directive']: final_val = simplejson.loads(final_val) try: final_items[gff_type].append(final_val) except KeyError: final_items[gff_type] = [final_val] for key, vals in final_items.items(): if params.jsonify: vals = simplejson.dumps(vals) out.add(key, vals) class _MultiIDRemapper: """Provide an ID remapping for cases where a parent has a non-unique ID. Real life GFF3 cases have non-unique ID attributes, which we fix here by using the unique sequence region to assign children to the right parent. """ def __init__(self, base_id, all_parents): self._base_id = base_id self._parents = all_parents def remap_id(self, feature_dict): rstart, rend = feature_dict['location'] for index, parent in enumerate(self._parents): pstart, pend = parent['location'] if rstart >= pstart and rend <= pend: if index > 0: return ("%s_%s" % (self._base_id, index + 1)) else: return self._base_id # if we haven't found a location match but parents are umabiguous, return that if len(self._parents) == 1: return self._base_id raise ValueError("Did not find remapped ID location: %s, %s, %s" % ( self._base_id, [p['location'] for p in self._parents], feature_dict['location'])) class _AbstractMapReduceGFF: """Base class providing general GFF parsing for local and remote classes. This class should be subclassed to provide a concrete class to parse GFF under specific conditions. These classes need to implement the _gff_process function, which returns a dictionary of SeqRecord information. """ def __init__(self, create_missing=True): """Initialize GFF parser create_missing - If True, create blank records for GFF ids not in the base_dict. If False, an error will be raised. """ self._create_missing = create_missing self._map_fn = _gff_line_map self._reduce_fn = _gff_line_reduce self._examiner = GFFExaminer() def _gff_process(self, gff_files, limit_info, target_lines=None): raise NotImplementedError("Derived class must define") def parse(self, gff_files, base_dict=None, limit_info=None): """Parse a GFF file, returning an iterator of SeqRecords. limit_info - A dictionary specifying the regions of the GFF file which should be extracted. This allows only relevant portions of a file to be parsed. base_dict - A base dictionary of SeqRecord objects which may be pre-populated with sequences and other features. The new features from the GFF file will be added to this dictionary. """ for rec in self.parse_in_parts(gff_files, base_dict, limit_info): yield rec def parse_in_parts(self, gff_files, base_dict=None, limit_info=None, target_lines=None): """Parse a region of a GFF file specified, returning info as generated. target_lines -- The number of lines in the file which should be used for each partial parse. This should be determined based on available memory. """ for results in self.parse_simple(gff_files, limit_info, target_lines): if base_dict is None: cur_dict = dict() else: cur_dict = copy.deepcopy(base_dict) cur_dict = self._results_to_features(cur_dict, results) all_ids = list(cur_dict.keys()) all_ids.sort() for cur_id in all_ids: yield cur_dict[cur_id] def parse_simple(self, gff_files, limit_info=None, target_lines=1): """Simple parse which does not build or nest features. This returns a simple dictionary representation of each line in the GFF file. """ # gracefully handle a single file passed if not isinstance(gff_files, (list, tuple)): gff_files = [gff_files] limit_info = self._normalize_limit_info(limit_info) for results in self._gff_process(gff_files, limit_info, target_lines): yield results def _normalize_limit_info(self, limit_info): """Turn all limit information into tuples for identical comparisons. """ final_limit_info = {} if limit_info: for key, values in limit_info.items(): final_limit_info[key] = [] for v in values: if isinstance(v, str): final_limit_info[key].append((v,)) else: final_limit_info[key].append(tuple(v)) return final_limit_info def _results_to_features(self, base, results): """Add parsed dictionaries of results to Biopython SeqFeatures. """ base = self._add_annotations(base, results.get('annotation', [])) for feature in results.get('feature', []): (_, base) = self._add_toplevel_feature(base, feature) base = self._add_parent_child_features(base, results.get('parent', []), results.get('child', [])) base = self._add_seqs(base, results.get('fasta', [])) base = self._add_directives(base, results.get('directive', [])) return base def _add_directives(self, base, directives): """Handle any directives or meta-data in the GFF file. Relevant items are added as annotation meta-data to each record. """ dir_keyvals = collections.defaultdict(list) for directive in directives: parts = directive.split() if len(parts) > 1: key = parts[0] if len(parts) == 2: val = parts[1] else: val = tuple(parts[1:]) # specific directives that need special handling if key == "sequence-region": # convert to Python 0-based coordinates if len(val) == 2: # handle regions missing contig val = (int(val[0]) - 1, int(val[1])) elif len(val) == 3: val = (val[0], int(val[1]) - 1, int(val[2])) dir_keyvals[key].append(val) for key, vals in dir_keyvals.items(): for rec in base.values(): self._add_ann_to_rec(rec, key, vals) return base def _get_matching_record_id(self, base, find_id): """Find a matching base record with the test identifier, handling tricky cases. NCBI IDs https://en.wikipedia.org/wiki/FASTA_format#NCBI_identifiers """ # Straight matches for identifiers if find_id in base: return find_id # NCBI style IDs in find_id elif find_id and find_id.find("|") > 0: for test_id in [x.strip() for x in find_id.split("|")[1:]]: if test_id and test_id in base: return test_id # NCBI style IDs in base IDs else: for base_id in base.keys(): if base_id.find("|") > 0: for test_id in [x.strip() for x in base_id.split("|")[1:]]: if test_id and test_id == find_id: return base_id return None def _add_seqs(self, base, recs): """Add sequence information contained in the GFF3 to records. """ for rec in recs: match_id = self._get_matching_record_id(base, rec.id) if match_id: base[match_id].seq = rec.seq else: base[rec.id] = rec return base def _add_parent_child_features(self, base, parents, children): """Add nested features with parent child relationships. """ multi_remap = self._identify_dup_ids(parents) # add children features children_prep = collections.defaultdict(list) for child_dict in children: child_feature = self._get_feature(child_dict) for pindex, pid in enumerate(child_feature.qualifiers['Parent']): if pid in multi_remap: pid = multi_remap[pid].remap_id(child_dict) child_feature.qualifiers['Parent'][pindex] = pid children_prep[pid].append((child_dict['rec_id'], child_feature)) children = dict(children_prep) # add children to parents that exist for cur_parent_dict in parents: cur_id = cur_parent_dict['id'] if cur_id in multi_remap: cur_parent_dict['id'] = multi_remap[cur_id].remap_id( cur_parent_dict) cur_parent, base = self._add_toplevel_feature(base, cur_parent_dict) cur_parent, children = self._add_children_to_parent(cur_parent, children) # create parents for children without them (GFF2 or split/bad files) while len(children) > 0: parent_id, cur_children = next(itertools.islice(children.items(), 1)) # one child, do not nest it if len(cur_children) == 1: rec_id, child = cur_children[0] loc = (child.location.start, child.location.end) rec, base = self._get_rec(base, dict(rec_id=rec_id, location=loc)) rec.features.append(child) del children[parent_id] else: cur_parent, base = self._add_missing_parent(base, parent_id, cur_children) cur_parent, children = self._add_children_to_parent(cur_parent, children) return base def _identify_dup_ids(self, parents): """Identify duplicated ID attributes in potential nested parents. According to the GFF3 spec ID attributes are supposed to be unique for a file, but this is not always true in practice. This looks for duplicates, and provides unique IDs sorted by locations. """ multi_ids = collections.defaultdict(list) for parent in parents: multi_ids[parent['id']].append(parent) multi_ids = [(mid, ps) for (mid, ps) in multi_ids.items() if len(parents) > 1] multi_remap = dict() for mid, parents in multi_ids: multi_remap[mid] = _MultiIDRemapper(mid, parents) return multi_remap def _add_children_to_parent(self, cur_parent, children): """Recursively add children to parent features. """ if cur_parent.id in children: cur_children = children[cur_parent.id] ready_children = [] for _, cur_child in cur_children: cur_child, _ = self._add_children_to_parent(cur_child, children) ready_children.append(cur_child) # Support Biopython features for 1.62+ CompoundLocations and pre-1.62 if not hasattr(SeqFeature, "CompoundLocation"): cur_parent.location_operator = "join" for cur_child in ready_children: cur_parent.sub_features.append(cur_child) del children[cur_parent.id] return cur_parent, children def _add_annotations(self, base, anns): """Add annotation data from the GFF file to records. """ # add these as a list of annotations, checking not to overwrite # current values for ann in anns: rec, base = self._get_rec(base, ann) for key, vals in ann['quals'].items(): self._add_ann_to_rec(rec, key, vals) return base def _add_ann_to_rec(self, rec, key, vals): """Add a key/value annotation to the given SeqRecord. """ if key in rec.annotations: try: rec.annotations[key].extend(vals) except AttributeError: rec.annotations[key] = [rec.annotations[key]] + vals else: rec.annotations[key] = vals def _get_rec(self, base, info_dict): """Retrieve a record to add features to. """ max_loc = info_dict.get('location', (0, 1))[1] match_id = self._get_matching_record_id(base, info_dict['rec_id']) if match_id: cur_rec = base[match_id] # update generated unknown sequences with the expected maximum length if unknown_seq_avail and isinstance(cur_rec.seq, UnknownSeq): cur_rec.seq._length = max([max_loc, cur_rec.seq._length]) elif not unknown_seq_avail and isinstance(cur_rec.seq._data, _UndefinedSequenceData): cur_rec.seq._data._length = max([max_loc, cur_rec.seq._data._length]) return cur_rec, base elif self._create_missing: if unknown_seq_avail: new_rec = SeqRecord(UnknownSeq(max_loc), info_dict['rec_id']) else: new_rec = SeqRecord(Seq(None, length=max_loc), info_dict['rec_id']) base[info_dict['rec_id']] = new_rec return new_rec, base else: raise KeyError("Did not find matching record in %s for %s" % (base.keys(), info_dict)) def _add_missing_parent(self, base, parent_id, cur_children): """Add a new feature that is missing from the GFF file. """ base_rec_id = list(set(c[0] for c in cur_children)) child_strands = list(set(c[1].location.strand for c in cur_children)) inferred_strand = child_strands[0] if len(child_strands) == 1 else None assert len(base_rec_id) > 0 feature_dict = dict(id=parent_id, strand=inferred_strand, type="inferred_parent", quals=dict(ID=[parent_id]), rec_id=base_rec_id[0]) coords = [(c.location.start, c.location.end) for r, c in cur_children] feature_dict["location"] = (min([c[0] for c in coords]), max([c[1] for c in coords])) return self._add_toplevel_feature(base, feature_dict) def _add_toplevel_feature(self, base, feature_dict): """Add a toplevel non-nested feature to the appropriate record. """ new_feature = self._get_feature(feature_dict) rec, base = self._get_rec(base, feature_dict) rec.features.append(new_feature) return new_feature, base def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation. """ #location = SeqFeature.FeatureLocation(*feature_dict['location']) rstart, rend = feature_dict['location'] new_feature = SeqFeature.SeqFeature(SeqFeature.SimpleLocation(start=rstart, end=rend, strand=feature_dict['strand']), feature_dict['type'], id=feature_dict['id']) # Support for Biopython 1.68 and above, which removed sub_features if not hasattr(new_feature, "sub_features"): new_feature.sub_features = [] new_feature.qualifiers = feature_dict['quals'] return new_feature def _parse_fasta(self, in_handle): """Parse FASTA sequence information contained in the GFF3 file. """ return list(SeqIO.parse(in_handle, "fasta")) class _GFFParserLocalOut: """Provide a collector for local GFF MapReduce file parsing. """ def __init__(self, smart_breaks=False): self._items = dict() self._smart_breaks = smart_breaks self._missing_keys = collections.defaultdict(int) self._last_parent = None self.can_break = True self.num_lines = 0 def add(self, key, vals): if self._smart_breaks: # if we are not GFF2 we expect parents and break # based on not having missing ones if key == 'directive': if vals[0] == '#': self.can_break = True self._last_parent = None elif not vals[0].get("is_gff2", False): self._update_missing_parents(key, vals) self.can_break = (len(self._missing_keys) == 0) # break when we are done with stretches of child features elif key != 'child': self.can_break = True self._last_parent = None # break when we have lots of child features in a row # and change between parents else: cur_parent = vals[0]["quals"]["Parent"][0] if (self._last_parent): self.can_break = (cur_parent != self._last_parent) self._last_parent = cur_parent self.num_lines += 1 try: self._items[key].extend(vals) except KeyError: self._items[key] = vals def _update_missing_parents(self, key, vals): # smart way of deciding if we can break this. # if this is too much, can go back to not breaking in the # middle of children if key in ["child"]: for val in vals: for p_id in val["quals"]["Parent"]: self._missing_keys[p_id] += 1 for val in vals: try: del self._missing_keys[val["quals"]["ID"][0]] except KeyError: pass def has_items(self): return len(self._items) > 0 def get_results(self): self._last_parent = None return self._items class GFFParser(_AbstractMapReduceGFF): """Local GFF parser providing standardized parsing of GFF3 and GFF2 files. """ def __init__(self, line_adjust_fn=None, create_missing=True): _AbstractMapReduceGFF.__init__(self, create_missing=create_missing) self._line_adjust_fn = line_adjust_fn def _gff_process(self, gff_files, limit_info, target_lines): """Process GFF addition without any parallelization. In addition to limit filtering, this accepts a target_lines attribute which provides a number of lines to parse before returning results. This allows partial parsing of a file to prevent memory issues. """ line_gen = self._file_line_generator(gff_files) for out in self._lines_to_out_info(line_gen, limit_info, target_lines): yield out def _file_line_generator(self, gff_files): """Generate single lines from a set of GFF files. """ for gff_file in gff_files: if hasattr(gff_file, "read"): need_close = False in_handle = gff_file else: need_close = True in_handle = open(gff_file) while 1: line = in_handle.readline() if not line: break yield line if need_close: in_handle.close() def _lines_to_out_info(self, line_iter, limit_info=None, target_lines=None): """Generate SeqRecord and SeqFeatures from GFF file lines. """ params = self._examiner._get_local_params(limit_info) out_info = _GFFParserLocalOut((target_lines is not None and target_lines > 1)) found_seqs = False for line in line_iter: results = self._map_fn(line, params) if self._line_adjust_fn and results: if results[0][0] not in ['directive']: results = [(results[0][0], self._line_adjust_fn(results[0][1]))] self._reduce_fn(results, out_info, params) if (target_lines and out_info.num_lines >= target_lines and out_info.can_break): yield out_info.get_results() out_info = _GFFParserLocalOut((target_lines is not None and target_lines > 1)) if (results and results[0][0] == 'directive' and results[0][1] == 'FASTA'): found_seqs = True break class FakeHandle: def __init__(self, line_iter): self._iter = line_iter def __iter__(self): return self def __next__(self): return next(self._iter) next = __next__ def read(self, size=-1): if size < 0: return "".join(l for l in self._iter) elif size == 0: return "" # Used by Biopython to sniff unicode vs bytes else: raise NotImplementedError def readline(self): try: return next(self._iter) except StopIteration: return "" if found_seqs: fasta_recs = self._parse_fasta(FakeHandle(line_iter)) out_info.add('fasta', fasta_recs) if out_info.has_items(): yield out_info.get_results() class DiscoGFFParser(_AbstractMapReduceGFF): """GFF Parser with parallelization through Disco (http://discoproject.org. """ def __init__(self, disco_host, create_missing=True): """Initialize parser. disco_host - Web reference to a Disco host which will be used for parallelizing the GFF reading job. """ _AbstractMapReduceGFF.__init__(self, create_missing=create_missing) self._disco_host = disco_host def _gff_process(self, gff_files, limit_info, target_lines=None): """Process GFF addition, using Disco to parallelize the process. """ assert target_lines is None, "Cannot split parallelized jobs" # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [] for f in gff_files: if f.split(":")[0] != "disco": full_files.append(os.path.abspath(f)) else: full_files.append(f) results = disco.job(self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._examiner._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) yield processed def parse(gff_files, base_dict=None, limit_info=None, target_lines=None): """High level interface to parse GFF files into SeqRecords and SeqFeatures. """ parser = GFFParser() for rec in parser.parse_in_parts(gff_files, base_dict, limit_info, target_lines): yield rec def parse_simple(gff_files, limit_info=None): """Parse GFF files as line by line dictionary of parts. """ parser = GFFParser() for rec in parser.parse_simple(gff_files, limit_info=limit_info): if "child" in rec: assert "parent" not in rec yield rec["child"][0] elif "parent" in rec: yield rec["parent"][0] elif "feature" in rec: yield rec["feature"][0] # ignore directive lines else: assert "directive" in rec def _file_or_handle(fn): """Decorator to handle either an input handle or a file. """ def _file_or_handle_inside(*args, **kwargs): in_file = args[1] if hasattr(in_file, "read"): need_close = False in_handle = in_file if six.PY3 and not isinstance(in_handle, io.TextIOBase): raise TypeError('input handle must be opened in text mode') else: need_close = True in_handle = open(in_file) args = (args[0], in_handle) + args[2:] out = fn(*args, **kwargs) if need_close: in_handle.close() return out return _file_or_handle_inside class GFFExaminer: """Provide high level details about a GFF file to refine parsing. GFF is a spec and is provided by many different centers. Real life files will present the same information in slightly different ways. Becoming familiar with the file you are dealing with is the best way to extract the information you need. This class provides high level summary details to help in learning. """ def __init__(self): self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2], gff_source = [1], gff_type = [2]) def _get_local_params(self, limit_info=None): class _LocalParams: def __init__(self): self.jsonify = False params = _LocalParams() params.limit_info = limit_info params.filter_info = self._filter_info return params @_file_or_handle def available_limits(self, gff_handle): """Return dictionary information on possible limits for this file. This returns a nested dictionary with the following structure: keys -- names of items to filter by values -- dictionary with: keys -- filter choice value -- counts of that filter in this file Not a parallelized map-reduce implementation. """ cur_limits = dict() for filter_key in self._filter_info.keys(): cur_limits[filter_key] = collections.defaultdict(int) for line in gff_handle: # when we hit FASTA sequences, we are done with annotations if line.startswith("##FASTA"): break # ignore empty and comment lines if line.strip() and line.strip()[0] != "#": parts = [p.strip() for p in line.split('\t')] assert len(parts) >= 8, line parts = parts[:9] for filter_key, cur_indexes in self._filter_info.items(): cur_id = tuple([parts[i] for i in cur_indexes]) cur_limits[filter_key][cur_id] += 1 # get rid of the default dicts final_dict = dict() for key, value_dict in cur_limits.items(): if len(key) == 1: key = key[0] final_dict[key] = dict(value_dict) gff_handle.close() return final_dict @_file_or_handle def parent_child_map(self, gff_handle): """Provide a mapping of parent to child relationships in the file. Returns a dictionary of parent child relationships: keys -- tuple of (source, type) for each parent values -- tuple of (source, type) as children of that parent Not a parallelized map-reduce implementation. """ # collect all of the parent and child types mapped to IDs parent_sts = dict() child_sts = collections.defaultdict(list) for line in gff_handle: # when we hit FASTA sequences, we are done with annotations if line.startswith("##FASTA"): break if line.strip() and not line.startswith("#"): line_type, line_info = _gff_line_map(line, self._get_local_params())[0] if (line_type == 'parent' or (line_type == 'child' and line_info['id'])): parent_sts[line_info['id']] = ( line_info['quals'].get('source', [""])[0], line_info['type']) if line_type == 'child': for parent_id in line_info['quals']['Parent']: child_sts[parent_id].append(( line_info['quals'].get('source', [""])[0], line_info['type'])) #print parent_sts, child_sts # generate a dictionary of the unique final type relationships pc_map = collections.defaultdict(list) for parent_id, parent_type in parent_sts.items(): for child_type in child_sts[parent_id]: pc_map[parent_type].append(child_type) pc_final_map = dict() for ptype, ctypes in pc_map.items(): unique_ctypes = list(set(ctypes)) unique_ctypes.sort() pc_final_map[ptype] = unique_ctypes return pc_final_map ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706536753.0 bcbio-gff-0.7.1/BCBio/GFF/__init__.py0000664002421100242110000000035514555727461017151 0ustar00bchapmanbchapman"""Top level of GFF parsing providing shortcuts for useful classes. """ from BCBio.GFF.GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple from BCBio.GFF.GFFOutput import GFF3Writer, write __version__ = "0.7.1" ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706534767.0 bcbio-gff-0.7.1/BCBio/GFF/_utils.py0000664002421100242110000000256014555723557016713 0ustar00bchapmanbchapmanclass defaultdict(dict): """Back compatible defaultdict: http://code.activestate.com/recipes/523034/ """ def __init__(self, default_factory=None, *a, **kw): if (default_factory is not None and not hasattr(default_factory, '__call__')): raise TypeError('first argument must be callable') dict.__init__(self, *a, **kw) self.default_factory = default_factory def __getitem__(self, key): try: return dict.__getitem__(self, key) except KeyError: return self.__missing__(key) def __missing__(self, key): if self.default_factory is None: raise KeyError(key) self[key] = value = self.default_factory() return value def __reduce__(self): if self.default_factory is None: args = tuple() else: args = self.default_factory, return type(self), args, None, None, self.items() def copy(self): return self.__copy__() def __copy__(self): return type(self)(self.default_factory, self) def __deepcopy__(self, memo): import copy return type(self)(self.default_factory, copy.deepcopy(self.items())) def __repr__(self): return 'defaultdict(%s, %s)' % (self.default_factory, dict.__repr__(self)) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706534767.0 bcbio-gff-0.7.1/BCBio/__init__.py0000664002421100242110000000002414555723557016542 0ustar00bchapmanbchapman"""BCBio module """ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706534767.0 bcbio-gff-0.7.1/LICENSE0000664002421100242110000000202714555723557014525 0ustar00bchapmanbchapmanBiopython License Agreement Permission to use, copy, modify, and distribute this software and its documentation with or without modifications and for any purpose and without fee is hereby granted, provided that any copyright notices appear in all copies and that both those copyright notices and this permission notice appear in supporting documentation, and that the names of the contributors or copyright holders not be used in advertising or publicity pertaining to distribution of the software without specific prior permission. THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706534767.0 bcbio-gff-0.7.1/MANIFEST.in0000664002421100242110000000012714555723557015255 0ustar00bchapmanbchapmanrecursive-include BCBio *.py include distribute_setup.py include *.rst include LICENSE ././@PaxHeader0000000000000000000000000000003300000000000010211 xustar0027 mtime=1706536857.459836 bcbio-gff-0.7.1/PKG-INFO0000644002421100242110000000052614555727631014612 0ustar00bchapmanbchapmanMetadata-Version: 2.1 Name: bcbio-gff Version: 0.7.1 Summary: Read and write Generic Feature Format (GFF) with Biopython integration. Home-page: https://github.com/chapmanb/bcbb/tree/master/gff Author: Brad Chapman Author-email: chapmanb@50mail.com License: Biopython License License-File: LICENSE Requires-Dist: six Requires-Dist: biopython ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706534767.0 bcbio-gff-0.7.1/README.rst0000664002421100242110000000165214555723557015212 0ustar00bchapmanbchapmanbcbio-gff --------- A python library to read and write Generic Feature Format (`GFF`_). See the `wiki documentation`_ for details on usage. The goal is to integrate this code in `gffutils`_ and `Biopython`_. Most of the functionality from this library has been integrated into `gffutils `_ and we recommend using that for parsing GFF. It's an improved approach to handling GFF and well maintained by `Ryan Dale `_. Installation from `bcbio-gff in pypi`_:: pip install bcbio-gff This code is freely available for use under the `Biopython license `_. .. _GFF: http://www.sequenceontology.org/gff3.shtml .. _wiki documentation: http://biopython.org/wiki/GFF_Parsing .. _gffutils: https://github.com/daler/gffutils .. _Biopython: http://biopython.org .. _bcbio-gff in pypi: https://pypi.python.org/pypi/bcbio-gff ././@PaxHeader0000000000000000000000000000003300000000000010211 xustar0027 mtime=1706536857.459836 bcbio-gff-0.7.1/bcbio_gff.egg-info/0000775002421100242110000000000014555727631017106 5ustar00bchapmanbchapman././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706536857.0 bcbio-gff-0.7.1/bcbio_gff.egg-info/PKG-INFO0000644002421100242110000000052614555727631020204 0ustar00bchapmanbchapmanMetadata-Version: 2.1 Name: bcbio-gff Version: 0.7.1 Summary: Read and write Generic Feature Format (GFF) with Biopython integration. Home-page: https://github.com/chapmanb/bcbb/tree/master/gff Author: Brad Chapman Author-email: chapmanb@50mail.com License: Biopython License License-File: LICENSE Requires-Dist: six Requires-Dist: biopython ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706536857.0 bcbio-gff-0.7.1/bcbio_gff.egg-info/SOURCES.txt0000664002421100242110000000051114555727631020767 0ustar00bchapmanbchapmanLICENSE MANIFEST.in README.rst distribute_setup.py setup.py BCBio/__init__.py BCBio/GFF/GFFOutput.py BCBio/GFF/GFFParser.py BCBio/GFF/__init__.py BCBio/GFF/_utils.py bcbio_gff.egg-info/PKG-INFO bcbio_gff.egg-info/SOURCES.txt bcbio_gff.egg-info/dependency_links.txt bcbio_gff.egg-info/requires.txt bcbio_gff.egg-info/top_level.txt././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706536857.0 bcbio-gff-0.7.1/bcbio_gff.egg-info/dependency_links.txt0000664002421100242110000000000114555727631023154 0ustar00bchapmanbchapman ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706536857.0 bcbio-gff-0.7.1/bcbio_gff.egg-info/requires.txt0000664002421100242110000000001614555727631021503 0ustar00bchapmanbchapmansix biopython ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706536857.0 bcbio-gff-0.7.1/bcbio_gff.egg-info/top_level.txt0000664002421100242110000000000614555727631021634 0ustar00bchapmanbchapmanBCBio ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706534767.0 bcbio-gff-0.7.1/distribute_setup.py0000664002421100242110000004241014555723557017470 0ustar00bchapmanbchapman#!python """Bootstrap distribute installation If you want to use setuptools in your package's setup.py, just include this file in the same directory with it, and add this to the top of your setup.py:: from distribute_setup import use_setuptools use_setuptools() If you want to require a specific version of setuptools, set a download mirror, or use an alternate download directory, you can do so by supplying the appropriate options to ``use_setuptools()``. This file can also be run as a script to install or upgrade setuptools. """ import os import shutil import sys import time import fnmatch import tempfile import tarfile import optparse from distutils import log try: from site import USER_SITE except ImportError: USER_SITE = None try: import subprocess def _python_cmd(*args): args = (sys.executable,) + args return subprocess.call(args) == 0 except ImportError: # will be used for python 2.3 def _python_cmd(*args): args = (sys.executable,) + args # quoting arguments if windows if sys.platform == 'win32': def quote(arg): if ' ' in arg: return '"%s"' % arg return arg args = [quote(arg) for arg in args] return os.spawnl(os.P_WAIT, sys.executable, *args) == 0 DEFAULT_VERSION = "0.6.49" DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/" SETUPTOOLS_FAKED_VERSION = "0.6c11" SETUPTOOLS_PKG_INFO = """\ Metadata-Version: 1.0 Name: setuptools Version: %s Summary: xxxx Home-page: xxx Author: xxx Author-email: xxx License: xxx Description: xxx """ % SETUPTOOLS_FAKED_VERSION def _install(tarball, install_args=()): # extracting the tarball tmpdir = tempfile.mkdtemp() log.warn('Extracting in %s', tmpdir) old_wd = os.getcwd() try: os.chdir(tmpdir) tar = tarfile.open(tarball) _extractall(tar) tar.close() # going in the directory subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) os.chdir(subdir) log.warn('Now working in %s', subdir) # installing log.warn('Installing Distribute') if not _python_cmd('setup.py', 'install', *install_args): log.warn('Something went wrong during the installation.') log.warn('See the error message above.') # exitcode will be 2 return 2 finally: os.chdir(old_wd) shutil.rmtree(tmpdir) def _build_egg(egg, tarball, to_dir): # extracting the tarball tmpdir = tempfile.mkdtemp() log.warn('Extracting in %s', tmpdir) old_wd = os.getcwd() try: os.chdir(tmpdir) tar = tarfile.open(tarball) _extractall(tar) tar.close() # going in the directory subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) os.chdir(subdir) log.warn('Now working in %s', subdir) # building an egg log.warn('Building a Distribute egg in %s', to_dir) _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) finally: os.chdir(old_wd) shutil.rmtree(tmpdir) # returning the result log.warn(egg) if not os.path.exists(egg): raise IOError('Could not build the egg.') def _do_download(version, download_base, to_dir, download_delay): egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg' % (version, sys.version_info[0], sys.version_info[1])) if not os.path.exists(egg): tarball = download_setuptools(version, download_base, to_dir, download_delay) _build_egg(egg, tarball, to_dir) sys.path.insert(0, egg) import setuptools setuptools.bootstrap_install_from = egg def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, download_delay=15, no_fake=True): # making sure we use the absolute path to_dir = os.path.abspath(to_dir) was_imported = 'pkg_resources' in sys.modules or \ 'setuptools' in sys.modules try: try: import pkg_resources # Setuptools 0.7b and later is a suitable (and preferable) # substitute for any Distribute version. try: pkg_resources.require("setuptools>=0.7b") return except (pkg_resources.DistributionNotFound, pkg_resources.VersionConflict): pass if not hasattr(pkg_resources, '_distribute'): if not no_fake: _fake_setuptools() raise ImportError except ImportError: return _do_download(version, download_base, to_dir, download_delay) try: pkg_resources.require("distribute>=" + version) return except pkg_resources.VersionConflict: e = sys.exc_info()[1] if was_imported: sys.stderr.write( "The required version of distribute (>=%s) is not available,\n" "and can't be installed while this script is running. Please\n" "install a more recent version first, using\n" "'easy_install -U distribute'." "\n\n(Currently using %r)\n" % (version, e.args[0])) sys.exit(2) else: del pkg_resources, sys.modules['pkg_resources'] # reload ok return _do_download(version, download_base, to_dir, download_delay) except pkg_resources.DistributionNotFound: return _do_download(version, download_base, to_dir, download_delay) finally: if not no_fake: _create_fake_setuptools_pkg_info(to_dir) def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, delay=15): """Download distribute from a specified location and return its filename `version` should be a valid distribute version number that is available as an egg for download under the `download_base` URL (which should end with a '/'). `to_dir` is the directory where the egg will be downloaded. `delay` is the number of seconds to pause before an actual download attempt. """ # making sure we use the absolute path to_dir = os.path.abspath(to_dir) try: from urllib.request import urlopen except ImportError: from urllib2 import urlopen tgz_name = "distribute-%s.tar.gz" % version url = download_base + tgz_name saveto = os.path.join(to_dir, tgz_name) src = dst = None if not os.path.exists(saveto): # Avoid repeated downloads try: log.warn("Downloading %s", url) src = urlopen(url) # Read/write all in one block, so we don't create a corrupt file # if the download is interrupted. data = src.read() dst = open(saveto, "wb") dst.write(data) finally: if src: src.close() if dst: dst.close() return os.path.realpath(saveto) def _no_sandbox(function): def __no_sandbox(*args, **kw): try: from setuptools.sandbox import DirectorySandbox if not hasattr(DirectorySandbox, '_old'): def violation(*args): pass DirectorySandbox._old = DirectorySandbox._violation DirectorySandbox._violation = violation patched = True else: patched = False except ImportError: patched = False try: return function(*args, **kw) finally: if patched: DirectorySandbox._violation = DirectorySandbox._old del DirectorySandbox._old return __no_sandbox def _patch_file(path, content): """Will backup the file then patch it""" f = open(path) existing_content = f.read() f.close() if existing_content == content: # already patched log.warn('Already patched.') return False log.warn('Patching...') _rename_path(path) f = open(path, 'w') try: f.write(content) finally: f.close() return True _patch_file = _no_sandbox(_patch_file) def _same_content(path, content): f = open(path) existing_content = f.read() f.close() return existing_content == content def _rename_path(path): new_name = path + '.OLD.%s' % time.time() log.warn('Renaming %s to %s', path, new_name) os.rename(path, new_name) return new_name def _remove_flat_installation(placeholder): if not os.path.isdir(placeholder): log.warn('Unkown installation at %s', placeholder) return False found = False for file in os.listdir(placeholder): if fnmatch.fnmatch(file, 'setuptools*.egg-info'): found = True break if not found: log.warn('Could not locate setuptools*.egg-info') return log.warn('Moving elements out of the way...') pkg_info = os.path.join(placeholder, file) if os.path.isdir(pkg_info): patched = _patch_egg_dir(pkg_info) else: patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO) if not patched: log.warn('%s already patched.', pkg_info) return False # now let's move the files out of the way for element in ('setuptools', 'pkg_resources.py', 'site.py'): element = os.path.join(placeholder, element) if os.path.exists(element): _rename_path(element) else: log.warn('Could not find the %s element of the ' 'Setuptools distribution', element) return True _remove_flat_installation = _no_sandbox(_remove_flat_installation) def _after_install(dist): log.warn('After install bootstrap.') placeholder = dist.get_command_obj('install').install_purelib _create_fake_setuptools_pkg_info(placeholder) def _create_fake_setuptools_pkg_info(placeholder): if not placeholder or not os.path.exists(placeholder): log.warn('Could not find the install location') return pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1]) setuptools_file = 'setuptools-%s-py%s.egg-info' % \ (SETUPTOOLS_FAKED_VERSION, pyver) pkg_info = os.path.join(placeholder, setuptools_file) if os.path.exists(pkg_info): log.warn('%s already exists', pkg_info) return log.warn('Creating %s', pkg_info) try: f = open(pkg_info, 'w') except EnvironmentError: log.warn("Don't have permissions to write %s, skipping", pkg_info) return try: f.write(SETUPTOOLS_PKG_INFO) finally: f.close() pth_file = os.path.join(placeholder, 'setuptools.pth') log.warn('Creating %s', pth_file) f = open(pth_file, 'w') try: f.write(os.path.join(os.curdir, setuptools_file)) finally: f.close() _create_fake_setuptools_pkg_info = _no_sandbox( _create_fake_setuptools_pkg_info ) def _patch_egg_dir(path): # let's check if it's already patched pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') if os.path.exists(pkg_info): if _same_content(pkg_info, SETUPTOOLS_PKG_INFO): log.warn('%s already patched.', pkg_info) return False _rename_path(path) os.mkdir(path) os.mkdir(os.path.join(path, 'EGG-INFO')) pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') f = open(pkg_info, 'w') try: f.write(SETUPTOOLS_PKG_INFO) finally: f.close() return True _patch_egg_dir = _no_sandbox(_patch_egg_dir) def _before_install(): log.warn('Before install bootstrap.') _fake_setuptools() def _under_prefix(location): if 'install' not in sys.argv: return True args = sys.argv[sys.argv.index('install') + 1:] for index, arg in enumerate(args): for option in ('--root', '--prefix'): if arg.startswith('%s=' % option): top_dir = arg.split('root=')[-1] return location.startswith(top_dir) elif arg == option: if len(args) > index: top_dir = args[index + 1] return location.startswith(top_dir) if arg == '--user' and USER_SITE is not None: return location.startswith(USER_SITE) return True def _fake_setuptools(): log.warn('Scanning installed packages') try: import pkg_resources except ImportError: # we're cool log.warn('Setuptools or Distribute does not seem to be installed.') return ws = pkg_resources.working_set try: setuptools_dist = ws.find( pkg_resources.Requirement.parse('setuptools', replacement=False) ) except TypeError: # old distribute API setuptools_dist = ws.find( pkg_resources.Requirement.parse('setuptools') ) if setuptools_dist is None: log.warn('No setuptools distribution found') return # detecting if it was already faked setuptools_location = setuptools_dist.location log.warn('Setuptools installation detected at %s', setuptools_location) # if --root or --preix was provided, and if # setuptools is not located in them, we don't patch it if not _under_prefix(setuptools_location): log.warn('Not patching, --root or --prefix is installing Distribute' ' in another location') return # let's see if its an egg if not setuptools_location.endswith('.egg'): log.warn('Non-egg installation') res = _remove_flat_installation(setuptools_location) if not res: return else: log.warn('Egg installation') pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO') if (os.path.exists(pkg_info) and _same_content(pkg_info, SETUPTOOLS_PKG_INFO)): log.warn('Already patched.') return log.warn('Patching...') # let's create a fake egg replacing setuptools one res = _patch_egg_dir(setuptools_location) if not res: return log.warn('Patching complete.') _relaunch() def _relaunch(): log.warn('Relaunching...') # we have to relaunch the process # pip marker to avoid a relaunch bug _cmd1 = ['-c', 'install', '--single-version-externally-managed'] _cmd2 = ['-c', 'install', '--record'] if sys.argv[:3] == _cmd1 or sys.argv[:3] == _cmd2: sys.argv[0] = 'setup.py' args = [sys.executable] + sys.argv sys.exit(subprocess.call(args)) def _extractall(self, path=".", members=None): """Extract all members from the archive to the current working directory and set owner, modification time and permissions on directories afterwards. `path' specifies a different directory to extract to. `members' is optional and must be a subset of the list returned by getmembers(). """ import copy import operator from tarfile import ExtractError directories = [] if members is None: members = self for tarinfo in members: if tarinfo.isdir(): # Extract directories with a safe mode. directories.append(tarinfo) tarinfo = copy.copy(tarinfo) tarinfo.mode = 448 # decimal for oct 0700 self.extract(tarinfo, path) # Reverse sort directories. if sys.version_info < (2, 4): def sorter(dir1, dir2): return cmp(dir1.name, dir2.name) directories.sort(sorter) directories.reverse() else: directories.sort(key=operator.attrgetter('name'), reverse=True) # Set correct owner, mtime and filemode on directories. for tarinfo in directories: dirpath = os.path.join(path, tarinfo.name) try: self.chown(tarinfo, dirpath) self.utime(tarinfo, dirpath) self.chmod(tarinfo, dirpath) except ExtractError: e = sys.exc_info()[1] if self.errorlevel > 1: raise else: self._dbg(1, "tarfile: %s" % e) def _build_install_args(options): """ Build the arguments to 'python setup.py install' on the distribute package """ install_args = [] if options.user_install: if sys.version_info < (2, 6): log.warn("--user requires Python 2.6 or later") raise SystemExit(1) install_args.append('--user') return install_args def _parse_args(): """ Parse the command line for options """ parser = optparse.OptionParser() parser.add_option( '--user', dest='user_install', action='store_true', default=False, help='install in user site package (requires Python 2.6 or later)') parser.add_option( '--download-base', dest='download_base', metavar="URL", default=DEFAULT_URL, help='alternative URL from where to download the distribute package') options, args = parser.parse_args() # positional arguments are ignored return options def main(version=DEFAULT_VERSION): """Install or upgrade setuptools and EasyInstall""" options = _parse_args() tarball = download_setuptools(download_base=options.download_base) return _install(tarball, _build_install_args(options)) if __name__ == '__main__': sys.exit(main()) ././@PaxHeader0000000000000000000000000000003300000000000010211 xustar0027 mtime=1706536857.459836 bcbio-gff-0.7.1/setup.cfg0000664002421100242110000000004614555727631015335 0ustar00bchapmanbchapman[egg_info] tag_build = tag_date = 0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1706536445.0 bcbio-gff-0.7.1/setup.py0000664002421100242110000000134314555726775015237 0ustar00bchapmanbchapman#!/usr/bin/env python """Python setup file for Blue Collar Bioinformatics scripts and modules. """ from distribute_setup import use_setuptools use_setuptools() from setuptools import setup, find_packages __version__ = "Undefined" for line in open('BCBio/GFF/__init__.py'): if (line.startswith('__version__')): exec(line.strip()) setup(name="bcbio-gff", version=__version__, author="Brad Chapman", author_email="chapmanb@50mail.com", license="Biopython License", description="Read and write Generic Feature Format (GFF) with Biopython integration.", url="https://github.com/chapmanb/bcbb/tree/master/gff", packages=find_packages(), install_requires=["six", "biopython"] )