././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3726625 bcbio-gff-0.6.9/0000775002421100242110000000000000000000000014705 5ustar00bchapmanbchapman00000000000000././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3686624 bcbio-gff-0.6.9/BCBio/0000775002421100242110000000000000000000000015623 5ustar00bchapmanbchapman00000000000000././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3686624 bcbio-gff-0.6.9/BCBio/GFF/0000775002421100242110000000000000000000000016225 5ustar00bchapmanbchapman00000000000000././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/BCBio/GFF/GFFOutput.py0000664002421100242110000001661000000000000020426 0ustar00bchapmanbchapman00000000000000"""Output Biopython SeqRecords and SeqFeatures to GFF3 format. The target format is GFF3, the current GFF standard: http://www.sequenceontology.org/gff3.shtml """ from six.moves import urllib from Bio import SeqIO class _IdHandler: """Generate IDs for GFF3 Parent/Child relationships where they don't exist. """ def __init__(self): self._prefix = "biopygen" self._counter = 1 self._seen_ids = [] def _generate_id(self, quals): """Generate a unique ID not present in our existing IDs. """ gen_id = self._get_standard_id(quals) if gen_id is None: while 1: gen_id = "%s%s" % (self._prefix, self._counter) if gen_id not in self._seen_ids: break self._counter += 1 return gen_id def _get_standard_id(self, quals): """Retrieve standardized IDs from other sources like NCBI GenBank. This tries to find IDs from known key/values when stored differently than GFF3 specifications. """ possible_keys = ["transcript_id", "protein_id"] for test_key in possible_keys: if test_key in quals: cur_id = quals[test_key] if isinstance(cur_id, tuple) or isinstance(cur_id, list): return cur_id[0] else: return cur_id return None def update_quals(self, quals, has_children): """Update a set of qualifiers, adding an ID if necessary. """ cur_id = quals.get("ID", None) # if we have an ID, record it if cur_id: if not isinstance(cur_id, list) and not isinstance(cur_id, tuple): cur_id = [cur_id] for add_id in cur_id: self._seen_ids.append(add_id) # if we need one and don't have it, create a new one elif has_children: new_id = self._generate_id(quals) self._seen_ids.append(new_id) quals["ID"] = [new_id] return quals class GFF3Writer: """Write GFF3 files starting with standard Biopython objects. """ def __init__(self): pass def write(self, recs, out_handle, include_fasta=False): """Write the provided records to the given handle in GFF3 format. """ id_handler = _IdHandler() self._write_header(out_handle) fasta_recs = [] try: recs = iter(recs) except TypeError: recs = [recs] for rec in recs: self._write_rec(rec, out_handle) self._write_annotations(rec.annotations, rec.id, len(rec.seq), out_handle) for sf in rec.features: sf = self._clean_feature(sf) id_handler = self._write_feature(sf, rec.id, out_handle, id_handler) if include_fasta and len(rec.seq) > 0: fasta_recs.append(rec) if len(fasta_recs) > 0: self._write_fasta(fasta_recs, out_handle) def _clean_feature(self, feature): quals = {} for key, val in feature.qualifiers.items(): if not isinstance(val, (list, tuple)): val = [val] val = [str(x) for x in val] quals[key] = val feature.qualifiers = quals # Support for Biopython 1.68 and above, which removed sub_features if not hasattr(feature, "sub_features"): feature.sub_features = [] clean_sub = [self._clean_feature(f) for f in feature.sub_features] feature.sub_features = clean_sub return feature def _write_rec(self, rec, out_handle): # if we have a SeqRecord, write out optional directive if len(rec.seq) > 0: out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq))) def _get_phase(self, feature): if "phase" in feature.qualifiers: phase = feature.qualifiers["phase"][0] elif feature.type == "CDS": phase = int(feature.qualifiers.get("codon_start", [1])[0]) - 1 else: phase = "." return str(phase) def _write_feature(self, feature, rec_id, out_handle, id_handler, parent_id=None): """Write a feature with location information. """ if feature.strand == 1: strand = '+' elif feature.strand == -1: strand = '-' else: strand = '.' # remove any standard features from the qualifiers quals = feature.qualifiers.copy() for std_qual in ["source", "score", "phase"]: if std_qual in quals and len(quals[std_qual]) == 1: del quals[std_qual] # add a link to a parent identifier if it exists if parent_id: if not "Parent" in quals: quals["Parent"] = [] quals["Parent"].append(parent_id) quals = id_handler.update_quals(quals, len(feature.sub_features) > 0) if feature.type: ftype = feature.type else: ftype = "sequence_feature" parts = [str(rec_id), feature.qualifiers.get("source", ["feature"])[0], ftype, str(feature.location.nofuzzy_start + 1), # 1-based indexing str(feature.location.nofuzzy_end), feature.qualifiers.get("score", ["."])[0], strand, self._get_phase(feature), self._format_keyvals(quals)] out_handle.write("\t".join(parts) + "\n") for sub_feature in feature.sub_features: id_handler = self._write_feature(sub_feature, rec_id, out_handle, id_handler, quals["ID"][0]) return id_handler def _format_keyvals(self, keyvals): format_kvs = [] for key in sorted(keyvals.keys()): values = keyvals[key] key = key.strip() format_vals = [] if not isinstance(values, list) or isinstance(values, tuple): values = [values] for val in values: val = urllib.parse.quote(str(val).strip(), safe=":/ ") if ((key and val) and val not in format_vals): format_vals.append(val) format_kvs.append("%s=%s" % (key, ",".join(format_vals))) return ";".join(format_kvs) def _write_annotations(self, anns, rec_id, size, out_handle): """Add annotations which refer to an entire sequence. """ format_anns = self._format_keyvals(anns) if format_anns: parts = [rec_id, "annotation", "remark", "1", str(size if size > 1 else 1), ".", ".", ".", format_anns] out_handle.write("\t".join(parts) + "\n") def _write_header(self, out_handle): """Write out standard header directives. """ out_handle.write("##gff-version 3\n") def _write_fasta(self, recs, out_handle): """Write sequence records using the ##FASTA directive. """ out_handle.write("##FASTA\n") SeqIO.write(recs, out_handle, "fasta") def write(recs, out_handle, include_fasta=False): """High level interface to write GFF3 files from SeqRecords and SeqFeatures. If include_fasta is True, the GFF3 file will include sequence information using the ##FASTA directive. """ writer = GFF3Writer() return writer.write(recs, out_handle, include_fasta) ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1637247916.0 bcbio-gff-0.6.9/BCBio/GFF/GFFParser.py0000664002421100242110000011253500000000000020365 0ustar00bchapmanbchapman00000000000000"""Parse GFF files into features attached to Biopython SeqRecord objects. This deals with GFF3 formatted files, a tab delimited format for storing sequence features and annotations: http://www.sequenceontology.org/gff3.shtml It will also deal with older GFF versions (GTF/GFF2): http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml http://mblab.wustl.edu/GTF22.html The implementation utilizes map/reduce parsing of GFF using Disco. Disco (http://discoproject.org) is a Map-Reduce framework for Python utilizing Erlang for parallelization. The code works on a single processor without Disco using the same architecture. """ import os import copy import re import collections import io import itertools import warnings import six from six.moves import urllib # Make defaultdict compatible with versions of python older than 2.4 try: collections.defaultdict except AttributeError: import _utils collections.defaultdict = _utils.defaultdict from Bio.Seq import UnknownSeq from Bio.SeqRecord import SeqRecord from Bio import SeqFeature from Bio import SeqIO from Bio import BiopythonDeprecationWarning warnings.simplefilter("ignore", BiopythonDeprecationWarning) def _gff_line_map(line, params): """Map part of Map-Reduce; parses a line of GFF into a dictionary. Given an input line from a GFF file, this: - decides if the file passes our filtering limits - if so: - breaks it into component elements - determines the type of attribute (flat, parent, child or annotation) - generates a dictionary of GFF info which can be serialized as JSON """ def _merge_keyvals(parts): """Merge key-values escaped by quotes that are improperly split at semicolons. """ out = [] for i, p in enumerate(parts): if i > 0 and len(p) == 1 and p[0].endswith('"') and not p[0].startswith('"'): if out[-1][-1].startswith('"'): prev_p = out.pop(-1) to_merge = prev_p[-1] prev_p[-1] = "%s; %s" % (to_merge, p[0]) out.append(prev_p) else: out.append(p) return out gff3_kw_pat = re.compile("\w+=") def _split_keyvals(keyval_str): """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. GFF3 has key value pairs like: count=9;gene=amx-2;sequence=SAGE:aacggagccg GFF2 and GTF have: Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 """ quals = collections.defaultdict(list) if keyval_str is None: return quals # ensembl GTF has a stray semi-colon at the end if keyval_str[-1] == ';': keyval_str = keyval_str[:-1] # GFF2/GTF has a semi-colon with at least one space after it. # It can have spaces on both sides; wormbase does this. # GFF3 works with no spaces. # Split at the first one we can recognize as working parts = keyval_str.split(" ; ") if len(parts) == 1: parts = [x.strip() for x in keyval_str.split(";")] # check if we have GFF3 style key-vals (with =) is_gff2 = True if gff3_kw_pat.match(parts[0]): is_gff2 = False key_vals = _merge_keyvals([p.split('=') for p in parts]) # otherwise, we are separated by a space with a key as the first item else: pieces = [] for p in parts: # fix misplaced semi-colons in keys in some GFF2 files if p and p[0] == ';': p = p[1:] pieces.append(p.strip().split(" ")) key_vals = [(p[0], " ".join(p[1:])) for p in pieces] for item in key_vals: # standard in-spec items are key=value if len(item) == 2: key, val = item # out-of-spec files can have just key values. We set an empty value # which will be changed to true later to standardize. else: assert len(item) == 1, item key = item[0] val = '' # remove quotes in GFF2 files quoted = False if (len(val) > 0 and val[0] == '"' and val[-1] == '"'): quoted = True val = val[1:-1] if val: if quoted: quals[key].append(val) else: quals[key].extend([v for v in val.split(',') if v]) # if we don't have a value, make this a key=True/False style # attribute else: quals[key].append('true') for key, vals in quals.items(): quals[key] = [urllib.parse.unquote(v) for v in vals] return quals, is_gff2 def _nest_gff2_features(gff_parts): """Provide nesting of GFF2 transcript parts with transcript IDs. exons and coding sequences are mapped to a parent with a transcript_id in GFF2. This is implemented differently at different genome centers and this function attempts to resolve that and map things to the GFF3 way of doing them. """ # map protein or transcript ids to a parent for transcript_id in ["transcript_id", "transcriptId", "proteinId"]: try: gff_parts["quals"]["Parent"] = \ gff_parts["quals"][transcript_id] break except KeyError: pass # case for WormBase GFF -- everything labelled as Transcript or CDS for flat_name in ["Transcript", "CDS"]: if flat_name in gff_parts["quals"]: # parent types if gff_parts["type"] in [flat_name]: if not gff_parts["id"]: gff_parts["id"] = gff_parts["quals"][flat_name][0] gff_parts["quals"]["ID"] = [gff_parts["id"]] # children types elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", "coding_exon", "five_prime_UTR", "CDS", "stop_codon", "start_codon"]: gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name] break return gff_parts strand_map = {'+' : 1, '-' : -1, '?' : None, None: None} line = line.strip() if line[:2] == "##": return [('directive', line[2:])] elif line and line[0] != "#": parts = line.split('\t') should_do = True if params.limit_info: for limit_name, limit_values in params.limit_info.items(): cur_id = tuple([parts[i] for i in params.filter_info[limit_name]]) if cur_id not in limit_values: should_do = False break if should_do: assert len(parts) >= 8, line # not python2.4 compatible but easier to understand #gff_parts = [(None if p == '.' else p) for p in parts] gff_parts = [] for p in parts: if p == ".": gff_parts.append(None) else: gff_parts.append(p) gff_info = dict() # collect all of the base qualifiers for this item if len(parts) > 8: quals, is_gff2 = _split_keyvals(gff_parts[8]) else: quals, is_gff2 = collections.defaultdict(list), False gff_info["is_gff2"] = is_gff2 if gff_parts[1]: quals["source"].append(gff_parts[1]) if gff_parts[5]: quals["score"].append(gff_parts[5]) if gff_parts[7]: quals["phase"].append(gff_parts[7]) gff_info['quals'] = dict(quals) gff_info['rec_id'] = gff_parts[0] # if we are describing a location, then we are a feature if gff_parts[3] and gff_parts[4]: gff_info['location'] = [int(gff_parts[3]) - 1, int(gff_parts[4])] gff_info['type'] = gff_parts[2] gff_info['id'] = quals.get('ID', [''])[0] gff_info['strand'] = strand_map.get(gff_parts[6], None) if is_gff2: gff_info = _nest_gff2_features(gff_info) # features that have parents need to link so we can pick up # the relationship if "Parent" in gff_info['quals']: # check for self referential parent/child relationships # remove the ID, which is not useful for p in gff_info['quals']['Parent']: if p == gff_info['id']: gff_info['id'] = '' del gff_info['quals']['ID'] break final_key = 'child' elif gff_info['id']: final_key = 'parent' # Handle flat features else: final_key = 'feature' # otherwise, associate these annotations with the full record else: final_key = 'annotation' if params.jsonify: return [(final_key, simplejson.dumps(gff_info))] else: return [(final_key, gff_info)] return [] def _gff_line_reduce(map_results, out, params): """Reduce part of Map-Reduce; combines results of parsed features. """ final_items = dict() for gff_type, final_val in map_results: if params.jsonify and gff_type not in ['directive']: final_val = simplejson.loads(final_val) try: final_items[gff_type].append(final_val) except KeyError: final_items[gff_type] = [final_val] for key, vals in final_items.items(): if params.jsonify: vals = simplejson.dumps(vals) out.add(key, vals) class _MultiIDRemapper: """Provide an ID remapping for cases where a parent has a non-unique ID. Real life GFF3 cases have non-unique ID attributes, which we fix here by using the unique sequence region to assign children to the right parent. """ def __init__(self, base_id, all_parents): self._base_id = base_id self._parents = all_parents def remap_id(self, feature_dict): rstart, rend = feature_dict['location'] for index, parent in enumerate(self._parents): pstart, pend = parent['location'] if rstart >= pstart and rend <= pend: if index > 0: return ("%s_%s" % (self._base_id, index + 1)) else: return self._base_id # if we haven't found a location match but parents are umabiguous, return that if len(self._parents) == 1: return self._base_id raise ValueError("Did not find remapped ID location: %s, %s, %s" % ( self._base_id, [p['location'] for p in self._parents], feature_dict['location'])) class _AbstractMapReduceGFF: """Base class providing general GFF parsing for local and remote classes. This class should be subclassed to provide a concrete class to parse GFF under specific conditions. These classes need to implement the _gff_process function, which returns a dictionary of SeqRecord information. """ def __init__(self, create_missing=True): """Initialize GFF parser create_missing - If True, create blank records for GFF ids not in the base_dict. If False, an error will be raised. """ self._create_missing = create_missing self._map_fn = _gff_line_map self._reduce_fn = _gff_line_reduce self._examiner = GFFExaminer() def _gff_process(self, gff_files, limit_info, target_lines=None): raise NotImplementedError("Derived class must define") def parse(self, gff_files, base_dict=None, limit_info=None): """Parse a GFF file, returning an iterator of SeqRecords. limit_info - A dictionary specifying the regions of the GFF file which should be extracted. This allows only relevant portions of a file to be parsed. base_dict - A base dictionary of SeqRecord objects which may be pre-populated with sequences and other features. The new features from the GFF file will be added to this dictionary. """ for rec in self.parse_in_parts(gff_files, base_dict, limit_info): yield rec def parse_in_parts(self, gff_files, base_dict=None, limit_info=None, target_lines=None): """Parse a region of a GFF file specified, returning info as generated. target_lines -- The number of lines in the file which should be used for each partial parse. This should be determined based on available memory. """ for results in self.parse_simple(gff_files, limit_info, target_lines): if base_dict is None: cur_dict = dict() else: cur_dict = copy.deepcopy(base_dict) cur_dict = self._results_to_features(cur_dict, results) all_ids = list(cur_dict.keys()) all_ids.sort() for cur_id in all_ids: yield cur_dict[cur_id] def parse_simple(self, gff_files, limit_info=None, target_lines=1): """Simple parse which does not build or nest features. This returns a simple dictionary representation of each line in the GFF file. """ # gracefully handle a single file passed if not isinstance(gff_files, (list, tuple)): gff_files = [gff_files] limit_info = self._normalize_limit_info(limit_info) for results in self._gff_process(gff_files, limit_info, target_lines): yield results def _normalize_limit_info(self, limit_info): """Turn all limit information into tuples for identical comparisons. """ final_limit_info = {} if limit_info: for key, values in limit_info.items(): final_limit_info[key] = [] for v in values: if isinstance(v, str): final_limit_info[key].append((v,)) else: final_limit_info[key].append(tuple(v)) return final_limit_info def _results_to_features(self, base, results): """Add parsed dictionaries of results to Biopython SeqFeatures. """ base = self._add_annotations(base, results.get('annotation', [])) for feature in results.get('feature', []): (_, base) = self._add_toplevel_feature(base, feature) base = self._add_parent_child_features(base, results.get('parent', []), results.get('child', [])) base = self._add_seqs(base, results.get('fasta', [])) base = self._add_directives(base, results.get('directive', [])) return base def _add_directives(self, base, directives): """Handle any directives or meta-data in the GFF file. Relevant items are added as annotation meta-data to each record. """ dir_keyvals = collections.defaultdict(list) for directive in directives: parts = directive.split() if len(parts) > 1: key = parts[0] if len(parts) == 2: val = parts[1] else: val = tuple(parts[1:]) # specific directives that need special handling if key == "sequence-region": # convert to Python 0-based coordinates if len(val) == 2: # handle regions missing contig val = (int(val[0]) - 1, int(val[1])) elif len(val) == 3: val = (val[0], int(val[1]) - 1, int(val[2])) dir_keyvals[key].append(val) for key, vals in dir_keyvals.items(): for rec in base.values(): self._add_ann_to_rec(rec, key, vals) return base def _get_matching_record_id(self, base, find_id): """Find a matching base record with the test identifier, handling tricky cases. NCBI IDs https://en.wikipedia.org/wiki/FASTA_format#NCBI_identifiers """ # Straight matches for identifiers if find_id in base: return find_id # NCBI style IDs in find_id elif find_id and find_id.find("|") > 0: for test_id in [x.strip() for x in find_id.split("|")[1:]]: if test_id and test_id in base: return test_id # NCBI style IDs in base IDs else: for base_id in base.keys(): if base_id.find("|") > 0: for test_id in [x.strip() for x in base_id.split("|")[1:]]: if test_id and test_id == find_id: return base_id return None def _add_seqs(self, base, recs): """Add sequence information contained in the GFF3 to records. """ for rec in recs: match_id = self._get_matching_record_id(base, rec.id) if match_id: base[match_id].seq = rec.seq else: base[rec.id] = rec return base def _add_parent_child_features(self, base, parents, children): """Add nested features with parent child relationships. """ multi_remap = self._identify_dup_ids(parents) # add children features children_prep = collections.defaultdict(list) for child_dict in children: child_feature = self._get_feature(child_dict) for pindex, pid in enumerate(child_feature.qualifiers['Parent']): if pid in multi_remap: pid = multi_remap[pid].remap_id(child_dict) child_feature.qualifiers['Parent'][pindex] = pid children_prep[pid].append((child_dict['rec_id'], child_feature)) children = dict(children_prep) # add children to parents that exist for cur_parent_dict in parents: cur_id = cur_parent_dict['id'] if cur_id in multi_remap: cur_parent_dict['id'] = multi_remap[cur_id].remap_id( cur_parent_dict) cur_parent, base = self._add_toplevel_feature(base, cur_parent_dict) cur_parent, children = self._add_children_to_parent(cur_parent, children) # create parents for children without them (GFF2 or split/bad files) while len(children) > 0: parent_id, cur_children = next(itertools.islice(children.items(), 1)) # one child, do not nest it if len(cur_children) == 1: rec_id, child = cur_children[0] loc = (child.location.nofuzzy_start, child.location.nofuzzy_end) rec, base = self._get_rec(base, dict(rec_id=rec_id, location=loc)) rec.features.append(child) del children[parent_id] else: cur_parent, base = self._add_missing_parent(base, parent_id, cur_children) cur_parent, children = self._add_children_to_parent(cur_parent, children) return base def _identify_dup_ids(self, parents): """Identify duplicated ID attributes in potential nested parents. According to the GFF3 spec ID attributes are supposed to be unique for a file, but this is not always true in practice. This looks for duplicates, and provides unique IDs sorted by locations. """ multi_ids = collections.defaultdict(list) for parent in parents: multi_ids[parent['id']].append(parent) multi_ids = [(mid, ps) for (mid, ps) in multi_ids.items() if len(parents) > 1] multi_remap = dict() for mid, parents in multi_ids: multi_remap[mid] = _MultiIDRemapper(mid, parents) return multi_remap def _add_children_to_parent(self, cur_parent, children): """Recursively add children to parent features. """ if cur_parent.id in children: cur_children = children[cur_parent.id] ready_children = [] for _, cur_child in cur_children: cur_child, _ = self._add_children_to_parent(cur_child, children) ready_children.append(cur_child) # Support Biopython features for 1.62+ CompoundLocations and pre-1.62 if not hasattr(SeqFeature, "CompoundLocation"): cur_parent.location_operator = "join" for cur_child in ready_children: cur_parent.sub_features.append(cur_child) del children[cur_parent.id] return cur_parent, children def _add_annotations(self, base, anns): """Add annotation data from the GFF file to records. """ # add these as a list of annotations, checking not to overwrite # current values for ann in anns: rec, base = self._get_rec(base, ann) for key, vals in ann['quals'].items(): self._add_ann_to_rec(rec, key, vals) return base def _add_ann_to_rec(self, rec, key, vals): """Add a key/value annotation to the given SeqRecord. """ if key in rec.annotations: try: rec.annotations[key].extend(vals) except AttributeError: rec.annotations[key] = [rec.annotations[key]] + vals else: rec.annotations[key] = vals def _get_rec(self, base, info_dict): """Retrieve a record to add features to. """ max_loc = info_dict.get('location', (0, 1))[1] match_id = self._get_matching_record_id(base, info_dict['rec_id']) if match_id: cur_rec = base[match_id] # update generated unknown sequences with the expected maximum length if isinstance(cur_rec.seq, UnknownSeq): cur_rec.seq._length = max([max_loc, cur_rec.seq._length]) return cur_rec, base elif self._create_missing: new_rec = SeqRecord(UnknownSeq(max_loc), info_dict['rec_id']) base[info_dict['rec_id']] = new_rec return new_rec, base else: raise KeyError("Did not find matching record in %s for %s" % (base.keys(), info_dict)) def _add_missing_parent(self, base, parent_id, cur_children): """Add a new feature that is missing from the GFF file. """ base_rec_id = list(set(c[0] for c in cur_children)) child_strands = list(set(c[1].strand for c in cur_children)) inferred_strand = child_strands[0] if len(child_strands) == 1 else None assert len(base_rec_id) > 0 feature_dict = dict(id=parent_id, strand=inferred_strand, type="inferred_parent", quals=dict(ID=[parent_id]), rec_id=base_rec_id[0]) coords = [(c.location.nofuzzy_start, c.location.nofuzzy_end) for r, c in cur_children] feature_dict["location"] = (min([c[0] for c in coords]), max([c[1] for c in coords])) return self._add_toplevel_feature(base, feature_dict) def _add_toplevel_feature(self, base, feature_dict): """Add a toplevel non-nested feature to the appropriate record. """ new_feature = self._get_feature(feature_dict) rec, base = self._get_rec(base, feature_dict) rec.features.append(new_feature) return new_feature, base def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation. """ location = SeqFeature.FeatureLocation(*feature_dict['location']) new_feature = SeqFeature.SeqFeature(location, feature_dict['type'], id=feature_dict['id'], strand=feature_dict['strand']) # Support for Biopython 1.68 and above, which removed sub_features if not hasattr(new_feature, "sub_features"): new_feature.sub_features = [] new_feature.qualifiers = feature_dict['quals'] return new_feature def _parse_fasta(self, in_handle): """Parse FASTA sequence information contained in the GFF3 file. """ return list(SeqIO.parse(in_handle, "fasta")) class _GFFParserLocalOut: """Provide a collector for local GFF MapReduce file parsing. """ def __init__(self, smart_breaks=False): self._items = dict() self._smart_breaks = smart_breaks self._missing_keys = collections.defaultdict(int) self._last_parent = None self.can_break = True self.num_lines = 0 def add(self, key, vals): if self._smart_breaks: # if we are not GFF2 we expect parents and break # based on not having missing ones if key == 'directive': if vals[0] == '#': self.can_break = True self._last_parent = None elif not vals[0].get("is_gff2", False): self._update_missing_parents(key, vals) self.can_break = (len(self._missing_keys) == 0) # break when we are done with stretches of child features elif key != 'child': self.can_break = True self._last_parent = None # break when we have lots of child features in a row # and change between parents else: cur_parent = vals[0]["quals"]["Parent"][0] if (self._last_parent): self.can_break = (cur_parent != self._last_parent) self._last_parent = cur_parent self.num_lines += 1 try: self._items[key].extend(vals) except KeyError: self._items[key] = vals def _update_missing_parents(self, key, vals): # smart way of deciding if we can break this. # if this is too much, can go back to not breaking in the # middle of children if key in ["child"]: for val in vals: for p_id in val["quals"]["Parent"]: self._missing_keys[p_id] += 1 for val in vals: try: del self._missing_keys[val["quals"]["ID"][0]] except KeyError: pass def has_items(self): return len(self._items) > 0 def get_results(self): self._last_parent = None return self._items class GFFParser(_AbstractMapReduceGFF): """Local GFF parser providing standardized parsing of GFF3 and GFF2 files. """ def __init__(self, line_adjust_fn=None, create_missing=True): _AbstractMapReduceGFF.__init__(self, create_missing=create_missing) self._line_adjust_fn = line_adjust_fn def _gff_process(self, gff_files, limit_info, target_lines): """Process GFF addition without any parallelization. In addition to limit filtering, this accepts a target_lines attribute which provides a number of lines to parse before returning results. This allows partial parsing of a file to prevent memory issues. """ line_gen = self._file_line_generator(gff_files) for out in self._lines_to_out_info(line_gen, limit_info, target_lines): yield out def _file_line_generator(self, gff_files): """Generate single lines from a set of GFF files. """ for gff_file in gff_files: if hasattr(gff_file, "read"): need_close = False in_handle = gff_file else: need_close = True in_handle = open(gff_file) while 1: line = in_handle.readline() if not line: break yield line if need_close: in_handle.close() def _lines_to_out_info(self, line_iter, limit_info=None, target_lines=None): """Generate SeqRecord and SeqFeatures from GFF file lines. """ params = self._examiner._get_local_params(limit_info) out_info = _GFFParserLocalOut((target_lines is not None and target_lines > 1)) found_seqs = False for line in line_iter: results = self._map_fn(line, params) if self._line_adjust_fn and results: if results[0][0] not in ['directive']: results = [(results[0][0], self._line_adjust_fn(results[0][1]))] self._reduce_fn(results, out_info, params) if (target_lines and out_info.num_lines >= target_lines and out_info.can_break): yield out_info.get_results() out_info = _GFFParserLocalOut((target_lines is not None and target_lines > 1)) if (results and results[0][0] == 'directive' and results[0][1] == 'FASTA'): found_seqs = True break class FakeHandle: def __init__(self, line_iter): self._iter = line_iter def __iter__(self): return self def __next__(self): return next(self._iter) next = __next__ def read(self, size=-1): if size < 0: return "".join(l for l in self._iter) elif size == 0: return "" # Used by Biopython to sniff unicode vs bytes else: raise NotImplementedError def readline(self): try: return next(self._iter) except StopIteration: return "" if found_seqs: fasta_recs = self._parse_fasta(FakeHandle(line_iter)) out_info.add('fasta', fasta_recs) if out_info.has_items(): yield out_info.get_results() class DiscoGFFParser(_AbstractMapReduceGFF): """GFF Parser with parallelization through Disco (http://discoproject.org. """ def __init__(self, disco_host, create_missing=True): """Initialize parser. disco_host - Web reference to a Disco host which will be used for parallelizing the GFF reading job. """ _AbstractMapReduceGFF.__init__(self, create_missing=create_missing) self._disco_host = disco_host def _gff_process(self, gff_files, limit_info, target_lines=None): """Process GFF addition, using Disco to parallelize the process. """ assert target_lines is None, "Cannot split parallelized jobs" # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [] for f in gff_files: if f.split(":")[0] != "disco": full_files.append(os.path.abspath(f)) else: full_files.append(f) results = disco.job(self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._examiner._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) yield processed def parse(gff_files, base_dict=None, limit_info=None, target_lines=None): """High level interface to parse GFF files into SeqRecords and SeqFeatures. """ parser = GFFParser() for rec in parser.parse_in_parts(gff_files, base_dict, limit_info, target_lines): yield rec def parse_simple(gff_files, limit_info=None): """Parse GFF files as line by line dictionary of parts. """ parser = GFFParser() for rec in parser.parse_simple(gff_files, limit_info=limit_info): if "child" in rec: assert "parent" not in rec yield rec["child"][0] elif "parent" in rec: yield rec["parent"][0] elif "feature" in rec: yield rec["feature"][0] # ignore directive lines else: assert "directive" in rec def _file_or_handle(fn): """Decorator to handle either an input handle or a file. """ def _file_or_handle_inside(*args, **kwargs): in_file = args[1] if hasattr(in_file, "read"): need_close = False in_handle = in_file if six.PY3 and not isinstance(in_handle, io.TextIOBase): raise TypeError('input handle must be opened in text mode') else: need_close = True in_handle = open(in_file) args = (args[0], in_handle) + args[2:] out = fn(*args, **kwargs) if need_close: in_handle.close() return out return _file_or_handle_inside class GFFExaminer: """Provide high level details about a GFF file to refine parsing. GFF is a spec and is provided by many different centers. Real life files will present the same information in slightly different ways. Becoming familiar with the file you are dealing with is the best way to extract the information you need. This class provides high level summary details to help in learning. """ def __init__(self): self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2], gff_source = [1], gff_type = [2]) def _get_local_params(self, limit_info=None): class _LocalParams: def __init__(self): self.jsonify = False params = _LocalParams() params.limit_info = limit_info params.filter_info = self._filter_info return params @_file_or_handle def available_limits(self, gff_handle): """Return dictionary information on possible limits for this file. This returns a nested dictionary with the following structure: keys -- names of items to filter by values -- dictionary with: keys -- filter choice value -- counts of that filter in this file Not a parallelized map-reduce implementation. """ cur_limits = dict() for filter_key in self._filter_info.keys(): cur_limits[filter_key] = collections.defaultdict(int) for line in gff_handle: # when we hit FASTA sequences, we are done with annotations if line.startswith("##FASTA"): break # ignore empty and comment lines if line.strip() and line.strip()[0] != "#": parts = [p.strip() for p in line.split('\t')] assert len(parts) >= 8, line parts = parts[:9] for filter_key, cur_indexes in self._filter_info.items(): cur_id = tuple([parts[i] for i in cur_indexes]) cur_limits[filter_key][cur_id] += 1 # get rid of the default dicts final_dict = dict() for key, value_dict in cur_limits.items(): if len(key) == 1: key = key[0] final_dict[key] = dict(value_dict) gff_handle.close() return final_dict @_file_or_handle def parent_child_map(self, gff_handle): """Provide a mapping of parent to child relationships in the file. Returns a dictionary of parent child relationships: keys -- tuple of (source, type) for each parent values -- tuple of (source, type) as children of that parent Not a parallelized map-reduce implementation. """ # collect all of the parent and child types mapped to IDs parent_sts = dict() child_sts = collections.defaultdict(list) for line in gff_handle: # when we hit FASTA sequences, we are done with annotations if line.startswith("##FASTA"): break if line.strip() and not line.startswith("#"): line_type, line_info = _gff_line_map(line, self._get_local_params())[0] if (line_type == 'parent' or (line_type == 'child' and line_info['id'])): parent_sts[line_info['id']] = ( line_info['quals'].get('source', [""])[0], line_info['type']) if line_type == 'child': for parent_id in line_info['quals']['Parent']: child_sts[parent_id].append(( line_info['quals'].get('source', [""])[0], line_info['type'])) #print parent_sts, child_sts # generate a dictionary of the unique final type relationships pc_map = collections.defaultdict(list) for parent_id, parent_type in parent_sts.items(): for child_type in child_sts[parent_id]: pc_map[parent_type].append(child_type) pc_final_map = dict() for ptype, ctypes in pc_map.items(): unique_ctypes = list(set(ctypes)) unique_ctypes.sort() pc_final_map[ptype] = unique_ctypes return pc_final_map ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1637247195.0 bcbio-gff-0.6.9/BCBio/GFF/__init__.py0000664002421100242110000000035500000000000020341 0ustar00bchapmanbchapman00000000000000"""Top level of GFF parsing providing shortcuts for useful classes. """ from BCBio.GFF.GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple from BCBio.GFF.GFFOutput import GFF3Writer, write __version__ = "0.6.9" ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/BCBio/GFF/_utils.py0000664002421100242110000000256000000000000020101 0ustar00bchapmanbchapman00000000000000class defaultdict(dict): """Back compatible defaultdict: http://code.activestate.com/recipes/523034/ """ def __init__(self, default_factory=None, *a, **kw): if (default_factory is not None and not hasattr(default_factory, '__call__')): raise TypeError('first argument must be callable') dict.__init__(self, *a, **kw) self.default_factory = default_factory def __getitem__(self, key): try: return dict.__getitem__(self, key) except KeyError: return self.__missing__(key) def __missing__(self, key): if self.default_factory is None: raise KeyError(key) self[key] = value = self.default_factory() return value def __reduce__(self): if self.default_factory is None: args = tuple() else: args = self.default_factory, return type(self), args, None, None, self.items() def copy(self): return self.__copy__() def __copy__(self): return type(self)(self.default_factory, self) def __deepcopy__(self, memo): import copy return type(self)(self.default_factory, copy.deepcopy(self.items())) def __repr__(self): return 'defaultdict(%s, %s)' % (self.default_factory, dict.__repr__(self)) ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/BCBio/__init__.py0000664002421100242110000000002400000000000017730 0ustar00bchapmanbchapman00000000000000"""BCBio module """ ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/LICENSE0000664002421100242110000000202700000000000015713 0ustar00bchapmanbchapman00000000000000Biopython License Agreement Permission to use, copy, modify, and distribute this software and its documentation with or without modifications and for any purpose and without fee is hereby granted, provided that any copyright notices appear in all copies and that both those copyright notices and this permission notice appear in supporting documentation, and that the names of the contributors or copyright holders not be used in advertising or publicity pertaining to distribution of the software without specific prior permission. THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/MANIFEST.in0000664002421100242110000000012700000000000016443 0ustar00bchapmanbchapman00000000000000recursive-include BCBio *.py include distribute_setup.py include *.rst include LICENSE ././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3726625 bcbio-gff-0.6.9/PKG-INFO0000664002421100242110000000047300000000000016006 0ustar00bchapmanbchapman00000000000000Metadata-Version: 1.0 Name: bcbio-gff Version: 0.6.9 Summary: Read and write Generic Feature Format (GFF) with Biopython integration. Home-page: https://github.com/chapmanb/bcbb/tree/master/gff Author: Brad Chapman Author-email: chapmanb@50mail.com License: Biopython License Description: UNKNOWN Platform: UNKNOWN ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/README.rst0000664002421100242110000000165200000000000016400 0ustar00bchapmanbchapman00000000000000bcbio-gff --------- A python library to read and write Generic Feature Format (`GFF`_). See the `wiki documentation`_ for details on usage. The goal is to integrate this code in `gffutils`_ and `Biopython`_. Most of the functionality from this library has been integrated into `gffutils `_ and we recommend using that for parsing GFF. It's an improved approach to handling GFF and well maintained by `Ryan Dale `_. Installation from `bcbio-gff in pypi`_:: pip install bcbio-gff This code is freely available for use under the `Biopython license `_. .. _GFF: http://www.sequenceontology.org/gff3.shtml .. _wiki documentation: http://biopython.org/wiki/GFF_Parsing .. _gffutils: https://github.com/daler/gffutils .. _Biopython: http://biopython.org .. _bcbio-gff in pypi: https://pypi.python.org/pypi/bcbio-gff ././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3646624 bcbio-gff-0.6.9/Scripts/0000775002421100242110000000000000000000000016334 5ustar00bchapmanbchapman00000000000000././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3686624 bcbio-gff-0.6.9/Scripts/gff/0000775002421100242110000000000000000000000017076 5ustar00bchapmanbchapman00000000000000././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Scripts/gff/access_gff_index.py0000664002421100242110000000602200000000000022722 0ustar00bchapmanbchapman00000000000000"""Access an GFF file using bx-python's interval indexing. Requires: bx-python: http://bitbucket.org/james_taylor/bx-python/wiki/Home gff library: http://github.com/chapmanb/bcbb/tree/master/gff Index time: 44 Mb file 11 seconds Index is 7.5Mb """ from __future__ import with_statement import os import sys from bx import interval_index_file from BCBio import GFF def main(gff_file): gff_index = gff_file + ".index" if not os.path.exists(gff_index): print "Indexing GFF file" index(gff_file) index = GFFIndexedAccess(gff_file, keep_open=True) print index.seqids print for feature in index.get_features_in_region("Chr2", 17500, 20000): print feature for feature in index.get_features_in_region("Chr5", 500000, 502500): print feature exam = GFF.GFFExaminer() #print exam.available_limits(gff_file) #print exam.parent_child_map(gff_file) found = 0 limit_info = dict( gff_type = ["protein", "gene", "mRNA", "exon", "CDS", "five_prime_UTR", "three_prime_UTR"] ) for feature in index.get_features_in_region("Chr1", 0, 50000, limit_info): found += 1 print found class GFFIndexedAccess(interval_index_file.AbstractIndexedAccess): """Provide indexed access to a GFF file. """ def __init__(self, *args, **kwargs): interval_index_file.AbstractIndexedAccess.__init__(self, *args, **kwargs) self._parser = GFF.GFFParser() @property def seqids(self): return self.indexes.indexes.keys() def get_features_in_region(self, seqid, start, end, limit_info=None): """Retrieve features located on a given region in start/end coordinates. """ limit_info = self._parser._normalize_limit_info(limit_info) line_gen = self.get_as_iterator(seqid, int(start), int(end)) recs = None for results in self._parser._lines_to_out_info(line_gen, limit_info): assert not recs, "Unexpected multiple results" recs = self._parser._results_to_features(dict(), results) if recs is None: return [] else: assert len(recs) == 1 rec = recs[seqid] return rec.features def read_at_current_offset(self, handle, **kwargs): line = handle.readline() return line def index(gff_file, index_file=None): index = interval_index_file.Indexes() with open(gff_file) as in_handle: while 1: pos = in_handle.tell() line = in_handle.readline() if not line: break if not line.startswith("#"): parts = line.split("\t") (seqid, gtype, source, start, end) = parts[:5] index.add(seqid, int(start), int(end), pos) if index_file is None: index_file = gff_file + ".index" with open(index_file, "w") as index_handle: index.write(index_handle) return index_file if __name__ == "__main__": main(*sys.argv[1:]) ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Scripts/gff/genbank_to_gff.py0000664002421100242110000000066300000000000022406 0ustar00bchapmanbchapman00000000000000#!/usr/bin/env python """Convert a GenBank file into GFF format. Usage: genbank_to_gff.py """ import sys import os from Bio import SeqIO from Bio import Seq from BCBio import GFF def main(gb_file): out_file = "%s.gff" % os.path.splitext(gb_file)[0] with open(out_file, "w") as out_handle: GFF.write(SeqIO.parse(gb_file, "genbank"), out_handle) if __name__ == "__main__": main(*sys.argv[1:]) ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Scripts/gff/gff2_to_gff3.py0000664002421100242110000000132200000000000021701 0ustar00bchapmanbchapman00000000000000#!/usr/bin/env python """Convert a GFF2 file to an updated GFF3 format file. Usage: gff2_to_gff3.py The output file has the same name with the extension gff3. """ import sys import os from BCBio.GFF import GFFParser, GFF3Writer def main(in_file): base, ext = os.path.splitext(in_file) out_file = "%s.gff3" % (base) in_handle = open(in_file) out_handle = open(out_file, "w") reader = GFFParser() writer = GFF3Writer() writer.write(reader.parse_in_parts(in_handle, target_lines=25000), out_handle) in_handle.close() out_handle.close() if __name__ == "__main__": if len(sys.argv) != 2: print __doc__ sys.exit() main(sys.argv[1]) ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Scripts/gff/gff_to_biosql.py0000664002421100242110000000502200000000000022264 0ustar00bchapmanbchapman00000000000000#!/usr/bin/env python """Load a fasta file of sequences and associated GFF file into BioSQL. You will need to adjust the database parameters and have a BioSQL database set up. See: http://biopython.org/wiki/BioSQL Depending on the size of the sequences being loaded, you may also get errors on loading very large chromosome sequences. Updating these options can help: set global max_allowed_packet=1000000000; set global net_buffer_length=1000000; Usage: gff_to_biosql.py """ from __future__ import with_statement import sys from BioSQL import BioSeqDatabase from Bio import SeqIO from BCBio.GFF import GFFParser def main(seq_file, gff_file): # -- To be customized # You need to update these parameters to point to your local database # XXX demo example could be swapped to use SQLite when that is integrated user = "chapmanb" passwd = "cdev" host = "localhost" db_name = "wb199_gff" biodb_name = "wb199_gff_cds_pcr" # These need to be updated to reflect what you would like to parse # out of the GFF file. Set limit_info=None to parse everything, but # be sure the file is small or you may deal with memory issues. rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type = rnai_types + gene_types) # -- print "Parsing FASTA sequence file..." with open(seq_file) as seq_handle: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) print "Parsing GFF data file..." parser = GFFParser() recs = parser.parse(gff_file, seq_dict, limit_info=limit_info) print "Writing to BioSQL database..." server = BioSeqDatabase.open_database(driver="MySQLdb", user=user, passwd=passwd, host=host, db=db_name) try: if biodb_name not in server.keys(): server.new_database(biodb_name) else: server.remove_database(biodb_name) server.adaptor.commit() server.new_database(biodb_name) db = server[biodb_name] db.load(recs) server.adaptor.commit() except: server.adaptor.rollback() raise if __name__ == "__main__": if len(sys.argv) != 3: print __doc__ sys.exit() main(sys.argv[1], sys.argv[2]) ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633513641.0 bcbio-gff-0.6.9/Scripts/gff/gff_to_genbank.py0000664002421100242110000000415600000000000022407 0ustar00bchapmanbchapman00000000000000#!/usr/bin/env python """Convert a GFF and associated FASTA file into GenBank format. Usage: gff_to_genbank.py [ ] FASTA sequence file: input sequences matching records in GFF. Optional if sequences are in the GFF molecule type: type of molecule in the GFF file. Defaults to DNA, the most common case. """ from __future__ import print_function import sys import os from Bio import SeqIO from BCBio import GFF def main(gff_file, fasta_file=None, molecule_type="DNA"): out_file = "%s.gb" % os.path.splitext(gff_file)[0] if fasta_file: fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta")) else: fasta_input = {} gff_iter = GFF.parse(gff_file, fasta_input) SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter), molecule_type), out_file, "genbank") def _fix_ncbi_id(fasta_iter): """GenBank identifiers can only be 16 characters; try to shorten NCBI. """ for rec in fasta_iter: if len(rec.name) > 16 and rec.name.find("|") > 0: new_id = [x for x in rec.name.split("|") if x][-1] print("Warning: shortening NCBI name %s to %s" % (rec.id, new_id)) rec.id = new_id rec.name = new_id yield rec def _check_gff(gff_iterator, molecule_type): """Check GFF files before feeding to SeqIO to be sure they have sequences. """ for rec in gff_iterator: if "molecule_type" not in rec.annotations: rec.annotations["molecule_type"] = molecule_type yield _flatten_features(rec) def _flatten_features(rec): """Make sub_features in an input rec flat for output. GenBank does not handle nested features, so we want to make everything top level. """ out = [] for f in rec.features: cur = [f] while len(cur) > 0: nextf = [] for curf in cur: out.append(curf) if len(curf.sub_features) > 0: nextf.extend(curf.sub_features) cur = nextf rec.features = out return rec if __name__ == "__main__": main(*sys.argv[1:]) ././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3686624 bcbio-gff-0.6.9/Tests/0000775002421100242110000000000000000000000016007 5ustar00bchapmanbchapman00000000000000././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3726625 bcbio-gff-0.6.9/Tests/GFF/0000775002421100242110000000000000000000000016411 5ustar00bchapmanbchapman00000000000000././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/F3-unique-3.v2.gff0000664002421100242110000004215600000000000021347 0ustar00bchapmanbchapman00000000000000##solid-gff-version 0.2 ##gff-version 2 ##source-version MaToGff.java v1.5 ##date 2008-05-28 ##time 13:11:03 ##Type solid_read ##color-code AA=0,AC=1,AG=2,AT=3,CA=1,CC=0,CG=3,CT=2,GA=2,GC=3,GG=0,GT=1,TA=3,TC=2,TG=1,TT=0 ##primer-base F3=T ##max-num-mismatches 3 ##max-read-length 20 ##line-order fragment ##history filter_fasta.pl --noduplicates --output=/data/results/DAEMON/DAEMON_MATE_PAIRS_2_20070326/S1/results.01/primary.20071218094706805 --name=DAEMON_MATE_PAIRS_2_20070326_S1 --tag=F3 --minlength=20 --prefix=T /data/results/DAEMON/DAEMON_MATE_PAIRS_2_20070326/S1/jobs/postPrimerSetPrimary.117/rawseq ##history map /data/results/RegressionDriver/CaseManager/results/r12/integration/case0002/reads1/test_S1_F3.csfasta /data/results/RegressionDriver/CaseManager/knownData/validatedReference/matchingPipeline/ecoli_k12_MG1655.fasta T=30 L=19 C=1 E=.Tmpfile1211939575SVhDtd F=0 B=1 D=1 u=1 r=0 n=1 Z=1000 P="0000000111111111111" M=0 U=0.000000 H=0 > .Tmpfile1211939575SVhDtd.out.1 ##history MaToGff.java --sort --qvs=test_S1_F3_QV.qual.txt --convert=unique --clear=3 --tempdir=../tmp test_S1_F3.csfasta.ma.20.3 ##hdr seqname source feature start end score strand frame [attributes] [comments] 3_336_815_F3 solid read 55409 55428 10.4 + . g=A3233312322232122211;i=1;p=1.000;q=23,12,18,17,10,24,19,14,27,9,23,9,16,20,11,7,8,4,4,14;u=0,0,0,1 3_142_1011_F3 solid read 91290 91309 5.0 - . g=T0330222333132222222;i=1;p=1.000;q=4,4,14,4,4,4,4,21,4,4,4,4,25,4,4,4,5,21,4,4;u=0,0,0,1 3_341_424_F3 solid read 102717 102736 10.6 - . g=T2203031313223113212;i=1;p=1.000;q=9,27,25,16,18,9,27,26,23,13,14,25,27,5,24,5,26,26,4,5;u=0,0,1 3_6_37_F3 solid read 181053 181072 9.4 + . g=C3220221332111020310;i=1;p=1.000;q=9,5,13,9,10,22,6,12,21,7,13,4,21,16,23,6,20,20,13,6;u=0,0,0,1 3_34_202_F3 solid read 284207 284226 6.9 + . g=G0301333332232122333;i=1;p=1.000;q=6,15,21,8,12,4,4,5,12,8,4,12,4,7,10,6,8,16,4,6;u=0,1 3_277_712_F3 solid read 304136 304155 11.8 - . g=A2033101122223322133;i=1;p=1.000;q=26,11,14,27,4,17,4,26,26,23,17,25,26,27,21,23,5,20,26,23;u=0,1 3_394_71_F3 solid read 308736 308755 10.8 + . g=T3203322323203312331;i=1;p=1.000;q=9,24,19,15,20,18,20,10,13,13,11,21,12,7,4,11,20,24,4,25;u=0,1 3_285_1497_F3 solid read 404055 404074 8.4 - . g=T1221231003202232221;i=1;p=1.000;q=8,10,6,25,16,14,23,27,8,14,21,19,5,4,4,6,22,12,4,6;u=0,0,0,1 3_228_178_F3 solid read 453227 453246 9.5 - . g=G1130333332331110323;i=1;p=1.000;q=4,19,25,18,18,5,19,6,8,24,4,26,21,11,15,4,26,13,13,15;u=0,0,0,1 3_406_794_F3 solid read 504835 504854 8.3 - . g=T3033331301320201111;i=1;p=1.000;q=27,4,13,4,21,11,7,11,5,26,10,8,9,4,6,18,9,26,17,6;u=0,0,0,1 3_303_251_F3 solid read 561501 561520 5.3 + . g=C0011111112222112221;i=1;p=1.000;q=9,8,4,4,10,4,4,4,6,14,4,4,4,4,16,4,4,4,4,23;u=0,0,1 3_152_112_F3 solid read 624012 624031 7.7 - . g=G0301122312213122221;i=1;p=1.000;q=22,14,7,13,18,5,11,4,15,6,6,11,4,8,15,5,10,4,6,24;u=0,0,0,1 3_112_1154_F3 solid read 630582 630601 11.3 - . g=T1333312011131131011;i=1;p=1.000;q=27,27,4,5,17,24,20,19,7,4,25,17,18,15,22,23,17,25,16,26;u=0,0,1 3_196_392_F3 solid read 661664 661683 19.7 - . g=T3321013301122133323;i=1;p=1.000;q=27,25,13,26,21,25,23,27,27,27,27,11,16,27,27,19,26,27,26,27;u=1 3_192_1248_F3 solid read 672037 672056 4.5 - . g=A0333232333121222222;i=1;p=1.000;q=4,7,4,4,4,4,4,4,6,4,4,4,4,4,7,7,4,4,6,4;u=0,0,0,1 3_63_479_F3 solid read 742582 742601 7.9 - . g=A0133333333233232332;i=1;p=1.000;q=4,9,6,11,20,12,11,9,13,20,18,4,4,14,9,15,4,6,21,4;u=0,0,0,1 3_30_710_F3 solid read 816069 816088 9.2 - . g=T3311001223313333313;i=1;p=1.000;q=22,27,18,25,25,7,26,25,14,23,6,25,5,11,7,4,15,7,4,6;u=0,0,0,1 3_284_77_F3 solid read 864876 864895 7.4 + . g=T2003133033233112331;i=1;p=1.000;q=13,19,4,11,22,24,6,16,4,6,13,4,12,18,4,6,7,11,4,5;u=0,0,0,1 3_411_1040_F3 solid read 876023 876042 10.9 - . g=T2121301233200033221;i=1;p=1.000;q=9,9,5,12,11,8,4,16,27,27,18,21,24,9,18,24,21,9,23,17;u=0,0,0,1 3_188_171_F3 solid read 884683 884702 5.8 - . g=A1322330132213322231;i=1;p=1.000;q=4,8,4,5,7,6,5,4,11,6,6,11,4,8,4,8,4,6,4,15;u=0,0,0,1 3_63_787_F3 solid read 1022149 1022168 7.5 + . g=C3131132013020123031;i=1;p=1.000;q=12,13,26,14,9,9,13,14,4,7,8,5,11,4,17,4,4,6,4,21;u=0,1 3_391_2015_F3 solid read 1074989 1075008 18.5 - . g=A2323101222321232322;i=1;p=1.000;q=27,25,18,20,27,27,24,23,27,23,27,25,19,26,12,26,9,21,27,21;u=1 3_8_425_F3 solid read 1119124 1119143 6.7 - . g=T0321201132230303323;i=1;p=1.000;q=6,5,8,6,4,4,23,9,12,10,15,4,13,13,8,4,4,5,5,12;u=0,0,1 3_53_745_F3 solid read 1130179 1130198 7.6 - . g=C0213313233333113321;i=1;p=1.000;q=27,6,9,22,18,9,8,15,6,8,14,5,8,6,16,4,5,4,4,14;u=0,0,0,1 3_123_576_F3 solid read 1219122 1219141 8.7 + . g=A3333133323333323323;i=1;p=1.000;q=18,22,5,11,16,16,8,14,8,5,19,8,9,10,7,11,6,11,9,4;u=0,0,1 3_81_12_F3 solid read 1236732 1236751 8.6 + . g=G2210332302233112321;i=1;p=1.000;q=7,16,17,9,7,9,9,16,9,4,10,21,17,8,4,6,9,16,6,12;u=0,0,0,1 3_96_1862_F3 solid read 1264409 1264428 6.9 - . g=G0301032323231222021;i=1;p=1.000;q=26,23,11,20,15,8,6,4,6,6,9,7,6,4,8,6,4,5,6,5;u=0,0,0,1 3_40_136_F3 solid read 1266177 1266196 7.4 - . g=T2332222332203312221;i=1;p=1.000;q=9,23,6,19,13,9,4,8,17,9,4,4,13,9,8,5,4,6,10,8;u=0,0,1 3_124_1781_F3 solid read 1385416 1385435 10.3 + . g=A1322302333332222132;i=1;p=1.000;q=13,17,8,6,5,9,24,4,7,9,18,27,18,16,16,23,18,18,11,23;u=0,0,1 3_134_1165_F3 solid read 1393169 1393188 9.0 - . g=T3301123202321131311;i=1;p=1.000;q=4,27,18,7,27,4,27,26,4,20,4,27,26,9,27,4,27,14,10,27;u=1 3_224_587_F3 solid read 1490044 1490063 6.1 + . g=G2032313231111233321;i=1;p=1.000;q=4,4,6,6,13,24,4,4,5,15,6,7,9,14,4,4,4,25,5,5;u=0,0,0,1 3_25_747_F3 solid read 1513598 1513617 9.5 + . g=T1223213101133121231;i=1;p=1.000;q=26,27,8,27,27,27,26,27,26,19,8,14,4,17,11,5,7,4,7,6;u=0,0,1 3_143_14_F3 solid read 1528236 1528255 9.7 + . g=T3233113323230202011;i=1;p=1.000;q=13,23,17,19,23,16,24,25,14,15,9,6,4,11,4,9,12,4,16,10;u=0,0,0,1 3_164_1025_F3 solid read 1570107 1570126 7.9 - . g=T3220332323303320231;i=1;p=1.000;q=7,10,20,8,4,24,4,4,21,6,26,22,9,6,11,9,6,4,17,14;u=0,0,0,1 3_137_552_F3 solid read 1630276 1630295 9.1 - . g=G3030333223233102131;i=1;p=1.000;q=6,28,9,4,6,26,27,6,10,9,27,21,6,16,9,25,6,7,23,12;u=0,0,0,1 3_125_1810_F3 solid read 1634104 1634123 10.5 + . g=G1232220322032311332;i=1;p=1.000;q=27,8,26,26,10,6,26,12,27,27,26,4,27,27,23,8,8,4,27,12;u=0,0,0,1 3_314_1310_F3 solid read 1639981 1640000 9.2 + . g=A2221332230322203033;i=1;p=1.000;q=19,12,6,27,11,27,6,11,5,6,9,13,27,27,8,18,5,22,4,27;u=0,0,0,1 3_384_591_F3 solid read 1654341 1654360 6.8 + . g=A3323221133121102313;i=1;p=1.000;q=19,8,7,7,15,4,20,7,4,6,14,7,19,6,8,4,5,9,4,4;u=0,0,0,1 3_145_739_F3 solid read 1791040 1791059 11.9 - . g=A0221223333323131212;i=1;p=1.000;q=20,27,23,13,27,14,27,28,27,25,12,24,8,16,8,4,8,21,9,11;u=0,0,0,1 3_326_2020_F3 solid read 1830564 1830583 9.3 + . g=A3321322331103233322;i=1;p=1.000;q=14,4,25,16,10,12,16,5,14,10,25,5,25,5,9,18,13,26,4,26;u=0,0,0,1 3_233_1265_F3 solid read 1857564 1857583 8.9 + . g=T3112113020130223311;i=1;p=1.000;q=7,27,25,26,27,14,26,27,27,27,4,6,5,10,17,4,5,7,6,12;u=0,0,1 3_235_100_F3 solid read 1912460 1912479 9.6 - . g=G2233020000132311231;i=1;p=1.000;q=23,24,25,16,17,6,21,25,9,4,6,11,8,19,6,6,19,14,13,6;u=0,0,0,1 3_111_107_F3 solid read 1944496 1944515 7.6 - . g=C3023223333211322231;i=1;p=1.000;q=15,5,6,14,5,13,4,12,11,4,9,9,11,12,4,11,11,13,6,6;u=0,0,0,1 3_457_1514_F3 solid read 1956598 1956617 9.9 - . g=T0013331013332110221;i=1;p=1.000;q=18,24,10,24,23,25,22,11,20,10,15,11,4,5,27,4,9,13,5,27;u=0,1 3_183_74_F3 solid read 1992040 1992059 9.8 + . g=C3332233131131222322;i=1;p=1.000;q=27,27,25,23,25,8,11,11,7,11,4,12,14,10,15,7,14,4,9,12;u=0,0,1 3_357_1303_F3 solid read 2037917 2037936 10.9 - . g=T3331331323320311331;i=1;p=1.000;q=7,27,5,19,26,8,27,12,14,27,8,27,23,9,19,4,26,20,9,27;u=0,0,0,1 3_153_186_F3 solid read 2083441 2083460 6.7 + . g=T3112233331133323322;i=1;p=1.000;q=7,14,19,7,12,6,11,4,11,8,4,6,6,4,11,4,6,4,4,18;u=0,1 3_65_1741_F3 solid read 2107441 2107460 8.4 + . g=T3333332330233132123;i=1;p=1.000;q=4,4,6,25,9,4,26,16,21,9,18,15,27,27,4,21,9,7,9,6;u=0,0,0,1 3_98_323_F3 solid read 2118821 2118840 7.5 + . g=A3222212322131112031;i=1;p=1.000;q=13,14,8,10,8,14,4,13,10,7,15,4,6,4,4,12,6,11,6,8;u=0,0,1 3_48_258_F3 solid read 2153882 2153901 9.4 - . g=G0330113313201122321;i=1;p=1.000;q=22,15,20,4,16,17,14,24,4,5,4,22,19,8,10,9,13,22,8,15;u=0,0,0,1 3_140_1125_F3 solid read 2182909 2182928 7.9 + . g=T3231331302232001131;i=1;p=1.000;q=10,4,12,6,4,12,13,6,18,5,8,11,4,26,6,25,5,18,11,12;u=0,0,0,1 3_359_118_F3 solid read 2188393 2188412 8.4 + . g=A0301311133331131322;i=1;p=1.000;q=11,5,7,13,20,6,6,25,8,18,9,15,27,9,6,7,15,17,4,4;u=0,0,0,1 3_203_483_F3 solid read 2272874 2272893 9.1 - . g=C3031223110333133311;i=1;p=1.000;q=23,21,25,27,10,5,22,15,17,18,5,18,17,5,19,4,4,13,4,22;u=0,0,0,1 3_66_301_F3 solid read 2286038 2286057 6.6 - . g=C1113113330132222311;i=1;p=1.000;q=10,4,6,4,8,13,9,4,10,9,4,6,13,9,5,6,11,6,4,9;u=0,0,0,1 3_78_130_F3 solid read 2291021 2291040 7.6 + . g=G3233131332212222321;i=1;p=1.000;q=13,16,6,12,17,11,10,4,12,8,13,4,8,6,4,4,12,10,4,11;u=0,0,0,1 3_141_110_F3 solid read 2291354 2291373 9.3 + . g=T1312203322212123321;i=1;p=1.000;q=9,21,24,11,16,4,23,27,16,16,8,22,6,10,16,4,9,4,7,25;u=0,0,1 3_51_1383_F3 solid read 2374918 2374937 8.8 + . g=T3311203033322222231;i=1;p=1.000;q=24,26,6,27,27,23,27,4,21,27,4,27,6,9,24,4,23,4,4,27;u=0,0,1 3_231_366_F3 solid read 2392091 2392110 10.0 - . g=T2022333223101331322;i=1;p=1.000;q=18,12,9,9,13,8,7,22,7,7,4,26,12,17,9,20,24,8,18,14;u=0,0,0,1 3_214_1802_F3 solid read 2394604 2394623 8.8 - . g=T1232111001220211133;i=1;p=1.000;q=17,18,14,6,19,4,21,4,6,12,11,4,26,20,9,18,7,16,5,18;u=0,0,0,1 3_67_1434_F3 solid read 2454508 2454527 15.2 - . g=T3121311232222231203;i=1;p=1.000;q=9,27,27,18,16,14,25,27,26,21,19,27,27,27,15,5,24,27,24,24;u=0,0,1 3_124_1647_F3 solid read 2493617 2493636 7.5 + . g=A0211320203220231332;i=1;p=1.000;q=9,12,12,9,6,14,12,7,4,4,12,9,4,9,16,4,4,9,9,16;u=0,0,0,1 3_39_328_F3 solid read 2500759 2500778 7.8 + . g=T1332333033231132333;i=1;p=1.000;q=24,27,26,26,25,21,7,8,4,5,20,4,11,6,8,4,6,4,11,7;u=0,0,1 3_378_322_F3 solid read 2541624 2541643 8.9 + . g=T2333331001023011220;i=1;p=1.000;q=14,6,13,25,27,4,24,22,14,19,9,23,15,6,8,4,22,4,4,20;u=0,0,0,1 3_216_848_F3 solid read 2550573 2550592 11.5 - . g=G2320322020031220322;i=1;p=1.000;q=21,24,8,21,20,25,18,6,24,14,21,9,7,18,8,18,7,9,19,12;u=0,0,0,1 3_221_516_F3 solid read 2607559 2607578 11.1 - . g=T2132333313222333332;i=1;p=1.000;q=9,19,27,26,24,26,26,25,25,26,21,4,6,10,21,6,20,13,5,24;u=0,0,0,1 3_56_45_F3 solid read 2662103 2662122 5.5 + . g=G3021122332232122321;i=1;p=1.000;q=4,4,4,6,4,6,4,5,18,9,4,16,10,4,4,4,12,4,6,6;u=0,0,0,1 3_127_210_F3 solid read 2798906 2798925 10.2 + . g=G2331321333232203222;i=1;p=1.000;q=11,25,9,4,23,16,26,14,7,22,9,25,9,8,21,8,15,17,4,26;u=0,0,1 3_417_422_F3 solid read 2812322 2812341 8.8 - . g=T3321222333313333132;i=1;p=1.000;q=9,26,7,19,7,13,23,4,25,4,6,19,4,16,15,15,23,4,19,13;u=0,0,0,1 3_42_1403_F3 solid read 2830264 2830283 9.6 - . g=T3212330132120221212;i=1;p=1.000;q=7,4,25,18,6,17,12,12,17,14,8,26,13,15,10,4,21,5,12,22;u=0,1 3_457_42_F3 solid read 2874245 2874264 7.6 - . g=G0301123332223122221;i=1;p=1.000;q=18,10,14,9,19,4,10,8,11,10,6,8,5,8,11,4,13,6,4,6;u=0,0,1 3_361_728_F3 solid read 2893879 2893898 14.6 + . g=C3213223312310132221;i=1;p=1.000;q=14,18,7,7,17,19,23,24,17,26,12,15,21,23,21,19,17,20,22,24;u=0,0,0,1 3_77_718_F3 solid read 2913092 2913111 9.4 + . g=T3021331333313131231;i=1;p=1.000;q=15,26,7,24,20,18,5,6,17,18,6,11,4,13,19,15,7,4,22,25;u=0,0,0,1 3_116_154_F3 solid read 2917672 2917691 9.8 - . g=A0323231223233132311;i=1;p=1.000;q=20,9,19,18,10,18,8,16,25,6,18,6,12,24,6,7,5,15,7,17;u=0,0,0,1 3_239_1415_F3 solid read 2923256 2923275 19.2 + . g=T3233113121300032200;i=1;p=1.000;q=25,27,27,26,27,24,27,27,25,27,22,27,21,26,22,19,26,9,14,21;u=1 3_142_1468_F3 solid read 2930117 2930136 10.5 - . g=A3233323333303103330;i=1;p=1.000;q=9,20,6,26,16,18,8,13,20,25,25,18,6,12,11,18,4,16,16,6;u=0,0,1 3_394_295_F3 solid read 2930118 2930137 8.1 - . g=T3023333333333311331;i=1;p=1.000;q=4,14,6,12,7,22,10,4,13,24,18,12,12,4,6,9,9,9,14,4;u=0,0,0,1 3_222_1773_F3 solid read 2934040 2934059 11.6 + . g=T1303031311123232302;i=1;p=1.000;q=11,10,24,15,28,6,19,5,13,27,8,26,8,22,25,27,26,27,8,13;u=0,0,0,1 3_276_1344_F3 solid read 2969950 2969969 13.2 - . g=G3211212131233322233;i=1;p=1.000;q=27,27,12,16,11,23,27,8,23,12,27,22,20,12,15,25,8,27,16,6;u=0,1 3_155_1814_F3 solid read 3107393 3107412 13.6 + . g=A2332222213113120221;i=1;p=1.000;q=27,26,20,25,26,27,12,27,26,18,26,4,27,10,23,26,6,23,26,26;u=0,0,0,1 3_373_2014_F3 solid read 3143956 3143975 12.0 - . g=T3013322223222221211;i=1;p=1.000;q=16,8,17,21,10,10,18,18,18,13,4,23,16,24,8,19,14,15,23,11;u=0,1 3_81_1637_F3 solid read 3413619 3413638 9.1 + . g=G2313032322122302111;i=1;p=1.000;q=9,4,7,19,27,6,11,5,12,15,20,27,8,27,6,16,6,27,21,6;u=0,0,1 3_291_969_F3 solid read 3438323 3438342 17.4 + . g=T0021120212032121313;i=1;p=1.000;q=24,27,6,27,27,27,27,13,27,27,25,27,26,27,27,20,23,26,27,20;u=1 3_179_1617_F3 solid read 3475164 3475183 8.0 + . g=A2100132222332123123;i=1;p=1.000;q=21,25,11,22,4,19,7,21,20,4,5,24,25,16,4,4,11,19,4,4;u=0,0,0,1 3_446_861_F3 solid read 3476173 3476192 11.6 - . g=G1213302212022132321;i=1;p=1.000;q=27,27,27,27,26,25,12,27,24,18,24,6,27,26,20,9,6,6,4,23;u=0,0,1 3_397_317_F3 solid read 3545152 3545171 11.1 + . g=T3110031332233111131;i=1;p=1.000;q=22,27,9,9,26,5,22,20,9,10,16,22,24,6,23,25,22,4,17,18;u=0,0,0,1 3_323_713_F3 solid read 3575287 3575306 16.2 - . g=A0322222200213223302;i=1;p=1.000;q=27,25,21,27,26,26,24,26,27,18,27,26,26,27,22,22,6,26,25,8;u=0,1 3_294_1906_F3 solid read 3727542 3727561 8.4 - . g=A3030310223202311021;i=1;p=1.000;q=14,7,5,4,7,18,4,6,13,6,12,12,10,11,15,14,16,7,9,12;u=0,0,0,1 3_443_223_F3 solid read 3730805 3730824 17.1 - . g=T1113320033330133111;i=1;p=1.000;q=28,27,18,27,27,27,20,26,27,14,25,16,19,19,8,23,16,21,16,15;u=0,0,1 3_94_809_F3 solid read 3841898 3841917 21.8 - . g=A2032223110001131310;i=1;p=1.000;q=27,27,27,27,26,27,25,24,27,27,27,25,27,27,27,12,23,16,27,27;u=0,0,0,1 3_245_387_F3 solid read 3878549 3878568 24.4 - . g=A0222211220333132122;i=1;p=1.000;q=27,27,26,27,26,27,27,25,27,25,26,27,18,21,26,25,26,23,24,24;u=1 3_190_1089_F3 solid read 3900038 3900057 13.7 - . g=T1111110323122301202;i=1;p=1.000;q=27,11,27,11,8,9,27,9,9,26,25,27,11,27,23,14,24,20,22,26;u=0,0,1 3_442_1501_F3 solid read 3912610 3912629 8.5 + . g=A0012333103302132301;i=1;p=1.000;q=11,11,15,19,15,6,12,10,4,11,21,5,9,16,7,14,4,4,8,19;u=0,0,1 3_342_678_F3 solid read 4044575 4044594 4.0 + . g=A3333112332213322323;i=1;p=1.000;q=4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4;u=0,0,0,1 3_56_1294_F3 solid read 4058789 4058808 12.7 + . g=G3323331232322213322;i=1;p=1.000;q=26,17,18,27,23,8,8,24,27,27,9,27,25,14,26,4,27,9,24,23;u=0,0,0,1 3_69_1575_F3 solid read 4070467 4070486 9.9 + . g=A2222011012222112121;i=1;p=1.000;q=16,25,14,9,9,9,21,9,4,24,6,21,13,6,27,10,19,8,6,27;u=0,0,0,1 3_198_476_F3 solid read 4080622 4080641 8.9 + . g=C2010231122212011133;i=1;p=1.000;q=16,8,8,16,12,17,4,16,12,15,10,4,9,6,4,25,9,9,23,11;u=0,1 3_24_715_F3 solid read 4136503 4136522 4.0 - . g=G1313332132232313233;i=1;p=1.000;q=4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4;u=0,0,0,1 3_151_283_F3 solid read 4148264 4148283 9.7 + . g=T3230210232022111220;i=1;p=1.000;q=9,14,6,25,25,19,6,4,16,11,12,20,10,13,26,19,6,4,19,14;u=0,0,1 3_164_774_F3 solid read 4156157 4156176 9.6 + . g=G2311112210110223313;i=1;p=1.000;q=8,24,19,7,6,16,12,9,4,8,26,14,26,24,7,18,6,16,14,7;u=0,0,0,1 3_275_1212_F3 solid read 4171385 4171404 8.3 + . g=G0223122231333302232;i=1;p=1.000;q=13,8,5,4,10,7,12,25,4,25,6,15,6,27,6,11,12,7,14,10;u=0,0,0,1 3_148_289_F3 solid read 4177672 4177691 8.0 - . g=T1203101332223323323;i=1;p=1.000;q=9,21,11,6,5,7,25,24,26,24,8,9,7,12,7,4,11,9,4,4;u=0,0,0,1 3_437_1000_F3 solid read 4179623 4179642 12.3 + . g=A0112222212231131001;i=1;p=1.000;q=26,27,26,27,4,27,17,6,22,13,27,24,6,27,21,27,22,15,24,9;u=0,0,1 3_318_2011_F3 solid read 4218181 4218200 12.9 - . g=T2133330223033303323;i=1;p=1.000;q=25,27,27,5,5,16,27,16,27,15,18,25,26,11,27,19,16,24,9,15;u=0,0,0,1 3_14_11_F3 solid read 4222697 4222716 7.8 - . g=T2323310222232322122;i=1;p=1.000;q=6,23,16,25,25,9,7,4,12,4,14,6,10,7,6,9,18,4,10,4;u=0,0,0,1 3_402_391_F3 solid read 4274545 4274564 6.2 - . g=C3303323321111111111;i=1;p=1.000;q=10,19,15,15,7,8,13,4,7,4,5,16,4,4,5,4,9,4,4,4;u=0,0,0,1 3_293_504_F3 solid read 4339235 4339254 9.5 + . g=C2133223303331120213;i=1;p=1.000;q=6,4,5,26,13,7,17,6,24,10,27,24,5,9,21,9,23,24,20,14;u=0,0,0,1 3_360_914_F3 solid read 4407004 4407023 10.7 + . g=T3012102130232022001;i=1;p=1.000;q=23,24,19,17,24,6,26,17,25,15,7,24,14,11,26,9,22,4,8,5;u=0,0,0,1 3_118_1532_F3 solid read 4431702 4431721 10.2 + . g=C3233220201223200322;i=1;p=1.000;q=20,9,17,22,17,23,13,4,9,5,16,11,10,6,17,7,9,22,27,27;u=0,0,1 3_358_133_F3 solid read 4460191 4460210 9.1 + . g=T0221223112322112233;i=1;p=1.000;q=6,23,12,22,7,6,7,4,13,5,9,23,12,9,24,8,14,7,20,26;u=0,0,0,1 3_397_195_F3 solid read 4499390 4499409 6.9 - . g=T3302332313332212121;i=1;p=1.000;q=23,14,15,5,9,8,6,4,4,13,4,16,13,16,4,7,4,12,4,5;u=0,0,0,1 3_158_642_F3 solid read 4533144 4533163 7.1 - . g=A1332103332323233212;i=1;p=1.000;q=8,20,9,22,8,14,4,16,17,4,8,13,7,8,4,12,5,4,4,4;u=0,0,0,1 3_300_1439_F3 solid read 4580452 4580471 12.3 - . g=A0331111211302100201;i=1;p=1.000;q=5,17,21,14,4,16,11,27,21,9,17,17,27,23,12,21,16,27,25,25;u=0,0,0,1 # Elapsed time 0.846 secs ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/c_elegans_WS199_ann_gff.txt0000664002421100242110000000022100000000000023417 0ustar00bchapmanbchapman00000000000000# modified GFF file to remove location coordinates and test annotations I Expr_profile experimental_result_region . . . + . expr_profile=B0019.1 ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/c_elegans_WS199_dna_shortened.fa0000664002421100242110000000134700000000000024417 0ustar00bchapmanbchapman00000000000000>I gcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaa gcctaagcctaagcctaagcctaagcctaagcctaagcct >II cctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaag cctaagcctaagcctaagcctaagcctaagcctaagccta >III cctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaag cctaagcctaagcctaagcctaagcctaagcctaagccta >IV cctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaag cctaagcctaagcctaagcctaagcctaagcctaagccta >V gaattcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagc ctaagcctaagcctaagcctaagcctaagcctaagcctaa >X ctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagc ctaagcctaagcctaagcctaagcctaagcctaagcctaa >MtDNA cagtaaatagtttaataaaaatatagcatttgggttgctaagatattattactgatagaa tttttagtttaatttagaatgtatcacttacaatgatggg ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/c_elegans_WS199_shortened_gff.txt0000664002421100242110000004326200000000000024652 0ustar00bchapmanbchapman00000000000000I Orfeome PCR_product 12759747 12764936 . - . amplified=1;pcr_product=mv_B0019.1 I SAGE_tag_unambiguously_mapped SAGE_tag 12763533 12763553 . - . count=1;gene=amx-2;sequence=SAGE:ggcagagtcttttggca;transcript=B0019.1 I SAGE_tag_unambiguously_mapped SAGE_tag 12761492 12761512 . - . count=5;gene=amx-2;sequence=SAGE:aacggagccgtacacgc;transcript=B0019.1 I SAGE_tag_most_three_prime SAGE_tag 12761499 12761512 . - . count=9;gene=amx-2;sequence=SAGE:aacggagccg;transcript=B0019.1 X SAGE_tag SAGE_tag 6819353 6819366 . + . count=9;gene=amx-2;sequence=SAGE:aacggagccg;transcript=B0019.1 I Expr_profile experimental_result_region 12762449 12764118 . + . expr_profile=B0019.1 I Coding_transcript CDS 12759745 12759828 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12759949 12760013 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12760227 12760319 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12760365 12760494 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12760834 12760904 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12761172 12761516 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12761799 12761953 . - 1 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12762127 12762268 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12762648 12762806 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12763112 12763249 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12763448 12763655 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12763729 12763882 . - 1 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12763979 12764102 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12764291 12764471 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I Coding_transcript CDS 12764812 12764937 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797 I history CDS 12759745 12759828 . - 0 ID=CDS:B0019.1:wp173 I history CDS 12759949 12760013 . - 2 ID=CDS:B0019.1:wp173 I history CDS 12760227 12760319 . - 2 ID=CDS:B0019.1:wp173 I history CDS 12760365 12760494 . - 0 ID=CDS:B0019.1:wp173 I history CDS 12760834 12760904 . - 2 ID=CDS:B0019.1:wp173 I history CDS 12761172 12761516 . - 2 ID=CDS:B0019.1:wp173 I history CDS 12761577 12761626 . - 1 ID=CDS:B0019.1:wp173 I history CDS 12761795 12761953 . - 1 ID=CDS:B0019.1:wp173 I history CDS 12762127 12762268 . - 2 ID=CDS:B0019.1:wp173 I history CDS 12762648 12762806 . - 2 ID=CDS:B0019.1:wp173 I history CDS 12763112 12763249 . - 2 ID=CDS:B0019.1:wp173 I history CDS 12763448 12763655 . - 0 ID=CDS:B0019.1:wp173 I history CDS 12763729 12763882 . - 1 ID=CDS:B0019.1:wp173 I history CDS 12763979 12764102 . - 2 ID=CDS:B0019.1:wp173 I history CDS 12764291 12764471 . - 0 ID=CDS:B0019.1:wp173 I history CDS 12764812 12764937 . - 0 ID=CDS:B0019.1:wp173 I history CDS 12759745 12759828 . - 0 ID=CDS:B0019.1:wp90 I history CDS 12759949 12760013 . - 2 ID=CDS:B0019.1:wp90 I history CDS 12760227 12760319 . - 2 ID=CDS:B0019.1:wp90 I history CDS 12761172 12761516 . - 2 ID=CDS:B0019.1:wp90 I history CDS 12761577 12761626 . - 1 ID=CDS:B0019.1:wp90 I history CDS 12761795 12761953 . - 1 ID=CDS:B0019.1:wp90 I history CDS 12762127 12762268 . - 2 ID=CDS:B0019.1:wp90 I history CDS 12762648 12762806 . - 2 ID=CDS:B0019.1:wp90 I history CDS 12763112 12763249 . - 2 ID=CDS:B0019.1:wp90 I history CDS 12763469 12763655 . - 0 ID=CDS:B0019.1:wp90 I history CDS 12763729 12763882 . - 1 ID=CDS:B0019.1:wp90 I history CDS 12763979 12764102 . - 2 ID=CDS:B0019.1:wp90 I history CDS 12764291 12764471 . - 0 ID=CDS:B0019.1:wp90 I history CDS 12764812 12764937 . - 0 ID=CDS:B0019.1:wp90 I mass_spec_genome translated_nucleotide_match 12761920 12761953 . - . ID=Target:381130;Target=Mass_spec_peptide:MSP:FADFSPLDVSDVNFATDDLAK 10 21 +;Note=MSP:FADFSPLDVSDVNFATDDLAK;cds_matches=B0019.1;protein_matches=WP:CE40797;times_observed=3 I mass_spec_genome translated_nucleotide_match 12762127 12762155 . - . ID=Target:381130;Target=Mass_spec_peptide:MSP:FADFSPLDVSDVNFATDDLAK 1 10 +;Note=MSP:FADFSPLDVSDVNFATDDLAK;cds_matches=B0019.1;protein_matches=WP:CE40797;times_observed=3 I mass_spec_genome translated_nucleotide_match 12763506 12763559 . - . ID=Target:381133;Target=Mass_spec_peptide:MSP:FGHGQSLLAQGGMNEVVR 1 18 +;Note=MSP:FGHGQSLLAQGGMNEVVR;cds_matches=B0019.1;protein_matches=WP:CE40797;times_observed=1 I mass_spec_genome translated_nucleotide_match 12764361 12764411 . - . ID=Target:381144;Target=Mass_spec_peptide:MSP:NIQQNRPGLSVLVLEAR 1 17 +;Note=MSP:NIQQNRPGLSVLVLEAR;cds_matches=B0019.1;protein_matches=WP:CE40797;times_observed=2 I Coding_transcript mRNA 12759582 12764949 . - . ID=Transcript:B0019.1;Note=amx-2;Parent=Gene:WBGene00000138;cds=B0019.1;prediction_status=Partially_confirmed;wormpep=CE:CE40797 I Allele SNP 12764272 12764272 . + . interpolated_map_position=14.003;rflp=No;variation=snp_B0019[1] I Oligo_set reagent 12759745 12761589 . - . oligo_set=Aff_B0019.1 I Coding_transcript exon 12759745 12759828 . - 0 Parent=Transcript:B0019.1 I Coding_transcript exon 12759949 12760013 . - 2 Parent=Transcript:B0019.1 I Coding_transcript exon 12760227 12760319 . - 2 Parent=Transcript:B0019.1 I Coding_transcript exon 12760365 12760494 . - 0 Parent=Transcript:B0019.1 I Coding_transcript exon 12760834 12760904 . - 2 Parent=Transcript:B0019.1 I Coding_transcript exon 12761172 12761516 . - 2 Parent=Transcript:B0019.1 I Coding_transcript exon 12761799 12761953 . - 1 Parent=Transcript:B0019.1 I Coding_transcript exon 12762127 12762268 . - 2 Parent=Transcript:B0019.1 I Coding_transcript exon 12762648 12762806 . - 2 Parent=Transcript:B0019.1 I Coding_transcript exon 12763112 12763249 . - 2 Parent=Transcript:B0019.1 I Coding_transcript exon 12763448 12763655 . - 0 Parent=Transcript:B0019.1 I Coding_transcript exon 12763729 12763882 . - 1 Parent=Transcript:B0019.1 I Coding_transcript exon 12763979 12764102 . - 2 Parent=Transcript:B0019.1 I Coding_transcript exon 12764291 12764471 . - 0 Parent=Transcript:B0019.1 I Coding_transcript exon 12764812 12764937 . - 0 Parent=Transcript:B0019.1 I Coding_transcript five_prime_UTR 12764938 12764949 . - . Parent=Transcript:B0019.1 I Coding_transcript three_prime_UTR 12759582 12759744 . - . Parent=Transcript:B0019.1 I Coding_transcript intron 12760495 12760833 . - . Parent=Transcript:B0019.1;confirmed_est=EC027594 I Coding_transcript intron 12760905 12761171 . - . Parent=Transcript:B0019.1;confirmed_est=EC027594 I Coding_transcript intron 12761517 12761798 . - . Parent=Transcript:B0019.1;confirmed_est=EC027594 I Coding_transcript intron 12759829 12759948 . - . Parent=Transcript:B0019.1;confirmed_est=EC034652 I Coding_transcript intron 12760014 12760226 . - . Parent=Transcript:B0019.1;confirmed_est=EC034652 I Coding_transcript intron 12760320 12760364 . - . Parent=Transcript:B0019.1;confirmed_est=yk1054h04.3 I Coding_transcript intron 12763883 12763978 . - . Parent=Transcript:B0019.1;confirmed_est=yk1054h04.5,OSTF088D9_1 I Coding_transcript intron 12764103 12764290 . - . Parent=Transcript:B0019.1;confirmed_est=yk1054h04.5,OSTF088D9_1 I Coding_transcript intron 12764472 12764811 . - . Parent=Transcript:B0019.1;confirmed_est=yk1054h04.5,OSTF088D9_1 I Coding_transcript intron 12762807 12763111 . - . Parent=Transcript:B0019.1;confirmed_est=yk1056c07.5 I Coding_transcript intron 12763250 12763447 . - . Parent=Transcript:B0019.1;confirmed_est=yk1056c07.5 I Coding_transcript intron 12763656 12763728 . - . Parent=Transcript:B0019.1;confirmed_est=yk1056c07.5 I Coding_transcript intron 12761954 12762126 . - . Parent=Transcript:B0019.1;confirmed_est=yk262g9.5 I Coding_transcript intron 12762269 12762647 . - . Parent=Transcript:B0019.1;confirmed_est=yk262g9.5 I Promoterome PCR_product 12764938 12766937 . + . pcr_product=p_B0019.1_93 I GenePair_STS PCR_product 12762449 12764118 . + . pcr_product=sjj_B0019.1 I Coding_transcript gene 12759582 12764949 . - . ID=Gene:WBGene00000138 III Orfeome PCR_product 13780230 13780850 . + . amplified=1;pcr_product=mv_3R5.1.v6 IV Orfeome PCR_product 17486939 17488952 . - . amplified=1;pcr_product=mv_4R79.1 IV Orfeome PCR_product 17480353 17483284 . - . amplified=1;pcr_product=mv_4R79.2 X Orfeome PCR_product 17714881 17718531 . + . amplified=1;pcr_product=mv_6R55.1 X Orfeome PCR_product 17712787 17714742 . + . amplified=1;pcr_product=mv_6R55.2 II Orfeome PCR_product 6995874 7010146 . + . amplified=1;pcr_product=mv_AAA03517 III Orfeome PCR_product 5625097 5631795 . + . amplified=1;pcr_product=mv_AAA03544 X GenePair_STS PCR_product 9962853 9963737 . + . pcr_product=cenix:102-c3 II GenePair_STS PCR_product 5507236 5508135 . + . pcr_product=cenix:102-c4 V GenePair_STS PCR_product 10117842 10118735 . + . pcr_product=cenix:102-c5 IV GenePair_STS PCR_product 3566130 3567025 . + . pcr_product=cenix:102-c6 X GenePair_STS PCR_product 6117180 6117930 . + . pcr_product=cenix:102-c7 IV GenePair_STS PCR_product 7189492 7190369 . + . pcr_product=cenix:102-c9 II GenePair_STS PCR_product 14462527 14463202 . + . pcr_product=cenix:102-d1 X Promoterome PCR_product 2258069 2259336 . + . pcr_product=p_AH9.2_93 IV Promoterome PCR_product 12157449 12159448 . + . pcr_product=p_B0001.6_93 I Promoterome PCR_product 12764938 12766937 . + . pcr_product=p_B0019.1_93 V Promoterome PCR_product 10320122 10320689 . + . pcr_product=p_B0024.12_93 I Coding_transcript CDS 4581214 4581237 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577 I Coding_transcript CDS 4581664 4582026 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577 I Coding_transcript CDS 4582412 4582718 . - 1 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577 I Coding_transcript CDS 4583190 4583374 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577 I Coding_transcript CDS 4583426 4583509 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577 I Coding_transcript CDS 4583560 4583805 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577 I Coding_transcript mRNA 4580734 4583815 . - . ID=Transcript:D1007.5b.1;Parent=Gene:WBGene00017003;cds=D1007.5b;prediction_status=Confirmed;wormpep=WP:CE33577 I Coding_transcript mRNA 4581214 4583811 . - . ID=Transcript:D1007.5b.2;Parent=Gene:WBGene00017003;cds=D1007.5b;prediction_status=Confirmed;wormpep=WP:CE33577 I Coding_transcript exon 4581214 4581237 . - 0 Parent=Transcript:D1007.5b.1 I Coding_transcript exon 4581664 4582026 . - 0 Parent=Transcript:D1007.5b.1 I Coding_transcript exon 4582412 4582718 . - 1 Parent=Transcript:D1007.5b.1 I Coding_transcript exon 4583190 4583374 . - 0 Parent=Transcript:D1007.5b.1 I Coding_transcript exon 4583426 4583509 . - 0 Parent=Transcript:D1007.5b.1 I Coding_transcript exon 4583560 4583805 . - 0 Parent=Transcript:D1007.5b.1 I Coding_transcript five_prime_UTR 4583806 4583815 . - . Parent=Transcript:D1007.5b.1 I Coding_transcript three_prime_UTR 4580734 4581213 . - . Parent=Transcript:D1007.5b.1 I Coding_transcript intron 4582027 4582411 . - . Parent=Transcript:D1007.5b.1;confirmed_est=EB994038 I Coding_transcript intron 4583375 4583425 . - . Parent=Transcript:D1007.5b.1;confirmed_est=EC038345,OSTF085G5_1 I Coding_transcript intron 4583510 4583559 . - . Parent=Transcript:D1007.5b.1;confirmed_est=EC038345,OSTF085G5_1 I Coding_transcript intron 4582719 4583189 . - . Parent=Transcript:D1007.5b.1;confirmed_est=yk1055g06.5,OSTF085G5_1 I Coding_transcript intron 4581238 4581663 . - . Parent=Transcript:D1007.5b.1;confirmed_est=yk1057e08.3 I Coding_transcript exon 4581214 4581237 . - 0 Parent=Transcript:D1007.5b.2 I Coding_transcript exon 4581664 4582026 . - 0 Parent=Transcript:D1007.5b.2 I Coding_transcript exon 4582412 4582718 . - 1 Parent=Transcript:D1007.5b.2 I Coding_transcript exon 4583190 4583374 . - 0 Parent=Transcript:D1007.5b.2 I Coding_transcript exon 4583426 4583509 . - 0 Parent=Transcript:D1007.5b.2 I Coding_transcript exon 4583560 4583805 . - 0 Parent=Transcript:D1007.5b.2 I Coding_transcript five_prime_UTR 4583806 4583811 . - . Parent=Transcript:D1007.5b.2 I Coding_transcript intron 4582027 4582411 . - . Parent=Transcript:D1007.5b.2;confirmed_est=EB994038 I Coding_transcript intron 4583375 4583425 . - . Parent=Transcript:D1007.5b.2;confirmed_est=EC038345,OSTF085G5_1 I Coding_transcript intron 4583510 4583559 . - . Parent=Transcript:D1007.5b.2;confirmed_est=EC038345,OSTF085G5_1 I Coding_transcript intron 4582719 4583189 . - . Parent=Transcript:D1007.5b.2;confirmed_est=yk1055g06.5,OSTF085G5_1 I Coding_transcript intron 4581238 4581663 . - . Parent=Transcript:D1007.5b.2;confirmed_est=yk1057e08.3 I Coding_transcript gene 4580693 4583815 . - . ID=Gene:WBGene00017003 I SAGE_tag_unambiguously_mapped SAGE_tag 4581093 4581113 . - . count=10;gene=D1007.5;sequence=SAGE:tttgcgaattacttgct;transcript=D1007.5b.1,D1007.5a I SAGE_tag_unambiguously_mapped SAGE_tag 4580748 4580768 . - . count=112;gene=D1007.5;sequence=SAGE:ttttccattaattttga;transcript=D1007.5b.1,D1007.5a I SAGE_tag_unambiguously_mapped SAGE_tag 4582415 4582428 . - . count=1;gene=D1007.5;sequence=SAGE:cattttcgtg;transcript=D1007.5b.2,D1007.5b.1,D1007.5a I SAGE_tag_unambiguously_mapped SAGE_tag 4580914 4580927 . - . count=1;gene=D1007.5;sequence=SAGE:taaatttcaa;transcript=D1007.5b.1,D1007.5a I SAGE_tag_unambiguously_mapped SAGE_tag 4581193 4581206 . - . count=1;gene=D1007.5;sequence=SAGE:tgctcgttcg;transcript=D1007.5b.1,D1007.5a I SAGE_tag_unambiguously_mapped SAGE_tag 4583465 4583478 . - . count=1;gene=D1007.5;sequence=SAGE:tgttggcctt;transcript=D1007.5b.2,D1007.5b.1,D1007.5a I SAGE_tag_unambiguously_mapped SAGE_tag 4583458 4583478 . - . count=1;gene=D1007.5;sequence=SAGE:tgttggccttttacttg;transcript=D1007.5b.2,D1007.5b.1,D1007.5a I SAGE_tag_unambiguously_mapped SAGE_tag 4582533 4582553 . - . count=2;gene=D1007.5;sequence=SAGE:tgcagtgatagtccagc;transcript=D1007.5b.2,D1007.5b.1,D1007.5a I SAGE_tag_unambiguously_mapped SAGE_tag 4581100 4581113 . - . count=2;gene=D1007.5;sequence=SAGE:tttgcgaatt;transcript=D1007.5b.1,D1007.5a I SAGE_tag_unambiguously_mapped SAGE_tag 4580755 4580768 . - . count=43;gene=D1007.5;sequence=SAGE:ttttccatta;transcript=D1007.5b.1,D1007.5a I Coding_transcript CDS 4580993 4581241 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034 I Coding_transcript CDS 4581664 4582026 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034 I Coding_transcript CDS 4582412 4582718 . - 1 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034 I Coding_transcript CDS 4583190 4583374 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034 I Coding_transcript CDS 4583426 4583509 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034 I Coding_transcript CDS 4583560 4583805 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034 I mass_spec_genome translated_nucleotide_match 4580996 4581052 . - . ID=Target:277116;Target=Mass_spec_peptide:MSP:IYEPSQEDLLLMHQLQQER 1 19 +;Note=MSP:IYEPSQEDLLLMHQLQQER;cds_matches=D1007.5a;protein_matches=WP:CE29034;times_observed=1 I mass_spec_genome translated_nucleotide_match 4581838 4581882 . - . ID=Target:277138;Target=Mass_spec_peptide:MSP:AAIHLGSWHQIEGPR 1 15 +;Note=MSP:AAIHLGSWHQIEGPR;cds_matches=D1007.5b D1007.5a;protein_matches=WP:CE33577 WP:CE29034;times_observed=1 I mass_spec_genome translated_nucleotide_match 4583581 4583601 . - . ID=Target:277176;Target=Mass_spec_peptide:MSP:TLWWLPK 1 7 +;Note=MSP:TLWWLPK;cds_matches=D1007.5b D1007.5a;protein_matches=WP:CE33577 WP:CE29034;times_observed=1 I Coding_transcript mRNA 4580693 4583811 . - . ID=Transcript:D1007.5a;Parent=Gene:WBGene00017003;cds=D1007.5a;prediction_status=Confirmed;wormpep=CE:CE29034 I Coding_transcript exon 4580993 4581241 . - 0 Parent=Transcript:D1007.5a I Coding_transcript exon 4581664 4582026 . - 0 Parent=Transcript:D1007.5a I Coding_transcript exon 4582412 4582718 . - 1 Parent=Transcript:D1007.5a I Coding_transcript exon 4583190 4583374 . - 0 Parent=Transcript:D1007.5a I Coding_transcript exon 4583426 4583509 . - 0 Parent=Transcript:D1007.5a I Coding_transcript exon 4583560 4583805 . - 0 Parent=Transcript:D1007.5a I Coding_transcript five_prime_UTR 4583806 4583811 . - . Parent=Transcript:D1007.5a I Coding_transcript three_prime_UTR 4580693 4580992 . - . Parent=Transcript:D1007.5a I Coding_transcript intron 4582027 4582411 . - . Parent=Transcript:D1007.5a;confirmed_est=EB994038 I Coding_transcript intron 4581242 4581663 . - . Parent=Transcript:D1007.5a;confirmed_est=EB994038,OSTR085G5_1 I Coding_transcript intron 4583375 4583425 . - . Parent=Transcript:D1007.5a;confirmed_est=EC038345,OSTF085G5_1 I Coding_transcript intron 4583510 4583559 . - . Parent=Transcript:D1007.5a;confirmed_est=EC038345,OSTF085G5_1 I Coding_transcript intron 4582719 4583189 . - . Parent=Transcript:D1007.5a;confirmed_est=yk1055g06.5,OSTF085G5_1 ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/ensembl_gtf.txt0000664002421100242110000001245000000000000021441 0ustar00bchapmanbchapman00000000000000I snoRNA exon 3747 3909 . - . gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; gene_name "Y74C9A.6"; transcript_name "NR_001477.2"; I protein_coding exon 12764812 12764949 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12764812 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding start_codon 12764935 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding exon 12764291 12764471 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12764291 12764471 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12763979 12764102 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12763979 12764102 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12763729 12763882 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12763729 12763882 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12763448 12763655 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12763448 12763655 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12763112 12763249 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12763112 12763249 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12762648 12762806 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12762648 12762806 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12762127 12762268 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12762127 12762268 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12761799 12761953 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12761799 12761953 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12761172 12761516 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12761172 12761516 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12760834 12760904 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12760834 12760904 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12760365 12760494 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12760365 12760494 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12760227 12760319 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12760227 12760319 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12759949 12760013 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12759949 12760013 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding exon 12759579 12759828 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; I protein_coding CDS 12759748 12759828 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; I protein_coding stop_codon 12759745 12759747 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/glimmer_nokeyval.gff30000664002421100242110000000060100000000000022521 0ustar00bchapmanbchapman00000000000000##gff-version 3 ##sequence-region scaffold4215_3 1 6526 scaffold4215_3 glimmer gene 3 62 . - . ID=GL0000006;Name=GL0000006;Lack 3'-end; scaffold4215_3 glimmer mRNA 3 62 . - . ID=GL0000006;Name=GL0000006;Parent=GL0000006;Lack 3'-end; scaffold4215_3 glimmer CDS 3 62 2.84 - 0 Parent=GL0000006;Lack 3'-end; scaffold4215_3 glimmer gene 124 1983 . - . ID=GL0000007;Name=GL0000007;Complete; ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/hybrid1.gff30000664002421100242110000000124500000000000020524 0ustar00bchapmanbchapman00000000000000##gff-version 3 ##sequence-region foo 1 100 ##feature-ontology bar ##attribute-ontology baz ##source-ontology boo ##sequence-region chr17 62467934 62469545 chr17 UCSC mRNA 62467934 62469545 . - . ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:SOMA_HUMAN;Note=growth%20hormone%201;Alias=GH1 chr17 UCSC CDS 62468039 62468236 . - 1 Parent=A00469 chr17 UCSC CDS 62468490 62468654 . - 2 Parent=A00469 chr17 UCSC CDS 62468747 62468866 . - 1 Parent=A00469 chr17 UCSC CDS 62469076 62469236 . - 1 Parent=A00469 chr17 UCSC CDS 62469497 62469506 . - 0 Parent=A00469 ### ##FASTA >chr17 GATTACA GATTACA ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1635903054.0 bcbio-gff-0.6.9/Tests/GFF/hybrid2.fa0000664002421100242110000000003300000000000020260 0ustar00bchapmanbchapman00000000000000>lcl|chr17 GATTACA GATTACA ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1635903025.0 bcbio-gff-0.6.9/Tests/GFF/hybrid2.gff30000664002421100242110000000125100000000000020522 0ustar00bchapmanbchapman00000000000000##gff-version 3 ##sequence-region foo 1 100 ##feature-ontology bar ##attribute-ontology baz ##source-ontology boo ##sequence-region chr17 62467934 62469545 chr17 UCSC mRNA 62467934 62469545 . - . ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:SOMA_HUMAN;Note=growth%20hormone%201;Alias=GH1 chr17 UCSC CDS 62468039 62468236 . - 1 Parent=A00469 chr17 UCSC CDS 62468490 62468654 . - 2 Parent=A00469 chr17 UCSC CDS 62468747 62468866 . - 1 Parent=A00469 chr17 UCSC CDS 62469076 62469236 . - 1 Parent=A00469 chr17 UCSC CDS 62469497 62469506 . - 0 Parent=A00469 ### ##FASTA >lcl|chr17 GATTACA GATTACA ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/jgi_gff2.txt0000664002421100242110000000102600000000000020626 0ustar00bchapmanbchapman00000000000000chr_1 JGI exon 37061 37174 . - . name "fgenesh1_pg.C_chr_1000007"; transcriptId 873 chr_1 JGI CDS 37061 37174 . - 0 name "fgenesh1_pg.C_chr_1000007"; proteinId 873; exonNumber 3 chr_1 JGI exon 37315 37620 . - . name "fgenesh1_pg.C_chr_1000007"; transcriptId 873 chr_1 JGI CDS 37315 37620 . - 0 name "fgenesh1_pg.C_chr_1000007"; proteinId 873; exonNumber 2 chr_1 JGI exon 37752 38216 . - . name "fgenesh1_pg.C_chr_1000007"; transcriptId 873 chr_1 JGI CDS 37752 38216 . - 0 name "fgenesh1_pg.C_chr_1000007"; proteinId 873; exonNumber 1 ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/mouse_extra_comma.gff30000664002421100242110000000245600000000000022676 0ustar00bchapmanbchapman00000000000000chr17 RefSeq gene 6797760 6818159 . + . ID=NC_000083.5:LOC100040603;Name=NC_000083.5:LOC100040603 chr17 RefSeq mRNA 6797760 6818159 . + . ID=XM_001475631.1;Parent=NC_000083.5:LOC100040603 chr17 RefSeq protein 6806527 6812289 . + . ID=;Parent=XM_001475631.1 chr17 RefSeq five_prime_UTR 6797760 6797769 . + . Parent=XM_001475631.1 chr17 RefSeq five_prime_UTR 6806513 6806526 . + . Parent=XM_001475631.1 chr17 RefSeq CDS 6806527 6806553 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1, chr17 RefSeq CDS 6808204 6808245 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1, chr17 RefSeq CDS 6811330 6811453 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1, chr17 RefSeq CDS 6811792 6811869 . + 2 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1, chr17 RefSeq CDS 6812219 6812289 . + 2 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1, chr17 RefSeq three_prime_UTR 6812290 6818159 . + . Parent=XM_001475631.1 chr17 RefSeq exon 6797760 6797769 . + . Parent=XM_001475631.1 chr17 RefSeq exon 6806513 6806553 . + . Parent=XM_001475631.1 chr17 RefSeq exon 6808204 6808245 . + . Parent=XM_001475631.1 chr17 RefSeq exon 6811330 6811453 . + . Parent=XM_001475631.1 chr17 RefSeq exon 6811792 6811869 . + . Parent=XM_001475631.1 chr17 RefSeq exon 6812219 6818159 . + . Parent=XM_001475631.1 ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/ncbi_gff3.txt0000664002421100242110000001274000000000000020776 0ustar00bchapmanbchapman00000000000000##gff-version 3 ##source-version NCBI C++ formatter 0.2 ##date 2009-04-25 ##Type DNA NC_008596.1 NC_008596.1 RefSeq gene 12272 13301 . + . locus_tag=MSMEG_0013;note=ferric%20enterobactin%20transport%20system%20permease%20protein%20FepG%3B%20this%20gene%20contains%20a%20frame%20shift%20which%20is%20not%20the%20result%20of%20sequencing%20error%3B%20identified%20by%20match%20to%20protein%20family%20HMM%20PF01032;pseudo=;db_xref=GeneID:4537201 NC_008596.1 RefSeq gene 1137579 1138550 . + . ID=NC_008596.1:speB;locus_tag=MSMEG_1072;db_xref=GeneID:4535378 NC_008596.1 RefSeq CDS 1137579 1138547 . + 0 ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;exon_number=1 NC_008596.1 RefSeq start_codon 1137579 1137581 . + 0 ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;exon_number=1 NC_008596.1 RefSeq stop_codon 1138548 1138550 . + 0 ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;exon_number=1 NC_008596.1 RefSeq gene 3597069 3598112 . + . ID=NC_008596.1:speB;locus_tag=MSMEG_3535;db_xref=GeneID:4533678 NC_008596.1 RefSeq CDS 3597069 3598109 . + 0 ID=NC_008596.1:speB:unknown_transcript_2;Parent=NC_008596.1:speB;locus_tag=MSMEG_3535;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_887838.1;db_xref=GI:118470943;db_xref=GeneID:4533678;exon_number=1 NC_008596.1 RefSeq start_codon 3597069 3597071 . + 0 ID=NC_008596.1:speB:unknown_transcript_2;Parent=NC_008596.1:speB;locus_tag=MSMEG_3535;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_887838.1;db_xref=GI:118470943;db_xref=GeneID:4533678;exon_number=1 NC_008596.1 RefSeq stop_codon 3598110 3598112 . + 0 ID=NC_008596.1:speB:unknown_transcript_2;Parent=NC_008596.1:speB;locus_tag=MSMEG_3535;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_887838.1;db_xref=GI:118470943;db_xref=GeneID:4533678;exon_number=1 NC_008596.1 RefSeq gene 4460713 4461672 . - . ID=NC_008596.1:speB;locus_tag=MSMEG_4374;db_xref=GeneID:4535424 NC_008596.1 RefSeq CDS 4460716 4461672 . - 0 ID=NC_008596.1:speB:unknown_transcript_3;Parent=NC_008596.1:speB;locus_tag=MSMEG_4374;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888649.1;db_xref=GI:118469662;db_xref=GeneID:4535424;exon_number=1 NC_008596.1 RefSeq start_codon 4461670 4461672 . - 0 ID=NC_008596.1:speB:unknown_transcript_3;Parent=NC_008596.1:speB;locus_tag=MSMEG_4374;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888649.1;db_xref=GI:118469662;db_xref=GeneID:4535424;exon_number=1 NC_008596.1 RefSeq stop_codon 4460713 4460715 . - 0 ID=NC_008596.1:speB:unknown_transcript_3;Parent=NC_008596.1:speB;locus_tag=MSMEG_4374;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888649.1;db_xref=GI:118469662;db_xref=GeneID:4535424;exon_number=1 NC_008596.1 RefSeq gene 4539385 4540344 . + . ID=NC_008596.1:speB;locus_tag=MSMEG_4459;db_xref=GeneID:4537057 NC_008596.1 RefSeq CDS 4539385 4540341 . + 0 ID=NC_008596.1:speB:unknown_transcript_4;Parent=NC_008596.1:speB;locus_tag=MSMEG_4459;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888732.1;db_xref=GI:118472833;db_xref=GeneID:4537057;exon_number=1 NC_008596.1 RefSeq start_codon 4539385 4539387 . + 0 ID=NC_008596.1:speB:unknown_transcript_4;Parent=NC_008596.1:speB;locus_tag=MSMEG_4459;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888732.1;db_xref=GI:118472833;db_xref=GeneID:4537057;exon_number=1 NC_008596.1 RefSeq stop_codon 4540342 4540344 . + 0 ID=NC_008596.1:speB:unknown_transcript_4;Parent=NC_008596.1:speB;locus_tag=MSMEG_4459;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888732.1;db_xref=GI:118472833;db_xref=GeneID:4537057;exon_number=1 ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633526505.0 bcbio-gff-0.6.9/Tests/GFF/problem_sequence_region.gff30000664002421100242110000000064400000000000024057 0ustar00bchapmanbchapman00000000000000##gff-version 3 #!gff-spec-version 1.21 #!processor NCBI annotwriter ##sequence-region 1 2482535 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1282 1 Local region 1 2482535 . + . ID=1:1..2482535;Dbxref=taxon:1282;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA 1 . gene 1 1356 . + . ID=gene-test_000001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/spaces.gff30000664002421100242110000000043200000000000020435 0ustar00bchapmanbchapman00000000000000##gff-version 3 contig1 . gene 1544 2057 . - . ID=contig1.1 contig1 . mRNA 1544 2057 . - . ID=mRNA.contig1.1;Parent=contig1.1 contig1 . mRNA 1544 2057 . - . foo=bar;ID=mRNA.contig1.1;Parent=contig1.1 contig1 . mRNA 1544 2057 . - . ID=mRNA.contig1.1;Parent=contig1.1; foo=bar ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/trans_splicing.gff30000664002421100242110000000151700000000000022203 0ustar00bchapmanbchapman000000000000001 manual gene 9559 9672 . + . ID=gene83;Name=rps12|lcl|NC_021456.1_cdsid_YP_008082803.1_8-gene;exception=trans-splicing 1 manual gene 112442 113241 . + . ID=gene84;Name=rps12|lcl|NC_021456.1_cdsid_YP_008082803.1_8-gene;exception=trans-splicing 1 manual mRNA 9559 9672 . + . ID=mRNA43;Parent=gene83,gene84;Name=rps12|lcl|NC_021456.1_cdsid_YP_008082803.1_8;exception=trans-splicing 1 manual mRNA 112442 113241 . + . ID=mRNA43;Parent=gene83,gene84;Name=rps12|lcl|NC_021456.1_cdsid_YP_008082803.1_8;exception=trans-splicing 1 manual exon 9559 9672 . + . Parent=mRNA43 1 manual CDS 9559 9672 . + 0 Parent=mRNA43 1 manual exon 112442 112673 . + . Parent=mRNA43 1 manual CDS 112442 112673 . + 0 Parent=mRNA43 1 manual intron 112674 113215 . + . Parent=mRNA43 1 manual exon 113216 113241 . + . Parent=mRNA43 1 manual CDS 113216 113241 . + 2 Parent=mRNA43 ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/transcripts.gff30000664002421100242110000000167700000000000021547 0ustar00bchapmanbchapman00000000000000##gff-version 3 ##date 2013-11-13 edit_test.fa . gene 500 2610 . + . ID=newGene edit_test.fa . mRNA 500 2385 . + . Parent=newGene;Namo=reinhard+did+this;Name=t1%28newGene%29;ID=t1;uri=http%3A//www.yahoo.com edit_test.fa . five_prime_UTR 500 802 . + . Parent=t1 edit_test.fa . CDS 803 1012 . + . Parent=t1 edit_test.fa . three_prime_UTR 1013 1168 . + . Parent=t1 edit_test.fa . three_prime_UTR 1475 1654 . + . Parent=t1 edit_test.fa . three_prime_UTR 1720 1908 . + . Parent=t1 edit_test.fa . three_prime_UTR 2047 2385 . + . Parent=t1 edit_test.fa . mRNA 1050 2610 . + . Parent=newGene;Name=t2%28newGene%29;ID=t2 edit_test.fa . CDS 1050 1196 . + . Parent=t2 edit_test.fa . CDS 1472 1651 . + . Parent=t2 edit_test.fa . CDS 1732 2610 . + . Parent=t2 edit_test.fa . mRNA 1050 2610 . + . Parent=newGene;Name=t3%28newGene%29;ID=t3 edit_test.fa . CDS 1050 1196 . + . Parent=t3 edit_test.fa . CDS 1472 1651 . + . Parent=t3 edit_test.fa . CDS 1732 2610 . + . Parent=t3 ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/unescaped-semicolon.gff30000664002421100242110000000032100000000000023111 0ustar00bchapmanbchapman00000000000000##gff-version 3 chr1 . gene 1 100 . + . ID=PH01000020G1780;Description="osFTL6 FT-Like6 homologous to Flowering Locus T gene; contains Pfam profile PF01161: Phosphatidylethanolamine-binding protein, expressed"././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/wormbase_gff2.txt0000664002421100242110000001464000000000000021702 0ustar00bchapmanbchapman00000000000000I Genomic_canonical region 1 2679 . + . Sequence "cTel33B" ; Note "Clone cTel33B; Genbank AC199162" ; Note "Clone cTel33B; Genbank AC199162" I Coding_transcript Transcript 12759582 12764949 . - . Transcript "B0019.1" ; WormPep "WP:CE40797" ; Note "amx-2" ; Prediction_status "Partially_confirmed" ; Gene "WBGene00000138" ; CDS "B0019.1" ; WormPep "WP:CE40797" ; Note "amx-2" ; Prediction_status "Partially_confirmed" ; Gene "WBGene00000138" I Coding_transcript intron 12759829 12759948 . - . Transcript "B0019.1" ; Confirmed_EST EC034652 I Coding_transcript intron 12760014 12760226 . - . Transcript "B0019.1" ; Confirmed_EST EC034652 I Coding_transcript intron 12760320 12760364 . - . Transcript "B0019.1" ; Confirmed_EST yk1054h04.3 I Coding_transcript intron 12760495 12760833 . - . Transcript "B0019.1" ; Confirmed_EST EC027594 I Coding_transcript intron 12760905 12761171 . - . Transcript "B0019.1" ; Confirmed_EST EC027594 I Coding_transcript intron 12761517 12761798 . - . Transcript "B0019.1" ; Confirmed_EST EC027594 I Coding_transcript intron 12761954 12762126 . - . Transcript "B0019.1" ; Confirmed_EST yk262g9.5 I Coding_transcript intron 12762269 12762647 . - . Transcript "B0019.1" ; Confirmed_EST yk262g9.5 I Coding_transcript intron 12762807 12763111 . - . Transcript "B0019.1" ; Confirmed_EST yk1056c07.5 I Coding_transcript intron 12763250 12763447 . - . Transcript "B0019.1" ; Confirmed_EST yk1056c07.5 I Coding_transcript intron 12763656 12763728 . - . Transcript "B0019.1" ; Confirmed_EST yk1056c07.5 I Coding_transcript intron 12763883 12763978 . - . Transcript "B0019.1" ; Confirmed_EST yk1054h04.5 ; Confirmed_EST OSTF088D9_1 I Coding_transcript intron 12764103 12764290 . - . Transcript "B0019.1" ; Confirmed_EST yk1054h04.5 ; Confirmed_EST OSTF088D9_1 I Coding_transcript intron 12764472 12764811 . - . Transcript "B0019.1" ; Confirmed_EST yk1054h04.5 ; Confirmed_EST OSTF088D9_1 I Coding_transcript exon 12759582 12759828 . - . Transcript "B0019.1" I Coding_transcript exon 12759949 12760013 . - . Transcript "B0019.1" I Coding_transcript exon 12760227 12760319 . - . Transcript "B0019.1" I Coding_transcript exon 12760365 12760494 . - . Transcript "B0019.1" I Coding_transcript exon 12760834 12760904 . - . Transcript "B0019.1" I Coding_transcript exon 12761172 12761516 . - . Transcript "B0019.1" I Coding_transcript exon 12761799 12761953 . - . Transcript "B0019.1" I Coding_transcript exon 12762127 12762268 . - . Transcript "B0019.1" I Coding_transcript exon 12762648 12762806 . - . Transcript "B0019.1" I Coding_transcript exon 12763112 12763249 . - . Transcript "B0019.1" I Coding_transcript exon 12763448 12763655 . - . Transcript "B0019.1" I Coding_transcript exon 12763729 12763882 . - . Transcript "B0019.1" I Coding_transcript exon 12763979 12764102 . - . Transcript "B0019.1" I Coding_transcript exon 12764291 12764471 . - . Transcript "B0019.1" I Coding_transcript exon 12764812 12764949 . - . Transcript "B0019.1" I SAGE_tag_unambiguously_mapped SAGE_tag 12761492 12761512 . - . Sequence SAGE:aacggagccgtacacgc;count 5;Gene amx-2;Transcript B0019.1 I SAGE_tag_most_three_prime SAGE_tag 12761499 12761512 . - . Sequence SAGE:aacggagccg;count 9;Gene amx-2;Transcript B0019.1 I mass_spec_genome translated_nucleotide_match 12761920 12761953 . - . Target "Mass_spec_peptide:MSP:FADFSPLDVSDVNFATDDLAK" 10 21 ; Note "MSP:FADFSPLDVSDVNFATDDLAK" ; Protein_matches "WP:CE40797" ; CDS_matches "B0019.1" ; Times_observed "3" I mass_spec_genome translated_nucleotide_match 12762127 12762155 . - . Target "Mass_spec_peptide:MSP:FADFSPLDVSDVNFATDDLAK" 1 10 ; Note "MSP:FADFSPLDVSDVNFATDDLAK" ; Protein_matches "WP:CE40797" ; CDS_matches "B0019.1" ; Times_observed "3" I mass_spec_genome translated_nucleotide_match 12763506 12763559 . - . Target "Mass_spec_peptide:MSP:FGHGQSLLAQGGMNEVVR" 1 18 ; Note "MSP:FGHGQSLLAQGGMNEVVR" ; Protein_matches "WP:CE40797" ; CDS_matches "B0019.1" ; Times_observed "1" I SAGE_tag_unambiguously_mapped SAGE_tag 12763533 12763553 . - . Sequence SAGE:ggcagagtcttttggca;count 1;Gene amx-2;Transcript B0019.1 I mass_spec_genome translated_nucleotide_match 12764361 12764411 . - . Target "Mass_spec_peptide:MSP:NIQQNRPGLSVLVLEAR" 1 17 ; Note "MSP:NIQQNRPGLSVLVLEAR" ; Protein_matches "WP:CE40797" ; CDS_matches "B0019.1" ; Times_observed "2" I GenePair_STS PCR_product 12762449 12764118 . + . PCR_product "sjj_B0019.1" I Expr_profile experimental_result_region 12762449 12764118 . + . Expr_profile "B0019.1" I Allele SNP 12764272 12764272 . + . Variation "snp_B0019[1]" ; Interpolated_map_position "14.003" ; ; RFLP "No" I Promoterome PCR_product 12764938 12766937 . + . PCR_product "p_B0019.1_93" I Oligo_set reagent 12759745 12761589 . - . Oligo_set "Aff_B0019.1" I Orfeome PCR_product 12759747 12764936 . - . PCR_product "mv_B0019.1" ; Amplified 1 ; Amplified 1 I Coding_transcript three_prime_UTR 12759582 12759744 . - . Transcript "B0019.1" I Coding_transcript coding_exon 12759745 12759828 . - 0 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12759949 12760013 . - 2 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12760227 12760319 . - 2 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12760365 12760494 . - 0 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12760834 12760904 . - 2 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12761172 12761516 . - 2 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12761799 12761953 . - 1 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12762127 12762268 . - 2 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12762648 12762806 . - 2 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12763112 12763249 . - 2 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12763448 12763655 . - 0 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12763729 12763882 . - 1 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12763979 12764102 . - 2 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript coding_exon 12764291 12764471 . - 0 Transcript "B0019.1" ; CDS "B0019.1" I Coding_transcript five_prime_UTR 12764938 12764949 . - . Transcript "B0019.1" I Coding_transcript coding_exon 12764812 12764937 . - 0 Transcript "B0019.1" ; CDS "B0019.1" X SAGE_tag SAGE_tag 6819353 6819366 . + . Sequence SAGE:aacggagccg;count 9;Gene amx-2;Transcript B0019.1 X gene processed_transcript 944828 948883 . - . Gene "WBGene00004893" ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/Tests/GFF/wormbase_gff2_alt.txt0000664002421100242110000000121500000000000022534 0ustar00bchapmanbchapman00000000000000Remanei_genome Genomic_canonical region 1 7816 . + . Sequence "Contig1020"; Contig102 WU_MERGED CDS 1629 3377 . - . CDS "cr01.sctg102.wum.2.1" Contig102 WU_MERGED coding_exon 2927 3377 . - . CDS "cr01.sctg102.wum.2.1" Contig102 WU_MERGED coding_exon 2474 2875 . - . CDS "cr01.sctg102.wum.2.1" Contig102 WU_MERGED coding_exon 1928 2430 . - . CDS "cr01.sctg102.wum.2.1" Contig102 WU_MERGED coding_exon 1629 1883 . - . CDS "cr01.sctg102.wum.2.1" Contig102 WU_MERGED intron 2876 2926 . - . CDS "cr01.sctg102.wum.2.1" Contig102 WU_MERGED intron 2431 2473 . - . CDS "cr01.sctg102.wum.2.1" Contig102 WU_MERGED intron 1884 1927 . - . CDS "cr01.sctg102.wum.2.1" ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1635903471.0 bcbio-gff-0.6.9/Tests/test_GFFSeqIOFeatureAdder.py0000664002421100242110000007171000000000000023205 0ustar00bchapmanbchapman00000000000000"""Test decoration of existing SeqRecords with GFF through a SeqIO interface. """ import sys import os import unittest import pprint import six from six import StringIO from Bio import SeqIO from BCBio import GFF from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation from BCBio.GFF import (GFFExaminer, GFFParser, DiscoGFFParser) class MapReduceGFFTest(unittest.TestCase): """Tests GFF parsing using a map-reduce framework for parallelization. """ def setUp(self): self._test_dir = os.path.join(os.path.dirname(__file__), "GFF") self._test_gff_file = os.path.join(self._test_dir, "c_elegans_WS199_shortened_gff.txt") self._disco_host = "http://localhost:7000" def t_local_map_reduce(self): """General map reduce framework without parallelization. """ cds_limit_info = dict(gff_type=["gene", "mRNA", "CDS"], gff_id=['I']) rec_dict = SeqIO.to_dict(GFF.parse(self._test_gff_file, limit_info=cds_limit_info)) test_rec = rec_dict['I'] assert len(test_rec.features) == 32 def t_disco_map_reduce(self): """Map reduce framework parallelized using disco. """ # this needs to be more generalized but fails okay with no disco try: import disco import simplejson except ImportError: print("Skipping -- disco and json not found") return cds_limit_info = dict( gff_source_type=[('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id=['I'] ) parser = DiscoGFFParser(disco_host=self._disco_host) rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, limit_info=cds_limit_info)) final_rec = rec_dict['I'] # second gene feature is multi-parent assert len(final_rec.features) == 2 # two gene feature class GFF3Test(unittest.TestCase): """Real live GFF3 tests from WormBase and NCBI. Uses GFF3 data from: ftp://ftp.wormbase.org/pub/wormbase/genomes/c_elegans/ genome_feature_tables/GFF3/ ftp://ftp.wormbase.org/pub/wormbase/genomes/c_elegans/sequences/dna/ and from NCBI. """ def setUp(self): self._test_dir = os.path.join(os.path.dirname(__file__), "GFF") self._test_seq_file = os.path.join(self._test_dir, "c_elegans_WS199_dna_shortened.fa") self._test_gff_file = os.path.join(self._test_dir, "c_elegans_WS199_shortened_gff.txt") self._test_gff_ann_file = os.path.join(self._test_dir, "c_elegans_WS199_ann_gff.txt") self._full_dir = "/usr/home/chapmanb/mgh/ruvkun_rnai/wormbase/" + \ "data_files_WS198" self._test_ncbi = os.path.join(self._test_dir, "ncbi_gff3.txt") def not_t_full_celegans(self): """Test the full C elegans chromosome and GFF files. This is used to test GFF on large files and is not run as a standard test. You will need to download the files and adjust the paths to run this. """ # read the sequence information seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa") gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3") seq_handle = open(seq_file) seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) seq_handle.close() #with open(gff_file) as gff_handle: # possible_limits = feature_adder.available_limits(gff_handle) # pprint.pprint(possible_limits) rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type=rnai_types + gene_types) for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info): pass def _get_seq_dict(self): """Internal reusable function to get the sequence dictionary. """ seq_handle = open(self._test_seq_file) seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) seq_handle.close() return seq_dict def t_possible_limits(self): """Calculate possible queries to limit a GFF file. """ gff_examiner = GFFExaminer() possible_limits = gff_examiner.available_limits(self._test_gff_file) print() pprint.pprint(possible_limits) def t_parent_child(self): """Summarize parent-child relationships in a GFF file. """ gff_examiner = GFFExaminer() pc_map = gff_examiner.parent_child_map(self._test_gff_file) print() pprint.pprint(pc_map) def t_parent_child_file_modes(self): """Summarize parent-child relationships in a GFF file. """ gff_examiner = GFFExaminer() # Use the loaded-from-filename as reference pc_map = gff_examiner.parent_child_map(self._test_gff_file) with open(self._test_gff_file, "rt") as handle: assert pc_map == gff_examiner.parent_child_map(handle) with open(self._test_gff_file, "rb") as handle: if six.PY2: assert pc_map == gff_examiner.parent_child_map(handle) else: try: gff_examiner.parent_child_map(handle) except TypeError as e: assert str(e) == "input handle must be opened in text mode", e else: assert False, "expected TypeError to be raised" def t_flat_features(self): """Check addition of flat non-nested features to multiple records. """ seq_dict = self._get_seq_dict() pcr_limit_info = dict( gff_source_type=[('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=pcr_limit_info)) assert len(rec_dict['I'].features) == 4 assert len(rec_dict['X'].features) == 5 def t_nested_features(self): """Check three-deep nesting of features with gene, mRNA and CDS. """ seq_dict = self._get_seq_dict() cds_limit_info = dict( gff_source_type=[('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id=['I'] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=cds_limit_info)) final_rec = rec_dict['I'] # first gene feature is plain assert len(final_rec.features) == 2 # two gene feature assert len(final_rec.features[0].sub_features) == 1 # one transcript # 15 final CDS regions assert len(final_rec.features[0].sub_features[0].sub_features) == 15 def t_nested_multiparent_features(self): """Verify correct nesting of features with multiple parents. """ seq_dict = self._get_seq_dict() cds_limit_info = dict( gff_source_type=[('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id=['I'] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=cds_limit_info)) final_rec = rec_dict['I'] # second gene feature is multi-parent assert len(final_rec.features) == 2 # two gene feature cur_subs = final_rec.features[1].sub_features assert len(cur_subs) == 3 # three transcripts # the first and second transcript have the same CDSs assert len(cur_subs[0].sub_features) == 6 assert len(cur_subs[1].sub_features) == 6 assert cur_subs[0].sub_features[0] is cur_subs[1].sub_features[0] def t_no_dict_error(self): """Ensure an error is raised when no dictionary to map to is present. """ parser = GFFParser(create_missing=False) try: for rec in parser.parse(self._test_gff_file): pass # no error -- problem raise AssertionError('Did not complain with missing dictionary') except KeyError: pass def t_unknown_seq(self): """Prepare unknown base sequences with the correct length. """ rec_dict = SeqIO.to_dict(GFF.parse(self._test_gff_file)) assert len(rec_dict["I"].seq) == 12766937 assert len(rec_dict["X"].seq) == 17718531 def t_gff_annotations(self): """Check GFF annotations placed on an entire sequence. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_ann_file)) final_rec = rec_dict['I'] assert len(final_rec.annotations.keys()) == 2 assert final_rec.annotations['source'] == ['Expr_profile'] assert final_rec.annotations['expr_profile'] == ['B0019.1'] def t_gff3_iterator(self): """Iterated parsing in GFF3 files with nested features. """ parser = GFFParser() recs = [r for r in parser.parse_in_parts(self._test_gff_file, target_lines=70)] # should be one big set because we don't have a good place to split assert len(recs) == 6 assert len(recs[0].features) == 59 def t_gff3_iterator_limit(self): """Iterated interface using a limit query on GFF3 files. """ cds_limit_info = dict( gff_source_type=[('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id=['I'] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, limit_info=cds_limit_info)) assert len(rec_dict) == 1 tfeature = rec_dict["I"].features[0].sub_features[0] for sub_test in tfeature.sub_features: assert sub_test.type == "CDS", sub_test def t_gff3_noval_attrib(self): """Parse GFF3 file from NCBI with a key/value pair with no value. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi)) assert len(rec_dict) == 1 t_feature = list(rec_dict.values())[0].features[0] assert t_feature.qualifiers["pseudo"] == ["true"] def t_gff3_multiple_ids(self): """Deal with GFF3 with non-unique ID attributes, using NCBI example. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi)) assert len(rec_dict) == 1 t_features = list(rec_dict.values())[0].features[1:] # 4 feature sets, same ID, different positions, different attributes assert len(t_features) == 4 for f in t_features: assert len(f.sub_features) == 3 def t_simple_parsing(self): """Parse GFF into a simple line by line dictionary without nesting. """ parser = GFFParser() num_lines = 0 for line_info in parser.parse_simple(self._test_gff_file): num_lines += 1 assert num_lines == 177, num_lines line_info = line_info['child'][0] assert line_info['quals']['confirmed_est'] == \ ['yk1055g06.5', 'OSTF085G5_1'] assert line_info['location'] == [4582718, 4583189] def t_simple_parsing_nesting(self): """Simple parsing for lines with nesting, using the simplified API. """ test_gff = os.path.join(self._test_dir, "transcripts.gff3") num_lines = 0 for line_info in GFF.parse_simple(test_gff): num_lines += 1 assert num_lines == 16, num_lines def t_extra_comma(self): """Correctly handle GFF3 files with extra trailing commas. """ tfile = os.path.join(self._test_dir, "mouse_extra_comma.gff3") in_handle = open(tfile) for rec in GFF.parse(in_handle): pass in_handle.close() tested = False for sub_top in rec.features[0].sub_features: for sub in sub_top.sub_features: if sub.qualifiers.get("Name", "") == ["CDS:NC_000083.5:LOC100040603"]: tested = True assert len(sub.qualifiers["Parent"]) == 1 assert tested, "Did not find sub-feature to test" def t_novalue_key(self): """Handle GFF3 files with keys and no values. """ tfile = os.path.join(self._test_dir, "glimmer_nokeyval.gff3") rec = six.next(GFF.parse(tfile)) f1, f2 = rec.features assert f1.qualifiers['ID'] == ['GL0000006'] assert len(f1.sub_features) == 2 assert f1.sub_features[0].qualifiers["Lack 3'-end"] == ["true"] assert not "ID" in f1.sub_features[0].qualifiers assert f2.qualifiers["Complete"] == ["true"] def t_key_whitespace(self): """Fix keys with problematic whitespace. """ tfile = os.path.join(self._test_dir, "spaces.gff3") for i, line_info in enumerate(GFF.parse_simple(tfile)): if i > 2: assert line_info["quals"]["foo"] == ["bar"] def t_trans_spliicing(self): """Parsing of transspliced genes from GFF3 spec where child locations don't match to parents. """ fname = os.path.join(self._test_dir, "trans_splicing.gff3") with open(fname) as in_handle: rec = six.next(GFF.parse(in_handle)) assert len(rec.features) == 2 assert rec.features[0].id == "gene83" assert len(rec.features[0].sub_features) == 2 assert len(rec.features[0].sub_features[0].sub_features) == 7 assert rec.features[1].id == "gene84" assert len(rec.features[1].sub_features) == 2 assert len(rec.features[1].sub_features[0].sub_features) == 7 class SolidGFFTester(unittest.TestCase): """Test reading output from SOLiD analysis, as GFF3. See more details on SOLiD GFF here: http://solidsoftwaretools.com/gf/project/matogff/ """ def setUp(self): self._test_dir = os.path.join(os.path.dirname(__file__), "GFF") self._test_gff_file = os.path.join(self._test_dir, "F3-unique-3.v2.gff") def t_basic_solid_parse(self): """Basic parsing of SOLiD GFF results files. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file)) test_feature = rec_dict['3_341_424_F3'].features[0] assert test_feature.location.nofuzzy_start == 102716 assert test_feature.location.nofuzzy_end == 102736 assert len(test_feature.qualifiers) == 7 assert test_feature.qualifiers['score'] == ['10.6'] assert test_feature.qualifiers['source'] == ['solid'] assert test_feature.strand == -1 assert test_feature.type == 'read' assert test_feature.qualifiers['g'] == ['T2203031313223113212'] assert len(test_feature.qualifiers['q']) == 20 def t_solid_iterator(self): """Iterated parsing in a flat file without nested features. """ parser = GFFParser() feature_sizes = [] for rec in parser.parse_in_parts(self._test_gff_file, target_lines=5): feature_sizes.append(len(rec.features)) assert len(feature_sizes) == 112 assert max(feature_sizes) == 1 def t_line_adjust(self): """Adjust lines during parsing to fix potential GFF problems. """ def adjust_fn(results): rec_index = results['quals']['i'][0] read_name = results['rec_id'] results['quals']['read_name'] = [read_name] results['rec_id'] = rec_index return results parser = GFFParser(line_adjust_fn=adjust_fn) recs = [r for r in parser.parse(self._test_gff_file)] assert len(recs) == 1 work_rec = recs[0] assert work_rec.id == '1' assert len(work_rec.features) == 112 assert work_rec.features[0].qualifiers['read_name'] == \ ['3_336_815_F3'] class GFF2Tester(unittest.TestCase): """Parse GFF2 and GTF files, building features. """ def setUp(self): self._test_dir = os.path.join(os.path.dirname(__file__), "GFF") self._ensembl_file = os.path.join(self._test_dir, "ensembl_gtf.txt") self._wormbase_file = os.path.join(self._test_dir, "wormbase_gff2.txt") self._jgi_file = os.path.join(self._test_dir, "jgi_gff2.txt") self._wb_alt_file = os.path.join(self._test_dir, "wormbase_gff2_alt.txt") def t_basic_attributes(self): """Parse out basic attributes of GFF2 from Ensembl GTF. """ limit_info = dict(gff_source_type=[('snoRNA', 'exon')]) rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file, limit_info=limit_info)) work_rec = rec_dict['I'] assert len(work_rec.features) == 1 test_feature = work_rec.features[0] qual_keys = list(test_feature.qualifiers.keys()) qual_keys.sort() assert qual_keys == [ 'Parent', 'exon_number', 'gene_id', 'gene_name', 'source', 'transcript_id', 'transcript_name' ] assert test_feature.qualifiers['source'] == ['snoRNA'] assert test_feature.qualifiers['transcript_name'] == ['NR_001477.2'] assert test_feature.qualifiers['exon_number'] == ['1'] def t_tricky_semicolons(self): """Parsing of tricky semi-colon positions in WormBase GFF2. """ limit_info = dict(gff_source_type=[('Genomic_canonical', 'region')]) rec_dict = SeqIO.to_dict(GFF.parse(self._wormbase_file, limit_info=limit_info)) work_rec = rec_dict['I'] assert len(work_rec.features) == 1 test_feature = work_rec.features[0] assert test_feature.qualifiers['Note'] == \ ['Clone cTel33B; Genbank AC199162', 'Clone cTel33B; Genbank AC199162'], test_feature.qualifiers["Note"] def t_unescaped_semicolons(self): """Parse inputs with unescaped semi-colons. This is a band-aid to not fail rather than correct parsing, since the combined feature will not be maintained. """ f = os.path.join(self._test_dir, "unescaped-semicolon.gff3") rec_dict = SeqIO.to_dict(GFF.parse(f)) f = rec_dict['chr1'].features[0] assert f.qualifiers["Description"][0].startswith('osFTL6') assert f.qualifiers["Description"][0].endswith('protein, expressed') def t_jgi_gff(self): """Parsing of JGI formatted GFF2, nested using transcriptId and proteinID """ rec_dict = SeqIO.to_dict(GFF.parse(self._jgi_file)) tfeature = rec_dict['chr_1'].features[0] assert tfeature.location.nofuzzy_start == 37060 assert tfeature.location.nofuzzy_end == 38216 assert tfeature.type == 'inferred_parent' assert len(tfeature.sub_features) == 6 sfeature = tfeature.sub_features[1] assert sfeature.qualifiers['proteinId'] == ['873'] assert sfeature.qualifiers['phase'] == ['0'] def t_ensembl_nested_features(self): """Test nesting of features with GFF2 files using transcript_id. XXX sub_features no longer supported in Biopython """ rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file)) assert len(rec_dict["I"].features) == 2 t_feature = rec_dict["I"].features[0] #assert len(t_feature.sub_features) == 32, len(t_feature.sub_features) def t_wormbase_nested_features(self): """Test nesting of features with GFF2 files using Transcript only. """ rec_dict = SeqIO.to_dict(GFF.parse(self._wormbase_file)) assert len(rec_dict) == 3 parent_features = [f for f in rec_dict["I"].features if f.type == "Transcript"] assert len(parent_features) == 1 inferred_features = [f for f in rec_dict["I"].features if f.type == "inferred_parent"] assert len(inferred_features) == 0 tfeature = parent_features[0] assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797" assert len(tfeature.sub_features) == 46 def t_wb_cds_nested_features(self): """Nesting of GFF2 features with a flat CDS key value pair. """ rec_dict = SeqIO.to_dict(GFF.parse(self._wb_alt_file)) assert len(rec_dict) == 2 features = list(rec_dict.values())[0].features assert len(features) == 1 tfeature = features[0] assert tfeature.id == "cr01.sctg102.wum.2.1" assert len(tfeature.sub_features) == 7 def t_gff2_iteration(self): """Test iterated features with GFF2 files, breaking without parents. """ recs = [] for rec in GFF.parse(self._wormbase_file, target_lines=15): recs.append(rec) assert len(recs) == 4 assert recs[0].features[0].type == 'region' assert recs[0].features[1].type == 'SAGE_tag' assert len(recs[0].features[2].sub_features) == 29 class DirectivesTest(unittest.TestCase): """Tests for parsing directives and other meta-data. """ def setUp(self): self._test_dir = os.path.join(os.path.dirname(__file__), "GFF") self._gff_file = os.path.join(self._test_dir, "hybrid1.gff3") self._ncbi_gff = os.path.join(self._test_dir, "hybrid2.gff3") self._ncbi_fa = os.path.join(self._test_dir, "hybrid2.fa") self._problem_seq_region_file = os.path.join(self._test_dir, "problem_sequence_region.gff3") def t_basic_directives(self): """Parse out top level meta-data supplied in a GFF3 file. """ recs = SeqIO.to_dict(GFF.parse(self._gff_file)) anns = recs['chr17'].annotations assert anns['gff-version'] == ['3'] assert anns['attribute-ontology'] == ['baz'] assert anns['feature-ontology'] == ['bar'] assert anns['source-ontology'] == ['boo'] assert anns['sequence-region'] == [('foo', 0, 100), ('chr17', 62467933, 62469545)] def t_fasta_directive(self): """Parse FASTA sequence information contained in a GFF3 file. """ recs = SeqIO.to_dict(GFF.parse(self._gff_file)) assert len(recs) == 1 test_rec = recs['chr17'] assert str(test_rec.seq) == "GATTACAGATTACA" def t_fasta_directive_w_ncbi(self): """Parse FASTA sequence information contained in a GFF3 file with NCBI style IDs. """ with open(self._ncbi_fa) as seq_handle: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) recs = SeqIO.to_dict(GFF.parse(self._ncbi_gff, seq_dict)) assert len(recs) == 1 test_rec = recs['lcl|chr17'] assert str(test_rec.seq) == "GATTACAGATTACA" def t_fasta_directive_w_ncbi_fa(self): """Parse FASTA sequence information contained in a separate file with NCBI style IDs. """ recs = SeqIO.to_dict(GFF.parse(self._ncbi_gff)) assert len(recs) == 1 test_rec = recs['chr17'] assert str(test_rec.seq) == "GATTACAGATTACA" def t_examiner_with_fasta(self): """Perform high level examination of files with FASTA directives. """ examiner = GFFExaminer() pc_map = examiner.parent_child_map(self._gff_file) assert pc_map[('UCSC', 'mRNA')] == [('UCSC', 'CDS')] limits = examiner.available_limits(self._gff_file) assert list(limits['gff_id'].keys())[0][0] == 'chr17' assert sorted(limits['gff_source_type'].keys()) == \ [('UCSC', 'CDS'), ('UCSC', 'mRNA')] def t_problem_sequence_region(self): """Avoid issues with sequence region directives lacking contigs """ recs = SeqIO.to_dict(GFF.parse(self._problem_seq_region_file)) anns = recs['1'].annotations assert anns['gff-version'] == ['3'] assert anns['sequence-region'] == [(0, 2482535)] class OutputTest(unittest.TestCase): """Tests to write SeqFeatures to GFF3 output format. """ def setUp(self): self._test_dir = os.path.join(os.path.dirname(__file__), "GFF") self._test_seq_file = os.path.join(self._test_dir, "c_elegans_WS199_dna_shortened.fa") self._test_gff_file = os.path.join(self._test_dir, "c_elegans_WS199_shortened_gff.txt") self._test_gff_ann_file = os.path.join(self._test_dir, "c_elegans_WS199_ann_gff.txt") self._wormbase_file = os.path.join(self._test_dir, "wormbase_gff2.txt") def t_gff3_to_gff3(self): """Read in and write out GFF3 without any loss of information. """ recs = SeqIO.to_dict(GFF.parse(self._test_gff_file)) out_handle = StringIO() GFF.write(recs.values(), out_handle) wrote_handle = StringIO(out_handle.getvalue()) recs_two = SeqIO.to_dict(GFF.parse(wrote_handle)) orig_rec = list(recs.values())[0] re_rec = list(recs.values())[0] assert len(orig_rec.features) == len(re_rec.features) for i, orig_f in enumerate(orig_rec.features): assert str(orig_f) == str(re_rec.features[i]) def t_gff2_to_gff3(self): """Read in GFF2 and write out as GFF3. """ recs = SeqIO.to_dict(GFF.parse(self._wormbase_file)) out_handle = StringIO() GFF.write(recs.values(), out_handle) wrote_handle = StringIO(out_handle.getvalue()) # check some tricky lines in the GFF2 file checks = 0 for line in wrote_handle: if line.find("Interpolated_map_position") >= 0: checks += 1 assert line.find("RFLP=No") > 0 if line.find("Gene=WBGene00000138") > 0: checks += 1 assert line.find("ID=B0019.1") > 0 if line.find("translated_nucleotide_match\t12762127") > 0: checks += 1 assert line.find("Note=MSP:FADFSPLDVSDVNFATDDLAK") > 0 assert checks == 3, "Missing check line" def t_write_from_recs(self): """Write out GFF3 from SeqRecord inputs. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} sub_qualifiers = {"source": "prediction"} top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers) top_feature.sub_features = [ SeqFeature(FeatureLocation(0, 5), type="exon", strand=1, qualifiers=sub_qualifiers), SeqFeature(FeatureLocation(15, 20), type="exon", strand=1, qualifiers=sub_qualifiers) ] rec.features = [top_feature] out_handle = StringIO() GFF.write([rec], out_handle) wrote_info = out_handle.getvalue().split("\n") assert wrote_info[0] == "##gff-version 3" assert wrote_info[1] == "##sequence-region ID1 1 20" print(wrote_info[2].split("\t")) assert wrote_info[2].split("\t") == [ 'ID1', 'prediction', 'gene', '1', '20', '10.0', '+', '.', 'ID=gene1;other=Some,annotations' ] assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5', '.', '+', '.', 'Parent=gene1'] def t_write_fasta(self): """Include FASTA records in GFF output. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers)] out_handle = StringIO() GFF.write([rec], out_handle, include_fasta=True) wrote_info = out_handle.getvalue().split("\n") fasta_parts = wrote_info[3:] assert fasta_parts[0] == "##FASTA" assert fasta_parts[1] == ">ID1 " assert fasta_parts[2] == str(seq) def t_write_seqrecord(self): """Write single SeqRecords. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers)] out_handle = StringIO() GFF.write([rec], out_handle, include_fasta=True) wrote_info = out_handle.getvalue().split("\n") gff_line = wrote_info[2] assert gff_line.split("\t")[0] == "ID1" def run_tests(argv): test_suite = testing_suite() runner = unittest.TextTestRunner(sys.stdout, verbosity=2) runner.run(test_suite) def testing_suite(): """Generate the suite of tests. """ test_suite = unittest.TestSuite() test_loader = unittest.TestLoader() test_loader.testMethodPrefix = 't_' tests = [GFF3Test, MapReduceGFFTest, SolidGFFTester, GFF2Tester, DirectivesTest, OutputTest] #tests = [GFF3Test] for test in tests: cur_suite = test_loader.loadTestsFromTestCase(test) test_suite.addTest(cur_suite) return test_suite if __name__ == "__main__": sys.exit(run_tests(sys.argv)) ././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3726625 bcbio-gff-0.6.9/bcbio_gff.egg-info/0000775002421100242110000000000000000000000020277 5ustar00bchapmanbchapman00000000000000././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1637248150.0 bcbio-gff-0.6.9/bcbio_gff.egg-info/PKG-INFO0000664002421100242110000000047300000000000021400 0ustar00bchapmanbchapman00000000000000Metadata-Version: 1.0 Name: bcbio-gff Version: 0.6.9 Summary: Read and write Generic Feature Format (GFF) with Biopython integration. Home-page: https://github.com/chapmanb/bcbb/tree/master/gff Author: Brad Chapman Author-email: chapmanb@50mail.com License: Biopython License Description: UNKNOWN Platform: UNKNOWN ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1637248151.0 bcbio-gff-0.6.9/bcbio_gff.egg-info/SOURCES.txt0000664002421100242110000000207500000000000022167 0ustar00bchapmanbchapman00000000000000LICENSE MANIFEST.in README.rst distribute_setup.py setup.py BCBio/__init__.py BCBio/GFF/GFFOutput.py BCBio/GFF/GFFParser.py BCBio/GFF/__init__.py BCBio/GFF/_utils.py Scripts/gff/access_gff_index.py Scripts/gff/genbank_to_gff.py Scripts/gff/gff2_to_gff3.py Scripts/gff/gff_to_biosql.py Scripts/gff/gff_to_genbank.py Tests/test_GFFSeqIOFeatureAdder.py Tests/GFF/F3-unique-3.v2.gff Tests/GFF/c_elegans_WS199_ann_gff.txt Tests/GFF/c_elegans_WS199_dna_shortened.fa Tests/GFF/c_elegans_WS199_shortened_gff.txt Tests/GFF/ensembl_gtf.txt Tests/GFF/glimmer_nokeyval.gff3 Tests/GFF/hybrid1.gff3 Tests/GFF/hybrid2.fa Tests/GFF/hybrid2.gff3 Tests/GFF/jgi_gff2.txt Tests/GFF/mouse_extra_comma.gff3 Tests/GFF/ncbi_gff3.txt Tests/GFF/problem_sequence_region.gff3 Tests/GFF/spaces.gff3 Tests/GFF/trans_splicing.gff3 Tests/GFF/transcripts.gff3 Tests/GFF/unescaped-semicolon.gff3 Tests/GFF/wormbase_gff2.txt Tests/GFF/wormbase_gff2_alt.txt bcbio_gff.egg-info/PKG-INFO bcbio_gff.egg-info/SOURCES.txt bcbio_gff.egg-info/dependency_links.txt bcbio_gff.egg-info/requires.txt bcbio_gff.egg-info/top_level.txt././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1637248150.0 bcbio-gff-0.6.9/bcbio_gff.egg-info/dependency_links.txt0000664002421100242110000000000100000000000024345 0ustar00bchapmanbchapman00000000000000 ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1637248150.0 bcbio-gff-0.6.9/bcbio_gff.egg-info/requires.txt0000664002421100242110000000001600000000000022674 0ustar00bchapmanbchapman00000000000000six biopython ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1637248150.0 bcbio-gff-0.6.9/bcbio_gff.egg-info/top_level.txt0000664002421100242110000000000600000000000023025 0ustar00bchapmanbchapman00000000000000BCBio ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/distribute_setup.py0000664002421100242110000004241000000000000020656 0ustar00bchapmanbchapman00000000000000#!python """Bootstrap distribute installation If you want to use setuptools in your package's setup.py, just include this file in the same directory with it, and add this to the top of your setup.py:: from distribute_setup import use_setuptools use_setuptools() If you want to require a specific version of setuptools, set a download mirror, or use an alternate download directory, you can do so by supplying the appropriate options to ``use_setuptools()``. This file can also be run as a script to install or upgrade setuptools. """ import os import shutil import sys import time import fnmatch import tempfile import tarfile import optparse from distutils import log try: from site import USER_SITE except ImportError: USER_SITE = None try: import subprocess def _python_cmd(*args): args = (sys.executable,) + args return subprocess.call(args) == 0 except ImportError: # will be used for python 2.3 def _python_cmd(*args): args = (sys.executable,) + args # quoting arguments if windows if sys.platform == 'win32': def quote(arg): if ' ' in arg: return '"%s"' % arg return arg args = [quote(arg) for arg in args] return os.spawnl(os.P_WAIT, sys.executable, *args) == 0 DEFAULT_VERSION = "0.6.49" DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/" SETUPTOOLS_FAKED_VERSION = "0.6c11" SETUPTOOLS_PKG_INFO = """\ Metadata-Version: 1.0 Name: setuptools Version: %s Summary: xxxx Home-page: xxx Author: xxx Author-email: xxx License: xxx Description: xxx """ % SETUPTOOLS_FAKED_VERSION def _install(tarball, install_args=()): # extracting the tarball tmpdir = tempfile.mkdtemp() log.warn('Extracting in %s', tmpdir) old_wd = os.getcwd() try: os.chdir(tmpdir) tar = tarfile.open(tarball) _extractall(tar) tar.close() # going in the directory subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) os.chdir(subdir) log.warn('Now working in %s', subdir) # installing log.warn('Installing Distribute') if not _python_cmd('setup.py', 'install', *install_args): log.warn('Something went wrong during the installation.') log.warn('See the error message above.') # exitcode will be 2 return 2 finally: os.chdir(old_wd) shutil.rmtree(tmpdir) def _build_egg(egg, tarball, to_dir): # extracting the tarball tmpdir = tempfile.mkdtemp() log.warn('Extracting in %s', tmpdir) old_wd = os.getcwd() try: os.chdir(tmpdir) tar = tarfile.open(tarball) _extractall(tar) tar.close() # going in the directory subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) os.chdir(subdir) log.warn('Now working in %s', subdir) # building an egg log.warn('Building a Distribute egg in %s', to_dir) _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) finally: os.chdir(old_wd) shutil.rmtree(tmpdir) # returning the result log.warn(egg) if not os.path.exists(egg): raise IOError('Could not build the egg.') def _do_download(version, download_base, to_dir, download_delay): egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg' % (version, sys.version_info[0], sys.version_info[1])) if not os.path.exists(egg): tarball = download_setuptools(version, download_base, to_dir, download_delay) _build_egg(egg, tarball, to_dir) sys.path.insert(0, egg) import setuptools setuptools.bootstrap_install_from = egg def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, download_delay=15, no_fake=True): # making sure we use the absolute path to_dir = os.path.abspath(to_dir) was_imported = 'pkg_resources' in sys.modules or \ 'setuptools' in sys.modules try: try: import pkg_resources # Setuptools 0.7b and later is a suitable (and preferable) # substitute for any Distribute version. try: pkg_resources.require("setuptools>=0.7b") return except (pkg_resources.DistributionNotFound, pkg_resources.VersionConflict): pass if not hasattr(pkg_resources, '_distribute'): if not no_fake: _fake_setuptools() raise ImportError except ImportError: return _do_download(version, download_base, to_dir, download_delay) try: pkg_resources.require("distribute>=" + version) return except pkg_resources.VersionConflict: e = sys.exc_info()[1] if was_imported: sys.stderr.write( "The required version of distribute (>=%s) is not available,\n" "and can't be installed while this script is running. Please\n" "install a more recent version first, using\n" "'easy_install -U distribute'." "\n\n(Currently using %r)\n" % (version, e.args[0])) sys.exit(2) else: del pkg_resources, sys.modules['pkg_resources'] # reload ok return _do_download(version, download_base, to_dir, download_delay) except pkg_resources.DistributionNotFound: return _do_download(version, download_base, to_dir, download_delay) finally: if not no_fake: _create_fake_setuptools_pkg_info(to_dir) def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, delay=15): """Download distribute from a specified location and return its filename `version` should be a valid distribute version number that is available as an egg for download under the `download_base` URL (which should end with a '/'). `to_dir` is the directory where the egg will be downloaded. `delay` is the number of seconds to pause before an actual download attempt. """ # making sure we use the absolute path to_dir = os.path.abspath(to_dir) try: from urllib.request import urlopen except ImportError: from urllib2 import urlopen tgz_name = "distribute-%s.tar.gz" % version url = download_base + tgz_name saveto = os.path.join(to_dir, tgz_name) src = dst = None if not os.path.exists(saveto): # Avoid repeated downloads try: log.warn("Downloading %s", url) src = urlopen(url) # Read/write all in one block, so we don't create a corrupt file # if the download is interrupted. data = src.read() dst = open(saveto, "wb") dst.write(data) finally: if src: src.close() if dst: dst.close() return os.path.realpath(saveto) def _no_sandbox(function): def __no_sandbox(*args, **kw): try: from setuptools.sandbox import DirectorySandbox if not hasattr(DirectorySandbox, '_old'): def violation(*args): pass DirectorySandbox._old = DirectorySandbox._violation DirectorySandbox._violation = violation patched = True else: patched = False except ImportError: patched = False try: return function(*args, **kw) finally: if patched: DirectorySandbox._violation = DirectorySandbox._old del DirectorySandbox._old return __no_sandbox def _patch_file(path, content): """Will backup the file then patch it""" f = open(path) existing_content = f.read() f.close() if existing_content == content: # already patched log.warn('Already patched.') return False log.warn('Patching...') _rename_path(path) f = open(path, 'w') try: f.write(content) finally: f.close() return True _patch_file = _no_sandbox(_patch_file) def _same_content(path, content): f = open(path) existing_content = f.read() f.close() return existing_content == content def _rename_path(path): new_name = path + '.OLD.%s' % time.time() log.warn('Renaming %s to %s', path, new_name) os.rename(path, new_name) return new_name def _remove_flat_installation(placeholder): if not os.path.isdir(placeholder): log.warn('Unkown installation at %s', placeholder) return False found = False for file in os.listdir(placeholder): if fnmatch.fnmatch(file, 'setuptools*.egg-info'): found = True break if not found: log.warn('Could not locate setuptools*.egg-info') return log.warn('Moving elements out of the way...') pkg_info = os.path.join(placeholder, file) if os.path.isdir(pkg_info): patched = _patch_egg_dir(pkg_info) else: patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO) if not patched: log.warn('%s already patched.', pkg_info) return False # now let's move the files out of the way for element in ('setuptools', 'pkg_resources.py', 'site.py'): element = os.path.join(placeholder, element) if os.path.exists(element): _rename_path(element) else: log.warn('Could not find the %s element of the ' 'Setuptools distribution', element) return True _remove_flat_installation = _no_sandbox(_remove_flat_installation) def _after_install(dist): log.warn('After install bootstrap.') placeholder = dist.get_command_obj('install').install_purelib _create_fake_setuptools_pkg_info(placeholder) def _create_fake_setuptools_pkg_info(placeholder): if not placeholder or not os.path.exists(placeholder): log.warn('Could not find the install location') return pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1]) setuptools_file = 'setuptools-%s-py%s.egg-info' % \ (SETUPTOOLS_FAKED_VERSION, pyver) pkg_info = os.path.join(placeholder, setuptools_file) if os.path.exists(pkg_info): log.warn('%s already exists', pkg_info) return log.warn('Creating %s', pkg_info) try: f = open(pkg_info, 'w') except EnvironmentError: log.warn("Don't have permissions to write %s, skipping", pkg_info) return try: f.write(SETUPTOOLS_PKG_INFO) finally: f.close() pth_file = os.path.join(placeholder, 'setuptools.pth') log.warn('Creating %s', pth_file) f = open(pth_file, 'w') try: f.write(os.path.join(os.curdir, setuptools_file)) finally: f.close() _create_fake_setuptools_pkg_info = _no_sandbox( _create_fake_setuptools_pkg_info ) def _patch_egg_dir(path): # let's check if it's already patched pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') if os.path.exists(pkg_info): if _same_content(pkg_info, SETUPTOOLS_PKG_INFO): log.warn('%s already patched.', pkg_info) return False _rename_path(path) os.mkdir(path) os.mkdir(os.path.join(path, 'EGG-INFO')) pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') f = open(pkg_info, 'w') try: f.write(SETUPTOOLS_PKG_INFO) finally: f.close() return True _patch_egg_dir = _no_sandbox(_patch_egg_dir) def _before_install(): log.warn('Before install bootstrap.') _fake_setuptools() def _under_prefix(location): if 'install' not in sys.argv: return True args = sys.argv[sys.argv.index('install') + 1:] for index, arg in enumerate(args): for option in ('--root', '--prefix'): if arg.startswith('%s=' % option): top_dir = arg.split('root=')[-1] return location.startswith(top_dir) elif arg == option: if len(args) > index: top_dir = args[index + 1] return location.startswith(top_dir) if arg == '--user' and USER_SITE is not None: return location.startswith(USER_SITE) return True def _fake_setuptools(): log.warn('Scanning installed packages') try: import pkg_resources except ImportError: # we're cool log.warn('Setuptools or Distribute does not seem to be installed.') return ws = pkg_resources.working_set try: setuptools_dist = ws.find( pkg_resources.Requirement.parse('setuptools', replacement=False) ) except TypeError: # old distribute API setuptools_dist = ws.find( pkg_resources.Requirement.parse('setuptools') ) if setuptools_dist is None: log.warn('No setuptools distribution found') return # detecting if it was already faked setuptools_location = setuptools_dist.location log.warn('Setuptools installation detected at %s', setuptools_location) # if --root or --preix was provided, and if # setuptools is not located in them, we don't patch it if not _under_prefix(setuptools_location): log.warn('Not patching, --root or --prefix is installing Distribute' ' in another location') return # let's see if its an egg if not setuptools_location.endswith('.egg'): log.warn('Non-egg installation') res = _remove_flat_installation(setuptools_location) if not res: return else: log.warn('Egg installation') pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO') if (os.path.exists(pkg_info) and _same_content(pkg_info, SETUPTOOLS_PKG_INFO)): log.warn('Already patched.') return log.warn('Patching...') # let's create a fake egg replacing setuptools one res = _patch_egg_dir(setuptools_location) if not res: return log.warn('Patching complete.') _relaunch() def _relaunch(): log.warn('Relaunching...') # we have to relaunch the process # pip marker to avoid a relaunch bug _cmd1 = ['-c', 'install', '--single-version-externally-managed'] _cmd2 = ['-c', 'install', '--record'] if sys.argv[:3] == _cmd1 or sys.argv[:3] == _cmd2: sys.argv[0] = 'setup.py' args = [sys.executable] + sys.argv sys.exit(subprocess.call(args)) def _extractall(self, path=".", members=None): """Extract all members from the archive to the current working directory and set owner, modification time and permissions on directories afterwards. `path' specifies a different directory to extract to. `members' is optional and must be a subset of the list returned by getmembers(). """ import copy import operator from tarfile import ExtractError directories = [] if members is None: members = self for tarinfo in members: if tarinfo.isdir(): # Extract directories with a safe mode. directories.append(tarinfo) tarinfo = copy.copy(tarinfo) tarinfo.mode = 448 # decimal for oct 0700 self.extract(tarinfo, path) # Reverse sort directories. if sys.version_info < (2, 4): def sorter(dir1, dir2): return cmp(dir1.name, dir2.name) directories.sort(sorter) directories.reverse() else: directories.sort(key=operator.attrgetter('name'), reverse=True) # Set correct owner, mtime and filemode on directories. for tarinfo in directories: dirpath = os.path.join(path, tarinfo.name) try: self.chown(tarinfo, dirpath) self.utime(tarinfo, dirpath) self.chmod(tarinfo, dirpath) except ExtractError: e = sys.exc_info()[1] if self.errorlevel > 1: raise else: self._dbg(1, "tarfile: %s" % e) def _build_install_args(options): """ Build the arguments to 'python setup.py install' on the distribute package """ install_args = [] if options.user_install: if sys.version_info < (2, 6): log.warn("--user requires Python 2.6 or later") raise SystemExit(1) install_args.append('--user') return install_args def _parse_args(): """ Parse the command line for options """ parser = optparse.OptionParser() parser.add_option( '--user', dest='user_install', action='store_true', default=False, help='install in user site package (requires Python 2.6 or later)') parser.add_option( '--download-base', dest='download_base', metavar="URL", default=DEFAULT_URL, help='alternative URL from where to download the distribute package') options, args = parser.parse_args() # positional arguments are ignored return options def main(version=DEFAULT_VERSION): """Install or upgrade setuptools and EasyInstall""" options = _parse_args() tarball = download_setuptools(download_base=options.download_base) return _install(tarball, _build_install_args(options)) if __name__ == '__main__': sys.exit(main()) ././@PaxHeader0000000000000000000000000000003400000000000011452 xustar000000000000000028 mtime=1637248151.3726625 bcbio-gff-0.6.9/setup.cfg0000664002421100242110000000004600000000000016526 0ustar00bchapmanbchapman00000000000000[egg_info] tag_build = tag_date = 0 ././@PaxHeader0000000000000000000000000000002600000000000011453 xustar000000000000000022 mtime=1633480952.0 bcbio-gff-0.6.9/setup.py0000664002421100242110000000134300000000000016420 0ustar00bchapmanbchapman00000000000000#!/usr/bin/env python """Python setup file for Blue Collar Bioinformatics scripts and modules. """ from distribute_setup import use_setuptools use_setuptools() from setuptools import setup, find_packages __version__ = "Undefined" for line in open('BCBio/GFF/__init__.py'): if (line.startswith('__version__')): exec(line.strip()) setup(name="bcbio-gff", version=__version__, author="Brad Chapman", author_email="chapmanb@50mail.com", license="Biopython License", description="Read and write Generic Feature Format (GFF) with Biopython integration.", url="https://github.com/chapmanb/bcbb/tree/master/gff", packages=find_packages(), install_requires=["six", "biopython"] )