cgecore-1.5.2/0000755000077000000240000000000013577151366013225 5ustar rolfstaff00000000000000cgecore-1.5.2/PKG-INFO0000644000077000000240000000227413577151366014327 0ustar rolfstaff00000000000000Metadata-Version: 1.0 Name: cgecore Version: 1.5.2 Summary: Center for Genomic Epidemiology Core Module Home-page: https://bitbucket.org/genomicepidemiology/cge_core_module Author: Center for Genomic Epidemiology Author-email: cgehelp@cbs.dtu.dk License: Apache License, Version 2.0 Description: # cge_core_module Core module for the Center for Genomic Epidemiology This module contains classes and functions needed to run the service wrappers and pipeline scripts The pypi project can be found here: https://pypi.org/project/cgecore/ # How to update: 1. Make changes to the modules 2. Bump the version number accordingly in cgecore/__init__.py 3. Install package locally 4. Test the changes locally (for both python2 and python3) 5. Distribute to Pypi # Install package locally python2 setup.py install python3 setup.py install # Distribute to PyPi python3 setup.py sdist bdist_wheel twine upload dist/* *deprecated:* ~~python setup.py sdist upload -r pypi~~ Platform: UNKNOWN cgecore-1.5.2/README.md0000644000077000000240000000124113471234333014467 0ustar rolfstaff00000000000000# cge_core_module Core module for the Center for Genomic Epidemiology This module contains classes and functions needed to run the service wrappers and pipeline scripts The pypi project can be found here: https://pypi.org/project/cgecore/ # How to update: 1. Make changes to the modules 2. Bump the version number accordingly in cgecore/__init__.py 3. Install package locally 4. Test the changes locally (for both python2 and python3) 5. Distribute to Pypi # Install package locally python2 setup.py install python3 setup.py install # Distribute to PyPi python3 setup.py sdist bdist_wheel twine upload dist/* *deprecated:* ~~python setup.py sdist upload -r pypi~~ cgecore-1.5.2/cgecore/0000755000077000000240000000000013577151366014634 5ustar rolfstaff00000000000000cgecore-1.5.2/cgecore/__init__.py0000644000077000000240000000134013577151137016737 0ustar rolfstaff00000000000000#!/usr/bin/env python """ The CGE functions module """ from .utility import (adv_dict, copy_dir, copy_file, create_zip_dir, debug, open_, file_unzipper, file_zipper, mkpath, move_file, seqs_from_file, Reg, REGroup, sort2groups, load_json, sort_and_distribute ) from .cmdline import Program, proglist, cmd2list from .argumentparsing import (check_file_type, get_arguments, get_string, make_file_list ) ##################### __version__ = "1.5.2" __all__ = [ "argumentparsing", "cmdline", "utility" ] # Initiate Shared Objects # debug = Debug() # proglist = programlist_obj() cgecore-1.5.2/cgecore/alignment.py0000644000077000000240000005553313425017076017166 0ustar rolfstaff00000000000000#!/usr/bin/env python3 """ This module contains core functions and classes related to alignment. """ ################################################################################ # CGE ALIGNMENT MODULE # ################################################################################ import os, subprocess, collections from Bio.Blast import NCBIXML from Bio import SeqIO # Python2 / Python3 specifik imports try: from string import maketrans except: maketrans = str.maketrans def extended_cigar(aligned_template, aligned_query): ''' Convert mutation annotations to extended cigar format https://github.com/lh3/minimap2#the-cs-optional-tag USAGE: >>> template = 'CGATCGATAAATAGAGTAG---GAATAGCA' >>> query = 'CGATCG---AATAGAGTAGGTCGAATtGCA' >>> extended_cigar(template, query) == ':6-ata:10+gtc:4*at:3' True ''' # - Go through each position in the alignment insertion = [] deletion = [] matches = [] cigar = [] for r_aa, q_aa in zip(aligned_template.lower(), aligned_query.lower()): gap_ref = r_aa == '-' gap_que = q_aa == '-' match = r_aa == q_aa if matches and not match: # End match block cigar.append(":%s"%len(matches)) matches = [] if insertion and not gap_ref: # End insertion cigar.append("+%s"%''.join(insertion)) insertion = [] elif deletion and not gap_que: # End deletion cigar.append("-%s"%''.join(deletion)) deletion = [] if gap_ref: if insertion: # Extend insertion insertion.append(q_aa) else: # Start insertion insertion = [q_aa] elif gap_que: if deletion: # Extend deletion deletion.append(r_aa) else: # Start deletion deletion = [r_aa] elif match: if matches: # Extend match block matches.append(r_aa) else: # Start match block matches = [r_aa] else: # Add SNP annotation cigar.append("*%s%s"%(r_aa, q_aa)) if matches: cigar.append(":%s"%len(matches)) del matches if insertion: # End insertion cigar.append("+%s"%''.join(insertion)) del insertion elif deletion: # End deletion cigar.append("-%s"%''.join(deletion)) del deletion return ''.join(cigar) def cigar2query(template, cigar): ''' Generate query sequence from the template and extended cigar annotation USAGE: >>> template = 'CGATCGATAAATAGAGTAGGAATAGCA' >>> cigar = ':6-ata:10+gtc:4*at:3' >>> cigar2query(template, cigar) == 'CGATCGAATAGAGTAGGTCGAATtGCA'.upper() True ''' query = [] entries = ['+','-','*',':'] number = list(map(str,range(10))) cigar_length = len(cigar) num = [] entry = None pos = 0 i = 0 while i < cigar_length: if cigar[i] in entries: # New entry if entry == ':': old_pos = pos pos += int(''.join(num)) query.append(template[old_pos:pos]) num = [] entry = cigar[i] if entry == '*': i += 2 query.append(cigar[i]) pos += 1 elif cigar[i] in number: num.append(cigar[i]) elif entry == '-': pos += 1 elif entry == '+': query.append(cigar[i]) i += 1 if entry == ':': old_pos = pos pos += int(''.join(num)) query.append(template[old_pos:pos]) return ''.join(query).upper() def Blaster(inputfile, databases, db_path, out_path='.', min_cov=0.6, threshold=0.9, blast='blastn', cut_off=True): ''' BLAST wrapper method, that takes a simple input and produces a overview list of the hits to templates, and their alignments Usage >>> import os, subprocess, collections >>> from Bio.Blast import NCBIXML >>> from Bio import SeqIO >>> from string import maketrans >>> inputfile = 'test.fsa' >>> databases = ['enterobacteriaceae'] >>> db_path = '/path/to/databases/plasmidfinder/' >>> Blaster(inputfile, databases, db_path) ''' min_cov = 100 * float(min_cov) threshold = 100 * float(threshold) # For alignment gene_align_query = dict() #will contain the sequence alignment lines gene_align_homo = dict() #will contain the sequence alignment homolog string gene_align_sbjct = dict() #will contain the sequence alignment allele string results = dict() #will contain the results for db in databases: # Adding the path to the database and output db_file = "%s/%s.fsa"%(db_path, db) os.system("mkdir -p %s/tmp"%(out_path)) os.system("chmod 775 %s/tmp"%(out_path)) out_file = "%s/tmp/out_%s.xml"%(out_path, db) # Running blast cmd = "%s -subject %s -query %s -out %s -outfmt '5' -perc_identity %s -dust 'no'"%(blast, db_file, inputfile, out_file, threshold) process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = process.communicate() # Getting the results result_handle = open(out_file) blast_records = NCBIXML.parse(result_handle) # Declaring variables for saving the results gene_results = dict() #will contain the results for each gene # For finding the best hits best_hsp = dict() # Keeping track of gene split gene_split = collections.defaultdict(dict) # Making the dicts for sequence outputs gene_align_query[db] = dict() gene_align_homo[db] = dict() gene_align_sbjct[db] = dict() # Parsing over the hits and only keeping the best for blast_record in blast_records: query = blast_record.query blast_record.alignments.sort(key = lambda align: -max((len(hsp.query) * (int(hsp.identities)/float(len(hsp.query))) for hsp in align.hsps))) for alignment in blast_record.alignments: # Setting the e-value as 1 and bit as 0 to get the best HSP fragment best_e_value = 1 best_bit = 0 for hsp in alignment.hsps: if hsp.expect < best_e_value or hsp.bits > best_bit: best_e_value = hsp.expect best_bit = hsp.bits tmp = alignment.title.split(" ") sbjct_header = tmp[1] bit = hsp.bits sbjct_length = alignment.length sbjct_start = hsp.sbjct_start sbjct_end = hsp.sbjct_end gaps = hsp.gaps query_string = str(hsp.query) homo_string = str(hsp.match) sbjct_string = str(hsp.sbjct) contig_name = query.replace(">","") query_start = hsp.query_start query_end = hsp.query_end HSP_length = len(query_string) perc_ident = int(hsp.identities)/float(HSP_length) * 100 strand = 0 coverage = ((int(HSP_length) - int(gaps))/float(sbjct_length)) perc_coverage = ((int(HSP_length) - int(gaps))/float(sbjct_length)) * 100 if int(HSP_length) == int(sbjct_length): cal_score = perc_ident * coverage * 100 else: cal_score = perc_ident * coverage hit_id = "%s:%s..%s:%s:%f"%(contig_name, query_start, query_end, sbjct_header, cal_score) # If the hit is on the other strand if sbjct_start > sbjct_end: tmp = sbjct_start sbjct_start = sbjct_end sbjct_end = tmp query_string = reverse_complement(query_string) homo_string = homo_string[::-1] sbjct_string = reverse_complement(sbjct_string) strand = 1 if cut_off == True: if perc_coverage > 20 : best_hsp = {'evalue': hsp.expect, 'sbjct_header': sbjct_header, 'bit': bit, 'perc_ident': perc_ident, 'sbjct_length':sbjct_length, 'sbjct_start': sbjct_start, 'sbjct_end': sbjct_end, 'gaps': gaps, 'query_string': query_string, 'homo_string': homo_string, 'sbjct_string': sbjct_string, 'contig_name': contig_name, 'query_start': query_start, 'query_end': query_end, 'HSP_length': HSP_length, 'coverage': coverage, 'cal_score': cal_score, 'hit_id': hit_id, 'strand': strand, 'perc_coverage': perc_coverage } else: best_hsp = {'evalue': hsp.expect, 'sbjct_header': sbjct_header, 'bit': bit, 'perc_ident': perc_ident, 'sbjct_length':sbjct_length, 'sbjct_start': sbjct_start, 'sbjct_end': sbjct_end, 'gaps': gaps, 'query_string': query_string, 'homo_string': homo_string, 'sbjct_string': sbjct_string, 'contig_name': contig_name, 'query_start': query_start, 'query_end': query_end, 'HSP_length': HSP_length, 'coverage': coverage, 'cal_score': cal_score, 'hit_id': hit_id, 'strand': strand, 'perc_coverage': perc_coverage } # Saving the result if any if best_hsp: save = 1 # If there are other gene alignments they are compared if gene_results: tmp_gene_split = gene_split tmp_results = gene_results # Compare the hit results save, gene_split, gene_results = compare_results(save, best_hsp, tmp_results, tmp_gene_split) # If the hit is not overlapping with other hit seqeunces it is kept if save == 1: gene_results[hit_id] = best_hsp else: pass # If the hit does not cover the entire database reference the missing seqence data are extracted for hit_id in list(gene_results): hit = gene_results[hit_id] # Calculate possible split gene coverage perc_coverage = hit['perc_coverage'] if hit['sbjct_header'] in gene_split and len(gene_split[hit['sbjct_header']]) > 1: # Calculate new length new_length = calculate_new_length(gene_split, gene_results, hit) hit['split_length'] = new_length # Calculate new coverage perc_coverage = new_length / float(hit['sbjct_length']) * 100 # If the hit is above the minimum length threshold it is kept if perc_coverage >= min_cov: if hit['coverage'] == 1: gene_align_query[db][hit_id] = hit['query_string'] gene_align_homo[db][hit_id] = hit['homo_string'] gene_align_sbjct[db][hit_id] = hit['sbjct_string'] elif hit['coverage'] != 1: # Getting the whole database sequence for seq_record in SeqIO.parse(db_file, "fasta"): if seq_record.description == hit['sbjct_header']: gene_align_sbjct[db][hit_id] = str(seq_record.seq) break # Getting the whole contig to extract extra query seqeunce contig = '' for seq_record in SeqIO.parse(inputfile, "fasta"): if seq_record.description == hit['contig_name']: contig = str(seq_record.seq) break # Extract extra sequence from query query_seq, homo_seq = get_query_align(hit, contig) # Saving the new alignment sequences gene_align_query[db][hit_id] = query_seq gene_align_homo[db][hit_id] = homo_seq else: del gene_results[hit_id] if hit['sbjct_header'] in gene_split: del gene_split[hit['sbjct_header']] # Save the database result if gene_results: results[db] = gene_results else: results[db] = "No hit found" return (results, gene_align_query, gene_align_homo, gene_align_sbjct) trans = maketrans("AGCT","TCGA") def reverse_complement(seq): ''' Make reverse complement strand ''' return seq.translate(trans)[::-1] def compare_results(save, best_hsp, tmp_results, tmp_gene_split): ''' Function for comparing hits and saving only the best hit ''' # Get data for comparison hit_id = best_hsp['hit_id'] new_start_query = best_hsp['query_start'] new_end_query = best_hsp['query_end'] new_start_sbjct = int(best_hsp['sbjct_start']) new_end_sbjct = int(best_hsp['sbjct_end']) new_score = best_hsp['cal_score'] new_db_hit = best_hsp['sbjct_header'] new_contig = best_hsp['contig_name'] new_HSP = best_hsp['HSP_length'] # See if the best HSP fragment overlap with another allignment and keep the # allignment with the highest score - if the new fragment is not providing new seqeunce for hit in list(tmp_results): hit_data = tmp_results[hit] old_start_query = hit_data['query_start'] old_end_query = hit_data['query_end'] old_start_sbjct = int(hit_data['sbjct_start']) old_end_sbjct = int(hit_data['sbjct_end']) old_score = hit_data['cal_score'] old_db_hit = hit_data['sbjct_header'] old_contig = hit_data['contig_name'] old_HSP = hit_data['HSP_length'] remove_old = 0 # If they align to the same gene in the database they are compared if new_db_hit == old_db_hit: # If the hit provids additional sequence it is kept and the new coverage is saved # otherwise the one with the highest score is kept if new_start_sbjct < (old_start_sbjct) or new_end_sbjct > (old_end_sbjct): # Save the hits as splitted tmp_gene_split[old_db_hit][hit_id] = 1 if not hit in tmp_gene_split[old_db_hit]: tmp_gene_split[old_db_hit][hit] = 1 else: if new_score > old_score: # Set to remove old hit remove_old = 1 # Save a split if the new hit still creats one if new_db_hit in tmp_gene_split and not hit_id in tmp_gene_split[new_db_hit]: tmp_gene_split[new_db_hit][hit_id] = 1 else: save = 0 # If the old and new hit is not identical the possible saved gene split for the new hit is removed if hit_id != hit: if new_db_hit in tmp_gene_split and hit_id in tmp_gene_split[new_db_hit]: del tmp_gene_split[new_db_hit][hit_id] break # If the hits comes form the same part of the contig sequnce but match different genes only the best hit is kept if new_contig == old_contig: # if the two hits cover the exact same place on the contig only # the percentage of identity is compared if old_start_query == new_start_query and old_end_query == new_end_query: if best_hsp['perc_ident'] > hit_data['perc_ident']: # Set to remove old hit remove_old = 1 # Save a split if the new hit still creats one if new_db_hit in tmp_gene_split and not hit_id in tmp_gene_split[new_db_hit]: tmp_gene_split[new_db_hit][hit_id] = 1 elif best_hsp['perc_ident'] == hit_data['perc_ident']: # Save both # Save a split if the new hit still creats one if new_db_hit in tmp_gene_split and not hit_id in tmp_gene_split[new_db_hit]: tmp_gene_split[new_db_hit][hit_id] = 1 else: save = 0 # Remove new gene from gene split if present if new_db_hit in tmp_gene_split and hit_id in tmp_gene_split[new_db_hit]: del tmp_gene_split[new_db_hit][hit_id] break elif (max(old_end_query, new_end_query) - min(old_start_query, new_start_query)) <= ((old_end_query - old_start_query) + (new_end_query - new_start_query)): if new_score > old_score: # Set to remove old gene remove_old = 1 # Save a split if the new hit still creats one if new_db_hit in tmp_gene_split and not hit_id in tmp_gene_split[new_db_hit]: tmp_gene_split[new_db_hit][hit_id] = 1 elif new_score == old_score: # If both genes are completly covered the longest hit is chosen if int(best_hsp['perc_coverage']) == 100 and int(hit_data['perc_coverage']) == 100 and new_HSP > old_HSP: # Set to remove old gene remove_old = 1 # Save a split if the new hit creats one - both hits are saved if new_db_hit in tmp_gene_split and not hit_id in tmp_gene_split[new_db_hit]: tmp_gene_split[new_db_hit][hit_id] = 1 else: # Remove new gene from gene split if present if new_db_hit in tmp_gene_split and hit_id in tmp_gene_split[new_db_hit]: del tmp_gene_split[new_db_hit][hit_id] save = 0 break # Remove old hit if new hit is better if remove_old == 1: del tmp_results[hit] # Remove gene from gene split if present if old_db_hit in tmp_gene_split and hit in tmp_gene_split[old_db_hit]: del tmp_gene_split[old_db_hit][hit] return save, tmp_gene_split, tmp_results def calculate_new_length(gene_split, gene_results, hit): ''' Function for calcualting new length if the gene is split on several contigs ''' # Looping over splitted hits and calculate new length first = 1 for split in gene_split[hit['sbjct_header']]: new_start = int(gene_results[split]['sbjct_start']) new_end = int(gene_results[split]['sbjct_end']) # Get the frist HSP if first == 1: new_length = int(gene_results[split]['HSP_length']) old_start = new_start old_end = new_end first = 0 continue if new_start < old_start: new_length = new_length + (old_start - new_start) old_start = new_start if new_end > old_end: new_length = new_length + (new_end - old_end) old_end = new_end return(new_length) def get_query_align(hit, contig): ''' Function for extracting extra seqeunce data to the query alignment if the full reference length are not covered ''' # Getting data needed to extract sequences query_seq = hit['query_string'] homo_seq = hit['homo_string'] sbjct_start = int(hit['sbjct_start']) sbjct_end = int(hit['sbjct_end']) query_start = int(hit['query_start']) query_end = int(hit['query_end']) length = int(hit['sbjct_length']) # If the alignment doesn't start at the first position data is added to the begnning if sbjct_start!= 1: missing = sbjct_start - 1 if query_start >= missing and hit['strand'] != 1 or hit['strand'] == 1 and missing <= (len(contig) - query_end): # Getting the query sequence # If the the hit is on the other strand the characters are reversed if hit['strand'] == 1: start_pos = query_end end_pos = query_end + missing chars = contig[start_pos:end_pos] chars = reverse_complement(chars) else: start_pos = query_start - missing - 1 end_pos = query_start - 1 chars = contig[start_pos:end_pos] query_seq = chars + str(query_seq) else: # Getting the query sequence # If the the hit is on the other strand the characters are reversed if hit['strand'] == 1: if query_end == len(contig): query_seq = "-" * missing + str(query_seq) else: start_pos = query_end chars = contig[start_pos:] chars = reverse_complement(chars) query_seq = "-" * (missing - len(chars)) + chars + str(query_seq) elif query_start < 3: query_seq = "-" * missing + str(query_seq) else: end_pos = query_start - 2 chars = contig[0:end_pos] query_seq = "-" * (missing - len(chars)) + chars + str(query_seq) # Adding to the homo sequence spaces = " " * missing homo_seq = str(spaces) + str(homo_seq) # If the alignment dosen't end and the last position data is added to the end if sbjct_end < length: missing = length - sbjct_end if missing <= (len(contig) - query_end) and hit['strand'] != 1 or hit['strand'] == 1 and query_start >= missing: # Getting the query sequence # If the the hit is on the other strand the characters are reversed if hit['strand'] == 1: start_pos = query_start - missing - 1 end_pos = query_start - 1 chars = contig[start_pos:end_pos] chars = reverse_complement(chars) else: start_pos = query_end end_pos = query_end + missing chars = contig[start_pos:end_pos] query_seq = query_seq + chars else: # If the hit is on the other strand the characters are reversed if hit['strand'] == 1: if query_start < 3: query_seq = query_seq + "-" * missing else: end_pos = query_start - 2 chars = contig[0:end_pos] chars = reverse_complement(chars) query_seq = query_seq + chars + "-" * (missing - len(chars)) elif query_end == len(contig): query_seq = query_seq + "-" * missing else: start_pos = query_end chars = contig[start_pos:] query_seq = query_seq + chars + "-" * (missing - len(chars)) # Adding to the homo sequence spaces = " " * int(missing) homo_seq = str(homo_seq) + str(spaces) return query_seq, homo_seq cgecore-1.5.2/cgecore/argumentparsing.py0000644000077000000240000000762213425017076020412 0ustar rolfstaff00000000000000#!/usr/bin/env python3 """ THIS MODULE CONTAINS ALL THE SHARED WRAPPER FUNCTIONS """ ################################################################################ # CGE FUNCTION MODULE # ################################################################################ # This script is part of the CGE Pipeline structure import os, sys from subprocess import Popen, PIPE from argparse import ArgumentParser, RawDescriptionHelpFormatter, SUPPRESS # CGE modules from .utility import debug, open_ def get_string(string): """ This function checks if a path was given as string, and tries to read the file and return the string. """ truestring = string if string is not None: if '/' in string: if os.path.isfile(string): try: with open_(string,'r') as f: truestring = ' '.join(line.strip() for line in f) except: pass if truestring.strip() == '': truestring = None return truestring def get_arguments(options): """ This function handles and validates the wrapper arguments. """ # These the next couple of lines defines the header of the Help output parser = ArgumentParser( formatter_class=RawDescriptionHelpFormatter, usage=("""%(prog)s -------------------------------------------------------------------------------- """), description=(""" Service Wrapper =============== This is the service wrapper script, which is a part of the CGE services. Read the online manual for help. A list of all published services can be found at: cge.cbs.dtu.dk/services """), epilog=(""" -------------------------------------------------------------------------------- """)) #ADDING ARGUMENTS setarg = parser.add_argument #SERVICE SPECIFIC ARGUMENTS if isinstance(options, str): options = [[x for i,x in enumerate(line.split()) if i in [1,2]] for line in options.split('\n') if len(line)>0] for o in options: try: setarg(o[1], type=str, dest=o[0], default=None, help=SUPPRESS) except: None else: for o in options: if o[2] is True: # Handle negative flags setarg(o[0], action="store_false", dest=o[1], default=o[2], help=o[3]) elif o[2] is False: # Handle positive flags setarg(o[0], action="store_true", dest=o[1], default=o[2], help=o[3]) else: help_ = o[3] if o[2] is None else "%s [%s]"%(o[3], '%(default)s') setarg(o[0], type=str, dest=o[1], default=o[2], help=help_) # VALIDATION OF ARGUMENTS args = parser.parse_args() debug.log("ARGS: %s"%args) return args def check_file_type(files): """ Check whether the input files are in fasta format, reads format or other/mix formats. """ all_are_fasta = True all_are_reads = True all_are_empty = True if sys.version_info < (3, 0): if isinstance(files, (str, unicode)): files = [files] else: if isinstance(files, str): files = [files] for file_ in files: debug.log('Checking file type: %s'%file_) # Check if file is empty if os.stat(file_).st_size == 0: continue else: all_are_empty = False with open_(file_) as f: fc = f.readline()[0] if fc != "@": all_are_reads = False if fc != ">": all_are_fasta = False if all_are_empty: return 'empty' elif all_are_fasta: return 'fasta' elif all_are_reads: return 'fastq' else: return 'other' def make_file_list(upload_path): """ This function returns list of files in the given dir """ newlist = [] for el in sorted(os.listdir(upload_path)): if ' ' in el: raise Exception('Error: Spaces are not allowed in file names!\n') newlist.append(os.path.normpath(upload_path+'/'+el)) debug.log('InputFiles: %s\n'%newlist) return newlist cgecore-1.5.2/cgecore/blaster/0000755000077000000240000000000013577151366016270 5ustar rolfstaff00000000000000cgecore-1.5.2/cgecore/blaster/__init__.py0000644000077000000240000000003513425017076020366 0ustar rolfstaff00000000000000from .blaster import Blaster cgecore-1.5.2/cgecore/blaster/blaster.py0000755000077000000240000006656213577150240020306 0ustar rolfstaff00000000000000#!/usr/bin/env python3 from __future__ import division import sys import os import time import random import re import subprocess from Bio.Blast import NCBIXML from Bio import SeqIO import collections class Blaster(): def __init__(self, inputfile, databases, db_path, out_path='', min_cov=0.6, threshold=0.9, blast='blastn', cut_off=True, max_target_seqs=50000, reuse_results=False, allowed_overlap=0): min_cov = 100 * float(min_cov) threshold = 100 * float(threshold) # For alignment data storage self.gene_align_query = dict() # Sequence alignment lines self.gene_align_homo = dict() # Sequence alignment homolog string self.gene_align_sbjct = dict() # Sequence alignment allele string self.results = dict() # Results # TODO: Add excluded results to this dictionay self.results["excluded"] = dict() for db in databases: # Adding the path to the database and output db_file = "%s/%s.fsa" % (db_path, db) tmp_out_path = "%s/tmp" % (out_path) out_file = "%s/out_%s.xml" % (tmp_out_path, db) os.makedirs(tmp_out_path, exist_ok=True) os.chmod(tmp_out_path, 0o775) # Running blast if (os.path.isfile(out_file) and os.access(out_file, os.R_OK) and reuse_results): print("Found " + out_file + " skipping DB.") out, err = (b'', b'') else: cmd = ("%s -subject %s -query %s -out %s -outfmt '5'" " -perc_identity %s -max_target_seqs %s" " -dust 'no'" % (blast, db_file, inputfile, out_file, threshold, max_target_seqs)) process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = process.communicate() # Get results file try: # Test if output file exist result_handle = open(out_file, "r") except IOError: sys.exit(("Error: BLAST did not run as expected. " "The expected output file, {}, did not exist.\n" "BLAST finished with the following response:" "\n{}\n{}").format(os.path.abspath(out_file), out.decode("utf-8"), err.decode("utf-8"))) # Test if blast output is empty if os.stat(out_file).st_size == 0: sys.exit(("Error: BLAST did not run as expected. " "The output file {} was empty.\n" "BLAST finished with the following response:" "\n{}\n{}").format(os.path.abspath(out_file), out.decode("utf-8"), err.decode("utf-8"))) # Get blast output blast_records = NCBIXML.parse(result_handle) # Declaring variables for saving the results # results for each gene gene_results = dict() # For finding the best hits best_hsp = dict() # Keeping track of gene split gene_split = collections.defaultdict(dict) # Making the dicts for sequence outputs self.gene_align_query[db] = dict() self.gene_align_homo[db] = dict() self.gene_align_sbjct[db] = dict() # Parsing over the hits and only keeping the best for blast_record in blast_records: # OLD CODE TO BE REMOVED # query = blast_record.query # blast_record.alignments.sort(key=lambda align: ( # -max((len(hsp.query) # * (int(hsp.identities) / float(len(hsp.query))) # for hsp in align.hsps))) # ) # Sort BLAST alignments by the hsp in each alignment with the # highest number of identical nucleotides (hsp.identities) blast_record.alignments.sort( key=lambda align: (max((int(hsp.identities) for hsp in align.hsps))), reverse=True) query = blast_record.query for alignment in blast_record.alignments: # Setting the e-value as 1 and bit as 0 to get the best # HSP fragment best_e_value = 1 best_bit = 0 start_hsp = 0 end_hsp = 0 for hsp in alignment.hsps: if hsp.expect < best_e_value or hsp.bits > best_bit: # best_e_value = hsp.expect # best_bit = hsp.bits tmp = alignment.title.split(" ") sbjct_header = tmp[1] # DEBUG print("Found: {}".format(sbjct_header)) bit = hsp.bits sbjct_length = alignment.length sbjct_start = hsp.sbjct_start sbjct_end = hsp.sbjct_end gaps = hsp.gaps query_string = str(hsp.query) homo_string = str(hsp.match) sbjct_string = str(hsp.sbjct) contig_name = query.replace(">", "") query_start = hsp.query_start query_end = hsp.query_end HSP_length = len(query_string) perc_ident = (int(hsp.identities) / float(HSP_length) * 100) strand = 0 coverage = ((int(HSP_length) - int(gaps)) / float(sbjct_length)) perc_coverage = (((int(HSP_length) - int(gaps)) / float(sbjct_length)) * 100) # cal_score is later used to select the best hit cal_score = perc_ident * coverage hit_id = "%s:%s..%s:%s:%f" % ( contig_name, query_start, query_end, sbjct_header, cal_score) # If the hit is on the other strand if sbjct_start > sbjct_end: tmp = sbjct_start sbjct_start = sbjct_end sbjct_end = tmp query_string = self.reversecomplement( query_string) homo_string = homo_string[::-1] sbjct_string = self.reversecomplement( sbjct_string) strand = 1 # Save hit if((cut_off and perc_coverage > 20) or cut_off is False): best_hsp = {'evalue': hsp.expect, 'sbjct_header': sbjct_header, 'bit': bit, 'perc_ident': perc_ident, 'sbjct_length': sbjct_length, 'sbjct_start': sbjct_start, 'sbjct_end': sbjct_end, 'gaps': gaps, 'query_string': query_string, 'homo_string': homo_string, 'sbjct_string': sbjct_string, 'contig_name': contig_name, 'query_start': query_start, 'query_end': query_end, 'HSP_length': HSP_length, 'coverage': coverage, 'cal_score': cal_score, 'hit_id': hit_id, 'strand': strand, 'perc_coverage': perc_coverage } # Saving the result if any if best_hsp: save = 1 # If there are other gene alignments they are compared if gene_results: tmp_gene_split = gene_split tmp_results = gene_results # Compare the hit results save, gene_split, gene_results = ( self.compare_results(save, best_hsp, tmp_results, tmp_gene_split, allowed_overlap) ) # If the hit is not overlapping with other hit # seqeunces it is kept if save == 1: # DEBUG print("Saving: {}".format(hit_id)) gene_results[hit_id] = best_hsp result_handle.close() # If the hit does not cover the entire database reference the # missing seqence data are extracted keys = list(gene_results.keys()) for hit_id in keys: hit = gene_results[hit_id] # Calculate possible split gene coverage perc_coverage = hit['perc_coverage'] if(hit['sbjct_header'] in gene_split and len(gene_split[hit['sbjct_header']]) > 1): # Calculate new length new_length = self.calculate_new_length(gene_split, gene_results, hit) hit['split_length'] = new_length # Calculate new coverage perc_coverage = new_length / float(hit['sbjct_length']) * 100 # If the hit is above the minimum length threshold it is kept if perc_coverage >= min_cov: if hit['coverage'] == 1: self.gene_align_query[db][hit_id] = hit['query_string'] self.gene_align_homo[db][hit_id] = hit['homo_string'] self.gene_align_sbjct[db][hit_id] = hit['sbjct_string'] elif hit['coverage'] != 1: # Getting the whole database sequence for seq_record in SeqIO.parse(db_file, "fasta"): if seq_record.description.replace(" ", "") == hit['sbjct_header'].replace(" ", ""): start_seq = str(seq_record.seq)[:int(hit["sbjct_start"])-1] end_seq = str(seq_record.seq)[int(hit["sbjct_end"]):] self.gene_align_sbjct[db][hit_id] = start_seq + hit['sbjct_string'] + end_seq #self.gene_align_sbjct[db][hit_id] = str(seq_record.seq) break # Getting the whole contig to extract extra query seqeunce contig = '' for seq_record in SeqIO.parse(inputfile, "fasta"): if seq_record.description.replace(" ", "") == hit['contig_name'].replace(" ", ""): contig = str(seq_record.seq) break # Extract extra sequence from query query_seq, homo_seq = self.get_query_align(hit, contig) # Saving the new alignment sequences self.gene_align_query[db][hit_id] = query_seq self.gene_align_homo[db][hit_id] = homo_seq else: del gene_results[hit_id] if hit['sbjct_header'] in gene_split: del gene_split[hit['sbjct_header']] # Save the database result if gene_results: self.results[db] = gene_results else: self.results[db] = "No hit found" @staticmethod def reversecomplement(seq): # Make reverse complement strand trans = str.maketrans("ATGC", "TACG") return seq.translate(trans)[::-1] @staticmethod def compare_results(save, best_hsp, tmp_results, tmp_gene_split, allowed_overlap): """ Function for comparing hits and saving only the best hit """ # Get data for comparison hit_id = best_hsp['hit_id'] new_start_query = best_hsp['query_start'] new_end_query = best_hsp['query_end'] new_start_sbjct = int(best_hsp['sbjct_start']) new_end_sbjct = int(best_hsp['sbjct_end']) new_score = best_hsp['cal_score'] new_db_hit = best_hsp['sbjct_header'] new_contig = best_hsp['contig_name'] new_HSP = best_hsp['HSP_length'] # See if the best HSP fragment overlap with another allignment # and keep the allignment with the highest score - if the new # fragment is not providing new sequence keys = list(tmp_results.keys()) for hit in keys: hit_data = tmp_results[hit] old_start_query = hit_data['query_start'] old_end_query = hit_data['query_end'] old_start_sbjct = int(hit_data['sbjct_start']) old_end_sbjct = int(hit_data['sbjct_end']) old_score = hit_data['cal_score'] old_db_hit = hit_data['sbjct_header'] old_contig = hit_data['contig_name'] old_HSP = hit_data['HSP_length'] remove_old = 0 # If they align to the same gene in the database they are # compared if new_db_hit == old_db_hit: #If the hit comes from different contig if old_contig != new_contig: # Save a split if the new hit still creats one if(new_db_hit in tmp_gene_split and hit_id not in tmp_gene_split[new_db_hit]): tmp_gene_split[new_db_hit][hit_id] = 1 # If the hit provides additional sequence it is kept and # the new coverage is saved otherwise the one with the # highest score is kept elif(new_start_sbjct < old_start_sbjct or new_end_sbjct > old_end_sbjct): # Save the hits as split tmp_gene_split[old_db_hit][hit_id] = 1 if hit not in tmp_gene_split[old_db_hit]: tmp_gene_split[old_db_hit][hit] = 1 # else: # if new_score > old_score: # Set to remove old hit # remove_old = 1 # Save a split if the new hit still creats one # if(new_db_hit in tmp_gene_split # and hit_id not in tmp_gene_split[new_db_hit]): # tmp_gene_split[new_db_hit][hit_id] = 1 # else: # save = 0 # If the old and new hit is not identical the # possible saved gene split for the new hit is # removed # if hit_id != hit: # if(new_db_hit in tmp_gene_split # and hit_id in tmp_gene_split[new_db_hit]): # del tmp_gene_split[new_db_hit][hit_id] # break # If the hits comes form the same part of the contig # sequnce but match different genes only the best hit is # kept if new_contig == old_contig: print("Same contig: {} == {}".format(new_contig, old_contig)) print("\t{} vs {}".format(new_db_hit, old_db_hit)) # Check if saved hits overlaps with current hit hit_union_length = (max(old_end_query, new_end_query) - min(old_start_query, new_start_query)) hit_lengths_sum = ((old_end_query - old_start_query) + (new_end_query - new_start_query)) overlap_len = (hit_lengths_sum - hit_union_length) if overlap_len < allowed_overlap: print("\tignore overlap ({}): {}".format(overlap_len, new_db_hit)) continue print("\toverlap found ({}): {}".format(overlap_len, new_db_hit)) # If the two hits cover the exact same place on the # contig only the percentage of identity is compared if(old_start_query == new_start_query and old_end_query == new_end_query): if best_hsp['perc_ident'] > hit_data['perc_ident']: # Set to remove old hit remove_old = 1 # Save a split if the new hit still creats one if(new_db_hit in tmp_gene_split and hit_id not in tmp_gene_split[new_db_hit]): tmp_gene_split[new_db_hit][hit_id] = 1 elif best_hsp['perc_ident'] == hit_data['perc_ident']: # Save both # Save a split if the new hit still creats one if(new_db_hit in tmp_gene_split and hit_id not in tmp_gene_split[new_db_hit]): tmp_gene_split[new_db_hit][hit_id] = 1 else: save = 0 # Remove new gene from gene split if present if(new_db_hit in tmp_gene_split and hit_id in tmp_gene_split[new_db_hit]): del tmp_gene_split[new_db_hit][hit_id] break # If new hit overlaps with the saved hit elif(hit_union_length <= hit_lengths_sum): print("\t{} <= {}".format(hit_union_length, hit_lengths_sum)) print("\t\tScores: {} cmp {}".format(new_score, old_score)) if new_score > old_score: # Set to remove old gene remove_old = 1 # Save a split if the new hit still creats one if(new_db_hit in tmp_gene_split and hit_id not in tmp_gene_split[new_db_hit]): tmp_gene_split[new_db_hit][hit_id] = 1 elif new_score == old_score: # If both genes are of same coverage # and identity is the same implied by new_score == old_score # hit is chosen based on length if((int(best_hsp['perc_coverage']) == int(hit_data['perc_coverage'])) and new_HSP > old_HSP): # Set to remove old gene remove_old = 1 elif((int(best_hsp['perc_coverage']) == int(hit_data['perc_coverage'])) and old_HSP > new_HSP): # Remove current hit save = 0 elif((int(best_hsp['perc_coverage']) == int(hit_data['perc_coverage'])) and old_HSP==new_HSP): # Both hits has same coverage, and same identity # and same length, how to choose only one hit? pass # TODO # If new_score == old_score but identity and coverages are not the same. # which gene should be chosen?? Now they are both keept. # Save a split if the new hit creats one - both # hits are saved if(new_db_hit in tmp_gene_split and hit_id not in tmp_gene_split[new_db_hit]): tmp_gene_split[new_db_hit][hit_id] = 1 else: # Remove new gene from gene split if present if(new_db_hit in tmp_gene_split and hit_id in tmp_gene_split[new_db_hit]): del tmp_gene_split[new_db_hit][hit_id] save = 0 break # Remove old hit if new hit is better if remove_old == 1: del tmp_results[hit] # Remove gene from gene split if present if(old_db_hit in tmp_gene_split and hit in tmp_gene_split[old_db_hit]): del tmp_gene_split[old_db_hit][hit] return save, tmp_gene_split, tmp_results @staticmethod def calculate_new_length(gene_split, gene_results, hit): """ Function for calcualting new length if the gene is split on several contigs """ # Looping over splitted hits and calculate new length first = 1 for split in gene_split[hit['sbjct_header']]: new_start = int(gene_results[split]['sbjct_start']) new_end = int(gene_results[split]['sbjct_end']) # Get the first HSP if first == 1: new_length = int(gene_results[split]['HSP_length']) old_start = new_start old_end = new_end first = 0 continue if new_start < old_start: new_length = new_length + (old_start - new_start) old_start = new_start if new_end > old_end: new_length = new_length + (new_end - old_end) old_end = new_end return(new_length) @staticmethod def get_query_align(hit, contig): """ Function for extracting extra seqeunce data to the query alignment if the full reference length are not covered """ # Getting data needed to extract sequences query_seq = hit['query_string'] homo_seq = hit['homo_string'] sbjct_start = int(hit['sbjct_start']) sbjct_end = int(hit['sbjct_end']) query_start = int(hit['query_start']) query_end = int(hit['query_end']) length = int(hit['sbjct_length']) # If the alignment doesn't start at the first position data is # added to the begnning if sbjct_start != 1: missing = sbjct_start - 1 if(query_start >= missing and hit['strand'] != 1 or hit['strand'] == 1 and missing <= (len(contig) - query_end)): # Getting the query sequence. # If the the hit is on the other strand the characters # are reversed. if hit['strand'] == 1: start_pos = query_end end_pos = query_end + missing chars = contig[start_pos:end_pos] chars = Blaster.reversecomplement(chars) else: start_pos = query_start - missing - 1 end_pos = query_start - 1 chars = contig[start_pos:end_pos] query_seq = chars + str(query_seq) else: # Getting the query sequence. # If the the hit is on the other strand the characters # are reversed. if hit['strand'] == 1: if query_end == len(contig): query_seq = "-" * missing + str(query_seq) else: start_pos = query_end chars = contig[start_pos:] chars = Blaster.reversecomplement(chars) query_seq = ("-" * (missing - len(chars)) + chars + str(query_seq)) elif query_start < 3: query_seq = "-" * missing + str(query_seq) else: end_pos = query_start - 2 chars = contig[0:end_pos] query_seq = ("-" * (missing - len(chars)) + chars + str(query_seq)) # Adding to the homo sequence spaces = " " * missing homo_seq = str(spaces) + str(homo_seq) # If the alignment dosen't end and the last position data is # added to the end if sbjct_end < length: missing = length - sbjct_end if(missing <= (len(contig) - query_end) and hit['strand'] != 1 or hit['strand'] == 1 and query_start >= missing): # Getting the query sequence. # If the the hit is on the other strand the characters # are reversed. if hit['strand'] == 1: start_pos = query_start - missing - 1 end_pos = query_start - 1 chars = contig[start_pos:end_pos] chars = Blaster.reversecomplement(chars) else: start_pos = query_end end_pos = query_end + missing chars = contig[start_pos:end_pos] query_seq = query_seq + chars else: # If the hit is on the other strand the characters are reversed if hit['strand'] == 1: if query_start < 3: query_seq = query_seq + "-" * missing else: end_pos = query_start - 2 chars = contig[0:end_pos] chars = Blaster.reversecomplement(chars) query_seq = (query_seq + chars + "-" * (missing - len(chars))) elif query_end == len(contig): query_seq = query_seq + "-" * missing else: start_pos = query_end chars = contig[start_pos:] query_seq = query_seq + chars + "-" * (missing - len(chars)) # Adding to the homo sequence spaces = " " * int(missing) homo_seq = str(homo_seq) + str(spaces) return query_seq, homo_seq cgecore-1.5.2/cgecore/blaster/run_blaster.py0000755000077000000240000001505413446162151021157 0ustar rolfstaff00000000000000#!/usr/bin/env python import sys import os import time import random import re import subprocess from argparse import ArgumentParser from blaster import Blaster ########################################################################## # PARSE COMMAND LINE OPTIONS ########################################################################## parser = ArgumentParser() parser.add_argument('-i', '--input', help="Input file") parser.add_argument('-l', '--input_list', help="File with list of files to analyse") parser.add_argument('-d', '--databases', help="Comma seperated list of databases to blast against") parser.add_argument('-o', '--out_path', help="output directory") parser.add_argument("-b", "--blastPath", dest="blast_path", help="Path to blast", default='blastn') parser.add_argument("-p", "--databasePath", dest="db_path", help="Path to the databases", default='') parser.add_argument("-c", "--min_cov", dest="min_cov", help="Minimum coverage", type=float, default=0.60) parser.add_argument("-t", "--threshold", dest="threshold", help="Blast threshold for identity", type=float, default=0.90) parser.add_argument("--overlap", help=("Allow hits/genes to overlap with this number of " "nucleotides. Default: 30."), type=int, default=30) args = parser.parse_args() ########################################################################## # MAIN ########################################################################## min_cov = args.min_cov threshold = args.threshold # Check if valid input file is provided input_list = [] if args.input is not None and args.input_list is not None: sys.exit("Input Error: Please only provide file list or single input " "file\n") elif args.input is None and args.input_list is None: sys.exit("Input Error: No Input were provided!\n") elif args.input is not None and not os.path.exists(args.input): sys.exit("Input Error: Input file does not exist!\n") elif args.input is not None: inputfile = args.input input_list.append(inputfile) elif args.input_list is not None and not os.path.exists(args.input_list): sys.exit("Input Error: No Input were provided!\n") elif args.input_list is not None: inputfile = args.input_list with open(inputfile, "r") as f: for line in f: line = line.rstrip() input_list.append(line) # Check if valid output directory is provided if not os.path.exists(args.out_path): os.makedirs(args.out_path, exist_ok=True) out_path = args.out_path else: out_path = args.out_path # Check if valid file with genes is provided if args.databases is None: sys.exit("Input Error: No databases sepcified!\n") else: databases = args.databases.split(",") # Check if valid path to BLAST is provided blast = args.blast_path db_path = args.db_path for inp_file in input_list: # Calling blast and parsing output blast_run = Blaster(inputfile=inp_file, databases=databases, db_path=db_path, out_path=out_path, min_cov=min_cov, threshold=threshold, blast=blast, allowed_overlap=args.overlap) results = blast_run.results query_align = blast_run.gene_align_query homo_align = blast_run.gene_align_homo sbjct_align = blast_run.gene_align_sbjct file_name = inp_file.split("/")[-1].split(".")[0] out_name = "%s/%s_hit_alignments.txt" % (out_path, file_name) txt_file_seq_text = dict() pos_result = list() # Make result file tab_file = "%s/%s_results.txt" % (out_path, file_name) tab = open(tab_file, "w") for db in results: if results[db] == "No hit found": tab.write("%s\n%s\n\n" % (db, results[db])) else: pos_result.append(db) tab.write("%s\n" % (db)) tab.write("Hit\tIdentity\tAlignment Length/Gene Length\tPosition in " "reference\tContig\tPosition in contig\n") txt_file_seq_text[db] = list() for hit in results[db]: header = results[db][hit]["sbjct_header"] ID = results[db][hit]["perc_ident"] sbjt_length = results[db][hit]["sbjct_length"] HSP = results[db][hit]["HSP_length"] positions_contig = "%s..%s" % (results[db][hit]["query_start"], results[db][hit]["query_end"]) positions_ref = "%s..%s" % (results[db][hit]["sbjct_start"], results[db][hit]["sbjct_end"]) contig_name = results[db][hit]["contig_name"] # Write tabels tab.write("%s\t%.2f\t%s/%s\t%s\t%s\t%s\n" % (header, ID, HSP, sbjt_length, positions_ref, contig_name, positions_contig)) # Writing subjet/ref sequence ref_seq = sbjct_align[db][hit] hit_seq = query_align[db][hit] # Getting the header and text for the txt file output sbjct_start = results[db][hit]["sbjct_start"] sbjct_end = results[db][hit]["sbjct_end"] text = ("%s, ID: %.2f %%, Alignment Length/Gene Length: %s/%s, " "Positions in reference: %s..%s, Contig name: %s, " "Position: %s" % (header, ID, HSP, sbjt_length, sbjct_start, sbjct_end, contig_name, positions_contig) ) # Saving the output to print the txt result file allignemts txt_file_seq_text[db].append((text, ref_seq, homo_align[db][hit], hit_seq)) tab.close() txt_file = open(out_name, "w") for db in pos_result: # Txt file alignments txt_file.write("##################### %s #####################\n" % (db)) for text in txt_file_seq_text[db]: txt_file.write("%s\n\n" % (text[0])) for i in range(0, len(text[1]), 60): txt_file.write("%s\n" % (text[1][i:i + 60])) txt_file.write("%s\n" % (text[2][i:i + 60])) txt_file.write("%s\n\n" % (text[3][i:i + 60])) txt_file.write("\n") txt_file.close() cgecore-1.5.2/cgecore/cgefinder.py0000755000077000000240000002542313527765770017152 0ustar rolfstaff00000000000000#!/usr/bin/env python3 import subprocess import re import os.path import sys # TODO import blaster and make blaster function in CGEFinder # from cge.blaster.blaster import Blaster def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) class FinderResult(): def __init__(self, results, align_sbjct=None, align_query=None, align_homo=None): self.results = results # Results self.gene_align_query = align_query # Sequence alignment lines self.gene_align_homo = align_homo # Sequence alignment homolog string self.gene_align_sbjct = align_sbjct # Sequence alignment allele string class CGEFinder(): # Variables used by methods to distinguish results created by different # methods. TYPE_BLAST = "blast" TYPE_KMA = "kma" @staticmethod def kma(inputfile_1, out_path, databases, db_path_kma, min_cov=0.6, threshold=0.9, kma_path="cge/kma/kma", sample_name="", inputfile_2=None, kma_mrs=None, kma_gapopen=None, kma_gapextend=None, kma_penalty=None, kma_reward=None, kma_apm=None, kma_memmode=False, kma_nanopore=False, debug=False, kma_add_args=None, kma_cge=False, kma_1t1=False): """ TODO: Result storage - Too complex. Not effective. Before changing code: Check downstream dependencies of results dicts. Currently the code stores results in four different dicts. This can be reduced to just one. The main dict that stores all results currently stores one result pr hit, and distiguishes hits to the same gene by adding an increasing integer (obtained by getting the length of the internal gene dict). The main dict also stores an internal dict for each 'gene' containing an empty dict for each hit. Solution: Create a main -> gene -> hit design, and remove all the references main -> hit. This reduces redundancy and the need for manipulating gene names with an incremental integer. TODO: Method too many responsibilities Solution: Create KMA class, create additional functions. Original comment: "I expect that there will only be one hit pr gene, but if there are more, I assume that the sequence of the hits are the same in the res file and the aln file." Not sure if this holds for the current code. """ threshold = threshold * 100 min_cov = min_cov * 100 kma_results = dict() kma_results["excluded"] = dict() if(sample_name): sample_name = "_" + sample_name # Initiate output dicts. gene_align_sbjct = {} gene_align_query = {} gene_align_homo = {} for db in databases: kma_db = db_path_kma + "/" + db kma_outfile = out_path + "/kma_" + db + sample_name kma_cmd = ("%s -t_db %s -o %s -e 1.0" % (kma_path, kma_db, kma_outfile)) if(inputfile_2 is not None): kma_cmd += " -ipe " + inputfile_1 + " " + inputfile_2 else: kma_cmd += " -i " + inputfile_1 if(kma_mrs is not None): kma_cmd += " -mrs " + str(kma_mrs) if(kma_gapopen is not None): kma_cmd += " -gapopen " + str(kma_gapopen) if(kma_gapextend is not None): kma_cmd += " -gapextend " + str(kma_gapextend) if(kma_penalty is not None): kma_cmd += " -penalty " + str(kma_penalty) if(kma_reward is not None): kma_cmd += " -reward " + str(kma_reward) if(kma_apm is not None): kma_cmd += " -apm " + kma_apm if(kma_cge): kma_cmd += " -cge " if(kma_1t1): kma_cmd += " -1t1 " if (kma_memmode): kma_cmd += " -mem_mode " if (kma_nanopore): kma_cmd += " -bcNano " kma_cmd += " -mp 20 " if (kma_add_args is not None): kma_cmd += " " + kma_add_args + " " # kma output files align_filename = kma_outfile + ".aln" res_filename = kma_outfile + ".res" # If .res file exists then skip mapping if(os.path.isfile(res_filename) and os.access(res_filename, os.R_OK)): eprint("Found " + res_filename + " skipping DB.") else: # Call KMA if(debug): eprint("KMA cmd: " + kma_cmd) process = subprocess.Popen(kma_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = process.communicate() kma_results[db] = 'No hit found' # Open res file try: res_file = open(res_filename, "r") header = res_file.readline() except IOError as error: sys.exit("Error: KMA did not run as expected.\n" "KMA finished with the following response:" "\n{}\n{}".format(out.decode("utf-8"), err.decode("utf-8"))) gene_res_count = {} for line in res_file: if kma_results[db] == 'No hit found': kma_results[db] = dict() data = [data.strip() for data in line.split("\t")] gene = data[0] gene_count = gene_res_count.get(gene, 0) gene_count = gene_count + 1 gene_res_count[gene] = gene_count if(gene_count == 1): hit = gene else: hit = "{}_{}".format(gene, gene_count) sbjct_len = int(data[3]) sbjct_ident = float(data[4]) coverage = float(data[5]) depth = float(data[-3]) q_value = float(data[-2]) p_value = float(data[-1]) exclude_reasons = [] if(coverage < min_cov or sbjct_ident < threshold): exclude_reasons.append(coverage) exclude_reasons.append(sbjct_ident) if(exclude_reasons): kma_results["excluded"][hit] = exclude_reasons kma_results[db][hit] = dict() kma_results[db][hit]['sbjct_length'] = sbjct_len kma_results[db][hit]["perc_coverage"] = coverage kma_results[db][hit]["sbjct_string"] = [] kma_results[db][hit]["query_string"] = [] kma_results[db][hit]["homo_string"] = [] kma_results[db][hit]["sbjct_header"] = gene kma_results[db][hit]["perc_ident"] = sbjct_ident kma_results[db][hit]["query_start"] = "NA" kma_results[db][hit]["query_end"] = "NA" kma_results[db][hit]["contig_name"] = "NA" kma_results[db][hit]["HSP_length"] = "" kma_results[db][hit]["cal_score"] = q_value kma_results[db][hit]["depth"] = depth kma_results[db][hit]["p_value"] = p_value res_file.close() if kma_results[db] == 'No hit found': continue # Open align file with open(align_filename, "r") as align_file: gene_aln_count = {} gene = "" # Parse through alignments for line in align_file: # Skip empty lines if(not line.strip()): continue # Check when a new gene alignment start if line.startswith("#"): gene = line[1:].strip() gene_count = gene_aln_count.get(gene, 0) gene_aln_count[gene] = gene_count + 1 else: if(gene_aln_count[gene] == 1): hit = gene else: hit = "{}_{}".format(gene, gene_aln_count[gene]) if hit in kma_results[db]: line_data = line.split("\t")[-1].strip() if line.startswith("template"): kma_results[db][hit]["sbjct_string"] += ( [line_data]) elif line.startswith("query"): kma_results[db][hit]["query_string"] += ( [line_data]) else: kma_results[db][hit]["homo_string"] += ( [line_data]) else: print(hit + " not in results: ", kma_results) # concatinate all sequence lists and find subject start # and subject end gene_align_sbjct[db] = {} gene_align_query[db] = {} gene_align_homo[db] = {} for hit in kma_results[db]: align_sbjct = "".join(kma_results[db][hit]['sbjct_string']) align_query = "".join(kma_results[db][hit]['query_string']) align_homo = "".join(kma_results[db][hit]['homo_string']) # Extract only aligned sequences start = re.search("^-*(\w+)", align_query).start(1) end = re.search("\w+(-*)$", align_query).start(1) kma_results[db][hit]['sbjct_string'] = align_sbjct[start:end] kma_results[db][hit]['query_string'] = align_query[start:end] kma_results[db][hit]['homo_string'] = align_homo[start:end] # Save align start and stop positions relative to # subject sequence kma_results[db][hit]['sbjct_start'] = start + 1 kma_results[db][hit]["sbjct_end"] = end + 1 kma_results[db][hit]["HSP_length"] = end - start # Count gaps in the alignment kma_results[db][hit]["gaps"] = ( kma_results[db][hit]['sbjct_string'].count("-") + kma_results[db][hit]['query_string'].count("-")) # Save sequences covering the entire subject sequence # in seperate variables gene_align_sbjct[db][hit] = align_sbjct gene_align_query[db][hit] = align_query gene_align_homo[db][hit] = align_homo return FinderResult(kma_results, gene_align_sbjct, gene_align_query, gene_align_homo) cgecore-1.5.2/cgecore/cmdline.py0000644000077000000240000005451513471234333016620 0ustar rolfstaff00000000000000#!/usr/bin/env python3 """ THIS MODULE CONTAINS ALL THE SHARED WRAPPER FUNCTIONS """ ################################################################################ # CGE FUNCTION MODULE # ################################################################################ # This script is part of the CGE Pipeline structure import sys, os from subprocess import Popen, PIPE from pipes import quote from time import time, sleep from datetime import timedelta # CGE modules from .utility import debug, open_, mkpath ############# CLASSES ############# class ProgramList(object): ''' This Class keeps track of all the programs USAGE: proglist = ProgramList() proglist.add2list(prog_obj) for progname in proglist.list: proglist[progname].wait() proglist.print_timers() ''' def __init__(self): ''' ''' self.timer = -time() # Wrapper timer self.list = [] def __getitem__(self, key): ''' ''' return getattr(self, key) def add_program(self, prog_obj): ''' ''' setattr(self, prog_obj.name, prog_obj) self.list.append(prog_obj.name) return prog_obj def empty_list(self, forcefully=False): ''' ''' removed = [self.remove_program(name, forcefully=forcefully) for name in self.list] if all(removed): return True else: return False def remove_name_from_list(self, name): ''' ''' self.list[:] = (ent for ent in self.list if ent != name) def remove_program(self, name, forcefully=False): ''' ''' # Check if program in list if name in self.list: # Check if program object is a local attribute if name in dir(self): # Check if program have completed prog_obj = getattr(self, name) if prog_obj.status != 'Executing' or forcefully: # Remove program from list delattr(self, name) self.remove_name_from_list(name) return True else: debug.log("Warning: Program %s status %s!"%( name, prog_obj.status)) return False else: self.remove_name_from_list(name) return True else: self.remove_name_from_list(name) return True def add2list(self, prog_obj): ''' ''' setattr(self, prog_obj.name, prog_obj) self.list.append(prog_obj.name) def exists(self, name): ''' Checks whether the program exists in the program list. ''' return name in dir(self) def return_timer(self, name, status, timer): ''' Return a text formatted timer ''' timer_template = '%s %s %s : %s : %9s' t = str(timedelta(0, timer)).split(',')[-1].strip().split(':') #t = str(timedelta(0, timer)).split(':') if len(t) == 4: h, m, s = int(t[0])*24 + int(t[1]), int(t[2]), float(t[3]) elif len(t) == 3: h, m, s = int(t[0]), int(t[1]), float(t[2]) else: h, m, s = 0, 0, str(t) return timer_template%( name[:20].ljust(20), status[:7].ljust(7), '%3d'%h if h != 0 else ' --', '%2d'%m if m != 0 else '--', '%.6f'%s if isinstance(s, float) else s ) def print_timers(self): ''' PRINT EXECUTION TIMES FOR THE LIST OF PROGRAMS ''' self.timer += time() total_time = self.timer tmp = '* %s *' debug.log( '', '* '*29, tmp%(' '*51), tmp%('%s %s %s'%('Program Name'.ljust(20), 'Status'.ljust(7), 'Execute Time (H:M:S)')), tmp%('='*51) ) for name in self.list: if self.exists(name): timer = getattr(self, name).get_time() status = getattr(self, name).get_status() self.timer -= timer debug.log(tmp%(self.return_timer(name, status, timer))) else: debug.log(tmp%("%s %s -- : -- : --"%(name[:20].ljust(20),' '*8))) debug.log( tmp%(self.return_timer('Wrapper', '', self.timer)), tmp%('='*51), tmp%(self.return_timer('Total', '', total_time)), tmp%(' '*51), '* '*29, '' ) class Program: """ This class defines a program structure for the CGE system OPTIONS: name - Set the name of the program. path - Set the path to the program. timer - Set the initial time expenditure for the program. ptype - Set the program type (EG. python or /bin/bash). wdir - Set the working directory for the program. server - Set the server for which to execute the program on. queue - The name of the TORQUE queue to use. If not set, the program is executed as a normal subprocess. forcewait - Set to True to make the script wait for the program to finish. args - Provide arguments for the program. walltime - Set the max limit for how many hours the program may run. mem - Set the max limit of how much memory the program may use. procs - Set the amount of processors to reserve for the program. """ def __init__(self, name, path=None, timer=0, ptype=None, wdir='', queue=None, wait=False, args=None, walltime=2, mem=4, procs=1, server=None): debug.log('\n\nInitiating %s...'%name) # INIT VALUES self.path = path self.name = name self.timer = timer self.ptype = ptype self.queue = queue self.forcewait = wait self.args = [] self.unquoted_args = [] self.stderr = "%s.err"%(name) self.stdout = "%s.out"%(name) self.walltime = walltime # Maximum amount of hours required to run program self.mem = mem # GB RAM allocation requirement self.procs = procs # Number of processors required for the job self.p = None # The Subprocess Object self.server = server self.status = 'Initialised' self.verbose = False # If True, Error messages are written to stdout. if args: self.append_args(args) self.wdir = '' if wdir is not None: if wdir != '' and not os.path.exists(wdir): try: # Create working directory mkpath(wdir) except Exception as e: debug.graceful_exit(("Error: The specified working directory " "(%s) does not exist, and could not be " "created!")%(wdir)) self.wdir = wdir def get_time(self): """ This function returns the amount of time used by the program (in seconds). """ return self.timer def get_status(self): """ This function returns the amount of time used by the program (in seconds). """ # debug.log('status (%s): %s\n'%(self.name, self.status)) return self.status def get_cmd(self): """ This function combines and return the commanline call of the program. """ cmd = [] if self.path is not None: if '/' in self.path and not os.path.exists(self.path): debug.log('Error: path contains / but does not exist: %s'%self.path) else: if self.ptype is not None: if os.path.exists(self.ptype): cmd.append(self.ptype) elif '/' not in self.ptype: for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') ppath = os.path.join(path, self.ptype) if os.path.isfile(ppath): cmd.append(ppath) break cmd.append(self.path) if sys.version_info < (3, 0): cmd.extend([str(x) if not isinstance(x, (unicode)) else x.encode('utf-8') for x in [quote(str(x)) for x in self.args]+self.unquoted_args]) else: cmd.extend([str(x) for x in [quote(str(x)) for x in self.args]+self.unquoted_args]) else: debug.log('Error: Program path not set!') return ' '.join(cmd) def update_timer(self, time): """ This function updates the program timer. """ self.timer += time def append_args(self, arg): """ This function appends the provided arguments to the program object. """ debug.log("Adding Arguments: %s"%(arg)) if isinstance(arg, (int,float)): self.args.append(str(arg)) if isinstance(arg, str): self.args.append(arg) if isinstance(arg, list): if sys.version_info < (3, 0): self.args.extend([str(x) if not isinstance(x, (unicode)) else x.encode('utf-8') for x in arg]) else: self.args.extend([str(x) for x in arg]) def execute(self): """ This function Executes the program with set arguments. """ prog_cmd = self.get_cmd().strip() if prog_cmd == '': self.status = 'Failure' debug.log("Error: No program to execute for %s!"%self.name) debug.log(("Could not combine path and arguments into cmdline:" "\n%s %s)\n")%(self.path, ' '.join(self.args))) else: debug.log("\n\nExecute %s...\n%s" % (self.name, prog_cmd)) # Create shell script script = '%s.sh'%self.name if self.wdir != '': script = '%s/%s'%(self.wdir, script) else: script = '%s/%s'%(os.getcwd(), script) with open_(script, 'w') as f: f.write('#!/bin/bash\n') if self.wdir != '': f.write('cd {workdir}\n'.format(workdir=self.wdir)) f.write( ('touch {stdout} {stderr}\n' 'chmod a+r {stdout} {stderr}\n' '{cmd} 1> {stdout} 2> {stderr}\n' 'ec=$?\n').format( stdout=self.stdout, stderr=self.stderr, cmd=prog_cmd ) ) if not self.forcewait: f.write(('if [ "$ec" -ne "0" ]; then echo "Error" >> {stderr}; ' 'else echo "Done" >> {stderr}; fi\n').format( stderr=self.stderr)) f.write('exit $ec\n') os.chmod(script, 0o755) if self.queue is not None: # Setup execution of shell script through TORQUE other_args = '' if self.forcewait: other_args += "-K " # ADDING -K argument if wait() is forced # QSUB INFO :: run_time_limit(walltime, dd:hh:mm:ss), # memory(mem, up to 100GB *gigabyte), # processors(ppn, up to 16) # USE AS LITTLE AS NEEDED! cmd = ('/usr/bin/qsub ' '-l nodes=1:ppn={procs},walltime={hours}:00:00,mem={mem}g ' '-r y {workdir_arg} {other_args} {cmd}').format( procs=self.procs, hours=self.walltime, mem=self.mem, workdir_arg="-d %s"%(self.wdir) if self.wdir != '' else '', other_args=other_args, cmd=script) debug.log("\n\nTORQUE SETUP %s...\n%s\n" % (self.name, cmd)) else: cmd = script if self.server is not None: cmd = "ssh {server} {cmd}".format( server=self.server, cmd=quote(cmd) ) self.status = 'Executing' # EXECUTING PROGRAM self.update_timer(-time()) # TIME START if self.forcewait: self.p = Popen(cmd) ec = self.p.wait() if ec == 0: debug.log("Program finished successfully!") self.status = 'Done' else: debug.log("Program failed on execution!") self.status = 'Failure' self.p = None else: # WaitOn should be called to determine if the program has ended debug.log("CMD: %s"%cmd) self.p = Popen(cmd) # shell=True, executable="/bin/bash" self.update_timer(time()) # TIME END debug.log("timed: %s" % (self.get_time())) def wait(self, pattern='Done', interval=None, epatterns=['error','Error','STACK','Traceback']): """ This function will wait on a given pattern being shown on the last line of a given outputfile. OPTIONS pattern - The string pattern to recognise when a program finished properly. interval - The amount of seconds to wait between checking the log file. epatterns - A list of string patterns to recognise when a program has finished with an error. """ increasing_interval = False if interval is None: increasing_interval = True interval = 10 if self.wdir != '': stderr = "%s/%s"%(self.wdir, self.stderr) else: stderr = self.stderr debug.log("\nWaiting for %s to finish..."%str(self.name)) if self.status == 'Executing': self.update_timer(-time()) # TIME START found = False if self.queue is not None: # Handling programs running on the compute servers # Waiting for error log to be created. # Prolonged waiting can be caused by the queue being full, or the # server being unavailable. debug.log(" Waiting for the error log to be created (%s)..."%( stderr)) # Set maximum amount of seconds to wait on the errorlog creation, # before assuming queue failure. max_queued_time = 10800 while ( not os.path.exists(stderr) and time()+self.timer < max_queued_time and time()+self.timer > 0 ): debug.log(" Waiting... (max wait time left: %s seconds)"%( str(max_queued_time-time()-self.timer))) sleep(interval) if increasing_interval: interval *= 1.1 if os.path.exists(stderr): if increasing_interval: interval = 10 # File created looking for pattern debug.log('\nError log created, waiting for program to finish...') # calculate max loops left based on set walltime and check interval max_time = time() + self.walltime * 60 * 60 while time() < max_time: with open_(stderr) as f: for l in f.readlines()[-5:]: # last five lines if pattern in l: found = True max_time = 0 break elif any([ep in l for ep in epatterns]): found = False max_time = 0 break if max_time > 0: debug.log(' Waiting... (max wait-time left: %s seconds)'%( str(max_time-time()))) sleep(interval) if found: debug.log(" Program finished successfully!") self.status = 'Done' else: debug.log("Error: Program took too long, or finished with error!") if self.verbose: debug.print_out( "Technical error occurred!\n", "The service was not able to produce a result.\n", ("Please check your settings are correct, and the file " "type matches what you specified.\n"), ("Try again, and if the problem persists please notify the" " technical support.\n") ) self.status = 'Failure' else: debug.log( ("Error: %s still does not exist!\n")%(stderr), ("This error might be caused by the cgebase not being " "available!") ) if self.verbose: debug.print_out( "Technical error occurred!\n", ("This error might be caused by the server not being " "available!\n"), ("Try again later, and if the problem persists please notify " "the technical support.\n"), "Sorry for any inconvenience.\n" ) self.status = 'Failure' if not self.p is None: self.p.wait() self.p = None else: # Handling wrappers running on the webserver if self.p is None: debug.log("Program not instanciated!") self.status = 'Failure' else: ec = self.p.wait() if ec != 0: debug.log("Program failed on execution!") self.status = 'Failure' elif os.path.exists(stderr): with open_(stderr) as f: for l in f.readlines()[-5:]: # last five lines if pattern in l: found = True break elif any([ep in l for ep in epatterns]): found = False break if found: debug.log(" Program finished successfully!") self.status = 'Done' else: debug.log("Error: Program failed to finish properly!") if self.verbose: debug.print_out("Technical error occurred!\n", "The service was not able to produce a result.\n", "Please check your settings are correct, and the file "+ "type matches what you specified.", "Try again, and if "+ "the problem persists please notify the technical "+ "support.\n") self.status = 'Failure' else: debug.log(("Error: %s does not exist!\n")%(stderr), "This error might be caused by the cgebase not being "+ "available!") if self.verbose: debug.print_out("Technical error occurred!\n", "This error might be caused by the server not being "+ "available!\n", "Try again later, and if the problem "+ "persists please notify the technical support.\n", "Sorry for any inconvenience.\n") self.status = 'Failure' self.p = None self.update_timer(time()) # TIME END debug.log(" timed: %s"%(self.get_time())) else: debug.log(" The check-out of the program has been sorted previously.") def print_stdout(self): """ This function will read the standard out of the program and print it """ # First we check if the file we want to print does exists if self.wdir != '': stdout = "%s/%s"%(self.wdir, self.stdout) else: stdout = self.stdout if os.path.exists(stdout): with open_(stdout, 'r') as f: debug.print_out("\n".join([line for line in f])) else: # FILE DOESN'T EXIST debug.log("Error: The stdout file %s does not exist!"%(stdout)) def find_out_var(self, varnames=[]): """ This function will read the standard out of the program, catch variables and return the values EG. #varname=value """ if self.wdir != '': stdout = "%s/%s"%(self.wdir, self.stdout) else: stdout = self.stdout response = [None]*len(varnames) # First we check if the file we want to print does exists if os.path.exists(stdout): with open_(stdout, 'r') as f: for line in f: if '=' in line: var = line.strip('#').split('=') value = var[1].strip() var = var[0].strip() if var in varnames: response[varnames.index(var)] = value else: # FILE DOESN'T EXIST debug.log("Error: The stdout file %s does not exist!"%(stdout)) return response def find_err_pattern(self, pattern): """ This function will read the standard error of the program and return a matching pattern if found. EG. prog_obj.FindErrPattern("Update of mySQL failed") """ if self.wdir != '': stderr = "%s/%s"%(self.wdir, self.stderr) else: stderr = self.stderr response = [] # First we check if the file we want to print does exists if os.path.exists(stderr): with open_(stderr, 'r') as f: for line in f: if pattern in line: response.append(line.strip()) else: # FILE DOESN'T EXIST debug.log("Error: The stderr file %s does not exist!"%(stderr)) return response def find_out_pattern(self, pattern): """ This function will read the standard error of the program and return a matching pattern if found. EG. prog_obj.FindErrPattern("Update of mySQL failed") """ if self.wdir != '': stdout = "%s/%s"%(self.wdir, self.stdout) else: stdout = self.stdout response = [] # First we check if the file we want to print does exists if os.path.exists(stdout): with open_(stdout, 'r') as f: for line in f: if pattern in line: response.append(line.strip()) else: # FILE DOESN'T EXIST debug.log("Error: The stdout file %s does not exist!"%(stdout)) return response def cmd2list(cmd): ''' Executes a command through the operating system and returns the output as a list, or on error a string with the standard error. EXAMPLE: >>> from subprocess import Popen, PIPE >>> CMDout2array('ls -l') ''' p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() if p.returncode != 0 and stderr != '': return "ERROR: %s\n"%(stderr) else: return stdout.split('\n') # Initiate Shared Global Objects proglist = ProgramList() cgecore-1.5.2/cgecore/organisminfo/0000755000077000000240000000000013577151366017327 5ustar rolfstaff00000000000000cgecore-1.5.2/cgecore/organisminfo/__init__.py0000644000077000000240000000000013425022222021402 0ustar rolfstaff00000000000000cgecore-1.5.2/cgecore/organisminfo/gram_neg.txt0000644000077000000240000000323213426054125021633 0ustar rolfstaff00000000000000Achromobacter xylosoxidans Acinetobacter baumannii Aeromonas hydrophila Aeromonas shubertii Aeromonas veronii Aggregatibacter aphrophilus Bacteroides acidifaciens Bacteroides fragilis Bacteroides gracilis Bacteroides melaninogenicus Bacteroides oris Bacteroides ovatus Bacteroides pyogenes Bacteroides stercoris Bacteroides vulgatus Bartonella bacilliformis Bartonella henselae Bartonella Quintana Bordetella pertussis Bordetella pertussis Branhamella catarrhalis Branhamella catarrhalis Brucella abortus Brucella canis Brucella melitensis Brucella suis Burkholderia cepacia Burkholderia pseudomallei Campylobacter fetus Campylobacter jejuni Capnocytophaga canimorsus Capnocytophaga ochracea Chryseobacterium meningosepticum Citrobacter freundii Citrobacter koseri Eikenella corrodens Elizabethkingia meningoseptica Enterobacter aerogenes Enterobacter cloacae Escherichia coli Francisella tularensis Fusobacterium necrophorum Haemophilus aphrophilus Haemophilus avium Haemophilus ducreyi Haemophilus influenzae Helicobacter pylori Klebsiella ozaenae Klebsiella pneumoniae Klebsiella rhinoscleromatis Legionella pneumophila Moraxella catarrhalis Moraxella catarrhalis Morganella morganii Neisseria gonorrhoeae Neisseria meningitidis Pasteurella multocida Plesiomonas shigelloides Prevotella intermedia Prevotella melaninogenica Proteus mirabilis Proteus vulgaris Providencia stuartii Providencia_rettgeri Pseudomonas aeruginosa Pseudomonas cepacia Pseudomonas maltophilia Pseudomonas pseudomallei Salmonella enterica Serratia marcescens Shigella boydii Shigella dysenteriae Shigella flexneri Shigella sonnei Stenotrophomonas maltophilia Vibrio cholera Vibrio damsela Vibrio parahaemolyticus cgecore-1.5.2/cgecore/organisminfo/gram_pos.txt0000644000077000000240000000210313425020340021646 0ustar rolfstaff00000000000000Actinomyces israelii Arcanobacterium haemolyticum Bacillus anthracis Bacillus cereus Bacillus subtilis Clostridium difficile Clostridium perfringens Clostridium tetani Corynebacterium diphtheria Corynebacterium equi Corynebacterium haemolyticum Corynebacterium jeikeium Corynebacterium urealyticum Enterococcus faecalis Enterococcus faecium Erysipelothrix rhusiopathiae Lactobacillus acidophilus Lactobacillus brevis Lactobacillus buchneri Lactobacillus casei Lactobacillus fermentum Lactobacillus gallinarum Lactobacillus gasseri Lactobacillus species Leuconostoc Listeria monocytogenes Nocardia asteroides Nocardia brasiliensis Prescottia equi Propionibacterium acnes Rhodococcus equi Staphylococcus aureus Staphylococcus capitis Staphylococcus epidermidis Staphylococcus haemolyticus Staphylococcus hominis Staphylococcus lugdunensis Staphylococcus saprophyticus Streptobacillus moniliformis Streptococcus agalactiae Streptococcus anginosus Streptococcus anginosus Streptococcus constellatus Streptococcus intermedius Streptococcus milleri Streptococcus pneumoniae Streptococcus pyogenes cgecore-1.5.2/cgecore/organisminfo/gramstain.py0000644000077000000240000000161113426054165021655 0ustar rolfstaff00000000000000#!/usr/bin/env python3 import os.path def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) class Gramstain(dict): """ """ gram_neg_file = ("{}/gram_neg.txt" .format(os.path.abspath(os.path.dirname(__file__)))) gram_pos_file = ("{}/gram_pos.txt" .format(os.path.abspath(os.path.dirname(__file__)))) def __init__(self): """ """ self.load_gram_file(file=Gramstain.gram_neg_file, gram="-") self.load_gram_file(file=Gramstain.gram_pos_file, gram="+") def load_gram_file(self, file, gram): """ """ with open(file, "r") as fh: for line in fh: line = line.rstrip() if(not line): continue if(line.startswith("#")): continue self[line.lower()] = gram cgecore-1.5.2/cgecore/organisminfo/species.py0000644000077000000240000000570313426047266021335 0ustar rolfstaff00000000000000#!/usr/bin/env python3 import urllib.parse import requests import time def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) class Species(): ensembl_server = "http://rest.ensembl.org/taxonomy/classification/" def __init__(self, species): """ """ self.species = species.lower() self.ensembl_response = Species.get_ensembl_tax(self.species) self.ensembl_dict = self.ensembl_list_to_dict(self.ensembl_response) self.tax_tuple = self.get_taxonomy_as_tuple(self.ensembl_dict) self.tax_dict = self.get_taxonomy_as_dict(self.tax_tuple) def ensembl_list_to_dict(self, ensembl_response): """ """ ensembl_dict = {self.species: None} for dict in ensembl_response: name = dict["scientific_name"].lower() child_name = dict["children"][0]["scientific_name"].lower() ensembl_dict[name] = child_name return ensembl_dict @staticmethod def get_ensembl_tax(species): """ """ ensembl_query = "{}{}?".format(Species.ensembl_server, urllib.parse.quote(species, safe="")) try: r = requests.get(ensembl_query, headers={"Content-Type": "application/json"}) # Make sure less than 15 requests per second (ENSEMBL rule). time.sleep(1) except Exception as e: print("Error while interacting ENSEMBEL REST API") print("Error: {}".format(e)) raise Exception("Error while interacting ENSEMBEL REST API") if not r.ok: eprint("Possible source of error: Missing or misspelling of " "organism name.") eprint("Query was: {}".format(query)) eprint("r url: {}".format(r.url)) eprint("r text: {}".format(r.text)) raise Exception("Possible source of error: Missing or misspelling " "of organism name.") return r.json() @staticmethod def get_taxonomy_as_tuple(ensembl_dict): """ """ root = None children = tuple(ensembl_dict.values()) for name in ensembl_dict: if(name not in children): root = name break out_lst = [root, ] parent = root child = "" while(True): child = ensembl_dict.get(parent, None) if(child is None): break out_lst.append(child) parent = child child = "" return tuple(out_lst) @staticmethod def get_taxonomy_as_dict(tax_tuple): """ """ return {"domain": tax_tuple[0], "phylum": tax_tuple[1], "class": tax_tuple[2], "order": tax_tuple[3], "family": tax_tuple[4], "genus": tax_tuple[5], "species": tax_tuple[6]} cgecore-1.5.2/cgecore/utility.py0000644000077000000240000005441413425017076016710 0ustar rolfstaff00000000000000#!/usr/bin/env python3 """ THIS MODULE CONTAINS ALL THE SHARED WRAPPER FUNCTIONS """ ################################################################################ # CGE FUNCTION MODULE # ################################################################################ # This script is part of the CGE Pipeline structure import sys, os, gzip, shutil, glob, re, json from subprocess import Popen from zipfile import ZipFile from contextlib import closing ############# CLASSES ############# class Debug(): """ Debug object, keeps track of all debug related matters, and provides easy access to logging debug text USAGE import os, sys debug = Debug() debug.setup(debug=True, logfile=None, stdout=None, stderr=None) debug.print_out('hello') debug.log_no_newline('Oh') debug.log_no_newline(' my') debug.log(' god!') # The graceful exit is only meant to be used with the integrated CGE platform debug.graceful_exit('error message') IMPORTING THE DEBUG OBJECT from cge.utility import debug """ def __init__(self): """ """ self.debug = False self.logfile = sys.stderr self.stdout = sys.stdout self.stderr = sys.stderr self.caught_error = None def setup(self, debug=None, logfile=None, stdout=None, stderr=None): """ """ if debug is not None: self.debug = debug if logfile is not None: self.logfile = logfile if stdout is not None: self.stdout = stdout if stderr is not None: self.stderr = stderr def print_out(self, *lst): """ Print list of strings to the predefined stdout. """ self.print2file(self.stdout, True, True, *lst) def print_err(self, *lst): """ Print list of strings to the predefined stdout. """ self.print2file(self.stderr, False, True, *lst) def print2file(self, logfile, print2screen, addLineFeed, *lst): """ This function prints to the screen and logs to a file, all the strings given. # print2screen eg. True, *lst is a commaseparated list of strings """ if addLineFeed: linefeed = '\n' else: linefeed = '' if print2screen: print(linefeed.join(str(string) for string in lst)) try: file_instance = isinstance(logfile, file) except NameError as e: from io import IOBase try: file_instance = isinstance(logfile, IOBase) except: raise e if file_instance: logfile.write(linefeed.join(str(string) for string in lst) + linefeed) elif isinstance(logfile, str) and os.path.exists(logfile): with open_(logfile, 'a') as f: f.write(linefeed.join(str(string) for string in lst) + linefeed) elif not print2screen: # Print to screen if there is no outputfile print(linefeed.join(str(string) for string in lst)) def log(self, *lst): """ Print list of strings to the predefined logfile if debug is set. and sets the caught_error message if an error is found """ self.print2file(self.logfile, self.debug, True, *lst) if 'Error' in '\n'.join([str(x) for x in lst]): self.caught_error = '\n'.join([str(x) for x in lst]) def log_no_newline(self, msg): """ print the message to the predefined log file without newline """ self.print2file(self.logfile, False, False, msg) def graceful_exit(self, msg): """ This function Tries to update the MSQL database before exiting. """ # Print stored errors to stderr if self.caught_error: self.print2file(self.stderr, False, False, self.caught_error) # Kill process with error message self.log(msg) sys.exit(1) class adv_dict(dict): """ This class expands on the dictionary class by adding the gettree class method. """ def get_tree(self, list_of_keys): """ gettree will extract the value from a nested tree INPUT list_of_keys: a list of keys ie. ['key1', 'key2'] USAGE >>> # Access the value for key2 within the nested dictionary >>> adv_dict({'key1': {'key2': 'value'}}).gettree(['key1', 'key2']) 'value' """ cur_obj = self for key in list_of_keys: cur_obj = cur_obj.get(key) if not cur_obj: break return cur_obj def invert(self): ''' Return inverse mapping of dictionary with sorted values. USAGE >>> # Switch the keys and values >>> adv_dict({ ... 'A': [1, 2, 3], ... 'B': [4, 2], ... 'C': [1, 4], ... }).invert() {1: ['A', 'C'], 2: ['A', 'B'], 3: ['A'], 4: ['B', 'C']} ''' inv_map = {} for k, v in self.items(): if sys.version_info < (3, 0): acceptable_v_instance = isinstance(v, (str, int, float, long)) else: acceptable_v_instance = isinstance(v, (str, int, float)) if acceptable_v_instance: v = [v] elif not isinstance(v, list): raise Exception('Error: Non supported value format! Values may only' ' be numerical, strings, or lists of numbers and ' 'strings.') for val in v: inv_map[val] = inv_map.get(val, []) inv_map[val].append(k) inv_map[val].sort() return inv_map class Reg: """ NAME: Reg - Extended Regular Expression Handler AUTHOR: Martin Thomsen DESCRIPTION: This class enables a simplistic usage of regular expression to get contained groups in a match statement. But it also allows to do some of the normal re call, such as findall and sub. DEPENDENCIES: re (regular expression module) USAGE: >>> import re >>> RegEx = Reg(pattern, flag) >>> if RegEx.match(string): >>> RegEx.getgroup(index) EXAMPLE: >>> RegEx = Reg('[^a]*(a)[^b]*(b)[^Y]*(Y)(a)?', 'I') >>> if RegEx.match('aBcdefgHIJKLmnOpqrstuvwxyz'): ... print(RegEx.getgroup(0), # index=0 -> full match ... RegEx.getgroup(1), ... RegEx.getgroup(2), ... RegEx.getgroup(3), ... RegEx.getgroup(4)) ... ('aBcdefgHIJKLmnOpqrstuvwxy', 'a', 'B', 'y', None) # NIFTY SUBSTITUTION LOOP >>> string = 'There are {%=count%} {%=animal%} on the {%=location%}!' >>> # Dictionary containing place-holders and values ... # (make sure all placeholders in the string is included!) ... d = { 'count': 5, 'animal': 'cows', 'location': 'Battle Field' } >>> # RE Object which matches place-holders in the string ... tmpPH = Reg('\{\%\=(\w+)\%\}', 'I') >>> # substitute all placeholders ... while tmpPH.match(string): string = tmpPH.sub(str(d[tmpPH.getgroup(1)]), string, 1) ... >>> print(string) There are 5 cows on the Battle Field! """ def __init__(self, pattern, *flags): sd = {'T': 1, 'I': 2, 'L':4,'M':8,'S':16,'U':32,'X':64} try: flag = sum([sd[f] if f in sd else int(f) for f in set(flags)]) if flags else 0 except: for f in flags: if not isinstance(f, int) and not f in sd: raise Exception("Error: Unrecognised flag argument '%s' for Reg call."%f) flag=0 if flag: self.re = re.compile(pattern, flag) else: self.re = re.compile(pattern) self.matches = None def sub(self, replace, string, count=0): """ returns new string where the matching cases (limited by the count) in the string is replaced. """ return self.re.sub(replace, string, count) def find_all(self, s): """ Finds all matches in the string and returns them in a tuple. """ return self.re.findall(s) def match(self, s): """ Matches the string to the stored regular expression, and stores all groups in mathches. Returns False on negative match. """ self.matches = self.re.search(s) return self.matches def get_group(self, x): """ Returns requested subgroup. """ return self.matches.group(x) def get_groups(self): """ Returns all subgroups. """ return self.matches.groups() class REGroup(): """ Regular Expression group object This class simplyfies the use of groups for the Sort2Groups function. """ def __init__(self, pattern, flags=''): self.re = Reg(pattern, flags) self.list = [] def match(self, s): """ Matching the pattern to the input string, returns True/False and saves the matched string in the internal list """ if self.re.match(s): self.list.append(s) return True else: return False ############# ITERATORS ############# def seqs_from_file(filename, exit_on_err=False, return_qual=False): """Extract sequences from a file Name: seqs_from_file Author(s): Martin C F Thomsen Date: 18 Jul 2013 Description: Iterator which extract sequence data from the input file Args: filename: string which contain a path to the input file Supported Formats: fasta, fastq USAGE: >>> import os, sys >>> # Create fasta test file >>> file_content = ('>head1 desc1\nthis_is_seq_1\n>head2 desc2\n' 'this_is_seq_2\n>head3 desc3\nthis_is_seq_3\n') >>> with open_('test.fsa', 'w') as f: f.write(file_content) >>> # Parse and print the fasta file >>> for seq, name, desc in SeqsFromFile('test.fsa'): ... print ">%s %s\n%s"%(name, desc, seq) ... >head1 desc1 this_is_seq_1 >head2 desc2 this_is_seq_2 >head3 desc3 this_is_seq_3 """ # VALIDATE INPUT if not isinstance(filename, str): msg = 'Filename has to be a string.' if exit_on_err: sys.stderr.write('Error: %s\n'%msg) sys.exit(1) else: raise IOError(msg) if not os.path.exists(filename): msg = 'File "%s" does not exist.'%filename if exit_on_err: sys.stderr.write('Error: %s\n'%msg) sys.exit(1) else: raise IOError(msg) # EXTRACT DATA with open_(filename,"rt") as f: query_seq_segments = [] seq, name, desc, qual = '', '', '', '' add_segment = query_seq_segments.append for l in f: if len(l.strip()) == 0: continue #sys.stderr.write("%s\n"%line) fields=l.strip().split() if l.startswith(">"): # FASTA HEADER FOUND if query_seq_segments != []: # YIELD SEQUENCE AND RESET seq = ''.join(query_seq_segments) yield (seq, name, desc) seq, name, desc = '', '', '' del query_seq_segments[:] name = fields[0][1:] desc = ' '.join(fields[1:]) elif l.startswith("@"): # FASTQ HEADER FOUND name = fields[0][1:] desc = ' '.join(fields[1:]) try: # EXTRACT FASTQ SEQUENCE seq = next(f).strip().split()[0] # SKIP SECOND HEADER LINE AND QUALITY SCORES l = next(f) qual = next(f).strip() # Qualities except: break else: # YIELD SEQUENCE AND RESET if return_qual: yield (seq, qual, name, desc) else: yield (seq, name, desc) seq, name, desc, qual = '', '', '', '' elif len(fields[0])>0: # EXTRACT FASTA SEQUENCE add_segment(fields[0]) # CHECK FOR LAST FASTA SEQUENCE if query_seq_segments != []: # YIELD SEQUENCE seq = ''.join(query_seq_segments) yield (seq, name, desc) ############# FUNCTIONS ############# def open_(filename, mode=None, compresslevel=9): """Switch for both open() and gzip.open(). Determines if the file is normal or gzipped by looking at the file extension. The filename argument is required; mode defaults to 'rb' for gzip and 'r' for normal and compresslevel defaults to 9 for gzip. >>> import gzip >>> from contextlib import closing >>> with open_(filename) as f: ... f.read() """ if filename[-3:] == '.gz': if mode is None: mode = 'rt' return closing(gzip.open(filename, mode, compresslevel)) else: if mode is None: mode = 'r' return open(filename, mode) def load_json(json_object): ''' Load json from file or file name ''' content = None if isinstance(json_object, str) and os.path.exists(json_object): with open_(json_object) as f: try: content = json.load(f) except Exception as e: debug.log("Warning: Content of '%s' file is not json."%f.name) elif hasattr(json_object, 'read'): try: content = json.load(json_object) except Exception as e: debug.log("Warning: Content of '%s' file is not json."%json_object.name) else: debug.log("%s\nWarning: Object type invalid!"%json_object) return content def sort2groups(array, gpat=['_R1','_R2']): """ Sort an array of strings to groups by patterns """ groups = [REGroup(gp) for gp in gpat] unmatched = [] for item in array: matched = False for m in groups: if m.match(item): matched = True break if not matched: unmatched.append(item) return [sorted(m.list) for m in groups], sorted(unmatched) def sort_and_distribute(array, splits=2): """ Sort an array of strings to groups by alphabetically continuous distribution """ if not isinstance(array, (list,tuple)): raise TypeError("array must be a list") if not isinstance(splits, int): raise TypeError("splits must be an integer") remaining = sorted(array) if sys.version_info < (3, 0): myrange = xrange(splits) else: myrange = range(splits) groups = [[] for i in myrange] while len(remaining) > 0: for i in myrange: if len(remaining) > 0: groups[i].append(remaining.pop(0)) return groups def mkpath(filepath, permissions=0o777): """ This function executes a mkdir command for filepath and with permissions (octal number with leading 0 or string only) # eg. mkpath("path/to/file", "0o775") """ # Converting string of octal to integer, if string is given. if isinstance(permissions, str): permissions = sum([int(x)*8**i for i,x in enumerate(reversed(permissions))]) # Creating directory if not os.path.exists(filepath): debug.log("Creating Directory %s (permissions: %s)"%( filepath, permissions)) os.makedirs(filepath, permissions) else: debug.log("Warning: The directory "+ filepath +" already exists") return filepath def create_zip_dir(zipfile_path, *file_list): """ This function creates a zipfile located in zipFilePath with the files in the file list # fileList can be both a comma separated list or an array """ try: if isinstance(file_list, (list, tuple)): #unfolding list of list or tuple if len(file_list) == 1: if isinstance(file_list[0], (list, tuple)): file_list = file_list[0] #converting string to iterable list if isinstance(file_list, str): file_list = [file_list] if file_list: with ZipFile(zipfile_path, 'w') as zf: for cur_file in file_list: if '/' in cur_file: os.chdir('/'.join(cur_file.split('/')[:-1])) elif '/' in zipfile_path: os.chdir('/'.join(zipfile_path.split('/')[:-1])) zf.write(cur_file.split('/')[-1]) else: debug.log('Error: No Files in list!',zipfile_path+' was not created!') except Exception as e: debug.log('Error: Could not create zip dir! argtype: '+ str(type(file_list)), "FileList: "+ str(file_list), "Errormessage: "+ str(e)) def file_zipper(root_dir): """ This function will zip the files created in the runroot directory and subdirectories """ # FINDING AND ZIPPING UNZIPPED FILES for root, dirs, files in os.walk(root_dir, topdown=False): if root != "": if root[-1] != '/': root += '/' for current_file in files: filepath = "%s/%s"%(root, current_file) try: file_size = os.path.getsize(filepath) except Exception as e: file_size = 0 debug.log('Error: file_zipper failed to zip following file '+filepath, e) # Excluding small files, gzipped files and links if ( file_size > 50 and current_file[-3:] != ".gz" and not os.path.islink(filepath) ): if current_file[-4:] == ".zip": # Unzip file ec = Popen('unzip -qq "%s" -d %s > /dev/null 2>&1'%(filepath, root), shell=True).wait() if ec > 0: debug.log('Error: fileZipper failed to unzip following file %s'%filepath) continue else: ec = Popen('rm -f "%s" > /dev/null 2>&1'%(filepath), shell=True).wait() if ec > 0: debug.log('Error: fileZipper failed to delete the original zip file (%s)'%filepath) filepath = filepath[:-4] # Saving a gzipped version with open_(filepath, 'rb') as f, open_(filepath+".gz", 'wb', 9) as gz: gz.writelines(f) # Deleting old (non-zipped) file try: os.remove(filepath) except OSError as e: debug.log(("WARNING! The file %s could not be " "removed!\n%s")%(current_file, e)) def file_unzipper(directory): """ This function will unzip all files in the runroot directory and subdirectories """ debug.log("Unzipping directory (%s)..."%directory) #FINDING AND UNZIPPING ZIPPED FILES for root, dirs, files in os.walk(directory, topdown=False): if root != "": orig_dir = os.getcwd() os.chdir(directory) Popen('gunzip -q -f *.gz > /dev/null 2>&1', shell=True).wait() Popen('unzip -qq -o "*.zip" > /dev/null 2>&1', shell=True).wait() Popen('rm -f *.zip > /dev/null 2>&1', shell=True).wait() os.chdir(orig_dir) def move_file(src, dst): """ this function will simply move the file from the source path to the dest path given as input """ # Sanity checkpoint src = re.sub('[^\w/\-\.\*]', '', src) dst = re.sub('[^\w/\-\.\*]', '', dst) if len(re.sub('[\W]', '', src)) < 5 or len(re.sub('[\W]', '', dst)) < 5: debug.log("Error: Moving file failed. Provided paths are invalid! src='%s' dst='%s'"%(src, dst)) else: # Check destination check = False if dst[-1] == '/': if os.path.exists(dst): check = True # Valid Dir else: debug.log("Error: Moving file failed. Destination directory does not exist (%s)"%(dst)) #DEBUG elif os.path.exists(dst): if os.path.isdir(dst): check = True # Valid Dir dst += '/' # Add missing slash else: debug.log("Error: Moving file failed. %s exists!"%dst) elif os.path.exists(os.path.dirname(dst)): check = True # Valid file path else: debug.log("Error: Moving file failed. %s is an invalid distination!"%dst) if check: # Check source files = glob.glob(src) if len(files) != 0: debug.log("Moving File(s)...", "Move from %s"%src, "to %s"%dst) for file_ in files: # Check if file contains invalid symbols: invalid_chars = re.findall('[^\w/\-\.\*]', os.path.basename(file_)) if invalid_chars: debug.graceful_exit(("Error: File %s contains invalid " "characters %s!" )%(os.path.basename(file_), invalid_chars)) continue # Check file exists if os.path.isfile(file_): debug.log("Moving file: %s"%file_) shutil.move(file_, dst) else: debug.log("Error: Moving file failed. %s is not a regular file!"%file_) else: debug.log("Error: Moving file failed. No files were found! (%s)"%src) def copy_file(src, dst, ignore=None): """ this function will simply copy the file from the source path to the dest path given as input """ # Sanity checkpoint src = re.sub('[^\w/\-\.\*]', '', src) dst = re.sub('[^\w/\-\.\*]', '', dst) if len(re.sub('[\W]', '', src)) < 5 or len(re.sub('[\W]', '', dst)) < 5: debug.log("Error: Copying file failed. Provided paths are invalid! src='%s' dst='%s'"%(src, dst)) else: # Check destination check = False if dst[-1] == '/': if os.path.exists(dst): check = True # Valid Dir else: debug.log("Error: Copying file failed. Destination directory does not exist (%s)"%(dst)) #DEBUG elif os.path.exists(dst): if os.path.isdir(dst): check = True # Valid Dir dst += '/' # Add missing slash else: debug.log("Error: Copying file failed. %s exists!"%dst) elif os.path.exists(os.path.dirname(dst)): check = True # Valid file path else: debug.log("Error: Copying file failed. %s is an invalid distination!"%dst) if check: # Check source files = glob.glob(src) if ignore is not None: files = [fil for fil in files if not ignore in fil] if len(files) != 0: debug.log("Copying File(s)...", "Copy from %s"%src, "to %s"%dst) #DEBUG for file_ in files: # Check file exists if os.path.isfile(file_): debug.log("Copying file: %s"%file_) #DEBUG shutil.copy(file_, dst) else: debug.log("Error: Copying file failed. %s is not a regular file!"%file_) #DEBUG else: debug.log("Error: Copying file failed. No files were found! (%s)"%src) #DEBUG def copy_dir(src, dst): """ this function will simply copy the file from the source path to the dest path given as input """ try: debug.log("copy dir from "+ src, "to "+ dst) shutil.copytree(src, dst) except Exception as e: debug.log("Error: happened while copying!\n%s\n"%e) # Initiate Shared Global Objects debug = Debug() cgecore-1.5.2/cgecore.egg-info/0000755000077000000240000000000013577151366016326 5ustar rolfstaff00000000000000cgecore-1.5.2/cgecore.egg-info/PKG-INFO0000644000077000000240000000227413577151366017430 0ustar rolfstaff00000000000000Metadata-Version: 1.0 Name: cgecore Version: 1.5.2 Summary: Center for Genomic Epidemiology Core Module Home-page: https://bitbucket.org/genomicepidemiology/cge_core_module Author: Center for Genomic Epidemiology Author-email: cgehelp@cbs.dtu.dk License: Apache License, Version 2.0 Description: # cge_core_module Core module for the Center for Genomic Epidemiology This module contains classes and functions needed to run the service wrappers and pipeline scripts The pypi project can be found here: https://pypi.org/project/cgecore/ # How to update: 1. Make changes to the modules 2. Bump the version number accordingly in cgecore/__init__.py 3. Install package locally 4. Test the changes locally (for both python2 and python3) 5. Distribute to Pypi # Install package locally python2 setup.py install python3 setup.py install # Distribute to PyPi python3 setup.py sdist bdist_wheel twine upload dist/* *deprecated:* ~~python setup.py sdist upload -r pypi~~ Platform: UNKNOWN cgecore-1.5.2/cgecore.egg-info/SOURCES.txt0000644000077000000240000000101213577151366020204 0ustar rolfstaff00000000000000README.md setup.py cgecore/__init__.py cgecore/alignment.py cgecore/argumentparsing.py cgecore/cgefinder.py cgecore/cmdline.py cgecore/utility.py cgecore.egg-info/PKG-INFO cgecore.egg-info/SOURCES.txt cgecore.egg-info/dependency_links.txt cgecore.egg-info/top_level.txt cgecore/blaster/__init__.py cgecore/blaster/blaster.py cgecore/blaster/run_blaster.py cgecore/organisminfo/__init__.py cgecore/organisminfo/gram_neg.txt cgecore/organisminfo/gram_pos.txt cgecore/organisminfo/gramstain.py cgecore/organisminfo/species.pycgecore-1.5.2/cgecore.egg-info/dependency_links.txt0000644000077000000240000000000113577151366022374 0ustar rolfstaff00000000000000 cgecore-1.5.2/cgecore.egg-info/top_level.txt0000644000077000000240000000001013577151366021047 0ustar rolfstaff00000000000000cgecore cgecore-1.5.2/setup.cfg0000644000077000000240000000004613577151366015046 0ustar rolfstaff00000000000000[egg_info] tag_build = tag_date = 0 cgecore-1.5.2/setup.py0000644000077000000240000000137413440471550014731 0ustar rolfstaff00000000000000#!/usr/bin/env python from setuptools import setup, find_packages with open("README.md", 'r') as f: long_description = f.read() with open("cgecore/__init__.py", 'r') as f: for l in f: if l.startswith('__version__'): version = l.split('=')[1].strip().strip('"') setup( name='cgecore', version=version, description='Center for Genomic Epidemiology Core Module', long_description=long_description, license="Apache License, Version 2.0", author='Center for Genomic Epidemiology', author_email='cgehelp@cbs.dtu.dk', url="https://bitbucket.org/genomicepidemiology/cge_core_module", packages=['cgecore', 'cgecore.blaster', 'cgecore.organisminfo'], package_data={'cgecore.organisminfo': ['*.txt']}, )