pax_global_header00006660000000000000000000000064140107355260014515gustar00rootroot0000000000000052 comment=b2b47b0ea3d159f9c3acfd790b6192f6274e178c CAT-5.2.3/000077500000000000000000000000001401073552600121335ustar00rootroot00000000000000CAT-5.2.3/CAT_pack/000077500000000000000000000000001401073552600135405ustar00rootroot00000000000000CAT-5.2.3/CAT_pack/CAT000077500000000000000000000042461401073552600141030ustar00rootroot00000000000000#!/usr/bin/env python3 import sys import about import add_names import bins import contigs import prepare import single_bin import summarise def usage(): message = ( 'usage: CAT (prepare | contigs | bin | bins | add_names | ' 'summarise) [-v / --version] [-h / --help]\n' 'CAT: error: one of the arguments prepare contigs bin bins ' 'add_names summarise is required') sys.stdout.write('{0}\n'.format(message)) return def version(): message = ('CAT v{0} ({1}) by {2}.'.format( about.__version__, about.__date__, about.__author__)) sys.stdout.write('{0}\n'.format(message)) return def help(): message = ( 'usage: CAT (prepare | contigs | bin | bins | add_names | ' 'summarise) [-v / --version] [-h / --help]\n\n' 'Run Contig Annotation Tool (CAT) or ' 'Bin Annotation Tool (BAT).\n\n' 'Required choice:\n' ' prepare\t\tConstruct database files.\n' ' contigs\t\tRun CAT.\n' ' bin\t\t\tRun BAT on a single bin.\n' ' bins\t\t\tRun BAT on a set of bins.\n' ' add_names\t\tAdd taxonomic names to CAT or BAT output files.\n' ' summarise\t\tSummarise a named CAT or BAT classification file.' '\n\n' 'Optional arguments:\n' ' -v, --version\t\tPrint version information and exit.\n' ' -h, --help\t\tShow this help message and exit.') sys.stdout.write('{0}\n'.format(message)) return def main(): if len(sys.argv) == 1: usage() elif sys.argv[1] == 'prepare': prepare.run() elif sys.argv[1] == 'contigs': contigs.run() elif sys.argv[1] == 'bin': single_bin.run() elif sys.argv[1] == 'bins': bins.run() elif sys.argv[1] == 'add_names': add_names.run() elif sys.argv[1] == 'summarise': summarise.run() elif sys.argv[1] == '-v' or sys.argv[1] == '--version': version() elif sys.argv[1] == '-h' or sys.argv[1] == '--help': help() else: usage() return if __name__ == '__main__': main() CAT-5.2.3/CAT_pack/about.py000066400000000000000000000001731401073552600152250ustar00rootroot00000000000000#!/usr/bin/env python3 __author__ = 'F. A. Bastiaan von Meijenfeldt' __version__ = '5.2.3' __date__ = '10 February, 2021' CAT-5.2.3/CAT_pack/add_names.py000066400000000000000000000123071401073552600160300ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import sys import about import check import shared import tax def parse_arguments(): parser = argparse.ArgumentParser( prog='CAT add_names', description='Add taxonomic names to CAT or BAT output files.', usage='CAT add_names -i -o -t [options] [-h / --help]', add_help=False) required = parser.add_argument_group('Required arguments') shared.add_argument(required, 'input_file', True, help_=('Path to input file. Can be classification or ORF2LCA ' 'output file from CAT or BAT.')) shared.add_argument(required, 'output_file', True) shared.add_argument(required, 'taxonomy_folder', True) optional = parser.add_argument_group('Optional arguments') shared.add_argument(optional, 'only_official', False) shared.add_argument(optional, 'exclude_scores', False) shared.add_argument(optional, 'force', False) shared.add_argument(optional, 'quiet', False) shared.add_argument(optional, 'help', False) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, 'add_names')] if len(extra_args) > 0: sys.exit('error: too much arguments supplied:\n{0}'.format( '\n'.join(extra_args))) # Add extra arguments. shared.expand_arguments(args) return args def run(): args = parse_arguments() message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_input_file(args.input_file, args.log_file, args.quiet)) if not args.force: errors.append( check.check_output_file( args.output_file, args.log_file, args.quiet)) errors.append( check.check_in_and_output_file( args.input_file, args.output_file, args.log_file, args.quiet)) if True in errors: sys.exit(1) (taxid2parent, taxid2rank) = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) taxid2name = tax.import_names(args.names_dmp, args.log_file, args.quiet) message = 'Appending names...' shared.give_user_feedback(message, args.log_file, args.quiet) with open(args.input_file, 'r') as f1: for line in f1: if line.startswith('#'): line = line.rstrip().split('\t') if 'lineage' in line: lineage_index = line.index('lineage') else: message = ('{0} is not a supported classification file.' ''.format(input_file)) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) try: scores_index = line.index('lineage scores') except: scores_index = None full_length = len(line) break else: message = ('{0} is not a supported classification file.'.format( args.input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) with open(args.input_file, 'r') as f1, open(args.output_file, 'w') as outf1: for line in f1: line = line.rstrip() if line.startswith('#'): if args.only_official: outf1.write('{0}\tsuperkingdom\tphylum\tclass\torder\t' 'family\tgenus\tspecies\n'.format(line)) else: outf1.write('{0}\tfull lineage names\n'.format(line)) continue line = line.split('\t') if len(line) != full_length: # Entry does not have a full annotation. outf1.write('{0}\n'.format('\t'.join(line))) continue if any([c.startswith('no taxid found') for c in line[2:4]]): # ORF has database hits but the accession number is not found # in the taxonomy files. outf1.write('{0}\n'.format('\t'.join(line))) continue lineage = line[lineage_index].split(';') if scores_index is not None and not args.exclude_scores: scores = line[scores_index].split(';') else: scores = None if args.only_official: names = tax.convert_to_official_names( lineage, taxid2rank, taxid2name, scores) else: names = tax.convert_to_names( lineage, taxid2rank, taxid2name, scores) outf1.write('{0}\t{1}\n'.format('\t'.join(line), '\t'.join(names))) message = 'Names written to {0}!'.format(args.output_file) shared.give_user_feedback(message, args.log_file, args.quiet) return if __name__ == '__main__': sys.exit('Run \'CAT add_names\' to add taxonomic names to CAT or BAT ' 'output files.') CAT-5.2.3/CAT_pack/bins.py000066400000000000000000000454671401073552600150650ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import decimal import multiprocessing import os import sys import about import check import shared import tax def parse_arguments(): parser = argparse.ArgumentParser( prog='CAT bins', description='Run Bin Annotation Tool (BAT) on a set of bins.', usage='CAT bins -b -d -t [options] [-h / --help]', add_help=False) required = parser.add_argument_group('Required arguments') shared.add_argument(required, 'bin_folder', True) shared.add_argument(required, 'database_folder', True) shared.add_argument(required, 'taxonomy_folder', True) optional = parser.add_argument_group('Optional arguments') shared.add_argument(optional, 'bin_suffix', False, default='.fna') shared.add_argument(optional, 'r', False, default=decimal.Decimal(5)) shared.add_argument(optional, 'f', False, default=decimal.Decimal(0.3)) shared.add_argument(optional, 'out_prefix', False, default='./out.BAT') shared.add_argument(optional, 'proteins_fasta', False, help_=( 'Path to concatenated predicted proteins fasta file ' 'generated during an earlier run of BAT on the same bins. If ' 'supplied, BAT will skip the protein prediction step.')) shared.add_argument(optional, 'alignment_file', False, help_=( 'Path to alignment table generated during an earlier run of ' 'BAT on the same bins. If supplied, BAT will skip the ' 'alignment step and directly classify the bins. A ' 'concatenated predicted proteins fasta file should also be ' 'supplied with argument [-p / --proteins].')) shared.add_argument(optional, 'path_to_prodigal', False, default='prodigal') shared.add_argument(optional, 'path_to_diamond', False, default='diamond') shared.add_argument(optional, 'no_stars', False) shared.add_argument(optional, 'force', False) shared.add_argument(optional, 'quiet', False) shared.add_argument(optional, 'verbose', False) shared.add_argument(optional, 'no_log', False) shared.add_argument(optional, 'help', False) shared.add_argument(optional, 'IkwId', False) specific = parser.add_argument_group('DIAMOND specific optional arguments') shared.add_all_diamond_arguments(specific) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, 'bins')] if len(extra_args) > 0: sys.exit('error: too much arguments supplied:\n{0}'.format( '\n'.join(extra_args))) # Check experimental features. if not args.IkwId: if args.top < 50: sys.exit('error: --top can only be set lower than 50 in ' 'combination with the --I_know_what_Im_doing flag. See ' 'README.md as to why this is the case.') # Add extra arguments. shared.expand_arguments(args) return args def import_bins(bin_folder, bin_suffix, log_file, quiet): message = 'Importing bins from {0}.'.format(bin_folder) shared.give_user_feedback(message, log_file, quiet) bin2contigs = {} contig_names = set() for file_ in os.listdir(bin_folder): if file_.startswith('.'): # Skip hidden files. continue if not file_.endswith(bin_suffix): continue if '.concatenated.' in file_: # Skip concatenated contig fasta and predicted protein fasta files # from earlier runs. continue # Keep the suffix in the bin name. bin_ = file_ bin2contigs[bin_] = [] with open('{0}{1}'.format(bin_folder, file_), 'r') as f1: for line in f1: if line.startswith('>'): contig = line.split(' ')[0].rstrip().lstrip('>') # Add bin name in front of the contig name. new_contig_name = '{0}_{1}'.format(bin_, contig) if new_contig_name in contig_names: message = ( 'BAT has encountered {0} twice in bin {1}. ' 'Each fasta header should be unique in each ' 'bin.'.format(contig, bin_)) shared.give_user_feedback( message, log_file, quiet, error=True) sys.exit(1) contig_names.add(new_contig_name) bin2contigs[bin_].append(new_contig_name) if len(bin2contigs) == 1: message = '1 bin found!' else: message = '{0:,d} bins found!'.format(len(bin2contigs)) shared.give_user_feedback(message, log_file, quiet) return (bin2contigs, contig_names) def make_concatenated_fasta( concatenated_fasta, bin2contigs, bin_folder, log_file, quiet): message = 'Writing {0}.'.format(concatenated_fasta) shared.give_user_feedback(message, log_file, quiet) with open(concatenated_fasta, 'w') as outf1: for bin_ in sorted(bin2contigs): with open('{0}{1}'.format(bin_folder, bin_), 'r') as f1: for line in f1: if line.startswith('>'): contig = line.split(' ')[0].rstrip().lstrip('>') # add bin name in front of the contig name. outf1.write('>{0}_{1}\n'.format(bin_, contig)) else: outf1.write(line) return def run(): args = parse_arguments() message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check at which state to start. step_list = [] if not args.proteins_fasta and not args.alignment_file: message = ( '\n' 'BAT is running. Protein prediction, alignment, and bin ' 'classification are carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) step_list.append('predict_proteins') step_list.append('align') elif args.proteins_fasta and not args.alignment_file: message = ( '\n' 'BAT is running. Since a predicted protein fasta is supplied, ' 'only alignment and bin classification are carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) step_list.append('align') elif args.proteins_fasta and args.alignment_file: message = ( '\n' 'BAT is running. Since a predicted protein fasta and ' 'alignment file are supplied, only bin classification is ' 'carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) elif not args.proteins_fasta and args.alignment_file: message = ( 'if you want BAT to directly classify a set of bins, you ' 'should not only supply a DIAMOND alignment table but also a ' 'concatenated predicted protein fasta file with argument ' '[-p / --proteins].') shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) step_list.append('classify') # Print variables. message = ( 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Bin folder: {1}\n' 'Taxonomy folder: {2}\n' 'Database folder: {3}\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format( ' '.join(sys.argv), args.bin_folder, args.taxonomy_folder, args.database_folder, int(args.r), float(args.f), args.log_file)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check binaries, output files, taxonomy folder and database folder, and # set variables. message = 'Doing some pre-flight checks first.' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_bin_folder( args.bin_folder, args.bin_suffix, args.log_file, args.quiet)) errors.append( check.check_out_prefix(args.out_prefix, args.log_file, args.quiet)) if 'predict_proteins' in step_list: errors.append( check.check_prodigal_binaries( args.path_to_prodigal, args.log_file, args.quiet)) setattr(args, 'concatenated_fasta', '{0}.concatenated.fasta'.format(args.out_prefix)) setattr(args, 'proteins_fasta', '{0}.concatenated.predicted_proteins.faa'.format( args.out_prefix)) setattr(args, 'proteins_gff', '{0}.concatenated.predicted_proteins.gff'.format( args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.concatenated_fasta, args.log_file, args.quiet)) errors.append( check.check_output_file( args.proteins_fasta, args.log_file, args.quiet)) errors.append( check.check_output_file( args.proteins_gff, args.log_file, args.quiet)) if 'align' in step_list: errors.append( check.check_diamond_binaries( args.path_to_diamond, args.log_file, args.quiet)) setattr(args, 'alignment_file', '{0}.concatenated.alignment.diamond'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.alignment_file, args.log_file, args.quiet)) errors.append( check.check_folders_for_run( args.taxonomy_folder, args.nodes_dmp, args.names_dmp, args.database_folder, args.diamond_database, args.fastaid2LCAtaxid_file, args.taxids_with_multiple_offspring_file, step_list, args.log_file, args.quiet)) setattr(args, 'bin2classification_output_file', '{0}.bin2classification.txt'.format(args.out_prefix)) setattr(args, 'ORF2LCA_output_file', '{0}.ORF2LCA.txt'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.bin2classification_output_file, args.log_file, args.quiet)) errors.append( check.check_output_file( args.ORF2LCA_output_file, args.log_file, args.quiet)) if 'predict_proteins' not in step_list: errors.append( check.check_fasta( args.proteins_fasta, args.log_file, args.quiet)) if 'align' in step_list: errors.append( check.check_top(args.top, args.r, args.log_file, args.quiet)) # Print all variables. shared.print_variables(args, step_list) if True in errors: sys.exit(1) message = 'Ready to fly!\n\n-----------------\n' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Start BAT. (bin2contigs, contig_names) = import_bins( args.bin_folder, args.bin_suffix, args.log_file, args.quiet) if 'predict_proteins' in step_list: make_concatenated_fasta( args.concatenated_fasta, bin2contigs, args.bin_folder, args.log_file, args.quiet) shared.run_prodigal( args.path_to_prodigal, args.concatenated_fasta, args.proteins_fasta, args.proteins_gff, args.log_file, args.quiet) contig2ORFs = shared.import_ORFs( args.proteins_fasta, args.log_file, args.quiet) check.check_whether_ORFs_are_based_on_contigs( contig_names, contig2ORFs, args.log_file, args.quiet) if 'align' in step_list: shared.run_diamond(args) (ORF2hits, all_hits) = shared.parse_tabular_alignment( args.alignment_file, args.one_minus_r, args.log_file, args.quiet) (taxid2parent, taxid2rank) = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid( args.fastaid2LCAtaxid_file, all_hits, args.log_file, args.quiet) taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring( args.taxids_with_multiple_offspring_file, args.log_file, args.quiet) message = 'BAT is flying! Files {0} and {1} are created.'.format( args.bin2classification_output_file, args.ORF2LCA_output_file) shared.give_user_feedback(message, args.log_file, args.quiet) n_classified_bins = 0 with open(args.bin2classification_output_file, 'w') as outf1, open(args.ORF2LCA_output_file, 'w') as outf2: outf1.write('# bin\tclassification\treason\tlineage\tlineage scores\n') outf2.write('# ORF\tbin\tnumber of hits\tlineage\ttop bit-score\n') for bin_ in sorted(bin2contigs): LCAs_ORFs = [] for contig in sorted(bin2contigs[bin_]): if contig not in contig2ORFs: continue for ORF in contig2ORFs[contig]: if ORF not in ORF2hits: outf2.write('{0}\t{1}\tORF has no hit to database\n' ''.format(ORF, bin_)) continue n_hits = len(ORF2hits[ORF]) (taxid, top_bitscore) = tax.find_LCA_for_ORF( ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent) if taxid.startswith('no taxid found'): outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( ORF, bin_, n_hits, taxid, top_bitscore)) else: lineage = tax.find_lineage(taxid, taxid2parent) if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( ORF, bin_, n_hits, ';'.join(lineage[::-1]), top_bitscore)) LCAs_ORFs.append((taxid, top_bitscore),) if len(LCAs_ORFs) == 0: outf1.write('{0}\tno taxid assigned\tno hits to database\n' ''.format(bin_)) continue (lineages, lineages_scores, based_on_n_ORFs) = tax.find_weighted_LCA( LCAs_ORFs, taxid2parent, args.f) if lineages == 'no ORFs with taxids found.': outf1.write('{0}\tno taxid assigned\t' 'hits not found in taxonomy files\n'.format(bin_)) continue if lineages == 'no lineage whitelisted.': outf1.write( '{0}\tno taxid assigned\t' 'no lineage reached minimum bit-score support\n' ''.format(bin_)) continue # The bin has a valid classification. n_classified_bins += 1 total_n_ORFs = sum([len(contig2ORFs[contig]) for contig in bin2contigs[bin_] if contig in contig2ORFs]) for (i, lineage) in enumerate(lineages): if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) scores = ['{0:.2f}'.format(score) for score in lineages_scores[i]] if len(lineages) == 1: # There is only one classification. outf1.write( '{0}\t' 'taxid assigned\t' 'based on {1}/{2} ORFs\t' '{3}\t' '{4}\n'.format( bin_, based_on_n_ORFs, total_n_ORFs, ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) else: # There are multiple classifications. outf1.write( '{0}\t' 'taxid assigned ({1}/{2})\t' 'based on {3}/{4} ORFs\t' '{5}\t' '{6}\n'.format( bin_, i + 1, len(lineages), based_on_n_ORFs, total_n_ORFs, ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) message = ('\n-----------------\n\n' '{0} BAT is done! {1:,d}/{2:,d} bins have taxonomy assigned.' ''.format(shared.timestamp(), n_classified_bins, len(bin2contigs))) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) if args.f < 0.5: message = ('\nWARNING: since f is set to smaller than 0.5, one bin ' 'may have multiple classifications.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) return if __name__ == '__main__': sys.exit('Run \'CAT bins\' to run Bin Annotation Tool (BAT) on a ' 'set of bins.') CAT-5.2.3/CAT_pack/check.py000066400000000000000000000242301401073552600151700ustar00rootroot00000000000000#!/usr/bin/env/ python3 import hashlib import os import subprocess import sys import shared def check_md5_gz(gz_file, md5_file, log_file, quiet): message = 'Checking file integrity via MD5 checksum.' shared.give_user_feedback(message, log_file, quiet) with open(md5_file, 'r') as f: md5_exp = f.read().split(' ')[0] if md5_exp == '': message = ('WARNING: no MD5 found in {0}. Integrity of {1} can not be ' 'established.'.format(md5_file, gz_file)) shared.give_user_feedback(message, log_file, quiet) else: md5 = hashlib.md5() block_size = 4096 with open(gz_file, 'rb') as f: for chunk in iter(lambda: f.read(block_size), b''): md5.update(chunk) md5 = md5.hexdigest() if md5 != md5_exp: message = 'MD5 of {0} does not check out.'.format(gz_file) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) else: message = 'MD5 of {0} checks out.'.format(gz_file) shared.give_user_feedback(message, log_file, quiet) return def check_memory(Gb): total_memory = None error = False if sys.platform == 'linux' or sys.platform == 'linux2': # It's a Linux! meminfo_file = '/proc/meminfo' with open(meminfo_file, 'r') as f: for line in f: if line.startswith('MemTotal:'): mem = int(line.split(' ')[-2]) # Mem is given in Kb, convert to Gb. total_memory = mem / 2 ** 20 elif sys.platform == 'darwin': # It's a Mac! meminfo = subprocess.check_output(['sysctl', 'hw.memsize']) mem = int(meminfo.decode('utf-8').rstrip().split(' ')[-1]) # Mem is given in b, convert to Gb. total_memory = mem / 2 ** 30 if total_memory < Gb: error = True return ('{0:.1f}'.format(total_memory), error) def check_out_prefix(out_prefix, log_file, quiet): error = False if os.path.isdir(out_prefix): message = 'prefix for output files ({0}) is a directory.'.format( out_prefix) shared.give_user_feedback(message, log_file, quiet, error=True) error = True dir_ = out_prefix.rsplit('/', 1)[0] if not os.path.isdir(dir_): message = ('can not find output directory {0} to which output files ' 'should be written.'.format(dir_)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_prodigal_binaries(path_to_prodigal, log_file, quiet): error = False try: p = subprocess.Popen([path_to_prodigal, '-v'], stderr=subprocess.PIPE) c = p.communicate() output = c[1].decode().rstrip().lstrip() message = 'Prodigal found: {0}.'.format(output) shared.give_user_feedback(message, log_file, quiet) except OSError: message = ('can not find Prodigal. Please check whether it is ' 'installed or the path to the binaries is provided.') shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_diamond_binaries(path_to_diamond, log_file, quiet): error = False try: p = subprocess.Popen([path_to_diamond, '--version'], stdout=subprocess.PIPE) c = p.communicate() output = c[0].decode().rstrip() message = 'DIAMOND found: {0}.'.format(output) shared.give_user_feedback(message, log_file, quiet) except OSError: message = ('can not find DIAMOND. Please check whether it is ' 'installed or the path to the binaries is provided.') shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_bin_folder(bin_folder, bin_suffix, log_file, quiet): error = False if not os.path.isdir(bin_folder): message = 'can not find the bin folder.' shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error tmp = [] for file_ in os.listdir(bin_folder): if file_.startswith('.'): # Skip hidden files. continue if not file_.endswith(bin_suffix): continue if '.concatenated.' in file_: # Skip concatenated contig fasta and predicted protein fasta files # from earlier runs. continue tmp.append(file_) if len(tmp) == 0: message = ( 'no bins found with suffix {0} in bin folder. You can set the ' 'suffix with the [-s / --bin_suffix] argument.'.format( bin_suffix)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True elif len(tmp) == 1: message = ( 'WARNING: a single bin is found. You can run BAT in single ' 'bin mode, with \'CAT bin\' as opposed to \'CAT bins\' for a ' 'set of bins. Both modes will give the same results, but you ' 'might find single mode more convenient for your workflow.') shared.give_user_feedback(message, log_file, quiet) return error def check_bin_fasta(bin_fasta, log_file, quiet): error = False if check_fasta(bin_fasta, log_file, quiet): error = True if os.path.isdir(bin_fasta): message = ( '{0} is a directory. If you want to classify more than 1 bin ' 'you can run \'CAT bins\' instead of \'CAT bin\'.'.format( bin_fasta)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_folders_for_run( taxonomy_folder, nodes_dmp, names_dmp, database_folder, diamond_database, fastaid2LCAtaxid_file, taxids_with_multiple_offspring_file, step_list, log_file, quiet): error = False if not os.path.isdir(taxonomy_folder): message = 'can not find the taxonomy folder.' shared.give_user_feedback(message, log_file, quiet, error=True) error = True else: if not nodes_dmp or not names_dmp: message = ('nodes.dmp and / or names.dmp not found in the ' 'taxonomy folder.') shared.give_user_feedback(message, log_file, quiet, error=True) error = True if not os.path.isdir(database_folder): message = 'can not find the database folder.' shared.give_user_feedback(message, log_file, quiet, error=True) error = True else: if not diamond_database and 'align' in step_list: message = 'DIAMOND database not found in database folder.' shared.give_user_feedback(message, log_file, quiet, error=True) error = True if not fastaid2LCAtaxid_file: message = 'file fastaid2LCAtaxid is not found in database folder.' shared.give_user_feedback(message, log_file, quiet, error=True) error = True if not taxids_with_multiple_offspring_file: message = ('file taxids_with_multiple_offspring not found in ' 'database folder.') shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_output_file(output_file, log_file, quiet): error = False if os.path.isfile(output_file): message = ( 'output file {0} already exists. You can choose to overwrite ' 'existing files with the [--force] argument.'.format( output_file)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_input_file(input_file, log_file, quiet): error = False if not os.path.isfile(input_file): message = 'input file {0} does not exist.'.format(input_file) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_in_and_output_file(input_file, output_file, log_file, quiet): error = False if input_file == output_file: message = 'input file and output file can not be the same.' shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_top(top, r, log_file, quiet): error = False if top < 50: message = ( 'WARNING: [--top] is set lower than 50. This might conflict ' 'with future runs with higher settings of the ' '[-r / --range] parameter, see README.md.') shared.give_user_feedback(message, log_file, quiet) if top <= r: message = '[--top] should be higher than [-r / --range].' shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_fasta(file_, log_file, quiet): error = False if not os.path.isfile(file_): error = True else: with open(file_, 'r') as f1: for n, line in enumerate(f1): if n == 0: if not line.startswith('>'): error = True break if error: message = '{0} is not a fasta file.'.format(file_) shared.give_user_feedback(message, log_file, quiet, error=True) return error def check_whether_ORFs_are_based_on_contigs( contig_names, contig2ORFs, log_file, quiet): for contig in contig2ORFs: if contig not in contig_names: message = ( 'found a protein in the predicted proteins fasta file ' 'that can not be traced back to one of the contigs in the ' 'contigs fasta file: {0}. Proteins should be named ' 'contig_name_#.'.format(contig2ORFs[contig][0])) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) return if __name__ == '__main__': sys.exit('Run \'CAT\' to run CAT or BAT.') CAT-5.2.3/CAT_pack/contigs.py000066400000000000000000000346751401073552600155770ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import decimal import sys import about import check import shared import tax def parse_arguments(): parser = argparse.ArgumentParser( prog='CAT contigs', description='Run Contig Annotation Tool (CAT).', usage='CAT contigs -c -d -t [options] [-h / --help]', add_help=False) required = parser.add_argument_group('Required arguments') shared.add_argument(required, 'contigs_fasta', True) shared.add_argument(required, 'database_folder', True) shared.add_argument(required, 'taxonomy_folder', True) optional = parser.add_argument_group('Optional arguments') shared.add_argument(optional, 'r', False, default=decimal.Decimal(10)) shared.add_argument(optional, 'f', False, default=decimal.Decimal(0.5)) shared.add_argument(optional, 'out_prefix', False, default='./out.CAT') shared.add_argument(optional, 'proteins_fasta', False) shared.add_argument(optional, 'alignment_file', False) shared.add_argument(optional, 'path_to_prodigal', False, default='prodigal') shared.add_argument(optional, 'path_to_diamond', False, default='diamond') shared.add_argument(optional, 'no_stars', False) shared.add_argument(optional, 'force', False) shared.add_argument(optional, 'quiet', False) shared.add_argument(optional, 'verbose', False) shared.add_argument(optional, 'no_log', False) shared.add_argument(optional, 'help', False) shared.add_argument(optional, 'IkwId', False) specific = parser.add_argument_group('DIAMOND specific optional arguments') shared.add_all_diamond_arguments(specific) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, 'contigs')] if len(extra_args) > 0: sys.exit('error: too much arguments supplied:\n{0}'.format( '\n'.join(extra_args))) # Check experimental features. if not args.IkwId: if args.top < 50: sys.exit('error: --top can only be set lower than 50 in ' 'combination with the --I_know_what_Im_doing flag. See ' 'README.md as to why this is the case.') # Add extra arguments. shared.expand_arguments(args) return args def run(): args = parse_arguments() message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check at which state to start. step_list = [] if not args.proteins_fasta and not args.alignment_file: message = ( '\n' 'CAT is running. Protein prediction, alignment, and contig ' 'classification are carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) step_list.append('predict_proteins') step_list.append('align') elif args.proteins_fasta and not args.alignment_file: message = ( '\n' 'CAT is running. Since a predicted protein fasta is supplied, ' 'only alignment and contig classification are carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) step_list.append('align') elif args.proteins_fasta and args.alignment_file: message = ( '\n' 'CAT is running. Since a predicted protein fasta and ' 'alignment file are supplied, only contig classification is ' 'carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) elif not args.proteins_fasta and args.alignment_file: message = ( 'if you want CAT to directly do the classification, you ' 'should not only supply an alignment table but also a ' 'predicted protein fasta file with argument ' '[-p / --proteins].') shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) step_list.append('classify') # Print variables. message = ( 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Contigs fasta: {1}\n' 'Taxonomy folder: {2}\n' 'Database folder: {3}\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format( ' '.join(sys.argv), args.contigs_fasta, args.taxonomy_folder, args.database_folder, int(args.r), float(args.f), args.log_file)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check binaries, output files, taxonomy folder and database folder, and # set variables. message = 'Doing some pre-flight checks first.' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_out_prefix(args.out_prefix, args.log_file, args.quiet)) if 'predict_proteins' in step_list: errors.append( check.check_prodigal_binaries( args.path_to_prodigal, args.log_file, args.quiet)) setattr(args, 'proteins_fasta', '{0}.predicted_proteins.faa'.format(args.out_prefix)) setattr(args, 'proteins_gff', '{0}.predicted_proteins.gff'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.proteins_fasta, args.log_file, args.quiet)) errors.append( check.check_output_file( args.proteins_gff, args.log_file, args.quiet)) if 'align' in step_list: errors.append( check.check_diamond_binaries( args.path_to_diamond, args.log_file, args.quiet)) setattr(args, 'alignment_file', '{0}.alignment.diamond'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.alignment_file, args.log_file, args.quiet)) errors.append( check.check_folders_for_run( args.taxonomy_folder, args.nodes_dmp, args.names_dmp, args.database_folder, args.diamond_database, args.fastaid2LCAtaxid_file, args.taxids_with_multiple_offspring_file, step_list, args.log_file, args.quiet)) setattr(args, 'contig2classification_output_file', '{0}.contig2classification.txt'.format(args.out_prefix)) setattr(args, 'ORF2LCA_output_file', '{0}.ORF2LCA.txt'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.contig2classification_output_file, args.log_file, args.quiet)) errors.append( check.check_output_file( args.ORF2LCA_output_file, args.log_file, args.quiet)) if 'predict_proteins' not in step_list: errors.append( check.check_fasta( args.proteins_fasta, args.log_file, args.quiet)) if 'align' in step_list: errors.append( check.check_top(args.top, args.r, args.log_file, args.quiet)) # Print all variables. shared.print_variables(args, step_list) if True in errors: sys.exit(1) message = 'Ready to fly!\n\n-----------------\n' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Start CAT. contig_names = shared.import_contig_names( args.contigs_fasta, args.log_file, args.quiet) if 'predict_proteins' in step_list: shared.run_prodigal( args.path_to_prodigal, args.contigs_fasta, args.proteins_fasta, args.proteins_gff, args.log_file, args.quiet) contig2ORFs = shared.import_ORFs( args.proteins_fasta, args.log_file, args.quiet) check.check_whether_ORFs_are_based_on_contigs( contig_names, contig2ORFs, args.log_file, args.quiet) if 'align' in step_list: shared.run_diamond(args) (ORF2hits, all_hits) = shared.parse_tabular_alignment( args.alignment_file, args.one_minus_r, args.log_file, args.quiet) (taxid2parent, taxid2rank) = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid( args.fastaid2LCAtaxid_file, all_hits, args.log_file, args.quiet) taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring( args.taxids_with_multiple_offspring_file, args.log_file, args.quiet) message = 'CAT is spinning! Files {0} and {1} are created.'.format( args.contig2classification_output_file, args.ORF2LCA_output_file) shared.give_user_feedback(message, args.log_file, args.quiet) n_classified_contigs = 0 with open(args.contig2classification_output_file, 'w') as outf1, open(args.ORF2LCA_output_file, 'w') as outf2: outf1.write( '# contig\tclassification\treason\tlineage\tlineage scores\n') outf2.write('# ORF\tnumber of hits\tlineage\ttop bit-score\n') for contig in sorted(contig_names): if contig not in contig2ORFs: outf1.write('{0}\tno taxid assigned\tno ORFs found\n'.format( contig)) continue LCAs_ORFs = [] for ORF in contig2ORFs[contig]: if ORF not in ORF2hits: outf2.write('{0}\tORF has no hit to database\n'.format( ORF)) continue n_hits = len(ORF2hits[ORF]) (taxid, top_bitscore) = tax.find_LCA_for_ORF( ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent) if taxid.startswith('no taxid found'): outf2.write('{0}\t{1}\t{2}\t{3}\n'.format( ORF, n_hits, taxid, top_bitscore)) else: lineage = tax.find_lineage(taxid, taxid2parent) if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) outf2.write('{0}\t{1}\t{2}\t{3}\n'.format( ORF, n_hits, ';'.join(lineage[::-1]), top_bitscore)) LCAs_ORFs.append((taxid, top_bitscore),) if len(LCAs_ORFs) == 0: outf1.write('{0}\tno taxid assigned\t' 'no hits to database\n'.format(contig)) continue (lineages, lineages_scores, based_on_n_ORFs) = tax.find_weighted_LCA( LCAs_ORFs, taxid2parent, args.f) if lineages == 'no ORFs with taxids found.': outf1.write('{0}\tno taxid assigned\t' 'hits not found in taxonomy files\n'.format(contig)) continue if lineages == 'no lineage whitelisted.': outf1.write( '{0}\tno taxid assigned\t' 'no lineage reached minimum bit-score support\n' ''.format(contig)) continue # The contig has a valid classification. n_classified_contigs += 1 for (i, lineage) in enumerate(lineages): if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) scores = ['{0:.2f}'.format(score) for score in lineages_scores[i]] if len(lineages) == 1: # There is only one classification. outf1.write( '{0}\t' 'taxid assigned\t' 'based on {1}/{2} ORFs\t' '{3}\t' '{4}\n'.format( contig, based_on_n_ORFs, len(contig2ORFs[contig]), ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) else: # There are multiple classifications. outf1.write( '{0}\t' 'taxid assigned ({1}/{2})\t' 'based on {3}/{4} ORFs\t' '{5}\t' '{6}\n'.format( contig, i + 1, len(lineages), based_on_n_ORFs, len(contig2ORFs[contig]), ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) message = ('\n-----------------\n\n' '{0} CAT is done! {1:,d}/{2:,d} contigs have taxonomy assigned.' ''.format( shared.timestamp(), n_classified_contigs, len(contig_names))) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) if args.f < 0.5: message = ('\nWARNING: since f is set to smaller than 0.5, one contig ' 'may have multiple classifications.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) return if __name__ == '__main__': sys.exit('Run \'CAT contigs\' to run Contig Annotation Tool (CAT).') CAT-5.2.3/CAT_pack/prepare.py000066400000000000000000000725201401073552600155560ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import datetime import gzip import multiprocessing import os import subprocess import sys import tarfile import urllib.request import about import check import shared import tax def parse_arguments(): date = datetime.datetime.now().strftime('%Y-%m-%d') parser = argparse.ArgumentParser( prog='CAT prepare', description='Download and construct CAT/BAT database files.', usage=('CAT prepare (--fresh | --existing) ' '[options] [-h / --help]'), add_help=False) required_choice = parser.add_argument_group('Required choice') group = required_choice.add_mutually_exclusive_group(required=True) group.add_argument( '--fresh', dest='fresh', action='store_true', help='Start with a fresh database.') group.add_argument( '--existing', dest='fresh', action='store_false', help=('Start with an existing database. CAT will search the ' 'supplied database and taxonomy folders and only construct ' 'files that do not exist yet.')) optional = parser.add_argument_group('Optional arguments') shared.add_argument( optional, 'database_folder', False, default='./CAT_database.{0}'.format(date), help_=('Name of folder to which database files will be written ' '(default: CAT_database.{date})')) shared.add_argument( optional, 'taxonomy_folder', False, default='./CAT_taxonomy.{0}'.format(date), help_=('Name of folder to which taxonomy files will be downloaded ' '(default: CAT_taxonomy.{date})')) shared.add_argument(optional, 'path_to_diamond', False, default='diamond') shared.add_argument(optional, 'quiet', False) shared.add_argument(optional, 'verbose', False) shared.add_argument(optional, 'no_log', False) shared.add_argument(optional, 'help', False) specific = parser.add_argument_group('DIAMOND specific optional arguments') shared.add_argument(specific, 'nproc', False, default=multiprocessing.cpu_count()) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, 'prepare')] if len(extra_args) > 0: sys.exit('error: too much arguments supplied:\n{0}'.format( '\n'.join(extra_args))) # Add extra arguments. setattr(args, 'date', date) setattr(args, 'min_mem', 200) shared.expand_arguments(args) return (args) def memory_bottleneck(args): (total_memory, error) = check.check_memory(args.min_mem) if error: message = ( 'at least {0}GB of memory is needed for the database ' 'construction. {1}GB is found on your system. You can try ' 'to find a machine with more memory, or download ' 'preconstructed database files from ' 'tbb.bio.uu.nl/bastiaan/CAT_prepare/.'.format( args.min_mem, total_memory)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) return def download_taxonomy_files(taxonomy_folder, date, log_file, quiet): url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/' message = ('Downloading and extracting taxonomy files from {0} to ' 'taxonomy folder.'.format(url)) shared.give_user_feedback(message, log_file, quiet) url = '{0}taxdump.tar.gz'.format(url) tmp_taxonomy_file = '{0}{1}.taxdump.tar.gz'.format(taxonomy_folder, date) try: urllib.request.urlretrieve(url, tmp_taxonomy_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) url = '{0}.md5'.format(url) md5_file = '{0}{1}.taxdump.tar.gz.md5'.format(taxonomy_folder, date) try: urllib.request.urlretrieve(url, md5_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'Download complete.' shared.give_user_feedback(message, log_file, quiet) check.check_md5_gz(tmp_taxonomy_file, md5_file, log_file, quiet) try: with tarfile.open(tmp_taxonomy_file) as tar: tar.extractall(taxonomy_folder) except: message = 'something went wrong while extracting the taxonomy files.' shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'Extracting complete.' shared.give_user_feedback(message, log_file, quiet) return def download_prot_accession2taxid_file( prot_accession2taxid_file, date, log_file, quiet): url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/' message = 'Downloading mapping file from {0} to taxonomy folder.'.format( url) shared.give_user_feedback(message, log_file, quiet) url = '{0}prot.accession2taxid.FULL.gz'.format(url) try: urllib.request.urlretrieve(url, prot_accession2taxid_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) url = '{0}.md5'.format(url) md5_file = '{0}.md5'.format(prot_accession2taxid_file) try: urllib.request.urlretrieve(url, md5_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'Download complete.' shared.give_user_feedback(message, log_file, quiet) check.check_md5_gz(prot_accession2taxid_file, md5_file, log_file, quiet) return def download_nr(nr_file, log_file, quiet): url = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/' message = 'Downloading nr database from {0} to database folder.'.format( url) shared.give_user_feedback(message, log_file, quiet) url = '{0}nr.gz'.format(url) try: urllib.request.urlretrieve(url, nr_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) url = '{0}.md5'.format(url) md5_file = '{0}.md5'.format(nr_file) try: urllib.request.urlretrieve(url, md5_file) except: message = 'download of {0} failed.'.format(url) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'Download complete.' shared.give_user_feedback(message, log_file, quiet) check.check_md5_gz(nr_file, md5_file, log_file, quiet) return def make_diamond_database( path_to_diamond, nr_file, diamond_database_prefix, nproc, log_file, quiet, verbose): message = ('Constructing DIAMOND database {0}.dmnd from {1} using {2} ' 'cores.'.format(diamond_database_prefix, nr_file, nproc)) shared.give_user_feedback(message, log_file, quiet) command = [ path_to_diamond, 'makedb', '--in', nr_file, '-d', diamond_database_prefix, '-p', str(nproc)] if not verbose: command += ['--quiet'] try: subprocess.check_call(command) except: message = 'DIAMOND database could not be created.' shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'DIAMOND database constructed.' shared.give_user_feedback(message, log_file, quiet) return def import_headers_nr(nr_file, log_file, quiet): message = 'Loading file {0}.'.format(nr_file) shared.give_user_feedback(message, log_file, quiet) fastaid2prot_accessions = {} prot_accessions_whitelist = set() with gzip.open(nr_file, 'rb') as f1: for line in f1: line = line.decode('utf-8') if not line.startswith('>'): continue line = line.lstrip('>').split('\x01') prot_accessions = [i.split(' ')[0] for i in line] fastaid = prot_accessions[0] fastaid2prot_accessions[fastaid] = prot_accessions prot_accessions_whitelist.update(prot_accessions) return (fastaid2prot_accessions, prot_accessions_whitelist) def import_prot_accession2taxid( prot_accession2taxid_file, prot_accessions_whitelist, log_file, quiet): message = 'Loading file {0}.'.format(prot_accession2taxid_file) shared.give_user_feedback(message, log_file, quiet) prot_accession2taxid = {} with gzip.open(prot_accession2taxid_file, 'rb') as f1: for n, line in enumerate(f1): line = line.decode('utf-8') line = line.rstrip().split('\t') if n == 0: index_1 = line.index('accession.version') index_2 = line.index('taxid') continue prot_accession = line[index_1] if prot_accession in prot_accessions_whitelist: prot_accession2taxid[prot_accession] = line[index_2] return prot_accession2taxid def make_fastaid2LCAtaxid_file( nodes_dmp, fastaid2LCAtaxid_file, nr_file, prot_accession2taxid_file, taxid2parent, log_file, quiet): (fastaid2prot_accessions, prot_accessions_whitelist) = import_headers_nr( nr_file, log_file, quiet) prot_accession2taxid = import_prot_accession2taxid( prot_accession2taxid_file, prot_accessions_whitelist, log_file, quiet) message = 'Finding LCA of all protein accession numbers in fasta headers.' shared.give_user_feedback(message, log_file, quiet) no_taxid = 0 corrected = 0 total = 0 with open(fastaid2LCAtaxid_file, 'w') as outf1: for fastaid, prot_accessions in fastaid2prot_accessions.items(): list_of_lineages = [] for prot_accession in prot_accessions: try: taxid = prot_accession2taxid[prot_accession] lineage = tax.find_lineage(taxid, taxid2parent) list_of_lineages.append(lineage) except: # This accounts for missing accession numbers in # prot.accession2taxid and missing nodes in nodes.dmp. continue total += 1 if len(list_of_lineages) == 0: # This accounts for entries that only contain accession numbers # that are missing in prot.accession2taxid or whose taxid is # missing in nodes.dmp. NOTE that these entries are thus not # present in the output file. no_taxid += 1 continue LCAtaxid = tax.find_LCA(list_of_lineages) outf1.write('{0}\t{1}\n'.format(fastaid, LCAtaxid)) if (fastaid not in prot_accession2taxid or LCAtaxid != prot_accession2taxid[fastaid]): # If the fastaid cannot be found in prot.accession2taxid, but # a taxid is given to the fastaid based on secondary accession # numbers, or if the taxid of the header is different from the # LCA taxid, it is counted as corrected. corrected += 1 message = ('Done! File {0} is created. ' '{1:,d} of {2:,d} headers ({3:.1f}%) corrected. ' '{4:,d} headers ({5:.1f}%) do not have a taxid assigned.'.format( fastaid2LCAtaxid_file, corrected, total, corrected / total * 100, no_taxid, no_taxid / total * 100)) shared.give_user_feedback(message, log_file, quiet) return def find_offspring( nodes_dmp, fastaid2LCAtaxid_file, taxid2parent, log_file, quiet): message = 'Searching nr database for taxids with multiple offspring.' shared.give_user_feedback(message, log_file, quiet) taxid2offspring = {} with open(fastaid2LCAtaxid_file, 'r') as f1: for line in f1: line = line.rstrip().split('\t') taxid = line[1] lineage = tax.find_lineage(taxid, taxid2parent) for (i, taxid) in enumerate(lineage): # The first taxid in the lineage does not have a daughter node. if i == 0: continue if taxid not in taxid2offspring: taxid2offspring[taxid] = set() offspring = lineage[i - 1] taxid2offspring[taxid].add(offspring) return taxid2offspring def write_taxids_with_multiple_offspring_file( taxids_with_multiple_offspring_file, taxid2offspring, log_file, quiet): message = 'Writing {0}.'.format(taxids_with_multiple_offspring_file) shared.give_user_feedback(message, log_file, quiet) with open(taxids_with_multiple_offspring_file, 'w') as outf1: for taxid in taxid2offspring: if len(taxid2offspring[taxid]) >= 2: outf1.write('{0}\n'.format(taxid)) return def prepare(step_list, args): shared.print_variables(args, step_list) if not os.path.isdir(args.taxonomy_folder): os.mkdir(args.taxonomy_folder) message = 'Taxonomy folder {0} is created.'.format( args.taxonomy_folder) shared.give_user_feedback(message, args.log_file, args.quiet) if not os.path.isdir(args.database_folder): os.mkdir(args.database_folder) message = 'Database folder {0} is created.'.format( args.database_folder) shared.give_user_feedback(message, args.log_file, args.quiet) if 'download_taxonomy_files' in step_list: download_taxonomy_files( args.taxonomy_folder, args.date, args.log_file, args.quiet) setattr(args, 'nodes_dmp', '{0}nodes.dmp'.format(args.taxonomy_folder)) if 'download_prot_accession2taxid_file' in step_list: setattr(args, 'prot_accession2taxid_file', '{0}{1}.prot.accession2taxid.FULL.gz'.format( args.taxonomy_folder, args.date)) download_prot_accession2taxid_file( args.prot_accession2taxid_file, args.date, args.log_file, args.quiet) if 'download_nr' in step_list: setattr(args, 'nr_file', '{0}{1}.nr.gz'.format(args.database_folder, args.date)) download_nr(args.nr_file, args.log_file, args.quiet) if 'make_diamond_database' in step_list: setattr(args, 'diamond_database_prefix', '{0}{1}.nr'.format(args.database_folder, args.date)) make_diamond_database( args.path_to_diamond, args.nr_file, args.diamond_database_prefix, args.nproc, args.log_file, args.quiet, args.verbose) if ('make_fastaid2LCAtaxid_file' in step_list or 'make_taxids_with_multiple_offspring_file' in step_list): taxid2parent, taxid2rank = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) if 'make_fastaid2LCAtaxid_file' in step_list: setattr(args, 'fastaid2LCAtaxid_file', '{0}{1}.nr.fastaid2LCAtaxid'.format( args.database_folder, args.date)) make_fastaid2LCAtaxid_file( args.nodes_dmp, args.fastaid2LCAtaxid_file, args.nr_file, args.prot_accession2taxid_file, taxid2parent, args.log_file, args.quiet) if 'make_taxids_with_multiple_offspring_file' in step_list: setattr(args, 'taxids_with_multiple_offspring_file', '{0}{1}.nr.taxids_with_multiple_offspring'.format( args.database_folder, args.date)) taxid2offspring = find_offspring( args.nodes_dmp, args.fastaid2LCAtaxid_file, taxid2parent, args.log_file, args.quiet) write_taxids_with_multiple_offspring_file( args.taxids_with_multiple_offspring_file, taxid2offspring, args.log_file, args.quiet) message = ('\n-----------------\n\n' '{0} CAT prepare is done!'.format(shared.timestamp())) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) if args.nr_file: message = 'You may remove {0} now.'.format(args.nr_file) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) message = ( '\nSupply the following arguments to CAT or BAT if you want to ' 'use this database:\n' '-d / --database_folder {0}\n' '-t / --taxonomy_folder {1}'.format( args.database_folder, args.taxonomy_folder)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) return def run_fresh(args): message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) message = ( '\n' 'CAT prepare is running, constructing a fresh database.\n' 'Rawr!\n\n' 'WARNING: preparing the database files may take a couple of hours.' '\n\n' 'Supplied command: {0}\n\n' 'Taxonomy folder: {1}\n' 'Database folder: {2}\n' 'Log file: {3}\n\n' '-----------------\n'.format( ' '.join(sys.argv), args.taxonomy_folder, args.database_folder, args.log_file)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check diamond path. error = check.check_diamond_binaries( args.path_to_diamond, args.log_file, args.quiet) if error: sys.exit(1) if os.path.isdir(args.taxonomy_folder): if args.nodes_dmp or args.names_dmp or args.prot_accession2taxid_file: message = ( 'taxonomy folder {0} exists already and contains taxonomy ' 'files. Supply a novel or empty folder if you want ' 'to start fresh, or run CAT prepare --existing.'.format( args.taxonomy_folder)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) message = ('Taxonomy folder exists already. Taxonomy files will be ' 'downloaded to it.') shared.give_user_feedback(message, args.log_file, args.quiet) if os.path.isdir(args.database_folder): if (args.nr_file or args.diamond_database or args.fastaid2LCAtaxid_file or args.taxids_with_multiple_offspring_file): message = ( 'database folder {0} exists already and contains database ' 'files. Supply a novel or empty folder if you want to ' 'start fresh.'.format(args.database_folder)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) message = ('Database folder exists already. Database file will be ' 'downloaded to it / constructed in it.') shared.give_user_feedback(message, args.log_file, args.quiet) # Check memory. memory_bottleneck(args) step_list = ['download_taxonomy_files', 'download_prot_accession2taxid_file', 'download_nr', 'make_diamond_database', 'make_fastaid2LCAtaxid_file', 'make_taxids_with_multiple_offspring_file'] prepare(step_list, args) return def run_existing(args): step_list = [] message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) message = ( '\n' 'CAT prepare is running, constructing only parts of the database ' 'that are missing. Rawr!\n\n' 'WARNING: CAT prepare does not check whether the existing files ' 'are OK or corrupted, only if they are there.\n' 'WARNING: note that the database and taxonomy files should be ' 'downloaded preferably at the same date.\n' 'WARNING: preparing the database files may take a couple of hours.' '\n\n' 'Supplied command: {0}\n\n' 'Taxonomy folder: {1}\n' 'Database folder: {2}\n' 'Log file: {3}\n\n' '-----------------\n'.format( ' '.join(sys.argv), args.taxonomy_folder, args.database_folder, args.log_file)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) message = 'Doing some pre-flight checks first.' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check DIAMOND path. error = check.check_diamond_binaries( args.path_to_diamond, args.log_file, args.quiet) if error: sys.exit(1) # Check taxonomy folder. if not os.path.isdir(args.taxonomy_folder): message = ('Taxonomy folder not found. Directory will be created ' 'fresh and taxonomy files downloaded to it.') shared.give_user_feedback(message, args.log_file, args.quiet) else: message = ('Taxonomy folder found.') shared.give_user_feedback(message, args.log_file, args.quiet) if ((not args.nodes_dmp and args.names_dmp) or (args.nodes_dmp and not args.names_dmp)): message = ( 'CAT prepare did not find both nodes.dmp and names.dmp in the ' 'taxonomy folder. They should be downloaded together. Remove ' '{0} and try again.'.format( [file_ for file_ in (args.nodes_dmp, args.names_dmp) if file_][0])) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if not args.nodes_dmp and not args.names_dmp: message = ('Nodes.dmp and names.dmp will be downloaded to taxonomy ' 'folder.') shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('download_taxonomy_files') else: message = 'Nodes.dmp found: {0}.'.format(args.nodes_dmp) shared.give_user_feedback(message, args.log_file, args.quiet) message = 'Names.dmp found: {0}.'.format(args.names_dmp) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.prot_accession2taxid_file: # NOTE that the file will only be downloaded if a new # fastaid2LCAtaxid_file needs to be constructed. message = 'Prot.accession2taxid file not found in taxonomy folder.' shared.give_user_feedback(message, args.log_file, args.quiet) else: message = 'Prot.accession2taxid file found: {0}.'.format( args.prot_accession2taxid_file) shared.give_user_feedback(message, args.log_file, args.quiet) # Check database folder. if not os.path.isdir(args.database_folder): message = ( 'Database folder not found. Directory will be created fresh ' 'and necessary database files will be downloaded to ' 'it / constructed in it.') shared.give_user_feedback(message, args.log_file, args.quiet) else: message = ('Database folder found.') shared.give_user_feedback(message, args.log_file, args.quiet) tmp = (args.diamond_database, args.fastaid2LCAtaxid_file, args.taxids_with_multiple_offspring_file) if (not args.nr_file and None in tmp and not all([file_ is None for file_ in tmp])): message = ( 'database folder does not contain an nr file, while some but ' 'not all of the downstream files that depend on it are ' 'present. In order to prevent strange bugs from arising, ' 'remove all files from the database folder and try again.') shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if (not args.fastaid2LCAtaxid_file and args.taxids_with_multiple_offspring_file): message = ( 'file taxids_with_multiple_offspring exists but ' 'fastaid2LCAtaxid is not found in the database folder whilst ' 'taxids_with_multiple_offspring depends on it. In order to ' 'prevent strange bugs from arising, remove {0} and try again.' ''.format(args.taxids_with_multiple_offspring_file)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) whether_to_download_nr = True if (not args.nr_file and args.diamond_database and args.fastaid2LCAtaxid_file and args.taxids_with_multiple_offspring_file): whether_to_download_nr = False if not args.nr_file: if whether_to_download_nr: message = 'Nr file will be downloaded to database folder.' shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('download_nr') else: pass else: message = 'Nr file found: {0}.'.format(args.nr_file) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.diamond_database: message = 'DIAMOND database will be constructed from the nr file.' shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('make_diamond_database') else: message = 'DIAMOND database found: {0}.'.format(args.diamond_database) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.fastaid2LCAtaxid_file: if not args.prot_accession2taxid_file: message = ('Prot.accession2taxid file will be downloaded to ' 'taxonomy folder.') shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('download_prot_accession2taxid_file') message = 'File fastaid2LCAtaxid will be created.' shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('make_fastaid2LCAtaxid_file') else: message = ('Fastaid2LCAtaxid found: {0}.'.format( args.fastaid2LCAtaxid_file)) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.prot_accession2taxid_file: message = 'Prot.accession2taxid file will not be needed.' shared.give_user_feedback(message, args.log_file, args.quiet) if not args.taxids_with_multiple_offspring_file: message = 'File taxids_with_multiple_offspring will be created.' shared.give_user_feedback(message, args.log_file, args.quiet) step_list.append('make_taxids_with_multiple_offspring_file') else: message = 'Taxids_with_multiple_offspring found: {0}'.format( args.taxids_with_multiple_offspring_file) shared.give_user_feedback(message, args.log_file, args.quiet) if not args.nr_file and whether_to_download_nr is False: # This is pushed here just for the logic of the user. message = ( 'NOTE: Database folder contains all the necessary files ' 'except for nr.gz. Since nr.gz is not used by CAT or BAT, ' 'this is fine.') shared.give_user_feedback(message, args.log_file, args.quiet) if (not os.path.isdir(args.taxonomy_folder) and not os.path.isdir(args.database_folder)): message = ( '\n-----------------\n\n' 'WARNING: no taxonomy or database folder was found. CAT ' 'prepare will create them fresh. Are you sure you are linking ' 'to existing folders?') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) if 'make_fastaid2LCAtaxid_file' in step_list: # Check memory. memory_bottleneck(args) if len(step_list) == 0: message = ('All necessary files are found. Existing database does not ' 'need any more work...') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) sys.exit(0) else: message = 'Ready to fly!\n\n-----------------\n' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) prepare(step_list, args) return def run(): args = parse_arguments() if args.fresh: run_fresh(args) else: run_existing(args) return if __name__ == '__main__': sys.exit('Run \'CAT prepare\' to construct a CAT/BAT database.') CAT-5.2.3/CAT_pack/shared.py000066400000000000000000000637001401073552600153660ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import datetime import decimal import gzip import multiprocessing import os import subprocess import sys import check class PathAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): path = os.path.expanduser(values.rstrip('/')) if not path.startswith('/') and not path.startswith('.'): path = './{0}'.format(path) if os.path.isdir(path): path = '{0}/'.format(path) setattr(namespace, self.dest, path) class DecimalAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, decimal.Decimal(values)) class SuffixAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): bin_suffix = '.{0}'.format(values.lstrip('.')) setattr(namespace, self.dest, bin_suffix) def timestamp(): now = datetime.datetime.now() str_ = '[{0}]'.format(now.strftime('%Y-%m-%d %H:%M:%S')) return str_ def add_argument(argument_group, dest, required, default=None, help_=None): if dest == 'contigs_fasta': if help_ is None: help_ = 'Path to contigs fasta file.' argument_group.add_argument( '-c', '--contigs_fasta', dest='contigs_fasta', metavar='', required=required, type=str, action=PathAction, help=help_) elif dest == 'bin_fasta': if help is None: help_ = 'Path to bin fasta file.' argument_group.add_argument( '-b', '--bin_fasta', dest='bin_fasta', metavar='', required=required, type=str, action=PathAction, help=help_) elif dest == 'bin_folder': if help_ is None: help_ = 'Path to directory containing bins.' argument_group.add_argument( '-b', '--bin_folder', dest='bin_folder', metavar='', required=required, type=str, action=PathAction, help=help_) elif dest == 'database_folder': if help_ is None: help_ = 'Path to folder that contains database files.' argument_group.add_argument( '-d', '--database_folder', dest='database_folder', metavar='', required=required, type=str, action=PathAction, default=default, help=help_) elif dest == 'taxonomy_folder': if help_ is None: help_ = 'Path to folder that contains taxonomy files.' argument_group.add_argument( '-t', '--taxonomy_folder', dest='taxonomy_folder', metavar='', required=required, type=str, action=PathAction, default=default, help=help_) elif dest == 'bin_suffix': if help_ is None: help_ = ('Suffix of bins in bin folder (default: {0}).' ''.format(default)) argument_group.add_argument( '-s', '--bin_suffix', dest='bin_suffix', metavar='', required=required, type=str, default=default, help=help_) elif dest == 'r': if help_ is None: help_ = 'r parameter [0-49] (default: {0:.0f}).'.format(default) argument_group.add_argument( '-r', '--range', dest='r', metavar='', required=required, type=float, choices = [i for i in range(50)], action=DecimalAction, default=default, help=help_) elif dest == 'f': if help_ is None: help_ = ('f parameter [0-0.99] (default: {0:.2f}).' ''.format(default)) argument_group.add_argument( '-f', '--fraction', dest='f', metavar='', required=required, type=float, choices = [i / 100 for i in range(0, 100)], action=DecimalAction, default=default, help=help_) elif dest == 'out_prefix': if help_ is None: help_ = 'Prefix for output files (default: {0}).'.format(default) argument_group.add_argument( '-o', '--out_prefix', dest='out_prefix', metavar='', required=required, type=str, action=PathAction, default=default, help=help_) elif dest == 'proteins_fasta': if help_ is None: help_ = ('Path to predicted proteins fasta file. If supplied, the ' 'protein prediction step is skipped.') argument_group.add_argument( '-p', '--proteins_fasta', dest='proteins_fasta', metavar='', required=required, type=str, action=PathAction, help=help_) elif dest == 'alignment_file': if help_ is None: help_ = ( 'Path to alignment table. If supplied, the alignment ' 'step is skipped and classification is carried out ' 'directly. A predicted proteins fasta file should also be ' 'supplied with argument [-p / --proteins].') argument_group.add_argument( '-a', '--diamond_alignment', dest='alignment_file', metavar='', required=required, type=str, action=PathAction, help=help_) elif dest == 'path_to_prodigal': if help_ is None: help_ = ('Path to Prodigal binaries. Supply if CAT/BAT cannot ' 'find Prodigal') argument_group.add_argument( '--path_to_prodigal', dest='path_to_prodigal', metavar='', required=required, type=str, action=PathAction, default=default, help=help_) elif dest == 'path_to_diamond': if help_ is None: help_ = ('Path to DIAMOND binaries. Supply if CAT/BAT cannot find ' 'DIAMOND.') argument_group.add_argument( '--path_to_diamond', dest='path_to_diamond', metavar='', required=required, type=str, action=PathAction, default=default, help=help_) elif dest == 'no_stars': if help_ is None: help_ = 'Suppress marking of suggestive taxonomic assignments.' argument_group.add_argument( '--no_stars', dest='no_stars', required=required, action='store_true', help=help_) elif dest == 'force': if help_ is None: help_ = 'Force overwrite existing files.' argument_group.add_argument( '--force', dest='force', required=required, action='store_true', help=help_) elif dest == 'quiet': if help_ is None: help_ = 'Suppress verbosity.' argument_group.add_argument( '-q', '--quiet', dest='quiet', required=required, action='store_true', help=help_) elif dest == 'verbose': if help_ is None: help_ = 'Increase verbosity.' argument_group.add_argument( '--verbose', dest='verbose', required=required, action='store_true', help=help_) elif dest == 'no_log': if help_ is None: help_ = 'Suppress log file.' argument_group.add_argument( '--no_log', dest='no_log', required=required, action='store_true', help=help_) elif dest == 'help': if help_ is None: help_ = 'Show this help message and exit.' argument_group.add_argument( '-h', '--help', action='help', help=help_) elif dest == 'IkwId': if help_ is None: help_ = 'Flag for experimental features.' argument_group.add_argument( '--I_know_what_Im_doing', dest='IkwId', required=required, action='store_true', help=help_) elif dest == 'input_file': if help_ is None: help_ = 'Path to input file.' argument_group.add_argument( '-i', '--input_file', dest='input_file', metavar='', required=required, type=str, action=PathAction, help=help_) elif dest == 'output_file': if help_ is None: help_ = 'Path to output file.' argument_group.add_argument( '-o', '--output_file', dest='output_file', metavar='', required=required, type=str, action=PathAction, help=help_) elif dest == 'only_official': if help_ is None: help_ = ('Only output official raxonomic ranks (superkingdom, ' 'phylum, class, order, family, genus, species).') argument_group.add_argument( '--only_official', dest='only_official', required=required, action='store_true', help=help_) elif dest == 'exclude_scores': if help_ is None: help_ = ('Do not include bit-score support scores in the lineage ' 'of a classification output file.') argument_group.add_argument( '--exclude_scores', dest='exclude_scores', required=required, action='store_true', help=help_) elif dest == 'nproc': if help_ is None: help_ = 'Number of cores to deploy by DIAMOND (default: maximum).' argument_group.add_argument( '-n', '--nproc', dest='nproc', metavar='', required=required, type=int, default=default, help=help_) elif dest == 'sensitive': if help_ is None: help_ = 'Run DIAMOND in sensitive mode (default: not enabled).' argument_group.add_argument( '--sensitive', dest='sensitive', required=required, action='store_true', help=help_) elif dest == 'no_self_hits': if help_ is None: help_ = ('Do not report identical self hits by DIAMOND (default: ' 'not enabled).') argument_group.add_argument( '--no_self_hits', dest='no_self_hits', required=required, action='store_true', help=help_) elif dest == 'block_size': if help_ is None: help_ = ( 'DIAMOND block-size parameter (default: {0}). Lower ' 'numbers will decrease memory and temporary disk space ' 'usage.'.format(default)) argument_group.add_argument( '--block_size', dest='block_size', metavar='', required=required, type=float, default=default, help=help_) elif dest == 'index_chunks': if help_ is None: help_ = ( 'DIAMOND index-chunks parameter (default: {0}). Set to ' '1 on high memory machines. The parameter has no effect ' 'on temporary disk space usage.'.format(default)) argument_group.add_argument( '--index_chunks', dest='index_chunks', metavar='', required=required, type=int, default=default, help=help_) elif dest == 'tmpdir': if help_ is None: help_ = ('Directory for temporary DIAMOND files (default: ' 'directory to which output files are written).') argument_group.add_argument( '--tmpdir', dest='tmpdir', metavar='', required=required, type=str, action=PathAction, help=help_) elif dest == 'compress': if help_ is None: help_ = 'Compress DIAMOND alignment file (default: not enabled).' argument_group.add_argument( '--compress', dest='compress', required=required, action='store_true', help=help_) elif dest == 'top': if help_ is None: help_ = ( 'DIAMOND top parameter [0-50] (default: {0}). Governs ' 'hits within range of best hit that are written to the ' 'alignment file. This is not the [-r / --range] ' 'parameter! Can only be set with the ' '[--I_know_what_Im_doing] flag, see README.md.' ''.format(default)) argument_group.add_argument( '--top', dest='top', metavar='', required=required, type=float, choices = [i for i in range(51)], default=default, help=help_) else: sys.exit('Unknown parser dest {0}.'.format(dest)) return def add_all_diamond_arguments(argument_group): add_argument(argument_group, 'nproc', False, default=multiprocessing.cpu_count()) add_argument(argument_group, 'sensitive', False) add_argument(argument_group, 'no_self_hits', False) add_argument(argument_group, 'block_size', False, default=2.0) add_argument(argument_group, 'index_chunks', False, default=4) add_argument(argument_group, 'tmpdir', False) add_argument(argument_group, 'compress', False) add_argument(argument_group, 'top', False, default=50) return def expand_arguments(args): if 'r' in args: setattr(args, 'one_minus_r', (100 - args.r) / 100) if 'out_prefix' in args: if not args.tmpdir: tmpdir = '{0}/'.format(args.out_prefix.rsplit('/', 1)[0]) setattr(args, 'tmpdir', tmpdir) if 'no_log' in args and not args.no_log: if 'fresh' in args and args.fresh: log_file = './CAT_prepare.{0}.fresh.log'.format(args.date) elif 'fresh' in args and not args.fresh: log_file = './CAT_prepare.{0}.existing.log'.format(args.date) else: # Check out_prefix as the log file needs to be written to a valid # location. error = check.check_out_prefix(args.out_prefix, None, args.quiet) if error: sys.exit(1) log_file = '{0}.log'.format(args.out_prefix) with open(log_file, 'w') as outf: pass else: log_file = None setattr(args, 'log_file', log_file) if 'taxonomy_folder' in args: setattr(args, 'taxonomy_folder', '{0}/'.format(args.taxonomy_folder.rstrip('/'))) explore_taxonomy_folder(args) if 'database_folder' in args: setattr(args, 'database_folder', '{0}/'.format(args.database_folder.rstrip('/'))) explore_database_folder(args) return def explore_taxonomy_folder(args): nodes_dmp = None names_dmp = None prot_accession2taxid_file = None if os.path.isdir(args.taxonomy_folder): for file_ in os.listdir(args.taxonomy_folder): if file_ == 'nodes.dmp': nodes_dmp = '{0}{1}'.format(args.taxonomy_folder, file_) elif file_ == 'names.dmp': names_dmp = '{0}{1}'.format(args.taxonomy_folder, file_) elif file_.endswith('prot.accession2taxid.FULL.gz'): prot_accession2taxid_file = '{0}{1}'.format( args.taxonomy_folder, file_) elif (file_.endswith('prot.accession2taxid.gz') and prot_accession2taxid_file is None): # Legacy prot_accession2taxid_file. prot_accession2taxid_file = '{0}{1}'.format( args.taxonomy_folder, file_) setattr(args, 'nodes_dmp', nodes_dmp) setattr(args, 'names_dmp', names_dmp) setattr(args, 'prot_accession2taxid_file', prot_accession2taxid_file) return def explore_database_folder(args): nr_file = None diamond_database = None fastaid2LCAtaxid_file = None taxids_with_multiple_offspring_file = None if os.path.isdir(args.database_folder): for file_ in os.listdir(args.database_folder): if file_.endswith('nr.gz'): nr_file = '{0}{1}'.format(args.database_folder, file_) elif file_.endswith('.dmnd'): diamond_database = '{0}{1}'.format( args.database_folder, file_) elif file_.endswith('fastaid2LCAtaxid'): fastaid2LCAtaxid_file = '{0}{1}'.format( args.database_folder, file_) elif file_.endswith('taxids_with_multiple_offspring'): taxids_with_multiple_offspring_file = ('{0}{1}' ''.format(args.database_folder, file_)) setattr(args, 'nr_file', nr_file) setattr(args, 'diamond_database', diamond_database) setattr(args, 'fastaid2LCAtaxid_file', fastaid2LCAtaxid_file) setattr(args, 'taxids_with_multiple_offspring_file', taxids_with_multiple_offspring_file) return def print_variables(args, step_list=None): if args.verbose: arguments = ['{0}: {1}'.format(k, v) for k, v in sorted(vars(args).items())] message = ( '\n-----------------\n\n' 'Full list of arguments:\n' '{0}'.format('\n'.join(arguments))) give_user_feedback(message, args.log_file, args.quiet, show_time=False) if step_list is not None: message = '\nStep list: {0}'.format(step_list) give_user_feedback(message, args.log_file, args.quiet, show_time=False) message = '\n-----------------\n' give_user_feedback(message, args.log_file, args.quiet, show_time=False) return def give_user_feedback(message, log_file=None, quiet=False, show_time=True, error=False): if error: message = 'ERROR: {0}'.format(message) if show_time: message = '{0} {1}'.format(timestamp(), message) message = '{0}\n'.format(message) if log_file: with open(log_file, 'a') as outf1: outf1.write(message) if not quiet and not error: sys.stdout.write(message) if not quiet and error: sys.stderr.write(message) return def run_prodigal( path_to_prodigal, contigs_fasta, proteins_fasta, proteins_gff, log_file, quiet): message = ( 'Running Prodigal for ORF prediction. Files {0} and {1} will be ' 'generated. Do not forget to cite Prodigal when using CAT or BAT ' 'in your publication.'.format(proteins_fasta, proteins_gff)) give_user_feedback(message, log_file, quiet) try: command = [ path_to_prodigal, '-i', contigs_fasta, '-a', proteins_fasta, '-o', proteins_gff, '-p', 'meta', '-g', '11', '-q', '-f', 'gff'] subprocess.check_call(command) except: message = 'Prodigal finished abnormally.' give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = 'ORF prediction done!' give_user_feedback(message, log_file, quiet) return def run_diamond(args): if args.sensitive: mode = 'sensitive' else: mode = 'fast' if args.compress: compression = '1' else: compression = '0' message = ( 'Homology search with DIAMOND is starting. Please be patient. Do ' 'not forget to cite DIAMOND when using CAT or BAT in your ' 'publication.\n' '\t\t\tquery: {0}\n' '\t\t\tdatabase: {1}\n' '\t\t\tmode: {2}\n' '\t\t\ttop: {3}\n' '\t\t\tno-self-hits: {4}\n' '\t\t\tnumber of cores: {5}\n' '\t\t\tblock-size (billions of letters): {6}\n' '\t\t\tindex-chunks: {7}\n' '\t\t\ttmpdir: {8}\n' '\t\t\tcompress: {9}'.format( args.proteins_fasta, args.diamond_database, mode, args.top, args.no_self_hits, args.nproc, args.block_size, args.index_chunks, args.tmpdir, compression)) give_user_feedback(message, args.log_file, args.quiet) try: command = [ args.path_to_diamond, 'blastp', '-d', args.diamond_database, '-q', args.proteins_fasta, '--top', str(args.top), '--matrix', 'BLOSUM62', '--evalue', '0.001', '-o', args.alignment_file, '-p', str(args.nproc), '--block-size', str(args.block_size), '--index-chunks', str(args.index_chunks), '--tmpdir', args.tmpdir, '--compress', compression] if not args.verbose: command += ['--quiet'] if args.sensitive: command += ['--sensitive'] if args.no_self_hits: command += ['--no-self-hits'] subprocess.check_call(command) except: message = 'DIAMOND finished abnormally.' give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if args.compress: setattr(args, 'alignment_file', '{0}.gz'.format(args.alignment_file)) message = 'Homology search done! File {0} created.'.format( args.alignment_file) give_user_feedback(message, args.log_file, args.quiet) return def import_contig_names(fasta_file, log_file, quiet): message = 'Importing contig names from {0}.'.format(fasta_file) give_user_feedback(message, log_file, quiet) contig_names = set() with open(fasta_file, 'r') as f1: for line in f1: if line.startswith('>'): contig = line.split(' ')[0].lstrip('>').rstrip() if contig in contig_names: message = ( 'your fasta file contains duplicate headers. The ' 'first duplicate encountered is {0}, but there ' 'might be more...'.format(contig)) give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) contig_names.add(contig) return contig_names def import_ORFs(proteins_fasta, log_file, quiet): message = 'Parsing ORF file {0}'.format(proteins_fasta) give_user_feedback(message, log_file, quiet) contig2ORFs = {} with open(proteins_fasta, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('>'): ORF = line.split(' ')[0].lstrip('>') contig = ORF.rsplit('_', 1)[0] if contig not in contig2ORFs: contig2ORFs[contig] = [] contig2ORFs[contig].append(ORF) return contig2ORFs def parse_tabular_alignment( alignment_file, one_minus_r, log_file, quiet): message = 'Parsing alignment file {0}.'.format(alignment_file) give_user_feedback(message, log_file, quiet) compressed = False if alignment_file.endswith('.gz'): compressed = True f1 = gzip.open(alignment_file, 'rb') else: f1 = open(alignment_file, 'r') ORF2hits = {} all_hits = set() ORF = 'first ORF' ORF_done = False for line in f1: if compressed: line = line.decode('utf-8') if line.startswith(ORF) and ORF_done == True: # The ORF has already surpassed its minimum allowed bit-score. continue line = line.rstrip().split('\t') if not line[0] == ORF: # A new ORF is reached. ORF = line[0] top_bitscore = decimal.Decimal(line[11]) ORF2hits[ORF] = [] ORF_done = False bitscore = decimal.Decimal(line[11]) if bitscore >= one_minus_r * top_bitscore: # The hit has a high enough bit-score to be included. hit = line[1] ORF2hits[ORF].append((hit, bitscore),) all_hits.add(hit) else: # The hit is not included because its bit-score is too low. ORF_done = True f1.close() return (ORF2hits, all_hits) if __name__ == '__main__': sys.exit('Run \'CAT\' to run CAT or BAT.') CAT-5.2.3/CAT_pack/single_bin.py000066400000000000000000000361721401073552600162340ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import decimal import multiprocessing import sys import about import check import shared import tax def parse_arguments(): parser = argparse.ArgumentParser( prog='CAT bin', description='Run Bin Annotation Tool (BAT) on a single bin.', usage='CAT bin -b -d -t [options] [-h / --help]', add_help=False) required = parser.add_argument_group('Required arguments') shared.add_argument(required, 'bin_fasta', True) shared.add_argument(required, 'database_folder', True) shared.add_argument(required, 'taxonomy_folder', True) optional = parser.add_argument_group('Optional arguments') shared.add_argument(optional, 'r', False, default=decimal.Decimal(5)) shared.add_argument(optional, 'f', False, default=decimal.Decimal(0.3)) shared.add_argument(optional, 'out_prefix', False, default='./out.BAT') shared.add_argument(optional, 'proteins_fasta', False) shared.add_argument(optional, 'alignment_file', False) shared.add_argument(optional, 'path_to_prodigal', False, default='prodigal') shared.add_argument(optional, 'path_to_diamond', False, default='diamond') shared.add_argument(optional, 'no_stars', False) shared.add_argument(optional, 'force', False) shared.add_argument(optional, 'quiet', False) shared.add_argument(optional, 'verbose', False) shared.add_argument(optional, 'no_log', False) shared.add_argument(optional, 'help', False) shared.add_argument(optional, 'IkwId', False) specific = parser.add_argument_group('DIAMOND specific optional arguments') shared.add_all_diamond_arguments(specific) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, 'bin')] if len(extra_args) > 0: sys.exit('error: too much arguments supplied:\n{0}'.format( '\n'.join(extra_args))) # Check experimental features. if not args.IkwId: if args.top < 50: sys.exit('error: --top can only be set lower than 50 in ' 'combination with the --I_know_what_Im_doing flag. See ' 'README.md as to why this is the case.') # Add extra arguments. shared.expand_arguments(args) return args def run(): args = parse_arguments() message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check at which state to start. step_list = [] if not args.proteins_fasta and not args.alignment_file: message = ( '\n' 'BAT is running. Protein prediction, alignment, and bin ' 'classification are carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) step_list.append('predict_proteins') step_list.append('align') elif args.proteins_fasta and not args.alignment_file: message = ( '\n' 'BAT is running. Since a predicted protein fasta is supplied, ' 'only alignment and bin classification are carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) step_list.append('align') elif args.proteins_fasta and args.alignment_file: message = ( '\n' 'BAT is running. Since a predicted protein fasta and ' 'alignment file are supplied, only bin classification is ' 'carried out.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) elif not args.proteins_fasta and args.alignment_file: message = ( 'if you want BAT to directly classify a single bin, you ' 'should not only supply an alignment table but also a ' 'predicted protein fasta file with argument ' '[-p / --proteins].') shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) step_list.append('classify') # Print variables. message = ( 'Rarw!\n\n' 'Supplied command: {0}\n\n' 'Bin fasta: {1}\n' 'Taxonomy folder: {2}\n' 'Database folder: {3}\n' 'Parameter r: {4}\n' 'Parameter f: {5}\n' 'Log file: {6}\n\n' '-----------------\n'.format( ' '.join(sys.argv), args.bin_fasta, args.taxonomy_folder, args.database_folder, int(args.r), float(args.f), args.log_file)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Check binaries, output files, taxonomy folder and database folder, and # set variables. message = 'Doing some pre-flight checks first.' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_out_prefix(args.out_prefix, args.log_file, args.quiet)) errors.append( check.check_bin_fasta(args.bin_fasta, args.log_file, args.quiet)) if 'predict_proteins' in step_list: errors.append( check.check_prodigal_binaries( args.path_to_prodigal, args.log_file, args.quiet)) setattr(args, 'proteins_fasta', '{0}.predicted_proteins.faa'.format(args.out_prefix)) setattr(args, 'proteins_gff', '{0}.predicted_proteins.gff'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.proteins_fasta, args.log_file, args.quiet)) errors.append( check.check_output_file( args.proteins_gff, args.log_file, args.quiet)) if 'align' in step_list: errors.append( check.check_diamond_binaries( args.path_to_diamond, args.log_file, args.quiet)) setattr(args, 'alignment_file', '{0}.alignment.diamond'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.alignment_file, args.log_file, args.quiet)) errors.append( check.check_folders_for_run( args.taxonomy_folder, args.nodes_dmp, args.names_dmp, args.database_folder, args.diamond_database, args.fastaid2LCAtaxid_file, args.taxids_with_multiple_offspring_file, step_list, args.log_file, args.quiet)) setattr(args, 'bin2classification_output_file', '{0}.bin2classification.txt'.format(args.out_prefix)) setattr(args, 'ORF2LCA_output_file', '{0}.ORF2LCA.txt'.format(args.out_prefix)) if not args.force: errors.append( check.check_output_file( args.bin2classification_output_file, args.log_file, args.quiet)) errors.append( check.check_output_file( args.ORF2LCA_output_file, args.log_file, args.quiet)) if 'predict_proteins' not in step_list: errors.append( check.check_fasta( args.proteins_fasta, args.log_file, args.quiet)) if 'align' in step_list: errors.append( check.check_top(args.top, args.r, args.log_file, args.quiet)) # Print all variables. shared.print_variables(args, step_list) if True in errors: sys.exit(1) message = 'Ready to fly!\n\n-----------------\n' shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) # Start BAT. contig_names = shared.import_contig_names( args.bin_fasta, args.log_file, args.quiet) if 'predict_proteins' in step_list: shared.run_prodigal( args.path_to_prodigal, args.bin_fasta, args.proteins_fasta, args.proteins_gff, args.log_file, args.quiet) contig2ORFs = shared.import_ORFs( args.proteins_fasta, args.log_file, args.quiet) check.check_whether_ORFs_are_based_on_contigs( contig_names, contig2ORFs, args.log_file, args.quiet) if 'align' in step_list: shared.run_diamond(args) (ORF2hits, all_hits) = shared.parse_tabular_alignment( args.alignment_file, args.one_minus_r, args.log_file, args.quiet) (taxid2parent, taxid2rank) = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid( args.fastaid2LCAtaxid_file, all_hits, args.log_file, args.quiet) taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring( args.taxids_with_multiple_offspring_file, args.log_file, args.quiet) message = 'BAT is flying! Files {0} and {1} are created.'.format( args.bin2classification_output_file, args.ORF2LCA_output_file) shared.give_user_feedback(message, args.log_file, args.quiet) n_classified_bins = 0 with open(args.bin2classification_output_file, 'w') as outf1, open(args.ORF2LCA_output_file, 'w') as outf2: outf1.write('# bin\tclassification\treason\tlineage\tlineage scores\n') outf2.write('# ORF\tbin\tnumber of hits\tlineage\ttop bit-score\n') # The list contains only a single bin, but I keep the code like this # to make the code consistent across bin and bins. bin_list = [args.bin_fasta.rsplit('/', 1)[-1]] for bin_ in bin_list: LCAs_ORFs = [] for contig in sorted(contig_names): if contig not in contig2ORFs: continue for ORF in contig2ORFs[contig]: if ORF not in ORF2hits: outf2.write('{0}\t{1}\tORF has no hit to database\n' ''.format(ORF, bin_)) continue n_hits = len(ORF2hits[ORF]) (taxid, top_bitscore) = tax.find_LCA_for_ORF( ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent) if taxid.startswith('no taxid found'): outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( ORF, bin_, n_hits, taxid, top_bitscore)) else: lineage = tax.find_lineage(taxid, taxid2parent) if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( ORF, bin_, n_hits, ';'.join(lineage[::-1]), top_bitscore)) LCAs_ORFs.append((taxid, top_bitscore),) if len(LCAs_ORFs) == 0: outf1.write('{0}\tno taxid assigned\tno hits to database\n' ''.format(bin_)) continue (lineages, lineages_scores, based_on_n_ORFs) = tax.find_weighted_LCA( LCAs_ORFs, taxid2parent, args.f) if lineages == 'no ORFs with taxids found.': outf1.write('{0}\tno taxid assigned\t' 'hits not found in taxonomy files\n'.format(bin_)) continue if lineages == 'no lineage whitelisted.': outf1.write( '{0}\tno taxid assigned\t' 'no lineage reached minimum bit-score support\n' ''.format(bin_)) continue # The bin has a valid classification. n_classified_bins += 1 total_n_ORFs = sum([len(contig2ORFs[contig]) for contig in contig_names if contig in contig2ORFs]) for (i, lineage) in enumerate(lineages): if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) scores = ['{0:.2f}'.format(score) for score in lineages_scores[i]] if len(lineages) == 1: # There is only one classification. outf1.write( '{0}\t' 'taxid assigned\t' 'based on {1}/{2} ORFs\t' '{3}\t' '{4}\n'.format( bin_, based_on_n_ORFs, total_n_ORFs, ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) else: # There are multiple classifications. outf1.write( '{0}\t' 'taxid assigned ({1}/{2})\t' 'based on {3}/{4} ORFs\t' '{5}\t' '{6}\n'.format( bin_, i + 1, len(lineages), based_on_n_ORFs, total_n_ORFs, ';'.join(lineage[::-1]), ';'.join(scores[::-1]))) message = ('\n-----------------\n' '{0} BAT is done! {1}/1 bin has taxonomy assigned.'.format( shared.timestamp(), n_classified_bins)) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) if args.f < 0.5: message = ('WARNING: since f is set to smaller than 0.5, one bin ' 'may have multiple classifications.') shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) return if __name__ == '__main__': sys.exit('Run \'CAT bins\' to run Bin Annotation Tool (BAT) on a single ' 'bin.') CAT-5.2.3/CAT_pack/summarise.py000066400000000000000000000361251401073552600161260ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import sys import about import check import shared def parse_arguments(): parser = argparse.ArgumentParser( prog='CAT summarise', description='Summarise a named CAT or BAT classification file.', usage='CAT summarise -i -o (-c) [options] [-h / --help]', add_help=False) required = parser.add_argument_group('Required arguments') shared.add_argument(required, 'input_file', True, help_=( 'Path to named CAT contig classification file or BAT bin ' 'classification file. Currently only official ranks are ' 'supported, and only classification files containing a single ' 'classification per contig / bin. If you want to summarise a ' 'contig classification file, you have to supply the contigs ' 'fasta file with argument [-c / --contigs_fasta].')) shared.add_argument(required, 'output_file', True) optional = parser.add_argument_group('Optional arguments') shared.add_argument(optional, 'contigs_fasta', False, help_=('Path to contigs fasta file. Required if you want to ' 'summarise a contig classification file.')) shared.add_argument(optional, 'force', False) shared.add_argument(optional, 'quiet', False) shared.add_argument(optional, 'help', False) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, 'summarise')] if len(extra_args) > 0: sys.exit('error: too much arguments supplied:\n{0}'.format( '\n'.join(extra_args))) # Add extra arguments. shared.expand_arguments(args) return args def import_contig_lengths(contigs_fasta, log_file, quiet): message = 'Gathering contig lengths from {0}.'.format(contigs_fasta) shared.give_user_feedback(message, log_file, quiet) contig2length = {} with open(contigs_fasta, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('>'): contig = line.split(' ')[0].lstrip('>') contig2length[contig] = 0 else: try: contig2length[contig] += len(line) except: message = '{0} is not a contigs fasta'.format( contigs_fasta) shared.give_user_feedback( message, log_file, quiet, error=True) sys.exit(1) return contig2length def summarise_contigs(args): message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_input_file(args.input_file, args.log_file, args.quiet)) if not args.force: errors.append( check.check_output_file( args.output_file, args.log_file, args.quiet)) errors.append( check.check_in_and_output_file( args.input_file, args.output_file, args.log_file, args.quiet)) if True in errors: sys.exit(1) contig2length = import_contig_lengths( args.contigs_fasta, args.log_file, args.quiet) message = 'Summarising...' shared.give_user_feedback(message, args.log_file, args.quiet) with open(args.input_file, 'r') as f1: for line in f1: if line.startswith('#'): line = line.split('\t') if line[0] != '# contig': message = '{0} is not a CAT classification file.'.format( args.input_file) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) if line[0] == '# bin': message = ( '{0} appears to be a BAT classification file. ' 'If you want to summarise bin ' 'classifications, simply don\'t supply a ' 'contigs fasta and everything should be fine.' ''.format(args.input_file)) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) try: superkingdom_index = line.index('superkingdom') except: message = ( 'official ranks not found in header of {0}. Make ' 'sure that the CAT classification file is named ' 'with official ranks with \'CAT add_names ' '--only_official\'.'.format(args.input_file)) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) break else: message = 'input file does not have a recognisable header.' shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) length = {} length['no taxid assigned'] = [] ORFs = {} official_ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] for rank in official_ranks: length[rank] = {} ORFs[rank] = {} n = 0 contig_trace = set() doubles = set() with open(args.input_file, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('#'): continue n += 1 line = line.split('\t') contig = line[0] if contig in contig_trace: doubles.add(contig) contig_trace.add(contig) if contig not in contig2length: message = ( 'contig {0} in CAT classification file is not found ' 'in supplied contigs fasta file. Are you sure the CAT ' 'classification file is based on the contigs fasta?' ''.format(contig)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if line[1] == 'no taxid assigned': length['no taxid assigned'].append(contig2length[contig]) continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(': ', 1)[0].rstrip('*') rank = official_ranks[i] if classification not in length[rank]: length[rank][classification] = [] ORFs[rank][classification] = [] length[rank][classification].append(contig2length[contig]) # NOTE that the total number of ORFs on a contig is reproted, # not only the number of ORFs a classification is based on. ORFs_on_contig = int(line[2].split('/')[1].split(' ')[0]) ORFs[rank][classification].append(ORFs_on_contig) if len(doubles) != 0: message = ( 'some contigs have multiple classifications. CAT summarise ' 'currently does not allow for this. Contigs with multiple ' 'classifications: {0}.'.format(', '.join(list(doubles)))) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if n != len(contig2length): message = ( 'the number of classified contigs is not the same as the ' 'number of contigs in contigs fasta. Are you sure the CAT ' 'classification file is based on the contigs fasta?') shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) with open(args.output_file, 'w') as outf1: n_contigs = len(contig2length) total_length = sum(contig2length.values()) n_classified_contigs = n_contigs - len(length['no taxid assigned']) total_classified_length = total_length - sum( length['no taxid assigned']) outf1.write('# total number of contigs in {0} is {1:,d} representing ' '{2:,d} positions.\n'.format( args.contigs_fasta, n_contigs, total_length)) outf1.write('# {0:,d} contigs have taxonomy assigned ({1:.2f}%) ' 'representing {2:,d} positions ({3:.2f}%) in {4}.\n'.format( n_classified_contigs, n_classified_contigs / n_contigs * 100, total_classified_length, total_classified_length / total_length * 100, args.input_file)) outf1.write('#\n') outf1.write( '# rank\t' 'clade\t' 'number of contigs\t' 'number of ORFs\t' 'number of positions\n') for rank in official_ranks: for clade in sorted(length[rank], key=lambda x: sum(length[rank][x]), reverse=True): outf1.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( rank, clade, len(length[rank][clade]), sum(ORFs[rank][clade]), sum(length[rank][clade]))) message = '{0} is created!'.format(args.output_file) shared.give_user_feedback(message, args.log_file, args.quiet) return def summarise_bins(args): message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_input_file(args.input_file, args.log_file, args.quiet)) if not args.force: errors.append( check.check_output_file( args.output_file, args.log_file, args.quiet)) errors.append( check.check_in_and_output_file( args.input_file, args.output_file, args.log_file, args.quiet)) if True in errors: sys.exit(1) message = 'Summarising...' shared.give_user_feedback(message, args.log_file, args.quiet) with open(args.input_file, 'r') as f1: for line in f1: if line.startswith('#'): line = line.split('\t') if line[0] != '# bin': message = '{0} is not a BAT classification file.'.format( args.input_file) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) if line[0] == '# contig': message = ( '{0} appears to be a CAT classification file. ' 'If you want to summarise contig ' 'classifications, supply a contigs fasta with ' 'argument [-c / --contigs_fasta].'.format( args.input_file)) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) try: superkingdom_index = line.index('superkingdom') except: message = ( 'official ranks not found in header of {0}. Make ' 'sure that the BAT classification file is named ' 'with official ranks with \'CAT add_names ' '--only_official\'.'.format(args.input_file)) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) break else: message = 'input file does not have a recognisable header.' shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) n_bins = {} n_bins['no taxid assigned'] = 0 official_ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] for rank in official_ranks: n_bins[rank] = {} n = 0 bin_trace = set() doubles = set() with open(args.input_file, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('#'): continue n += 1 line = line.split('\t') bin_ = line[0] if bin_ in bin_trace: doubles.add(bin_) bin_trace.add(bin_) if line[1] == 'no taxid assigned': n_bins['no taxid assigned'] += 1 continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(': ', 1)[0].rstrip('*') rank = official_ranks[i] if classification not in n_bins[rank]: n_bins[rank][classification] = 0 n_bins[rank][classification] += 1 if len(doubles) != 0: message = ( 'some bins have multiple classifications. CAT summarise ' 'currently does not allow for this. Bins with multiple ' 'classifications: {0}.'.format(', '.join(list(doubles)))) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) n_classified_bins = n - n_bins['no taxid assigned'] with open(args.output_file, 'w') as outf1: outf1.write('# total number of bins is {0:,d}, of which {1:,d} ' '({2:.2f}%) have taxonomy assigned.\n'.format( n, n_classified_bins, n_classified_bins / n * 100)) outf1.write('#\n') outf1.write('# rank\tclade\tnumber of bins\n') for rank in official_ranks: for clade in sorted(n_bins[rank], key=lambda x: n_bins[rank][x], reverse=True): outf1.write('{0}\t{1}\t{2}\n'.format( rank, clade, n_bins[rank][clade])) message = '{0} is created!'.format(args.output_file) shared.give_user_feedback(message, args.log_file, args.quiet) return def run(): args = parse_arguments() if not args.contigs_fasta: summarise_bins(args) else: summarise_contigs(args) return if __name__ == '__main__': sys.exit('Run \'CAT summarise\' to summarise a named CAT contig ' 'classification file or named BAT bin classification file.') CAT-5.2.3/CAT_pack/tax.py000066400000000000000000000207411401073552600147120ustar00rootroot00000000000000#!/usr/bin/env python3 import sys import shared def import_nodes(nodes_dmp, log_file, quiet): message = 'Loading file {0}.'.format(nodes_dmp) shared.give_user_feedback(message, log_file, quiet) taxid2parent = {} taxid2rank = {} with open(nodes_dmp, 'r') as f1: for line in f1: line = line.split('\t') taxid = line[0] parent = line[2] rank = line[4] taxid2parent[taxid] = parent taxid2rank[taxid] = rank return (taxid2parent, taxid2rank) def import_names(names_dmp, log_file, quiet): message = 'Loading file {0}.'.format(names_dmp) shared.give_user_feedback(message, log_file, quiet) taxid2name = {} with open(names_dmp, 'r') as f1: for line in f1: line = line.split('\t') if line[6] == 'scientific name': taxid = line[0] name = line[2] taxid2name[taxid] = name return taxid2name def import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet): message = 'Loading file {0}.'.format(fastaid2LCAtaxid_file) shared.give_user_feedback(message, log_file, quiet) fastaid2LCAtaxid = {} with open(fastaid2LCAtaxid_file, 'r') as f1: for line in f1: line = line.rstrip().split('\t') if line[0] in all_hits: # Only include fastaids that are found in hits. fastaid2LCAtaxid[line[0]] = line[1] return fastaid2LCAtaxid def import_taxids_with_multiple_offspring( taxids_with_multiple_offspring_file, log_file, quiet): message = 'Loading file {0}.'.format(taxids_with_multiple_offspring_file) shared.give_user_feedback(message, log_file, quiet) taxids_with_multiple_offspring = set() with open(taxids_with_multiple_offspring_file, 'r') as f1: for line in f1: line = line.rstrip() taxids_with_multiple_offspring.add(line) return taxids_with_multiple_offspring def find_lineage(taxid, taxid2parent, lineage=None): if lineage is None: lineage = [] lineage.append(taxid) if taxid2parent[taxid] == taxid: return lineage else: return find_lineage(taxid2parent[taxid], taxid2parent, lineage) def find_LCA(list_of_lineages): overlap = set.intersection(*map(set, list_of_lineages)) for taxid in list_of_lineages[0]: if taxid in overlap: return taxid def find_LCA_for_ORF(hits, fastaid2LCAtaxid, taxid2parent): list_of_lineages = [] top_bitscore = 0 for (hit, bitscore) in hits: if bitscore > top_bitscore: top_bitscore = bitscore try: taxid = fastaid2LCAtaxid[hit] lineage = find_lineage(taxid, taxid2parent) list_of_lineages.append(lineage) except: # The fastaid does not have an associated taxid for some reason. pass if len(list_of_lineages) == 0: return ('no taxid found ({0})'.format(';'.join([i[0] for i in hits])), top_bitscore) overlap = set.intersection(*map(set, list_of_lineages)) for taxid in list_of_lineages[0]: if taxid in overlap: return (taxid, top_bitscore) def find_questionable_taxids(lineage, taxids_with_multiple_offspring): questionable_taxids = [] if lineage == ['1']: return questionable_taxids if len(lineage) == 2 and lineage[1:] == ['1']: return questionable_taxids for (i, taxid) in enumerate(lineage): taxid_parent = lineage[i + 1] if taxid_parent in taxids_with_multiple_offspring: return questionable_taxids questionable_taxids.append(taxid) def star_lineage(lineage, taxids_with_multiple_offspring): questionable_taxids = find_questionable_taxids(lineage, taxids_with_multiple_offspring) starred_lineage = [taxid if taxid not in questionable_taxids else '{0}*'.format(taxid) for taxid in lineage] return starred_lineage def find_weighted_LCA(LCAs_ORFs, taxid2parent, f): list_of_lineages = [] list_of_bitscores = [] based_on_n_ORFs = 0 for (taxid, top_bitscore) in LCAs_ORFs: if taxid.startswith('no taxid found'): # Thus the ORFs that are not classified because they don't have an # associated taxid are not taken into account for the # classification of the contig. continue lineage = find_lineage(taxid, taxid2parent) list_of_lineages.append(lineage) list_of_bitscores.append(top_bitscore) based_on_n_ORFs += 1 if len(list_of_lineages) == 0: return ( 'no ORFs with taxids found.', 'no ORFs with taxids found.', 'no ORFs with taxids found.') taxid2bitscore = {} for (i, lineage) in enumerate(list_of_lineages): for taxid in lineage: if taxid not in taxid2bitscore: taxid2bitscore[taxid] = 0 taxid2bitscore[taxid] += list_of_bitscores[i] whitelisted_lineages = [] for taxid in taxid2bitscore: if taxid2bitscore[taxid] / sum(list_of_bitscores) > f: lineage = find_lineage(taxid, taxid2parent) whitelisted_lineages.append(lineage) if len(whitelisted_lineages) == 0: return ( 'no lineage whitelisted.', 'no lineage whitelisted.', 'no lineage whitelisted.') whitelisted_lineages = sorted(whitelisted_lineages, key=lambda x: len(x), reverse=True) longest_lineages = [] longest_lineages_scores = [] taxid_trace = set() for whitelisted_lineage in whitelisted_lineages: if whitelisted_lineage[0] not in taxid_trace: longest_lineages.append(whitelisted_lineage) scores = [taxid2bitscore[taxid] / sum(list_of_bitscores) for taxid in whitelisted_lineage] longest_lineages_scores.append(scores) taxid_trace |= set(whitelisted_lineage) return (longest_lineages, longest_lineages_scores, based_on_n_ORFs) def convert_to_names(lineage, taxid2rank, taxid2name, scores=None): names = [] for (i, taxid) in enumerate(lineage): if '*' in taxid: taxid = taxid.rstrip('*') starred = True else: starred = False name = taxid2name[taxid] rank = taxid2rank[taxid] if scores is not None: if starred: names.append('{0}* ({1}): {2}'.format(name, rank, scores[i])) else: names.append('{0} ({1}): {2}'.format(name, rank, scores[i])) else: if starred: names.append('{0}* ({1})'.format(name, rank)) else: names.append('{0} ({1})'.format(name, rank)) return names def convert_to_official_names(lineage, taxid2rank, taxid2name, scores=None): official_ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] lineage_ranks = [taxid2rank[taxid.rstrip('*')] for taxid in lineage] official_names = ['no support'] * 7 for (i, rank) in enumerate(official_ranks): if rank in lineage_ranks: index = lineage_ranks.index(rank) taxid = lineage[index] if '*' in taxid: taxid = taxid.rstrip('*') starred = True else: starred = False name = taxid2name[taxid] if scores is not None: if starred: official_names[i] = '{0}*: {1}'.format(name, scores[index]) else: official_names[i] = '{0}: {1}'.format(name, scores[index]) else: if starred: official_names[i] = '{0}*'.format(name) else: official_names[i] = name # Fill the official lineage with NAs if a lower classification is present. index_lowest_classification = 0 for (i, name) in enumerate(official_names): if name != 'no support': index_lowest_classification = i for i in range(index_lowest_classification): if official_names[i] == 'no support': official_names[i] = 'NA' return official_names if __name__ == '__main__': sys.exit('Run \'CAT\' to run CAT or BAT.') CAT-5.2.3/CHANGELOG.md000066400000000000000000000057131401073552600137520ustar00rootroot00000000000000# Changelog ## 5.2.3 Minor bug fix for `CAT add_names`. ## 5.2.2 We have added the DIAMOND specific `--no_self_hits` flag. We have also added some extra checks and removed redundancy from the parser code. Databases constructed by `CAT prepare` now have a slightly different naming scheme. ## 5.2.1 Minor bug fix for `CAT prepare`. ## 5.2 `CAT prepare` now uses the latest taxonomy mapping files from NCBI, significantly expanding taxonomic coverage of proteins in nr. File integrity of downloads is assessed based on md5 checksums. The ORF2LCA output file contains a new column for the number of hits the classification is based on. We have made textual changes to the output files to better reflect the meaning of 'classified' and 'not classified' in different contexts. ## 5.1.2 Code streamlining. ## 5.1.1 CAT and BAT can now compress the DIAMOND alignment file, and import gzip compressed alignment files. ## 5.1 The code has been rewritten to prepare for future extensions. We have also added the `--verbose` flag. ## 5.0.5. Skip hidden files in bin folder. ## 5.0.4. We have added the `--no_stars` flag alongside a minor bug fix. ## 5.0.3 Bug fix for single bin mode. ## 5.0.2 Floating point numbers have been changed to decimals. ## 5.0.1 Updated license to MIT. ## 5.0 We have simplified the output table format: we have added a 'reason' column, which shows the number of ORFs a classification is based on and the total number of predicted ORFs on a contig/MAG. In case of an unclassified sequence, the reason for this is shown in this column as well. Moreover, `add_names` now has an option to exclude the bit-score support scores from the lineage! ## 4.6 We have added the DIAMOND `--top` parameter and the `--I_know_what_Im_doing` flag for experimental features. ## 4.5 BAT can now be run in single bin mode. The familiar `./CAT bins` is still the go-to option if you want to classify multiple MAGs, but if it's only one MAG you are interested in try out `./CAT bin`! An added benefit of single bin mode is that you can use the alignment and predicted protein files of the BAT run to classify individual contigs within the MAG with CAT, or the other way around. ## 4.4 We have added DIAMOND specific options. This allows you to use sensitive mode, and tune memory and temporary disk space usage during alignment! Moreover, you can now force CAT and BAT to overwrite existing files. ## 4.3.4 We extended some of the pre-flight checks. ## 4.3.3 Minor bug fix. ## 4.3.2 A fruity update: CAT and BAT are now macOS compatible! ## 4.3.1 We removed the psutil dependency. ## 4.3 Prepare now checks whether the RAM of your computer is large enough. If not, not to worry! We have put preconstructed databases online. ## 4.2 Code streamlining. ## 4.1 CAT and BAT leave much less footprints: the size of the alignment output files is greatly reduced, alignment is now up to 3 times faster than previous releases. ## 4.0 CAT and BAT have been completely rewritten, bumping the version up to 4.0. CAT-5.2.3/LICENSE.md000066400000000000000000000020501401073552600135340ustar00rootroot00000000000000Copyright (c) 2019 Universiteit Utrecht Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. CAT-5.2.3/README.md000066400000000000000000000402751401073552600134220ustar00rootroot00000000000000# CAT and BAT - [Introduction](#introduction) - [Dependencies and where to get them](#dependencies-and-where-to-get-them) - [Installation](#installation) - [Getting started](#getting-started) - [Usage](#usage) - [Interpreting the output files](#interpreting-the-output-files) - [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk) - [Optimising running time, RAM, and disk usage](#optimising-running-time-ram-and-disk-usage) - [Examples](#examples) ## Introduction Contig Annotation Tool (CAT) and Bin Annotation Tool (BAT) are pipelines for the taxonomic classification of long DNA sequences and metagenome assembled genomes (MAGs/bins) of both known and (highly) unknown microorganisms, as generated by contemporary metagenomics studies. The core algorithm of both programs involves gene calling, mapping of predicted ORFs against the nr protein database, and voting-based classification of the entire contig / MAG based on classification of the individual ORFs. CAT and BAT can be run from intermediate steps if files are formated appropriately (see [Usage](#usage)). A paper describing the algorithm together with extensive benchmarks can be found at https://doi.org/10.1186/s13059-019-1817-x. If you use CAT or BAT in your research, it would be great if you could cite us: * *von Meijenfeldt FAB, Arkhipova K, Cambuy DD, Coutinho FH, Dutilh BE. Robust taxonomic classification of uncharted microbial sequences and bins with CAT and BAT. Genome Biology. 2019;20:217.* ## Dependencies and where to get them Python 3, https://www.python.org/. DIAMOND, https://github.com/bbuchfink/diamond. Prodigal, https://github.com/hyattpd/Prodigal. CAT and BAT have been thoroughly tested on Linux systems, and should run on macOS as well. ## Installation No installation is required. You can run CAT and BAT by supplying the absolute path: ``` $ ./CAT_pack/CAT --help ``` Alternatively, if you add the files in the CAT\_pack directory to your `$PATH` variable, you can run CAT and BAT from anywhere: ``` $ CAT --version ``` *Special note for Mac users: since the macOS file system is case-insensitive by default, adding the CAT\_pack directory to your `$PATH` variable might replace calls to the standard unix `cat` utility. We advise Mac users to run CAT from its absolute path.* CAT and BAT can also be installed via Bioconda, thanks to Silas Kieser: ``` $ conda install -c bioconda cat ``` ## Getting started To get started with CAT and BAT, you will have to get the database files on your system. You can either download preconstructed database files, or generate them yourself which will get you the latest versions of nr and the taxonomy files. ### Downloading the database files. To download the database files, find the most recent version on [tbb.bio.uu.nl/bastiaan/CAT\_prepare/](https://tbb.bio.uu.nl/bastiaan/CAT_prepare/), download and extract, and you are ready to go! ``` $ wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz $ tar -xvzf CAT_prepare_20210107.tar.gz ``` Your version of DIAMOND should be the same as with which the database is constructed. For this reason the DIAMOND executable is supplied within the CAT prepare folder. Alternatively, you can find the DIAMOND version used for database construction within the database log file: ``` $ grep version 2021-01-07.CAT_prepare.fresh.log ``` ### Generating the database files yourself. ``` $ CAT prepare --fresh ``` This will download the taxonomy files from NCBI taxonomy to a taxonomy folder, and the nr database to a database folder. A DIAMOND database is constructed from the nr file. CAT prepare also generates a fastaid2LCAtaxid file, as the first accession numbers in the headers of nr are not necessarily the Last Common Ancestor (LCA) of all accession numbers in it. Moreover, the file taxids\_with\_multiple\_offspring is generated. CAT prepare will typically take a few hours to create a fresh database, and will use up to 200GB of memory. If some of the files are already on your system (say the taxonomy files and the nr database) you can run: ``` $ CAT prepare --existing -d {folder containing nr} -t {folder containing taxonomy files} ``` CAT prepare will assess which files need to be downloaded and created and start from that point. CAT prepare only checks if the necessary files are there, not if they are correctly formatted. ### Running CAT and BAT. The taxonomy folder and database folder created by CAT prepare are needed in subsequent CAT and BAT runs. They only need to be generated/downloaded once or whenever you want to update the nr database. To run CAT on a contig set, each header in the contig fasta file (the part after `>` and before the first space) needs to be unique. To run BAT on set of MAGs, each header in a MAG needs to be unique within that MAG. If you are unsure if this is the case, you can just run CAT or BAT, as the appropriate error messages are generated if formatting is incorrect. ### Getting help. If you are unsure what options a program has, you can always add `--help` to a command. This is a great way to get you started with CAT and BAT. ``` $ CAT --help $ CAT contigs --help $ CAT summarise --help ``` ## Usage After you have got the database files on your system, you can run CAT to annotate your contig set: ``` $ CAT contigs -c {contigs fasta} -d {database folder} -t {taxonomy folder} ``` Multiple output files and a log file will be generated. The final classification files will be called `out.CAT.ORF2LCA.txt` and `out.CAT.contig2classification.txt`. Alternatively, if you already have a predicted proteins fasta file and/or an alignment table for example from previous runs, you can supply them to CAT, which will then skip the steps that have already been done and start from there: ``` $ CAT contigs -c {contigs fasta} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file} ``` The headers in the predicted proteins fasta file must look like this `>{contig}_{ORFnumber}`, so that CAT can couple contigs to ORFs. The alignment file must be tab-seperated, with queried ORF in the first column, nr protein accession number in the second, and bit-score in the 12th. To run BAT on a set of MAGs: ``` $ CAT bins -b {bin folder} -d {database folder} -t {taxonomy folder} ``` Alternatively, BAT can be run on a single MAG: ``` $ CAT bin -b {bin fasta} -d {database folder} -t {taxonomy folder} ``` Multiple output files and a log file will be generated. The final classification files will be called `out.BAT.ORF2LCA.txt` and `out.BAT.bin2classification.txt`. Similarly to CAT, BAT can be run from intermidate steps if gene prediction and alignment have already been carried out once: ``` $ CAT bins -b {bin folder} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file} ``` If BAT is run in single bin mode, you can use these predicted protein and alignment files to classify individual contigs within the MAG with CAT. ``` $ CAT bin -b {bin fasta} -d {database folder} -t {taxonomy folder} $ CAT contigs -c {bin fasta} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file} ``` You can also do this the other way around; start with contig classification and classify the entire MAG with BAT in single bin mode based on the files generated by CAT. ## Interpreting the output files The ORF2LCA output looks like this: ORF | lineage | bit-score --- | --- | --- contig\_1\_ORF1 | 1;131567;2;1783272 | 574.7 Where the lineage is the full taxonomic lineage of the classification of the ORF, and the bit-score the top-hit bit-score that is assigned to the ORF for voting. The BAT ORF2LCA output file has an extra column where ORFs are linked to the MAG in which they are found. The contig2classification and bin2classification output looks like this: contig or bin | classification | reason | lineage | lineage scores --- | --- | --- | --- | --- contig\_1 | taxid assigned | based on 14/15 ORFs | 1;131567;2;1783272 | 1.00; 1.00; 1.00; 0.78 contig\_2 | taxid assigned (1/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;1416614;1183438\* | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.23;0.23 contig\_2 | taxid assigned (2/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;33072 | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.77 contig\_3 | no taxid assigned | no ORFs found Where the lineage scores represent the fraction of bit-score support for each classification. **Contig\_2 has two classifications.** This can happen if the *f* parameter is chosen below 0.5. For an explanation of the **starred classification**, see [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk). To add names to the taxonomy id's in either output file, run: ``` $ CAT add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} ``` This will show you that for example contig\_1 is classified as Terrabacteria group. To only get official levels (*i.e.* superkingdom, phylum, ...): ``` $ CAT add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} --only_official ``` Or, alternatively: ``` $ CAT add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} --only_official --exclude_scores ``` If you have named a CAT or BAT classification file with official names, you can get a summary of the classification, where total length and number of ORFs supporting a taxon are calculated for contigs, and the number of MAGs per encountered taxon for MAG classification: ``` $ CAT summarise -c {contigs fasta} -i {named CAT classification file} -o {output file} $ CAT summarise -i {named BAT classification file} -o {output file} ``` CAT summarise currently does not support classification files wherein some contigs / MAGs have multiple classifications (as contig\_2 above). ## Marking suggestive taxonomic assignments with an asterisk When we want to confidently go down to the lowest taxonomic level possible for a classification, an important assumption is that on that level conflict between classifications could have arisen. Namely, if there were conflicting classifications, the algorithm would have made the classification more conservative by moving up a level. Since it did not, we can trust the low-level classification. However, it is not always possible for conflict to arise, because in some cases no other sequences from the clade are present in the database. This is true for example for the family Dehalococcoidaceae, which in our databases is the sole representative of the order Dehalococcoidales. Thus, here we cannot confidently state that an classification on the family level is more correct than an classification on the order level. For these cases, CAT and BAT mark the lineage with asterisks, starting from the lowest level classification up to the level where conflict could have arisen because the clade contains multiple taxa with database entries. The user is advised to examine starred taxa more carefully, for example by analysing sequence identity between predicted ORFs and hits, or move up the lineage to a confident classification (i.e. the first classification without an asterisk). If you do not want the asterisks in your output files, you can add the `--no_stars` flag to CAT or BAT. ## Optimising running time, RAM, and disk usage CAT and BAT may take a while to run, and may use quite a lot of RAM and disk space. Depending on what you value most, you can tune CAT and BAT to maximize one and minimize others. The classification algorithm itself is fast and is friendly on memory and disk space. The most expensive step is alignment with DIAMOND, hence tuning alignment parameters will have the highest impact: - The `-n / --nproc` argument allows you to choose the number of cores to deploy. - You can choose to run DIAMOND in sensitive mode with the `--sensitive` flag. This will increase sensitivity but will make alignment considerably slower. - Setting the `--block_size` parameter lower will decrease memory and temporary disk space usage. Setting it higher will increase performance. - For high memory machines, it is adviced to set `--index_chunks` to 1. This parameter has no effect on temprary disk space usage. - You can specify the location of temporary DIAMOND files with the `--tmpdir` argument. - You can set the DIAMOND --top parameter (see below). ### Setting the DIAMOND --top parameter You can speed up DIAMOND considerably, and at the same time greatly reduce disk usage, by setting the DIAMOND `--top` parameter to lower values. This will govern hits within range of the best hit that are written to the alignment file. You have to be very carefull to 1) not confuse this parameter with the `r / --range` parameter, which does a similar cut-off but *after* alignment and 2) be aware that if you want to run CAT or BAT again afterwards with different values of the `-r / --range` parameter, your options will be limited to the range you have chosen with `--top` earlier, because all hits that fall outside this range will not be included in the alignment file. **Importantly**, CAT and BAT currently do not warn you if you choose `-r / --range` in a second run higher than `--top` in a previous one, **so it's up to you to remember this!** If you have understood all this, or you do not plan to tune `-r / --range` at all afterwards, you can add the `--I_know_what_Im_doing` flag and enjoy a huge speedup with much smaller alignment files! For CAT you can for example set `--top 11` and for BAT `--top 6`. ## Examples Getting help for running the prepare utility: ``` $ CAT prepare --help ``` First, create a fresh database. Next, run CAT on a contig set with default parameter settings deploying 16 cores for DIAMOND alignment. Finally, name the contig classification output with official names, and create a summary: ``` $ CAT prepare --fresh -d CAT_database/ -t CAT_taxonomy/ $ CAT contigs -c contigs.fasta -d CAT_database/ -t CAT_taxonomy/ -n 16 --out_prefix first_CAT_run $ CAT add_names -i first_CAT_run.contig2classification.txt -o first_CAT_run.contig2classification.official_names.txt -t CAT_taxonomy/ --only_official $ CAT summarise -c contigs.fasta -i first_CAT_run.contig2classification.official_names.txt -o CAT_first_run.summary.txt ``` Run the classification algorithm again with custom parameter settings, and name the contig classification output with all names in the lineage, excluding the scores: ``` $ CAT contigs --range 5 --fraction 0.1 -c contigs.fasta -d CAT_database/ -t CAT_taxonomy/ -p first_CAT_run.predicted_proteins.fasta -a first_CAT_run.alignment.diamond -o second_CAT_run $ CAT add_names -i second_CAT_run.contig2classification.txt -o second_CAT_run.contig2classification.names.txt -t CAT_taxonomy/ --exclude_scores ``` First, run BAT on a set of MAGs with custom parameter settings, suppressing verbosity and not writing a log file. Next, add names to the ORF2LCA output file: ``` $ CAT bins -r 10 -f 0.1 -b ../bins/ -s .fa -d CAT_database/ -t CAT_taxonomy/ -o BAT_run --quiet --no_log $ CAT add_names -i BAT_run.ORF2LCA.txt -o BAT_run.ORF2LCA.names.txt -t CAT_taxonomy/ ``` ### Identifying contamination/mis-binned contigs within a MAG. We often use the combination of CAT/BAT to explore possible contamination within a MAG. Run BAT on a single MAG. Next, classify the contigs within the MAG individually without generating new protein files or DIAMOND alignments. ``` $ CAT bin -b ../bins/interesting_MAG.fasta -d CAT_database/ -t CAT_taxonomy/ -o BAT.interesting_MAG $ CAT contigs -c ../bins/interesting_MAG.fasta -d CAT_database/ -t CAT_taxonomy/ -p BAT.interesting_MAG.predicted_proteins.faa -a BAT.interesting_MAG.alignment.diamond -o CAT.interesting_MAG ``` Contigs that have a different taxonomic signal than the MAG classification are probably contamination. Alternatively, you can look at contamination from the MAG perspective, by setting the *f* parameter to a low value: ``` $ CAT bin -f 0.01 -b ../bins/interesting_MAG.fasta -d CAT_database/ -t CAT_taxonomy/ -o BAT.interesting_MAG $ CAT add_names -i BAT.interesting_MAG.bin2classification.txt -o BAT.interesting_MAG.bin2classification.names.txt -t CAT_taxonomy/ ``` BAT will output any taxonomic signal with at least 1% support. Low scoring diverging signals are clear signs of contamination!