pax_global_header00006660000000000000000000000064145214260270014515gustar00rootroot0000000000000052 comment=3e2bf361a9944a6a921ce1a44f9718bee0762e06 CAT-5.3/000077500000000000000000000000001452142602700117735ustar00rootroot00000000000000CAT-5.3/CAT_pack/000077500000000000000000000000001452142602700134005ustar00rootroot00000000000000CAT-5.3/CAT_pack/CAT000077500000000000000000000042061452142602700137370ustar00rootroot00000000000000#!/usr/bin/env python3 import sys import about import add_names import bins import contigs import download import prepare import summarise def usage(): message = ( "usage: CAT (download | prepare | contigs | bins | add_names | " "summarise) [-v / --version] [-h / --help]\n" "CAT: error: one of the arguments " "download prepare contigs bins add_names summarise " "is required" ) sys.stdout.write("{0}\n".format(message)) return def version(): message = ("CAT v{0} ({1}) by {2}.".format( about.__version__, about.__date__, about.__author__)) sys.stdout.write("{0}\n".format(message)) return def help(): message = ( "usage: CAT (prepare | contigs | bin | bins | add_names | summarise) " "[-v / --version] [-h / --help]\n\n" "Run Contig Annotation Tool (CAT) or " "Bin Annotation Tool (BAT).\n\n" "Required choice:\n" " download\t\tDownload and preprocess data from NCBI nr or GTDB.\n" " prepare\t\tConstruct database files.\n" " contigs\t\tRun CAT.\n" " bins\t\t\tRun BAT.\n" " add_names\t\tAdd taxonomic names to CAT or BAT output files.\n" " summarise\t\tSummarise a named CAT or BAT classification file." "\n\n" "Optional arguments:\n" " -v, --version\t\tPrint version information and exit.\n" " -h, --help\t\tShow this help message and exit." ) sys.stdout.write("{0}\n".format(message)) return def main(): if len(sys.argv) == 1: usage() elif sys.argv[1] == "download": download.run() elif sys.argv[1] == "prepare": prepare.run() elif sys.argv[1] == "contigs": contigs.run() elif sys.argv[1] == "bins": bins.run() elif sys.argv[1] == "add_names": add_names.run() elif sys.argv[1] == "summarise": summarise.run() elif sys.argv[1] == "-v" or sys.argv[1] == "--version": version() elif sys.argv[1] == "-h" or sys.argv[1] == "--help": help() else: usage() return if __name__ == "__main__": main() CAT-5.3/CAT_pack/about.py000066400000000000000000000001701452142602700150620ustar00rootroot00000000000000#!/usr/bin/env python3 __author__ = "F. A. Bastiaan von Meijenfeldt" __version__ = "5.3" __date__ = "4 November, 2023" CAT-5.3/CAT_pack/add_names.py000066400000000000000000000123601452142602700156670ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import sys import about import check import shared import tax def parse_arguments(): parser = argparse.ArgumentParser( prog="CAT add_names", description="Add taxonomic names to CAT or BAT output files.", usage="CAT add_names -i FILE -o FILE -t DIR [options] [-h / --help]", add_help=False) required = parser.add_argument_group("Required arguments") shared.add_argument( required, "input_file", True, help_=("Path to input file. Can be classification or ORF2LCA output " "file from CAT or BAT.")) shared.add_argument(required, "output_file", True) shared.add_argument(required, "taxonomy_folder", True) optional = parser.add_argument_group("Optional arguments") shared.add_argument(optional, "only_official", False) shared.add_argument(optional, "exclude_scores", False) shared.add_argument(optional, "force", False) shared.add_argument(optional, "quiet", False) shared.add_argument(optional, "help", False) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, "add_names")] if len(extra_args) > 0: sys.exit("error: too much arguments supplied:\n{0}".format( "\n".join(extra_args))) # Add extra arguments. shared.expand_arguments(args) return args def run(): args = parse_arguments() message = "# CAT v{0}.".format(about.__version__) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_input_file(args.input_file, args.log_file, args.quiet)) if not args.force: errors.append( check.check_output_file( args.output_file, args.log_file, args.quiet) ) errors.append( check.check_in_and_output_file( args.input_file, args.output_file, args.log_file, args.quiet) ) if True in errors: sys.exit(1) (taxid2parent, taxid2rank) = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) taxid2name = tax.import_names(args.names_dmp, args.log_file, args.quiet) message = "Appending names..." shared.give_user_feedback(message, args.log_file, args.quiet) with open(args.input_file, "r") as f1: for line in f1: if line.startswith("#"): line = line.rstrip().split("\t") if "lineage" in line: lineage_index = line.index("lineage") else: message = ("{0} is not a supported classification file." "".format(args.input_file)) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) try: scores_index = line.index("lineage scores") except: scores_index = None full_length = len(line) break else: message = ("{0} is not a supported classification file." "".format(args.input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) with open(args.input_file, "r") as f1, open(args.output_file, "w") as outf1: for line in f1: line = line.rstrip() if line.startswith("#"): if args.only_official: outf1.write("{0}\tsuperkingdom\tphylum\tclass\torder\t" "family\tgenus\tspecies\n".format(line)) else: outf1.write("{0}\tfull lineage names\n".format(line)) continue line = line.split("\t") if len(line) != full_length: # Entry does not have a full annotation. outf1.write("{0}\n".format("\t".join(line))) continue if any([c.startswith("no taxid found") for c in line[2:4]]): # ORF has database hits but the accession number is not found # in the taxonomy files. outf1.write("{0}\n".format("\t".join(line))) continue lineage = line[lineage_index].split(";") if scores_index is not None and not args.exclude_scores: scores = line[scores_index].split(";") else: scores = None if args.only_official: names = tax.convert_to_official_names( lineage, taxid2rank, taxid2name, scores ) else: names = tax.convert_to_names( lineage, taxid2rank, taxid2name, scores ) outf1.write("{0}\t{1}\n".format("\t".join(line), "\t".join(names))) message = "Names written to {0}!".format(args.output_file) shared.give_user_feedback(message, args.log_file, args.quiet) return if __name__ == "__main__": sys.exit("Run \'CAT add_names\' to add taxonomic names to CAT or BAT " "output files.") CAT-5.3/CAT_pack/bins.py000066400000000000000000000457521452142602700147220ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import decimal import multiprocessing import os import sys import about import check import shared import tax def parse_arguments(): parser = argparse.ArgumentParser( prog="CAT bins", description="Run Bin Annotation Tool (BAT).", usage="CAT bins -b DIR / FILE -d DIR -t DIR [options] [-h / --help]", add_help=False ) required = parser.add_argument_group("Required arguments") shared.add_argument(required, "bin_fasta_or_folder", True) shared.add_argument(required, "database_folder", True) shared.add_argument(required, "taxonomy_folder", True) optional = parser.add_argument_group("Optional arguments") shared.add_argument(optional, "bin_suffix", False, default=".fna") shared.add_argument(optional, "r", False, default=decimal.Decimal(5)) shared.add_argument(optional, "f", False, default=decimal.Decimal(0.3)) shared.add_argument(optional, "out_prefix", False, default="./out.BAT") shared.add_argument(optional, "proteins_fasta", False) shared.add_argument(optional, "alignment_file", False) shared.add_argument( optional, "path_to_prodigal", False, default="prodigal") shared.add_argument(optional, "path_to_diamond", False, default="diamond") shared.add_argument(optional, "no_stars", False) shared.add_argument(optional, "force", False) shared.add_argument(optional, "quiet", False) shared.add_argument(optional, "verbose", False) shared.add_argument(optional, "no_log", False) shared.add_argument(optional, "help", False) shared.add_argument(optional, "IkwId", False) specific = parser.add_argument_group("DIAMOND specific optional arguments") shared.add_all_diamond_arguments(specific) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, "bins")] if len(extra_args) > 0: sys.exit("error: too much arguments supplied:\n{0}".format( "\n".join(extra_args))) # Check experimental features. if not args.IkwId: if args.top < 11: sys.exit( "error: [--top] can only be set lower than 11 with the " "[--I_know_what_Im_doing] flag. See README.md as to why this " "is the case." ) if args.r > 11 and args.alignment_file: sys.exit( "error: [-r / --range] can only be set higher than 11 in " "combination with [-a / --diamond_alignment] with the " "[--I_know_what_Im_doing] flag. See README.md as to why this " "is the case." ) # Add extra arguments. shared.expand_arguments(args) return args def import_bins(bin_folder, bin_suffix, log_file, quiet): message = "Importing bins from {0}.".format(bin_folder) shared.give_user_feedback(message, log_file, quiet) bin2contigs = {} contig2bin = {} for file_ in os.listdir(bin_folder): if file_.startswith("."): # Skip hidden files. continue if not file_.endswith(bin_suffix): continue if ".concatenated." in file_: # Skip concatenated contig fasta and predicted protein fasta files # from earlier runs. continue # Keep the suffix in the bin name. bin_ = file_ bin2contigs[bin_] = [] with open("{0}{1}".format(bin_folder, file_), "r") as f1: for line in f1: if line.startswith(">"): contig = line.split()[0].rstrip().lstrip(">") if contig in contig2bin: message = ( "BAT has encountered {0} twice, in {1} and in " "{2}. Fasta headers should be unique across bins, " "please remove or rename duplicates." "".format(contig, contig2bin[contig], bin_) ) shared.give_user_feedback( message, log_file, quiet, error=True) sys.exit(1) contig2bin.setdefault(contig, bin_) bin2contigs[bin_].append(contig) if len(bin2contigs) == 1: message = "1 bin found!" else: message = "{0:,d} bins found!".format(len(bin2contigs)) shared.give_user_feedback(message, log_file, quiet) contig_names = set(contig2bin) return (bin2contigs, contig_names) def make_concatenated_fasta( concatenated_fasta, bin2contigs, bin_folder, log_file, quiet): message = "Writing {0}.".format(concatenated_fasta) shared.give_user_feedback(message, log_file, quiet) with open(concatenated_fasta, "w") as outf1: for bin_ in sorted(bin2contigs): with open("{0}{1}".format(bin_folder, bin_), "r") as f1: for line in f1: if line.startswith(">"): contig = line.split()[0].rstrip().lstrip(">") outf1.write(">{0}\n".format(contig)) else: outf1.write(line) return def run(): args = parse_arguments() message = "# CAT v{0}.".format(about.__version__) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False ) # Check at which state to start. step_list = [] if not args.proteins_fasta and not args.alignment_file: message = ( "\n" "BAT is running. Protein prediction, alignment, and bin " "classification are carried out." ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) step_list.append("predict_proteins") step_list.append("align") elif args.proteins_fasta and not args.alignment_file: message = ( "\n" "BAT is running. Since a predicted protein fasta is supplied, " "only alignment and bin classification are carried out." ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) step_list.append("align") elif args.proteins_fasta and args.alignment_file: message = ( "\n" "BAT is running. Since a predicted protein fasta and alignment " "file are supplied, only bin classification is carried out." ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) elif not args.proteins_fasta and args.alignment_file: message = ( "if you want BAT to directly do the classification, you should " "not only supply a DIAMOND alignment table but also a " "predicted protein fasta file with argument [-p / --proteins]." ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) step_list.append("classify") # Print variables. message = ( "Rarw!\n\n" "Supplied command: {0}\n\n" "".format(" ".join(sys.argv)) ) if "bin_folder" in args: message += "Bin folder: {0}\n".format(args.bin_folder) if "bin_fasta" in args: message += "Bin fasta: {0}\n".format(args.bin_fasta) message += ( "Taxonomy folder: {0}\n" "Database folder: {1}\n" "Parameter r: {2}\n" "Parameter f: {3}\n" "Log file: {4}\n\n" "-----------------\n".format( args.taxonomy_folder, args.database_folder, int(args.r), float(args.f), args.log_file) ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) # Check binaries, output files, taxonomy folder and database folder, and # set variables. message = "Doing some pre-flight checks first." shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) errors = [] if "bin_folder" in args: errors.append( check.check_bin_folder( args.bin_folder, args.bin_suffix, args.log_file, args.quiet) ) if "bin_fasta" in args: errors.append( check.check_bin_fasta(args.bin_fasta, args.log_file, args.quiet)) errors.append( check.check_out_prefix(args.out_prefix, args.log_file, args.quiet)) if "predict_proteins" in step_list: errors.append( check.check_prodigal_binaries( args.path_to_prodigal, args.log_file, args.quiet) ) setattr( args, "concatenated_fasta", "{0}.concatenated.fasta".format(args.out_prefix) ) setattr( args, "proteins_fasta", "{0}.concatenated.predicted_proteins.faa".format(args.out_prefix) ) setattr( args, "proteins_gff", "{0}.concatenated.predicted_proteins.gff".format(args.out_prefix) ) if not args.force: errors.append( check.check_output_file( args.concatenated_fasta, args.log_file, args.quiet) ) errors.append( check.check_output_file( args.proteins_fasta, args.log_file, args.quiet) ) errors.append( check.check_output_file( args.proteins_gff, args.log_file, args.quiet) ) if "align" in step_list: errors.append( check.check_diamond_binaries( args.path_to_diamond, args.log_file, args.quiet) ) setattr( args, "alignment_file", "{0}.concatenated.alignment.diamond".format(args.out_prefix) ) if not args.force: errors.append( check.check_output_file( args.alignment_file, args.log_file, args.quiet) ) errors.append( check.check_folders_for_run( args.taxonomy_folder, args.nodes_dmp, args.names_dmp, args.database_folder, args.diamond_database, args.fastaid2LCAtaxid_file, args.taxids_with_multiple_offspring_file, step_list, args.log_file, args.quiet ) ) setattr( args, "bin2classification_output_file", "{0}.bin2classification.txt".format(args.out_prefix) ) setattr( args, "ORF2LCA_output_file", "{0}.ORF2LCA.txt".format(args.out_prefix) ) if not args.force: errors.append( check.check_output_file( args.bin2classification_output_file, args.log_file, args.quiet ) ) errors.append( check.check_output_file( args.ORF2LCA_output_file, args.log_file, args.quiet) ) if "predict_proteins" not in step_list: errors.append( check.check_fasta(args.proteins_fasta, args.log_file, args.quiet)) if "align" in step_list: errors.append( check.check_top(args.top, args.r, args.log_file, args.quiet)) # Print all variables. shared.print_variables(args, step_list) if True in errors: sys.exit(1) message = "Ready to fly!\n\n-----------------\n" shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) # Start BAT. if "bin_folder" in args: (bin2contigs, contig_names) = import_bins( args.bin_folder, args.bin_suffix, args.log_file, args.quiet) bin_folder = args.bin_folder else: contig_names = shared.import_contig_names( args.bin_fasta, args.log_file, args.quiet) bin_folder, bin_ = args.bin_fasta.rsplit('/', 1) bin_folder += '/' bin2contigs = {} bin2contigs[bin_] = sorted(contig_names) if "predict_proteins" in step_list: make_concatenated_fasta( args.concatenated_fasta, bin2contigs, bin_folder, # Note: not in args. args.log_file, args.quiet ) shared.run_prodigal( args.path_to_prodigal, args.concatenated_fasta, args.proteins_fasta, args.proteins_gff, args.log_file, args.quiet ) contig2ORFs = shared.import_ORFs( args.proteins_fasta, args.log_file, args.quiet) check.check_whether_ORFs_are_based_on_contigs( contig_names, contig2ORFs, args.log_file, args.quiet) if "align" in step_list: shared.run_diamond(args) (ORF2hits, all_hits) = shared.parse_tabular_alignment( args.alignment_file, args.one_minus_r, args.log_file, args.quiet) (taxid2parent, taxid2rank) = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid( args.fastaid2LCAtaxid_file, all_hits, args.log_file, args.quiet) taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring( args.taxids_with_multiple_offspring_file, args.log_file, args.quiet) message = "BAT is flying! Files {0} and {1} are created.".format( args.bin2classification_output_file, args.ORF2LCA_output_file) shared.give_user_feedback(message, args.log_file, args.quiet) n_classified_bins = 0 with open(args.bin2classification_output_file, "w") as outf1, open(args.ORF2LCA_output_file, "w") as outf2: outf1.write("# bin\tclassification\treason\tlineage\t" "lineage scores (f: {0:.2f})\n".format(args.f)) outf2.write("# ORF\tbin\tnumber of hits (r: {0})\tlineage\t" "top bit-score\n".format(args.r)) for bin_ in sorted(bin2contigs): LCAs_ORFs = [] for contig in sorted(bin2contigs[bin_]): if contig not in contig2ORFs: continue for ORF in contig2ORFs[contig]: if ORF not in ORF2hits: outf2.write("{0}\t{1}\tORF has no hit to database\n" "".format(ORF, bin_)) continue n_hits = len(ORF2hits[ORF]) (taxid, top_bitscore) = tax.find_LCA_for_ORF( ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent) if taxid.startswith("no taxid found"): outf2.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format( ORF, bin_, n_hits, taxid, top_bitscore)) else: lineage = tax.find_lineage(taxid, taxid2parent) if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) outf2.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format( ORF, bin_, n_hits, ";".join(lineage[::-1]), top_bitscore )) LCAs_ORFs.append((taxid, top_bitscore),) if len(LCAs_ORFs) == 0: outf1.write("{0}\tno taxid assigned\tno hits to database\n" "".format(bin_)) continue ( lineages, lineages_scores, based_on_n_ORFs ) = tax.find_weighted_LCA( LCAs_ORFs, taxid2parent, args.f) if lineages == "no ORFs with taxids found.": outf1.write("{0}\tno taxid assigned\t" "hits not found in taxonomy files\n".format(bin_)) continue if lineages == "no lineage whitelisted.": outf1.write( "{0}\tno taxid assigned\t" "no lineage reached minimum bit-score support\n" "".format(bin_) ) continue # The bin has a valid classification. n_classified_bins += 1 total_n_ORFs = sum( [len(contig2ORFs[contig]) for contig in bin2contigs[bin_] if contig in contig2ORFs] ) for (i, lineage) in enumerate(lineages): if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) scores = ["{0:.2f}".format(score) for score in lineages_scores[i]] if len(lineages) == 1: # There is only one classification. outf1.write( "{0}\t" "taxid assigned\t" "based on {1}/{2} ORFs\t" "{3}\t" "{4}\n".format( bin_, based_on_n_ORFs, total_n_ORFs, ";".join(lineage[::-1]), ";".join(scores[::-1]) ) ) else: # There are multiple classifications. outf1.write( "{0}\t" "taxid assigned ({1}/{2})\t" "based on {3}/{4} ORFs\t" "{5}\t" "{6}\n".format( bin_, i + 1, len(lineages), based_on_n_ORFs, total_n_ORFs, ";".join(lineage[::-1]), ";".join(scores[::-1]))) message = ( "\n-----------------\n\n" "{0} BAT is done! {1:,d}/{2:,d} bins ({3:.2f}%) have " "taxonomy assigned.".format( shared.timestamp(), n_classified_bins, len(bin2contigs), n_classified_bins / len(bin2contigs) * 100 ) ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) if args.f < 0.5: message = ("since f is set to smaller than 0.5, one bin " "may have multiple classifications.") shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False, warning=True) return if __name__ == "__main__": sys.exit("Run \'CAT bins\' to run Bin Annotation Tool (BAT).") CAT-5.3/CAT_pack/check.py000066400000000000000000000235251452142602700150360ustar00rootroot00000000000000 #!/usr/bin/env/ python3 import hashlib import os import subprocess import sys import shared def check_md5_gz(gz_file, md5_file, log_file, quiet): message = "Checking file integrity via MD5 checksum." shared.give_user_feedback(message, log_file, quiet) with open(md5_file, "r") as f1: md5_exp = f1.read().strip().split(" ")[0] if md5_exp == "": message = ("no MD5 found in {0}. Integrity of {1} cannot be " "established.".format(md5_file, gz_file)) shared.give_user_feedback(message, log_file, quiet, warning=True) else: md5 = gz_md5(gz_file) if md5 != md5_exp: message = "MD5 of {0} does not check out.".format(gz_file) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) else: message = "MD5 of {0} checks out.".format(gz_file) shared.give_user_feedback(message, log_file, quiet) return def gz_md5(input_gz, block_size=4096): message = "Calculating md5sum for file {0}".format(input_gz) shared.give_user_feedback(message) md5 = hashlib.md5() with open(input_gz, "rb") as f1: for chunk in iter(lambda: f1.read(block_size), b""): md5.update(chunk) return md5.hexdigest() def check_memory(Gb): total_memory = None error = False if sys.platform == "linux" or sys.platform == "linux2": # It"s a Linux! meminfo_file = "/proc/meminfo" with open(meminfo_file, "r") as f1: for line in f1: if line.startswith("MemTotal:"): mem = int(line.split(" ")[-2]) # Mem is given in Kb, convert to Gb. total_memory = mem / 2 ** 20 elif sys.platform == "darwin": # It"s a Mac! meminfo = subprocess.check_output(["sysctl", "hw.memsize"]) mem = int(meminfo.decode("utf-8").rstrip().split(" ")[-1]) # Mem is given in b, convert to Gb. total_memory = mem / 2 ** 30 if total_memory < Gb: error = True return ("{0:.1f}".format(total_memory), error) def check_out_prefix(out_prefix, log_file, quiet): error = False if os.path.isdir(out_prefix): message = ("prefix for output files ({0}) is a directory." "".format(out_prefix)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True dir_ = out_prefix.rsplit("/", 1)[0] if not os.path.isdir(dir_): message = ("cannot find output directory {0} to which output files " "should be written.".format(dir_)) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_prodigal_binaries(path_to_prodigal, log_file, quiet): error = False try: p = subprocess.Popen([path_to_prodigal, "-v"], stderr=subprocess.PIPE) c = p.communicate() output = c[1].decode().rstrip().lstrip() message = "Prodigal found: {0}.".format(output) shared.give_user_feedback(message, log_file, quiet) except OSError: message = ("cannot find Prodigal. Please check whether it is " "installed or the path to the binaries is provided.") shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_diamond_binaries(path_to_diamond, log_file, quiet): error = False try: p = subprocess.Popen([path_to_diamond, "--version"], stdout=subprocess.PIPE) c = p.communicate() output = c[0].decode().rstrip() message = "DIAMOND found: {0}.".format(output) shared.give_user_feedback(message, log_file, quiet) except OSError: message = ("cannot find DIAMOND. Please check whether it is " "installed or the path to the binaries is provided.") shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_bin_folder(bin_folder, bin_suffix, log_file, quiet): error = False if not os.path.isdir(bin_folder): message = "cannot find the bin folder." shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error tmp = [] for file_ in os.listdir(bin_folder): if file_.startswith("."): # Skip hidden files. continue if not file_.endswith(bin_suffix): continue if ".concatenated." in file_: # Skip concatenated contig fasta and predicted protein fasta files # from earlier runs. continue tmp.append(file_) if len(tmp) == 0: message = ( "no bins found with suffix {0} in bin folder. You can set the " "suffix with the [-s / --bin_suffix] argument.".format(bin_suffix) ) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_bin_fasta(bin_fasta, log_file, quiet): error = False if check_fasta(bin_fasta, log_file, quiet): error = True return error def check_folders_for_run( taxonomy_folder, nodes_dmp, names_dmp, database_folder, diamond_database, fastaid2LCAtaxid_file, taxids_with_multiple_offspring_file, step_list, log_file, quiet ): error = False if not os.path.isdir(taxonomy_folder): message = "cannot find the taxonomy folder." shared.give_user_feedback(message, log_file, quiet, error=True) error = True else: if not nodes_dmp or not names_dmp: message = ("nodes.dmp and / or names.dmp not found in the " "taxonomy folder.") shared.give_user_feedback(message, log_file, quiet, error=True) error = True if not os.path.isdir(database_folder): message = "cannot find the database folder." shared.give_user_feedback(message, log_file, quiet, error=True) error = True else: if not diamond_database and "align" in step_list: message = "DIAMOND database not found in database folder." shared.give_user_feedback(message, log_file, quiet, error=True) error = True if not fastaid2LCAtaxid_file: message = "file fastaid2LCAtaxid is not found in database folder." shared.give_user_feedback(message, log_file, quiet, error=True) error = True if not taxids_with_multiple_offspring_file: message = ("file taxids_with_multiple_offspring not found in " "database folder.") shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_output_file(output_file, log_file, quiet): error = False if os.path.isfile(output_file): message = ( "output file {0} already exists. You can choose to overwrite " "existing files with the [--force] argument.".format(output_file) ) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_input_file(input_file, log_file, quiet): error = False if not os.path.isfile(input_file): message = "input file {0} does not exist.".format(input_file) shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_in_and_output_file(input_file, output_file, log_file, quiet): error = False if input_file == output_file: message = "input file and output file cannot be the same." shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_top(top, r, log_file, quiet): error = False if top <= r: message = "[--top] should be higher than [-r / --range]." shared.give_user_feedback(message, log_file, quiet, error=True) error = True return error def check_fasta(file_, log_file, quiet): error = False if not os.path.isfile(file_): error = True else: with open(file_, "r") as f1: for n, line in enumerate(f1): if n == 0: if not line.startswith(">"): error = True break if error: message = "{0} is not a fasta file.".format(file_) shared.give_user_feedback(message, log_file, quiet, error=True) return error def check_whether_ORFs_are_based_on_contigs( contig_names, contig2ORFs, log_file, quiet): overlap = len(contig_names & set(contig2ORFs)) if overlap == 0: message = ( "no ORFs found that can be traced back to one of the contigs " "in the contigs fasta file: {0}. ORFs should be named " "contig_name_#.".format(contig2ORFs[contig][0]) ) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) rel_overlap = overlap / len(contig_names) message = "ORFs found on {0:,d} / {1:,d} contigs ({2:.2f}%).".format( overlap, len(contig_names), rel_overlap * 100) shared.give_user_feedback(message, log_file, quiet) if rel_overlap < 0.97: message = ( "only {0:.2f}% contigs found with ORF predictions. This may " "indicate that some contigs were missing from the protein " "prediction. Please make sure that the protein prediction was " "based on all contigs.".format(rel_overlap * 100) ) shared.give_user_feedback(message, log_file, quiet, warning=True) return if __name__ == "__main__": sys.exit("Run \'CAT\' to run CAT or BAT.") CAT-5.3/CAT_pack/contigs.py000066400000000000000000000353731452142602700154330ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import decimal import sys import about import check import shared import tax def parse_arguments(): parser = argparse.ArgumentParser( prog="CAT contigs", description="Run Contig Annotation Tool (CAT).", usage="CAT contigs -c FILE -d DIR -t DIR [options] [-h / --help]", add_help=False ) required = parser.add_argument_group("Required arguments") shared.add_argument(required, "contigs_fasta", True) shared.add_argument(required, "database_folder", True) shared.add_argument(required, "taxonomy_folder", True) optional = parser.add_argument_group("Optional arguments") shared.add_argument(optional, "r", False, default=decimal.Decimal(10)) shared.add_argument(optional, "f", False, default=decimal.Decimal(0.5)) shared.add_argument(optional, "out_prefix", False, default="./out.CAT") shared.add_argument(optional, "proteins_fasta", False) shared.add_argument(optional, "alignment_file", False) shared.add_argument( optional, "path_to_prodigal", False, default="prodigal") shared.add_argument(optional, "path_to_diamond", False, default="diamond") shared.add_argument(optional, "no_stars", False) shared.add_argument(optional, "force", False) shared.add_argument(optional, "quiet", False) shared.add_argument(optional, "verbose", False) shared.add_argument(optional, "no_log", False) shared.add_argument(optional, "help", False) shared.add_argument(optional, "IkwId", False) specific = parser.add_argument_group("DIAMOND specific optional arguments") shared.add_all_diamond_arguments(specific) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, "contigs")] if len(extra_args) > 0: sys.exit("error: too much arguments supplied:\n{0}".format( "\n".join(extra_args))) # Check experimental features. if not args.IkwId: if args.top < 11: sys.exit( "error: [--top] can only be set lower than 11 with the " "[--I_know_what_Im_doing] flag. See README.md as to why this " "is the case." ) if args.r > 11 and args.alignment_file: sys.exit( "error: [-r / --range] can only be set higher than 11 in " "combination with [-a / --diamond_alignment] with the " "[--I_know_what_Im_doing] flag. See README.md as to why this " "is the case." ) # Add extra arguments. shared.expand_arguments(args) return args def run(): args = parse_arguments() message = "# CAT v{0}.".format(about.__version__) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) # Check at which state to start. step_list = [] if not args.proteins_fasta and not args.alignment_file: message = ( "\n" "CAT is running. Protein prediction, alignment, and contig " "classification are carried out." ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) step_list.append("predict_proteins") step_list.append("align") elif args.proteins_fasta and not args.alignment_file: message = ( "\n" "CAT is running. Since a predicted protein fasta is supplied, " "only alignment and contig classification are carried out." ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) step_list.append("align") elif args.proteins_fasta and args.alignment_file: message = ( "\n" "CAT is running. Since a predicted protein fasta and alignment " "file are supplied, only contig classification is carried out." ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) elif not args.proteins_fasta and args.alignment_file: message = ( "if you want CAT to directly do the classification, you should " "not only supply an alignment table but also a predicted protein " "fasta file with argument [-p / --proteins]." ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) step_list.append("classify") # Print variables. message = ( "Rarw!\n\n" "Supplied command: {0}\n\n" "Contigs fasta: {1}\n" "Taxonomy folder: {2}\n" "Database folder: {3}\n" "Parameter r: {4}\n" "Parameter f: {5}\n" "Log file: {6}\n\n" "-----------------\n".format( " ".join(sys.argv), args.contigs_fasta, args.taxonomy_folder, args.database_folder, int(args.r), float(args.f), args.log_file) ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) # Check binaries, output files, taxonomy folder and database folder, and # set variables. message = "Doing some pre-flight checks first." shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_out_prefix(args.out_prefix, args.log_file, args.quiet)) if "predict_proteins" in step_list: errors.append( check.check_prodigal_binaries( args.path_to_prodigal, args.log_file, args.quiet) ) setattr( args, "proteins_fasta", "{0}.predicted_proteins.faa".format(args.out_prefix) ) setattr( args, "proteins_gff", "{0}.predicted_proteins.gff".format(args.out_prefix) ) if not args.force: errors.append( check.check_output_file( args.proteins_fasta, args.log_file, args.quiet) ) errors.append( check.check_output_file( args.proteins_gff, args.log_file, args.quiet) ) if "align" in step_list: errors.append( check.check_diamond_binaries( args.path_to_diamond, args.log_file, args.quiet) ) setattr( args, "alignment_file", "{0}.alignment.diamond".format(args.out_prefix) ) if not args.force: errors.append( check.check_output_file( args.alignment_file, args.log_file, args.quiet) ) errors.append( check.check_folders_for_run( args.taxonomy_folder, args.nodes_dmp, args.names_dmp, args.database_folder, args.diamond_database, args.fastaid2LCAtaxid_file, args.taxids_with_multiple_offspring_file, step_list, args.log_file, args.quiet ) ) setattr( args, "contig2classification_output_file", "{0}.contig2classification.txt".format(args.out_prefix) ) setattr( args, "ORF2LCA_output_file", "{0}.ORF2LCA.txt".format(args.out_prefix) ) if not args.force: errors.append( check.check_output_file( args.contig2classification_output_file, args.log_file, args.quiet ) ) errors.append( check.check_output_file( args.ORF2LCA_output_file, args.log_file, args.quiet) ) if "predict_proteins" not in step_list: errors.append( check.check_fasta( args.proteins_fasta, args.log_file, args.quiet) ) if "align" in step_list: errors.append( check.check_top(args.top, args.r, args.log_file, args.quiet)) # Print all variables. shared.print_variables(args, step_list) if True in errors: sys.exit(1) message = "Ready to fly!\n\n-----------------\n" shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) # Start CAT. contig_names = shared.import_contig_names( args.contigs_fasta, args.log_file, args.quiet) if "predict_proteins" in step_list: shared.run_prodigal( args.path_to_prodigal, args.contigs_fasta, args.proteins_fasta, args.proteins_gff, args.log_file, args.quiet ) contig2ORFs = shared.import_ORFs( args.proteins_fasta, args.log_file, args.quiet) check.check_whether_ORFs_are_based_on_contigs( contig_names, contig2ORFs, args.log_file, args.quiet) if "align" in step_list: shared.run_diamond(args) (ORF2hits, all_hits) = shared.parse_tabular_alignment( args.alignment_file, args.one_minus_r, args.log_file, args.quiet) (taxid2parent, taxid2rank) = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid( args.fastaid2LCAtaxid_file, all_hits, args.log_file, args.quiet) taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring( args.taxids_with_multiple_offspring_file, args.log_file, args.quiet) message = "CAT is spinning! Files {0} and {1} are created.".format( args.contig2classification_output_file, args.ORF2LCA_output_file) shared.give_user_feedback(message, args.log_file, args.quiet) n_classified_contigs = 0 with open(args.contig2classification_output_file, "w") as outf1, open(args.ORF2LCA_output_file, "w") as outf2: outf1.write("# contig\tclassification\treason\tlineage\t" "lineage scores (f: {0})\n".format(args.f)) outf2.write("# ORF\tnumber of hits (r: {0})\tlineage\ttop bit-score\n" "".format(args.r)) for contig in sorted(contig_names): if contig not in contig2ORFs: outf1.write("{0}\tno taxid assigned\tno ORFs found\n".format( contig)) continue LCAs_ORFs = [] for ORF in contig2ORFs[contig]: if ORF not in ORF2hits: outf2.write("{0}\tORF has no hit to database\n".format( ORF)) continue n_hits = len(ORF2hits[ORF]) (taxid, top_bitscore) = tax.find_LCA_for_ORF( ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent) if taxid.startswith("no taxid found"): outf2.write("{0}\t{1}\t{2}\t{3}\n".format( ORF, n_hits, taxid, top_bitscore)) else: lineage = tax.find_lineage(taxid, taxid2parent) if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) outf2.write("{0}\t{1}\t{2}\t{3}\n".format( ORF, n_hits, ";".join(lineage[::-1]), top_bitscore)) LCAs_ORFs.append((taxid, top_bitscore),) if len(LCAs_ORFs) == 0: outf1.write("{0}\tno taxid assigned\t" "no hits to database\n".format(contig)) continue (lineages, lineages_scores, based_on_n_ORFs) = tax.find_weighted_LCA( LCAs_ORFs, taxid2parent, args.f) if lineages == "no ORFs with taxids found.": outf1.write( "{0}\tno taxid assigned\t" "hits not found in taxonomy files\n".format(contig) ) continue if lineages == "no lineage whitelisted.": outf1.write( "{0}\tno taxid assigned\t" "no lineage reached minimum bit-score support\n" "".format(contig) ) continue # The contig has a valid classification. n_classified_contigs += 1 for (i, lineage) in enumerate(lineages): if not args.no_stars: lineage = tax.star_lineage( lineage, taxids_with_multiple_offspring) scores = ["{0:.2f}".format(score) for score in lineages_scores[i]] if len(lineages) == 1: # There is only one classification. outf1.write( "{0}\t" "taxid assigned\t" "based on {1}/{2} ORFs\t" "{3}\t" "{4}\n".format( contig, based_on_n_ORFs, len(contig2ORFs[contig]), ";".join(lineage[::-1]), ";".join(scores[::-1]) ) ) else: # There are multiple classifications. outf1.write( "{0}\t" "taxid assigned ({1}/{2})\t" "based on {3}/{4} ORFs\t" "{5}\t" "{6}\n".format( contig, i + 1, len(lineages), based_on_n_ORFs, len(contig2ORFs[contig]), ";".join(lineage[::-1]), ";".join(scores[::-1]) ) ) message = ( "\n-----------------\n\n" "{0} CAT is done! {1:,d}/{2:,d} contigs ({3:.2f}%) have " "taxonomy assigned.".format( shared.timestamp(), n_classified_contigs, len(contig_names), n_classified_contigs / len(contig_names) * 100 ) ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) if args.f < 0.5: message = ("since f is set to smaller than 0.5, one contig may have " "multiple classifications.") shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False, warning=True) return if __name__ == "__main__": sys.exit("Run \'CAT contigs\' to run Contig Annotation Tool (CAT).") CAT-5.3/CAT_pack/download.py000066400000000000000000000631271452142602700155720ustar00rootroot00000000000000import argparse from collections import namedtuple import datetime import hashlib import pathlib import shutil import sys import tarfile import urllib.request import urllib.parse import shared import check def parse_arguments(): date = datetime.datetime.now().strftime("%Y-%m-%d") parser = argparse.ArgumentParser( prog="CAT download", description=( "Download and preprocess sequence and taxonomy information. " "Currently supports the NCBI non-redundant (nr) database " "and GTDB." ), usage="CAT download --db (nr | gtdb) -o DIR [options] [-h / --help]", add_help=False, ) required = parser.add_argument_group("Required arguments") shared.add_argument(required, "db", True) shared.add_argument(required, "output_dir", True) optional = parser.add_argument_group("Optional arguments") shared.add_argument(optional, "cleanup", False) shared.add_argument(optional, "quiet", False) shared.add_argument(optional, "no_log", False) shared.add_argument(optional, "help", False) (args, extra_args) = parser.parse_known_args() extra_args = [ arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, "download") ] if len(extra_args) > 0: sys.exit( "error: too many arguments supplied:\n{0}".format( "\n".join(extra_args) ) ) setattr(args, "date", date) shared.expand_arguments(args) return args def download_singleton(target_url, local_path, log_file, quiet): """Download a single file to the specified location.""" try: urllib.request.urlretrieve(target_url, local_path) except: message = "Failed downloading file: {0}.".format(target_url) shared.give_user_feedback(message, log_file, quiet) raise return def multi_download(url_list, output_dir, log_file, quiet, prefix=None): """Download all required nr files in the specified output dir.""" existing_files = list([p.resolve() for p in output_dir.iterdir()]) for url in url_list: url_leaf = url.split("/")[-1] if prefix: output_basename = "{0}.{1}".format(prefix, url_leaf) else: output_basename = url_leaf output_path = output_dir / pathlib.Path(output_basename) if output_path in existing_files: message = ( "Skipping download of file {0}. It already exists.".format( output_path.name ) ) shared.give_user_feedback(message, log_file, quiet) else: message = "Downloading {0}.".format(url_leaf) shared.give_user_feedback(message, log_file, quiet) download_singleton(url, output_path, log_file, quiet) return def check_nr_md5s(data_dir, log_file, quiet): """Check integrity of all files in a dir with their paired .md5 files.""" md5_files = list([p.resolve() for p in data_dir.glob("*.md5")]) for md5_file in md5_files: data_file = md5_file.with_suffix("") check.check_md5_gz(data_file, md5_file, log_file, quiet) return def process_nr(output_dir, log_file, quiet, prefix, cleanup): nr_urls = [ "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz", "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5", "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.gz", "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.gz.md5", "ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz", "ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5", ] # Fetch files. multi_download(nr_urls, output_dir, log_file, quiet, prefix) # Check files. check_nr_md5s(output_dir, log_file, quiet) # Process files. tax_tar = "{0}.{1}".format(prefix, "taxdump.tar.gz") tax_tar_path = output_dir / pathlib.Path(tax_tar) with tarfile.open(tax_tar_path, "r:gz") as tar: for dmp in ["names.dmp", "nodes.dmp"]: message = "Extracting {0} from taxdump.tar.gz".format(dmp) shared.give_user_feedback(message, log_file, quiet) tar.extract(dmp, path=output_dir) # Timestamp the extracted dmp file outf1 = output_dir / pathlib.Path(dmp) timestamped_fname = "{0}.{1}".format(prefix, dmp) timestamped_outf1 = output_dir / pathlib.Path(timestamped_fname) outf1.rename(timestamped_outf1) if cleanup is True: targets = [tax_tar_path.resolve()] for i in output_dir.glob("*.md5"): targets.append(i.resolve()) for t in targets: t.unlink() # CAT prepare nr_gz = list(output_dir.glob("*nr.gz"))[0] names_dmp = list(output_dir.glob("*names.dmp"))[0] nodes_dmp = list(output_dir.glob("*nodes.dmp"))[0] acc2taxid_gz = list(output_dir.glob("*accession2taxid.FULL.gz"))[0] message = ( "\n-----------------\n\n" "Done!\n\n" "A CAT database can be build with:\n\n" "CAT prepare \\\n" "--db_fasta {0} \\\n" "--names {1} \\\n" "--nodes {2} \\\n" "--acc2tax {3} \\\n" "--db_dir path/to/prepare_output\n".format( nr_gz.resolve(), names_dmp.resolve(), nodes_dmp.resolve(), acc2taxid_gz.resolve(), ) ) shared.give_user_feedback(message, log_file, quiet, show_time=False) return # GTDB. ## GENERAL. prefixes_to_rank_names = { "d__": "superkingdom", # Using superkingdom for compatibility with NCBI. "p__": "phylum", "o__": "order", "c__": "class", "f__": "family", "g__": "genus", "s__": "species", } fastaRecord = namedtuple( "fastaRecord", ["id", "seq", "uid", "taxid"], ) ## FUNCTIONS. def get_gtdb_latest_version(): """Read the version number from the VERSION file.""" version_url = "https://data.gtdb.ecogenomic.org/releases/latest/VERSION.txt" with urllib.request.urlopen(version_url) as f: version_data = f.read().decode() version = "{0} ({2})".format(*version_data.split("\n")) return version def load_gtdb_md5sums(md5sums_file): """Create a dictionary from the MD5SUMS file.""" md5_dict = {} with open(md5sums_file, "r") as f1: for line in f1: fields = [f.strip() for f in line.split()] fname = pathlib.Path(fields[1]).name md5_dict[fname] = fields[0] return md5_dict def check_gtdb_md5s(data_dir, md5_dict, log_file, quiet): for f in data_dir.glob("*.gz"): if f.name not in md5_dict: continue md5 = check.gz_md5(f) if md5_dict[f.name] != md5: message = "MD5 of {0} does not check out.".format(f.resolve()) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) else: message = "MD5 of {0} checks out.".format(f.resolve()) shared.give_user_feedback(message, log_file, quiet) return def concatenate_taxonomies_gz(ar_tsv_gz, bac_tsv_gz, output_tsv): with open(output_tsv, "w") as outf1: for f in [ar_tsv_gz, bac_tsv_gz]: with shared.optionally_compressed_handle(f, "r") as f1: for line in f1: outf1.write(line) return def parent_child_pairs(lineage_string): """Turn a ";" separated lineage to a list of tuples for child parent. Given a lineage string of the form "d__XXX;p__XXX;o__XXX;c__XXX;f__XXX;g_XXX;s__XXX" prepend "root" and split it into parent-child pairs [ ("root", "d__XXX"), ("d__XXX", "p__XXX"), ("p__XXX", "o__XXX"), ... ] """ with_root = ";".join(["root", lineage_string]) lineage_list = with_root.split(";") pairs = [] for i in range(0, len(lineage_list) - 1): parent, child = lineage_list[i], lineage_list[i + 1] pairs.append((parent, child)) return pairs def write_nodes_dmp(taxonomies_tsv, nodes_dmp): """Write the nodes.dmp from the taxonomy files.""" seen_taxids = [] with open(taxonomies_tsv, "r") as f1, open(nodes_dmp, "w") as outf1: for line in f1: fields = [f.strip() for f in line.split("\t")] lineage_string = fields[1] pairs = parent_child_pairs(lineage_string) for pair in pairs: parent, child = pair[0], pair[1] if parent not in seen_taxids: if parent == "root": outf1.write( "{0}{1}".format( "\t|\t".join(["root", "root", "no rank"]), "\t|\n", ) ) seen_taxids.append("root") else: seen_taxids.append(parent) if child not in seen_taxids: seen_taxids.append(child) outf1.write( "{0}{1}".format( "\t|\t".join( [ child, parent, prefixes_to_rank_names[child[:3]], ] ), "\t|\n", ) ) return def write_names_dmp(taxonomies_tsv, names_dmp): seen_taxids = [] with open(taxonomies_tsv, "r") as f1, open(names_dmp, "w") as outf1: outf1.write( "{0}{1}".format( "\t|\t".join(["root", "root", "scientific name"]), "\t|\n" ) ) for line in f1: taxid = line.split(";")[-1].strip() if taxid not in seen_taxids: outf1.write( "{0}{1}".format( "\t|\t".join([taxid, taxid, "scientific name"]), "\t|\n", ) ) seen_taxids.append(taxid) return def genome_id_to_taxid(taxonomy_tsv): """Return a dictionary with GTDB taxid for each genome.""" mapping = {} with open(taxonomy_tsv, "r") as f1: for line in f1: fields = [f.strip() for f in line.split("\t")] genome_id = fields[0] taxid = fields[1].split(";")[-1] mapping[genome_id] = taxid return mapping def fastaIterator_gz(fasta_in_gz, gid2taxid): """Yield fastaRecord tuples with more information. This is adapted from biopython's SimpleFastaParser. https://github.com/biopython/biopython/blob/eec86d4bcb04bfcf86495f92f12faf3ff98a288d/Bio/SeqIO/FastaIO.py#L24 The gid2taxid dictionary holds the mapping of a given genome accession (RS_GCF_XXXX or GB_GCA_XXXX). The taxid is propagated to all protein sequences of that genome. Positional argunents: fasta_in: pathlib.Path object: Path to the fasta. gid2taxid: dict: A dictionary with taxid for a genome accession of the form {"RS_CCF_XXXXX" : "s__Escherichia coli", ...}. Return: None, if the file is not a valid fasta. Breaks the iteration. Yields fastaRecord objects which are named tuples holding the following information: - id: str: Unique id of the fasta header, anything between the ">" and the first space. - seq: str: The sequence, capitalized and with trailing "*" stripped off - uid: str: Unique id, "_" joined md5sum and length. - taxid: str: GTDB taxonomy that was assigned to the genome. """ origin = fasta_in_gz.name.replace("_protein.faa.gz", "") taxid = gid2taxid[origin] with shared.optionally_compressed_handle(fasta_in_gz, "r") as f1: for line in f1: if line[0] == ">": title = line[1:].rstrip() break else: # No break encountered - probably an empty file. return lines = [] for line in f1: if line[0] == ">": name = title.split(" ")[0] seq = "".join( lines).replace(" ", "").replace("\r", "").rstrip("*") length = len(seq) md5sum = hashlib.md5(seq.encode()).hexdigest() uid = "_".join([md5sum, str(length)]) yield fastaRecord(name, seq, uid, taxid) lines = [] title = line[1:].rstrip() continue lines.append(line.rstrip()) name = title.split(" ")[0] seq = "".join(lines).replace(" ", "").replace("\r", "").rstrip("*") length = len(seq) md5sum = hashlib.md5(seq.encode()).hexdigest() uid = "_".join([md5sum, str(length)]) yield fastaRecord(name, seq, uid, taxid) def extract_duplicates(proteins_dir, gid2taxid, acc2taxid_fp, log_file, quiet): """Get a dictionary of duplicate uids (md5sum + length).""" seen_uids = {} multiplets = {} seq_counter, file_counter = 0, 0 with shared.optionally_compressed_handle(acc2taxid_fp, "w") as outf1: outf1.write("accession.version\ttaxid\n") for f in proteins_dir.rglob("*/*.faa.gz"): file_counter += 1 for record in fastaIterator_gz(f, gid2taxid): # Write an entry. outf1.write("{0}\t{1}\n".format(record.id, record.taxid)) # Check if duplicate. is_multiplet = False if record.uid not in seen_uids: # Store the first occurrence id. seen_uids[record.uid] = record.id else: is_multiplet = True if is_multiplet is True: if record.uid not in multiplets: multiplets[record.uid] = fastaRecord( "\x01".join([seen_uids[record.uid], record.id]), record.seq, None, None, ) else: old_rec = multiplets[record.uid] new_rec = fastaRecord( "\x01".join([old_rec.id, record.id]), old_rec.seq, None, None, ) multiplets[record.uid] = new_rec seq_counter += 1 if file_counter % 1000 == 0 and file_counter != 0: message = "Parsed {0} sequences from {1} files.".format( seq_counter, file_counter ) shared.give_user_feedback(message, log_file, quiet) # This else is part of the outter for-loop. # It executes when the for loop finishes. else: # Create some whitespace for aligned printing. padding = len("[YYYY-MM-DD HH:MM:SS] ") * " " # Calculate the total number of identified multiplets. redundants = sum(map(len, [v for v in multiplets.values()])) message = ( " Total files: {0:>12}\n" "{1}Total sequences: {2:>12}\n" "{3} Multiplets: {4:>12}\n" "{5}of which unique: {6:>12}" "".format( file_counter, padding, seq_counter, padding, redundants, padding, len(multiplets), ) ) shared.give_user_feedback(message, log_file, quiet) return multiplets def write_singletons( proteins_dir, duplicates, gid2taxid, singletons_fp, log_file, quiet): seq_counter, file_counter, skipped = 0, 0, 0 with shared.optionally_compressed_handle(singletons_fp, "w") as outf1: for f in proteins_dir.rglob("*/*.faa.gz"): file_counter += 1 for record in fastaIterator_gz(f, gid2taxid): if record.uid not in duplicates: outf1.write(">{0}\n{1}\n".format(record.id, record.seq)) seq_counter += 1 else: skipped += 1 if file_counter % 1000 == 0 and file_counter != 0: message = ("Written {0} sequences from {1} files ({2} skipped)." "".format(seq_counter, file_counter, skipped)) shared.give_user_feedback(message, log_file, quiet) else: message = ("Written {0} sequences from {1} files ({2} skipped)." "".format(seq_counter, file_counter, skipped)) shared.give_user_feedback(message, log_file, quiet) return def concatenate_trees(bac_tree_fp, ar_tree_fp, all_tree_fp): """Concatenate the newick trees under a common root in a new file.""" # Load the Bacteria tree as a string and make it a subtree. bac_tree = bac_tree_fp.read_text() bac_tree = bac_tree.rstrip().replace( "d__Bacteria;", "'100.0:d__Bacteria':1.0" ) # Load the Archaea tree as a string and make it a subtree. ar_tree = ar_tree_fp.read_text() ar_tree = ar_tree.rstrip().replace("d__Archaea;", "'100.0:d__Archaea':1.0") # Concatenate the subtrees under a node named root. all_tree = "({0},{1})root;\n".format(ar_tree, bac_tree) # Write the file. all_tree_fp.write_text(all_tree) return def process_gtdb(output_dir, log_file, quiet, cleanup=False): # Using "latest" as an entry point. # This needs to be checked for future versions. version = get_gtdb_latest_version() message = "CAT will download files from GTDB {0}.".format(version) shared.give_user_feedback(message, log_file, quiet) gtdb_urls = [ "https://data.gtdb.ecogenomic.org/releases/latest/VERSION.txt", "https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz", "https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz", "https://data.gtdb.ecogenomic.org/releases/latest/MD5SUM.txt", "https://data.gtdb.ecogenomic.org/releases/latest/bac120.tree", "https://data.gtdb.ecogenomic.org/releases/latest/ar53.tree", "https://data.gtdb.ecogenomic.org/releases/latest/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz", ] # Fetch files. multi_download(gtdb_urls, output_dir, log_file, quiet, prefix=None) # Check files. md5sums_file = output_dir / pathlib.Path("MD5SUM.txt") md5sums_dict = load_gtdb_md5sums(md5sums_file) check_gtdb_md5s(output_dir, md5sums_dict, log_file, quiet) # Concatenate taxonomies. bacteria_tsv_gz = list(output_dir.glob("*bac*_taxonomy*"))[0] archaea_tsv_gz = list(output_dir.glob("*ar*_taxonomy*"))[0] all_taxa_tsv = output_dir / pathlib.Path("all_taxonomies.tsv") if not all_taxa_tsv.exists(): concatenate_taxonomies_gz( archaea_tsv_gz, bacteria_tsv_gz, all_taxa_tsv ) # Concatenate newick trees. bac_tree_fp = list(output_dir.glob("*bac*.tree"))[0] ar_tree_fp = list(output_dir.glob("*ar*.tree"))[0] concatenated_tree_fp = output_dir / pathlib.Path("gtdb.tree") if not concatenated_tree_fp.exists(): message = "Concatenating newick trees." shared.give_user_feedback(message, log_file, quiet) concatenate_trees(bac_tree_fp, ar_tree_fp, concatenated_tree_fp) # Extract protein files from archive. proteins_tar = list(output_dir.glob("*proteins_aa_reps*"))[0] proteins_dir = output_dir / pathlib.Path("protein_faa_reps") if not proteins_dir.is_dir(): with tarfile.open(proteins_tar, "r:gz") as tar: # Fix for CVE-2007-4559. def is_within_directory(directory, target): abs_directory = os.path.abspath(directory) abs_target = os.path.abspath(target) prefix = os.path.commonprefix([abs_directory, abs_target]) return prefix == abs_directory def safe_extract(tar, path=".", members=None, *, numeric_owner=False): for member in tar.getmembers(): member_path = os.path.join(path, member.name) if not is_within_directory(path, member_path): raise Exception("Attempted Path Traversal in Tar File") tar.extractall(path, members, numeric_owner=numeric_owner) safe_extract(tar, output_dir) else: message = "Proteins directory {0} already exists.".format( proteins_dir.resolve() ) shared.give_user_feedback(message, log_file, quiet) # Process files. # NODES. nodes_dmp = output_dir / pathlib.Path("nodes.dmp") if not nodes_dmp.exists(): message = "Writing nodes information to {0}.".format( nodes_dmp.resolve()) shared.give_user_feedback(message, log_file, quiet) write_nodes_dmp(all_taxa_tsv, nodes_dmp) else: message = "Nodes file found : {0}.".format( nodes_dmp.resolve()) shared.give_user_feedback(message, log_file, quiet) # NAMES. names_dmp = output_dir / pathlib.Path("names.dmp") if not names_dmp.exists(): message = "Writing names information to {0}.".format( names_dmp.resolve()) shared.give_user_feedback(message, log_file, quiet) write_names_dmp(all_taxa_tsv, names_dmp) else: message = "Names file found : {0}.".format(names_dmp.resolve()) shared.give_user_feedback(message, log_file, quiet) gid2taxid = genome_id_to_taxid(all_taxa_tsv) # SEQUENCES. duplicates_fp = output_dir / pathlib.Path("dups.fa.gz") singletons_fp = output_dir / pathlib.Path("singletons.fa.gz") all_seqs_fp = output_dir / pathlib.Path("gtdb_seqs.fa.gz") acc2taxid_fp = output_dir / pathlib.Path("prot.accession2taxid.txt.gz") if not acc2taxid_fp.exists(): message = "1st pass: Extracting multiplets." shared.give_user_feedback(message, log_file, quiet) ## 1st pass. ## Write acc2taxid file, extract duplicates in their own fasta. duplicates = extract_duplicates( proteins_dir, gid2taxid, acc2taxid_fp, log_file, quiet) with shared.optionally_compressed_handle(duplicates_fp, "w") as outf1: for rec in duplicates.values(): outf1.write(">{0}\n{1}\n".format(rec.id, rec.seq)) # 2nd pass. # Write the unique sequences to a separate file. message = "2nd pass: Retrieving unique sequences." shared.give_user_feedback(message, log_file, quiet) write_singletons( proteins_dir, duplicates, gid2taxid, singletons_fp, log_file, quiet ) message = "Concatenating sequence files." shared.give_user_feedback(message, log_file, quiet) # Concatenate the two files into one. with shared.optionally_compressed_handle(all_seqs_fp, "w") as outf1: for f in [duplicates_fp, singletons_fp]: with shared.optionally_compressed_handle(f, "r") as f1: for line in f1: outf1.write(line) if cleanup is True: remove_targets = [ proteins_dir, bac_tree_fp, ar_tree_fp, duplicates_fp, singletons_fp, bacteria_tsv_gz, archaea_tsv_gz, all_taxa_tsv, proteins_tar, ] message = "Cleanup specified. Removing unnecessary files and folders." shared.give_user_feedback(message, log_file, quiet) for target in remove_targets: if target.is_dir(): shutil.rmtree(target) else: target.unlink() message = ( "\n-----------------\n\n" "Done!\n\n" "A CAT database can be build with:\n\n" "CAT prepare \\\n" "--db_fasta {0} \\\n" "--names {1} \\\n" "--nodes {2} \\\n" "--acc2tax {3} \\\n" "--db_dir path/to/prepare_output\n".format( all_seqs_fp.resolve(), names_dmp.resolve(), nodes_dmp.resolve(), acc2taxid_fp.resolve(), ) ) shared.give_user_feedback(message, log_file, quiet, show_time=False) return def run(): args = parse_arguments() if not args.output_dir.exists(): args.output_dir.mkdir(parents=True) if args.no_log: log_file = None else: log_fname = "{0}.CAT_download.log".format(args.date) log_file = args.output_dir / pathlib.Path(log_fname) setattr(args, "log_file", log_file) if args.db == "nr": process_nr( args.output_dir, args.log_file, args.quiet, prefix=args.date, cleanup=args.cleanup, ) elif args.db == "gtdb": process_gtdb(args.output_dir, args.log_file, args.quiet, args.cleanup) return if __name__ == "__main__": sys.exit("Run \'CAT download\' to download and preprocess data from " "NCBI nr or GTDB.") CAT-5.3/CAT_pack/prepare.py000066400000000000000000000344461452142602700154230ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import datetime import multiprocessing import os import pathlib import shutil import subprocess import sys import check import shared import tax def parse_arguments(): date = datetime.datetime.now().strftime("%Y-%m-%d") parser = argparse.ArgumentParser( prog="CAT prepare", description="Construct CAT/BAT database files.", usage=( "CAT prepare --db_fasta FILE " "--acc2tax FILE " "--names FILE " "--nodes FILE " "--db_dir DIR " "[options] [-h / --help]" ), add_help=False, ) required = parser.add_argument_group("Required arguments") shared.add_argument(required, "db_fasta", True) shared.add_argument(required, "names_dmp", True) shared.add_argument(required, "nodes_dmp", True) shared.add_argument(required, "acc2tax", True) shared.add_argument(required, "db_dir", True) optional = parser.add_argument_group("Optional arguments") shared.add_argument(optional, "path_to_diamond", False, default="diamond") shared.add_argument( optional, "common_prefix", False, default="{0}_CAT".format(date), help_="Prefix for all files to be created." ) shared.add_argument(optional, "quiet", False) shared.add_argument(optional, "verbose", False) shared.add_argument(optional, "no_log", False) shared.add_argument(optional, "help", False) specific = parser.add_argument_group("DIAMOND specific optional arguments") shared.add_argument( specific, "nproc", False, default=multiprocessing.cpu_count()) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, "prepare")] if len(extra_args) > 0: sys.exit("error: too much arguments supplied:\n{0}".format( "\n".join(extra_args))) # Add extra arguments. setattr(args, "date", date) setattr(args, "min_mem", 200) shared.expand_arguments(args) return args def memory_bottleneck(args): (total_memory, error) = check.check_memory(args.min_mem) if error: message = ( "At least {0}GB of memory is recommended for large database " "construction (e.g. nr). {1}GB is found on your system. You can " "try to find a machine with more memory if you run into issues or " "download preconstructed database files from " "tbb.bio.uu.nl/bastiaan/CAT_prepare/.".format( args.min_mem, total_memory) ) shared.give_user_feedback( message, args.log_file, args.quiet, warning=True) return def make_diamond_database( path_to_diamond, fasta_file, db_dir, common_prefix, nproc, log_file, quiet, verbose, ): message = ("Constructing DIAMOND database {0}.dmnd from {1} using {2} " "cores.".format(common_prefix, fasta_file, nproc)) shared.give_user_feedback(message, log_file, quiet) diamond_database_prefix = db_dir / pathlib.Path(common_prefix) command = [ path_to_diamond, "makedb", "--in", fasta_file, "-d", diamond_database_prefix, "-p", str(nproc) ] if not verbose: command += ["--quiet"] try: subprocess.check_call(command) except: message = "DIAMOND database could not be created." shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = "DIAMOND database constructed." shared.give_user_feedback(message, log_file, quiet) return def import_fasta_headers(fasta_file, log_file, quiet): message = "Loading file {0}.".format(fasta_file) shared.give_user_feedback(message, log_file, quiet) fastaid2prot_accessions = {} prot_accessions_whitelist = set() with shared.optionally_compressed_handle(fasta_file, "r") as f1: for line in f1: if not line.startswith(">"): continue # \x01 == ^A, handles multiple fasta headers. # Some legacy format for headers. line = line.lstrip(">").split("\x01") prot_accessions = [i.split(" ")[0].strip() for i in line] fastaid = prot_accessions[0] fastaid2prot_accessions[fastaid] = prot_accessions prot_accessions_whitelist.update(prot_accessions) return (fastaid2prot_accessions, prot_accessions_whitelist) def import_prot_accession2taxid( prot_accession2taxid_file, prot_accessions_whitelist, log_file, quiet): message = "Loading file {0}.".format(prot_accession2taxid_file) shared.give_user_feedback(message, log_file, quiet) prot_accession2taxid = {} with shared.optionally_compressed_handle(prot_accession2taxid_file, "r") as f1: for n, line in enumerate(f1): line = line.rstrip().split("\t") if n == 0: index_1 = line.index("accession.version") index_2 = line.index("taxid") continue prot_accession = line[index_1] if prot_accession in prot_accessions_whitelist: prot_accession2taxid[prot_accession] = line[index_2] return prot_accession2taxid def make_fastaid2LCAtaxid_file( fastaid2LCAtaxid_file, fasta_file, prot_accession2taxid_file, taxid2parent, log_file, quiet ): ( fastaid2prot_accessions, prot_accessions_whitelist, ) = import_fasta_headers(fasta_file, log_file, quiet) prot_accession2taxid = import_prot_accession2taxid( prot_accession2taxid_file, prot_accessions_whitelist, log_file, quiet) message = "Finding LCA of all protein accession numbers in fasta headers." shared.give_user_feedback(message, log_file, quiet) no_taxid = 0 corrected = 0 total = 0 with open(fastaid2LCAtaxid_file, "w") as outf1: for fastaid, prot_accessions in fastaid2prot_accessions.items(): list_of_lineages = [] for prot_accession in prot_accessions: try: taxid = prot_accession2taxid[prot_accession] lineage = tax.find_lineage(taxid, taxid2parent) list_of_lineages.append(lineage) except: # This accounts for missing accession numbers in # prot.accession2taxid and missing nodes in nodes.dmp. continue total += 1 if len(list_of_lineages) == 0: # This accounts for entries that only contain accession numbers # that are missing in prot.accession2taxid or whose taxid is # missing in nodes.dmp. NOTE that these entries are thus not # present in the output file. no_taxid += 1 continue LCAtaxid = tax.find_LCA(list_of_lineages) outf1.write("{0}\t{1}\n".format(fastaid, LCAtaxid)) if ( fastaid not in prot_accession2taxid or LCAtaxid != prot_accession2taxid[fastaid] ): # If the fastaid cannot be found in prot.accession2taxid, but # a taxid is given to the fastaid based on secondary accession # numbers, or if the taxid of the header is different from the # LCA taxid, it is counted as corrected. corrected += 1 message = ( "Done! File {0} is created. " "{1:,d} of {2:,d} headers ({3:.1f}%) corrected. " "{4:,d} headers ({5:.1f}%) do not have a taxid assigned.".format( fastaid2LCAtaxid_file, corrected, total, corrected / total * 100, no_taxid, no_taxid / total * 100, ) ) shared.give_user_feedback(message, log_file, quiet) return def find_offspring(fastaid2LCAtaxid_file, taxid2parent, log_file, quiet): message = "Searching database for taxids with multiple offspring." shared.give_user_feedback(message, log_file, quiet) taxid2offspring = {} with open(fastaid2LCAtaxid_file, "r") as f1: for line in f1: line = line.rstrip().split("\t") taxid = line[1] lineage = tax.find_lineage(taxid, taxid2parent) for (i, taxid) in enumerate(lineage): # The first taxid in the lineage does not have a daughter node. if i == 0: continue if taxid not in taxid2offspring: taxid2offspring[taxid] = set() offspring = lineage[i - 1] taxid2offspring[taxid].add(offspring) return taxid2offspring def write_taxids_with_multiple_offspring_file( taxids_with_multiple_offspring_file, taxid2offspring, log_file, quiet): message = "Writing {0}.".format(taxids_with_multiple_offspring_file) shared.give_user_feedback(message, log_file, quiet) with open(taxids_with_multiple_offspring_file, "w") as outf1: for taxid in taxid2offspring: if len(taxid2offspring[taxid]) >= 2: outf1.write("{0}\n".format(taxid)) return def prepare(step_list, args): shared.print_variables(args, step_list) memory_bottleneck(args) # This is the root dir. db_dir = pathlib.Path(args.db_dir).resolve() db_dir.mkdir(exist_ok=True) if not args.no_log: log_fname = "{0}.log".format(args.common_prefix) log_path = db_dir / pathlib.Path(log_fname) setattr(args, "log_file", log_path) # It should contain... # ... 1. a taxonomy folder with names and nodes. tax_db = db_dir / pathlib.Path("tax") if tax_db.is_dir(): message = "Taxonomy folder {0} exists.".format(tax_db) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=True) else: message = "Taxonomy folder {0} is created.".format(tax_db) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=True) tax_db.mkdir() # Check if names and nodes exist together. nodes_dmp_fp = tax_db / pathlib.Path("nodes.dmp") if not nodes_dmp_fp.exists(): message = "Copying nodes.dmp in taxonomy folder." shared.give_user_feedback( message, args.log_file, args.quiet, show_time=True) shutil.copyfile(args.nodes_dmp, nodes_dmp_fp) names_dmp_fp = tax_db / pathlib.Path("names.dmp") if not names_dmp_fp.exists(): message = "Copying names.dmp in taxonomy folder." shared.give_user_feedback( message, args.log_file, args.quiet, show_time=True) shutil.copyfile(args.names_dmp, names_dmp_fp) # ... 2. a dir with the .dmnd and LCA files. cat_db = db_dir / pathlib.Path("db") if cat_db.is_dir(): message = "Database folder {0} exists.".format(cat_db) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=True) if any(cat_db.glob("*.dmnd")): message = "A DIAMOND database exists. Skipping creation." shared.give_user_feedback( message, args.log_file, args.quiet, show_time=True) else: message = "Database folder {0} is created.".format(cat_db) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=True) cat_db.mkdir() if "make_diamond_database" in step_list: make_diamond_database( args.path_to_diamond, args.db_fasta, args.database_folder, args.common_prefix, args.nproc, args.log_file, args.quiet, args.verbose, ) if ("make_fastaid2LCAtaxid_file" in step_list or "make_taxids_with_multiple_offspring_file" in step_list): taxid2parent, taxid2rank = tax.import_nodes( args.nodes_dmp, args.log_file, args.quiet) if "make_fastaid2LCAtaxid_file" in step_list: fname = "{0}.fastaid2LCAtaxid".format(args.common_prefix) fpath = cat_db / pathlib.Path(fname) setattr(args, "fastaid2LCAtaxid_file", fpath) make_fastaid2LCAtaxid_file( args.fastaid2LCAtaxid_file, args.db_fasta, args.acc2tax, taxid2parent, args.log_file, args.quiet, ) if "make_taxids_with_multiple_offspring_file" in step_list: fname = "{0}.taxids_with_multiple_offspring".format(args.common_prefix) fpath = cat_db / pathlib.Path(fname) setattr(args, "taxids_with_multiple_offspring_file", fpath) taxid2offspring = find_offspring( args.fastaid2LCAtaxid_file, taxid2parent, args.log_file, args.quiet ) write_taxids_with_multiple_offspring_file( args.taxids_with_multiple_offspring_file, taxid2offspring, args.log_file, args.quiet ) message = "\n-----------------\n\n{0} CAT prepare is done!".format( shared.timestamp()) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) message = ( "\nSupply the following arguments to CAT or BAT if you want to " "use this database:\n" "-d / --database_folder {0}\n" "-t / --taxonomy_folder {1}".format(cat_db, tax_db) ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) return def run(): args = parse_arguments() step_list = [] if not os.path.exists(args.diamond_database): step_list.append("make_diamond_database") if not os.path.exists(args.fastaid2LCAtaxid_file): step_list.append("make_fastaid2LCAtaxid_file") if not os.path.exists(args.taxids_with_multiple_offspring_file): step_list.append("make_taxids_with_multiple_offspring_file") if len(step_list) == 0: message = ( "Nothing to do here! All files exist. " "Please provide a new location or remove one of the files " "created by CAT to launch a build." ) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=True) else: prepare(step_list, args) return if __name__ == "__main__": sys.exit("Run \'CAT prepare\' to construct a CAT/BAT database.") CAT-5.3/CAT_pack/shared.py000066400000000000000000000750621452142602700152320ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import datetime import decimal import gzip import multiprocessing import os import pathlib import subprocess import sys import check class PathAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): path = os.path.expanduser(values.rstrip("/")) if not path.startswith("/") and not path.startswith("."): path = "./{0}".format(path) if os.path.isdir(path): path = "{0}/".format(path) setattr(namespace, self.dest, path) class DecimalAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, decimal.Decimal(values)) class SuffixAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): bin_suffix = ".{0}".format(values.lstrip(".")) setattr(namespace, self.dest, bin_suffix) def timestamp(): now = datetime.datetime.now() str_ = "[{0}]".format(now.strftime("%Y-%m-%d %H:%M:%S")) return str_ def add_argument(argument_group, dest, required, default=None, help_=None): if dest == "contigs_fasta": if help_ is None: help_ = "Path to contigs fasta file." argument_group.add_argument( "-c", "--contigs_fasta", dest="contigs_fasta", metavar="", required=required, type=str, action=PathAction, help=help_, ) elif dest == "db_fasta": if help_ is None: help_ = "Path to fasta file containing all sequences." argument_group.add_argument( "--db_fasta", dest="db_fasta", metavar="", required=required, type=str, action=PathAction, help=help_, ) elif dest == "bin_fasta_or_folder": if help_ is None: help_ = "Path to bin fasta file or to directory containing bins." argument_group.add_argument( "-b", "--bin_fasta", "--bin_folder", dest="bin_fasta_or_folder", metavar="", required=required, type=str, action=PathAction, help=help_, ) elif dest == "db_dir": if help_ is None: help_ = ("Path to directory where CAT/BAT database files will " "be created.") argument_group.add_argument( "--db_dir", dest="db_dir", metavar="", required=required, type=str, action=PathAction, help=help_, ) elif dest == "database_folder": if help_ is None: help_ = "Path to directory that contains database files." argument_group.add_argument( "-d", "--database_folder", dest="database_folder", metavar="", required=required, type=str, action=PathAction, default=default, help=help_, ) elif dest == "taxonomy_folder": if help_ is None: help_ = "Path to directory that contains taxonomy files." argument_group.add_argument( "-t", "--taxonomy_folder", dest="taxonomy_folder", metavar="", required=required, type=str, action=PathAction, default=default, help=help_, ) elif dest == "names_dmp": if help_ is None: help_ = "Path to names.dmp" argument_group.add_argument( "--names", dest="names_dmp", metavar="", required=required, type=str, action=PathAction, default=default, help=help_, ) elif dest == "nodes_dmp": if help_ is None: help_ = "Path to nodes.dmp" argument_group.add_argument( "--nodes", dest="nodes_dmp", metavar="", required=required, type=str, action=PathAction, default=default, help=help_, ) elif dest == "acc2tax": if help_ is None: help_ = "Path to accession2taxid.txt file. Can be gzipped." argument_group.add_argument( "--acc2tax", dest="acc2tax", metavar="", required=required, type=str, action=PathAction, default=default, help=help_, ) elif dest == "cleanup": if help_ is None: help_ = ("Remove unnecessary files after all data have been " "processed.") argument_group.add_argument( "--cleanup", dest="cleanup", required=required, action="store_true", help=help_ ) elif dest == "bin_suffix": if help_ is None: help_ = "Suffix of bins in bin directory (default: {0}).".format( default) argument_group.add_argument( "-s", "--bin_suffix", dest="bin_suffix", metavar="", required=required, type=str, default=default, help=help_, ) elif dest == "r": if help_ is None: help_ = "r parameter [0-100] (default: {0:.0f}).".format(default) argument_group.add_argument( "-r", "--range", dest="r", metavar="", required=required, type=float, choices=[i for i in range(101)], action=DecimalAction, default=default, help=help_, ) elif dest == "f": if help_ is None: help_ = "f parameter [0-0.99] (default: {0:.2f})." "".format( default) argument_group.add_argument( "-f", "--fraction", dest="f", metavar="", required=required, type=float, choices=[i / 100 for i in range(0, 100)], action=DecimalAction, default=default, help=help_, ) elif dest == "out_prefix": if help_ is None: help_ = "Prefix for output files (default: {0}).".format(default) argument_group.add_argument( "-o", "--out_prefix", dest="out_prefix", metavar="", required=required, type=str, action=PathAction, default=default, help=help_, ) elif dest == "db": if help_ is None: help_ = "Either 'nr' or 'gtdb'." argument_group.add_argument( "--db", dest="db", metavar="", required=required, type=str, choices=["nr", "gtdb"], default=None, help=help_ ) elif dest == "output_dir": if help_ is None: help_ = "Path to direcotry where data will be stored." argument_group.add_argument( "-o", "--output_dir", dest="output_dir", metavar="", required=required, type=lambda p: pathlib.Path(p).resolve(), help=help_ ) elif dest == "proteins_fasta": if help_ is None: help_ = ( "Path to predicted proteins fasta file. If supplied, the " "protein prediction step is skipped." ) argument_group.add_argument( "-p", "--proteins_fasta", dest="proteins_fasta", metavar="", required=required, type=str, action=PathAction, help=help_, ) elif dest == "alignment_file": if help_ is None: help_ = ( "Path to alignment table. If supplied, the alignment step is " "skipped and classification is carried out directly. A " "predicted proteins fasta file should also be supplied with " "argument [-p / --proteins]." ) argument_group.add_argument( "-a", "--diamond_alignment", dest="alignment_file", metavar="", required=required, type=str, action=PathAction, help=help_, ) elif dest == "common_prefix": if help_ is None: help_ = "Prefix for all files that will be created" argument_group.add_argument( "--common_prefix", dest="common_prefix", metavar="", required=required, type=str, default=default, help=help_, ) elif dest == "path_to_prodigal": if help_ is None: help_ = ( "Path to Prodigal binaries. Supply if CAT/BAT cannot find " "Prodigal" ) argument_group.add_argument( "--path_to_prodigal", dest="path_to_prodigal", metavar="", required=required, type=str, action=PathAction, default=default, help=help_, ) elif dest == "path_to_diamond": if help_ is None: help_ = ( "Path to DIAMOND binaries. Supply if CAT/BAT cannot find " "DIAMOND." ) argument_group.add_argument( "--path_to_diamond", dest="path_to_diamond", metavar="", required=required, type=str, action=PathAction, default=default, help=help_, ) elif dest == "no_stars": if help_ is None: help_ = "Suppress marking of suggestive taxonomic assignments." argument_group.add_argument( "--no_stars", dest="no_stars", required=required, action="store_true", help=help_, ) elif dest == "force": if help_ is None: help_ = "Force overwrite existing files." argument_group.add_argument( "--force", dest="force", required=required, action="store_true", help=help_, ) elif dest == "quiet": if help_ is None: help_ = "Suppress verbosity." argument_group.add_argument( "-q", "--quiet", dest="quiet", required=required, action="store_true", help=help_, ) elif dest == "verbose": if help_ is None: help_ = "Increase verbosity." argument_group.add_argument( "--verbose", dest="verbose", required=required, action="store_true", help=help_, ) elif dest == "no_log": if help_ is None: help_ = "Suppress log file." argument_group.add_argument( "--no_log", dest="no_log", required=required, action="store_true", help=help_, ) elif dest == "help": if help_ is None: help_ = "Show this help message and exit." argument_group.add_argument("-h", "--help", action="help", help=help_) elif dest == "IkwId": if help_ is None: help_ = "Flag for experimental features." argument_group.add_argument( "--I_know_what_Im_doing", dest="IkwId", required=required, action="store_true", help=help_, ) elif dest == "input_file": if help_ is None: help_ = "Path to input file." argument_group.add_argument( "-i", "--input_file", dest="input_file", metavar="", required=required, type=str, action=PathAction, help=help_, ) elif dest == "output_file": if help_ is None: help_ = "Path to output file." argument_group.add_argument( "-o", "--output_file", dest="output_file", metavar="", required=required, type=str, action=PathAction, help=help_, ) elif dest == "only_official": if help_ is None: help_ = ( "Only output official raxonomic ranks (superkingdom, phylum, " "class, order, family, genus, species)." ) argument_group.add_argument( "--only_official", dest="only_official", required=required, action="store_true", help=help_, ) elif dest == "exclude_scores": if help_ is None: help_ = ( "Do not include bit-score support scores in the lineage of a " "classification output file." ) argument_group.add_argument( "--exclude_scores", dest="exclude_scores", required=required, action="store_true", help=help_, ) elif dest == "nproc": if help_ is None: help_ = "Number of cores to deploy by DIAMOND (default: maximum)." argument_group.add_argument( "-n", "--nproc", dest="nproc", metavar="", required=required, type=int, default=default, help=help_, ) elif dest == "sensitive": if help_ is None: help_ = "Run DIAMOND in sensitive mode (default: not enabled)." argument_group.add_argument( "--sensitive", dest="sensitive", required=required, action="store_true", help=help_, ) elif dest == "no_self_hits": if help_ is None: help_ = ( "Do not report identical self hits by DIAMOND (default: " "not enabled)." ) argument_group.add_argument( "--no_self_hits", dest="no_self_hits", required=required, action="store_true", help=help_, ) elif dest == "block_size": if help_ is None: help_ = ( "DIAMOND block-size parameter (default: {0}). Lower numbers " "will decrease memory and temporary disk space usage." "".format(default) ) argument_group.add_argument( "--block_size", dest="block_size", metavar="", required=required, type=float, default=default, help=help_, ) elif dest == "index_chunks": if help_ is None: help_ = ( "DIAMOND index-chunks parameter (default: {0}). Set to 4 on " "low memory machines. The parameter has no effect on " "temporary disk space usage.".format(default) ) argument_group.add_argument( "--index_chunks", dest="index_chunks", metavar="", required=required, type=int, default=default, help=help_, ) elif dest == "tmpdir": if help_ is None: help_ = ( "Directory for temporary DIAMOND files (default: directory to " "which output files are written)." ) argument_group.add_argument( "--tmpdir", dest="tmpdir", metavar="", required=required, type=str, action=PathAction, help=help_, ) elif dest == "compress": if help_ is None: help_ = "Compress DIAMOND alignment file (default: not enabled)." argument_group.add_argument( "--compress", dest="compress", required=required, action="store_true", help=help_, ) elif dest == "top": if help_ is None: help_ = ( "DIAMOND top parameter [0-100] (default: {0}). Governs hits " "within range of best hit that are written to the alignment " "file. This is not the [-r / --range] parameter! See " "README.md.".format(default) ) argument_group.add_argument( "--top", dest="top", metavar="", required=required, type=float, choices=[i for i in range(101)], default=default, help=help_, ) else: sys.exit("Unknown parser dest {0}.".format(dest)) return def add_all_diamond_arguments(argument_group): add_argument( argument_group, "nproc", False, default=multiprocessing.cpu_count() ) add_argument(argument_group, "sensitive", False) add_argument(argument_group, "no_self_hits", False) add_argument(argument_group, "block_size", False, default=12.0) add_argument(argument_group, "index_chunks", False, default=1) add_argument(argument_group, "tmpdir", False) add_argument(argument_group, "compress", False) add_argument(argument_group, "top", False, default=11) return def expand_arguments(args): if "r" in args: setattr(args, "one_minus_r", (100 - args.r) / 100) if "out_prefix" in args: if not args.tmpdir: tmpdir = "{0}/".format(args.out_prefix.rsplit("/", 1)[0]) setattr(args, "tmpdir", tmpdir) # Check out_prefix as the log file needs to be written to a valid # location. error = check.check_out_prefix(args.out_prefix, None, args.quiet) if error: sys.exit(1) log_file = "{0}.log".format(args.out_prefix) with open(log_file, "w") as outf1: pass else: log_file = None setattr(args, "log_file", log_file) if "db_dir" in args: database_folder_path = str( pathlib.Path(args.db_dir) / pathlib.Path("db")) diamond_database_name = "{0}.dmnd".format(args.common_prefix) diamond_database_path = str( database_folder_path / pathlib.Path(diamond_database_name)) taxonomy_folder_path = str( pathlib.Path(args.db_dir) / pathlib.Path("tax")) fastaid2LCAtaxid_fname = "{0}.fastaid2LCAtaxid".format( args.common_prefix) fastaid2LCAtaxid_path = database_folder_path / pathlib.Path( fastaid2LCAtaxid_fname) fastaid2LCAtaxid_file = str(fastaid2LCAtaxid_path) taxids_with_multiple_offspring_fname = ( "{0}.taxids_with_multiple_offspring".format(args.common_prefix)) taxids_with_multiple_offspring_path = ( database_folder_path / pathlib.Path(taxids_with_multiple_offspring_fname) ) taxids_with_multiple_offspring_file = str( taxids_with_multiple_offspring_path) setattr(args, "database_folder", database_folder_path) setattr(args, "taxonomy_folder", taxonomy_folder_path) setattr(args, "diamond_database", diamond_database_path) setattr(args, "fastaid2LCAtaxid_file", fastaid2LCAtaxid_file) setattr( args, "taxids_with_multiple_offspring_file", taxids_with_multiple_offspring_file, ) if "taxonomy_folder" in args and not "db_dir" in args: setattr( args, "taxonomy_folder", "{0}/".format(args.taxonomy_folder.rstrip("/")), ) explore_taxonomy_folder(args) if "database_folder" in args and not "db_dir" in args: setattr( args, "database_folder", "{0}/".format(args.database_folder.rstrip("/")), ) explore_database_folder(args) if "bin_fasta_or_folder" in args: if os.path.isfile(args.bin_fasta_or_folder): setattr(args, "bin_fasta", args.bin_fasta_or_folder) else: setattr(args, "bin_folder", args.bin_fasta_or_folder) return def explore_taxonomy_folder(args): nodes_dmp = None names_dmp = None prot_accession2taxid_file = None if os.path.isdir(args.taxonomy_folder): for file_ in os.listdir(args.taxonomy_folder): if file_ == "nodes.dmp": nodes_dmp = "{0}{1}".format(args.taxonomy_folder, file_) elif file_ == "names.dmp": names_dmp = "{0}{1}".format(args.taxonomy_folder, file_) # No need to check for this. elif file_.endswith("prot.accession2taxid.FULL.gz"): prot_accession2taxid_file = "{0}{1}".format( args.taxonomy_folder, file_) elif ( file_.endswith("prot.accession2taxid.gz") and prot_accession2taxid_file is None ): # Legacy prot_accession2taxid_file. prot_accession2taxid_file = "{0}{1}".format( args.taxonomy_folder, file_) setattr(args, "nodes_dmp", nodes_dmp) setattr(args, "names_dmp", names_dmp) setattr(args, "prot_accession2taxid_file", prot_accession2taxid_file) return def explore_database_folder(args): fasta_file = None diamond_database = None fastaid2LCAtaxid_file = None taxids_with_multiple_offspring_file = None if os.path.isdir(args.database_folder): for file_ in os.listdir(args.database_folder): if file_.endswith( (".fa", ".fasta", ".fna", "fa.gz", "fasta.gz", "fna.gz")): fasta_file = "{0}{1}".format(args.database_folder, file_) elif file_.endswith(".dmnd"): diamond_database = "{0}{1}".format(args.database_folder, file_) elif file_.endswith("fastaid2LCAtaxid"): fastaid2LCAtaxid_file = "{0}{1}".format( args.database_folder, file_) elif file_.endswith("taxids_with_multiple_offspring"): taxids_with_multiple_offspring_file = "{0}{1}".format( args.database_folder, file_) setattr(args, "db_fasta", fasta_file) setattr(args, "diamond_database", diamond_database) setattr(args, "fastaid2LCAtaxid_file", fastaid2LCAtaxid_file) setattr( args, "taxids_with_multiple_offspring_file", taxids_with_multiple_offspring_file, ) return def print_variables(args, step_list=None): if args.verbose: arguments = [ "{0}: {1}".format(k, v) for k, v in sorted(vars(args).items()) ] message = ( "\n-----------------\n\n" "Full list of arguments:\n" "{0}".format("\n".join(arguments)) ) give_user_feedback(message, args.log_file, args.quiet, show_time=False) if step_list is not None: message = "\nStep list: {0}".format(step_list) give_user_feedback( message, args.log_file, args.quiet, show_time=False) message = "\n-----------------\n" give_user_feedback(message, args.log_file, args.quiet, show_time=False) return def give_user_feedback( message, log_file=None, quiet=False, show_time=True, error=False, warning=False ): if error: message = "ERROR: {0}".format(message) if warning: message = "WARNING: {0}".format(message) if show_time: message = "{0} {1}".format(timestamp(), message) message = "{0}\n".format(message) if log_file: with open(log_file, "a") as outf1: outf1.write(message) if not quiet and not error: sys.stdout.write(message) if not quiet and error: sys.stderr.write(message) return def run_prodigal( path_to_prodigal, contigs_fasta, proteins_fasta, proteins_gff, log_file, quiet, ): message = ( "Running Prodigal for ORF prediction. Files {0} and {1} will be " "generated. Do not forget to cite Prodigal when using CAT or BAT in " "your publication.".format(proteins_fasta, proteins_gff) ) give_user_feedback(message, log_file, quiet) try: command = [ path_to_prodigal, "-i", contigs_fasta, "-a", proteins_fasta, "-o", proteins_gff, "-p", "meta", "-g", "11", "-q", "-f", "gff" ] subprocess.check_call(command) except: message = "Prodigal finished abnormally." give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) message = "ORF prediction done!" give_user_feedback(message, log_file, quiet) return def run_diamond(args): if args.sensitive: mode = "sensitive" else: mode = "fast" if args.compress: compression = "1" else: compression = "0" message = ( "Homology search with DIAMOND is starting. Please be patient. Do not " "forget to cite DIAMOND when using CAT or BAT in your publication.\n" "\t\t\tquery: {0}\n" "\t\t\tdatabase: {1}\n" "\t\t\tmode: {2}\n" "\t\t\ttop: {3}\n" "\t\t\tno-self-hits: {4}\n" "\t\t\tnumber of cores: {5}\n" "\t\t\tblock-size (billions of letters): {6}\n" "\t\t\tindex-chunks: {7}\n" "\t\t\ttmpdir: {8}\n" "\t\t\tcompress: {9}".format( args.proteins_fasta, args.diamond_database, mode, args.top, args.no_self_hits, args.nproc, args.block_size, args.index_chunks, args.tmpdir, compression ) ) give_user_feedback(message, args.log_file, args.quiet) try: command = [ args.path_to_diamond, "blastp", "-d", args.diamond_database, "-q", args.proteins_fasta, "--top", str(args.top), "--matrix", "BLOSUM62", "--evalue", "0.001", "-o", args.alignment_file, "-p", str(args.nproc), "--block-size", str(args.block_size), "--index-chunks", str(args.index_chunks), "--tmpdir", args.tmpdir, "--compress", compression ] if not args.verbose: command += ["--quiet"] if args.sensitive: command += ["--sensitive"] if args.no_self_hits: command += ["--no-self-hits"] subprocess.check_call(command) except: message = "DIAMOND finished abnormally." give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if args.compress: setattr(args, "alignment_file", "{0}.gz".format(args.alignment_file)) message = "Homology search done! File {0} created.".format( args.alignment_file) give_user_feedback(message, args.log_file, args.quiet) return def import_contig_names(fasta_file, log_file, quiet): message = "Importing contig names from {0}.".format(fasta_file) give_user_feedback(message, log_file, quiet) contig_names = set() with open(fasta_file, "r") as f1: for line in f1: if line.startswith(">"): contig = line.split()[0].lstrip(">").rstrip() if contig in contig_names: message = ( "your fasta file contains duplicate headers. The " "first duplicate encountered is {0}, but there might " "be more...".format(contig) ) give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) contig_names.add(contig) return contig_names def import_ORFs(proteins_fasta, log_file, quiet): message = "Parsing ORF file {0}".format(proteins_fasta) give_user_feedback(message, log_file, quiet) contig2ORFs = {} with open(proteins_fasta, "r") as f1: for line in f1: line = line.rstrip() if line.startswith(">"): ORF = line.split()[0].lstrip(">") contig = ORF.rsplit("_", 1)[0] if contig not in contig2ORFs: contig2ORFs[contig] = [] contig2ORFs[contig].append(ORF) return contig2ORFs def parse_tabular_alignment(alignment_file, one_minus_r, log_file, quiet): message = "Parsing alignment file {0}.".format(alignment_file) give_user_feedback(message, log_file, quiet) compressed = False if alignment_file.endswith(".gz"): compressed = True f1 = gzip.open(alignment_file, "rb") else: f1 = open(alignment_file, "r") ORF2hits = {} all_hits = set() ORF = "first ORF" ORF_done = False for line in f1: if compressed: line = line.decode("utf-8") if line.startswith(ORF) and ORF_done == True: # The ORF has already surpassed its minimum allowed bit-score. continue line = line.rstrip().split("\t") if not line[0] == ORF: # A new ORF is reached. ORF = line[0] top_bitscore = decimal.Decimal(line[11]) ORF2hits[ORF] = [] ORF_done = False bitscore = decimal.Decimal(line[11]) if bitscore >= one_minus_r * top_bitscore: # The hit has a high enough bit-score to be included. hit = line[1] ORF2hits[ORF].append( (hit, bitscore),) all_hits.add(hit) else: # The hit is not included because its bit-score is too low. ORF_done = True f1.close() return (ORF2hits, all_hits) def is_gz(file_path): """Check if given file_paht is gzipped based on suffix.""" if isinstance(file_path, pathlib.Path): file_path = file_path.name return file_path.endswith(".gz") or file_path.endswith(".z") def optionally_compressed_handle(file_path, mode): """Return an appropriate file handle to operate on. Arguments: file_path: str or PathLike: File path. mode: str: The passed mode to open the file on. Return: A file handle either gzip opened or plainly opened for reading/writing/appending in text mode. """ if mode == "r" or mode == "rb": mode = "rt" if mode == "w" or mode == "wb": mode = "wt" if mode == "a" or mode == "ab": mode = "at" if is_gz(file_path): return gzip.open(file_path, mode=mode) else: return open(file_path, mode=mode) if __name__ == "__main__": sys.exit("Run \'CAT\' to run CAT or BAT.") CAT-5.3/CAT_pack/summarise.py000066400000000000000000000363461452142602700157730ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import sys import about import check import shared def parse_arguments(): parser = argparse.ArgumentParser( prog="CAT summarise", description="Summarise a named CAT or BAT classification file.", usage=("CAT summarise -i FILE -o FILE (-c FILE) " "[options] [-h / --help]"), add_help=False) required = parser.add_argument_group("Required arguments") shared.add_argument(required, "input_file", True, help_=( "Path to named CAT contig classification file or BAT bin " "classification file. Currently only official ranks are " "supported, and only classification files containing a single " "classification per contig / bin. If you want to summarise a " "contig classification file, you have to supply the contigs " "fasta file with argument [-c / --contigs_fasta].")) shared.add_argument(required, "output_file", True) optional = parser.add_argument_group("Optional arguments") shared.add_argument( optional, "contigs_fasta", False, help_=("Path to contigs fasta file. Required if you want to summarise " "a contig classification file.") ) shared.add_argument(optional, "force", False) shared.add_argument(optional, "quiet", False) shared.add_argument(optional, "help", False) (args, extra_args) = parser.parse_known_args() extra_args = [arg for (i, arg) in enumerate(extra_args) if (i, arg) != (0, "summarise")] if len(extra_args) > 0: sys.exit("error: too much arguments supplied:\n{0}".format( "\n".join(extra_args))) # Add extra arguments. shared.expand_arguments(args) return args def import_contig_lengths(contigs_fasta, log_file, quiet): message = "Gathering contig lengths from {0}.".format(contigs_fasta) shared.give_user_feedback(message, log_file, quiet) contig2length = {} with open(contigs_fasta, "r") as f1: for line in f1: line = line.rstrip() if line.startswith(">"): contig = line.split()[0].lstrip(">") contig2length[contig] = 0 else: try: contig2length[contig] += len(line) except: message = "{0} is not a contigs fasta".format( contigs_fasta) shared.give_user_feedback( message, log_file, quiet, error=True) sys.exit(1) return contig2length def summarise_contigs(args): message = "# CAT v{0}.".format(about.__version__) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_input_file(args.input_file, args.log_file, args.quiet)) if not args.force: errors.append( check.check_output_file( args.output_file, args.log_file, args.quiet) ) errors.append( check.check_in_and_output_file( args.input_file, args.output_file, args.log_file, args.quiet) ) if True in errors: sys.exit(1) contig2length = import_contig_lengths( args.contigs_fasta, args.log_file, args.quiet) message = "Summarising..." shared.give_user_feedback(message, args.log_file, args.quiet) with open(args.input_file, "r") as f1: for line in f1: if line.startswith("#"): line = line.split("\t") if line[0] != "# contig": message = "{0} is not a CAT classification file.".format( args.input_file) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) if line[0] == "# bin": message = ( "{0} appears to be a BAT classification file. If " "you want to summarise bin classifications, " "simply don\'t supply a contigs fasta and " "everything should be fine." "".format(args.input_file) ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) try: superkingdom_index = line.index("superkingdom") except: message = ( "official ranks not found in header of {0}. Make sure " "that the CAT classification file is named with " "official ranks with " "\'CAT add_names --only_official\'." "".format(args.input_file) ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) break else: message = "input file does not have a recognisable header." shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) length = {} length["no taxid assigned"] = [] ORFs = {} official_ranks = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"] for rank in official_ranks: length[rank] = {} ORFs[rank] = {} n = 0 contig_trace = set() doubles = set() with open(args.input_file, "r") as f1: for line in f1: line = line.rstrip() if line.startswith("#"): continue n += 1 line = line.split("\t") contig = line[0] if contig in contig_trace: doubles.add(contig) contig_trace.add(contig) if contig not in contig2length: message = ( "contig {0} in CAT classification file is not found in " "supplied contigs fasta file. Are you sure the CAT " "classification file is based on the contigs fasta?" "".format(contig) ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) if line[1] == "no taxid assigned": length["no taxid assigned"].append(contig2length[contig]) continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(": ", 1)[0].rstrip("*") rank = official_ranks[i] if classification not in length[rank]: length[rank][classification] = [] ORFs[rank][classification] = [] length[rank][classification].append(contig2length[contig]) # NOTE that the total number of ORFs on a contig is reported, # not only the number of ORFs a classification is based on. ORFs_on_contig = int(line[2].split("/")[1].split(" ")[0]) ORFs[rank][classification].append(ORFs_on_contig) if len(doubles) != 0: message = ( "some contigs have multiple classifications. CAT summarise " "currently does not allow for this. Contigs with multiple " "classifications: {0}.".format(", ".join(list(doubles))) ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) if n != len(contig2length): message = ( "the number of classified contigs is not the same as the number " "of contigs in contigs fasta. Are you sure the CAT classification " "file is based on the contigs fasta?" ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) with open(args.output_file, "w") as outf1: n_contigs = len(contig2length) total_length = sum(contig2length.values()) n_classified_contigs = n_contigs - len(length["no taxid assigned"]) total_classified_length = total_length - sum( length["no taxid assigned"]) outf1.write( "# total number of contigs in {0} is {1:,d} representing " "{2:,d} positions.\n" "".format(args.contigs_fasta, n_contigs, total_length) ) outf1.write( "# {0:,d} contigs have taxonomy assigned ({1:.2f}%) representing " "{2:,d} positions ({3:.2f}%) in {4}.\n" "".format( n_classified_contigs, n_classified_contigs / n_contigs * 100, total_classified_length, total_classified_length / total_length * 100, args.input_file ) ) outf1.write("#\n") outf1.write( "# rank\t" "clade\t" "number of contigs\t" "number of ORFs\t" "number of positions\n" ) for rank in official_ranks: for clade in sorted( length[rank], key=lambda x: sum(length[rank][x]), reverse=True ): outf1.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format( rank, clade, len(length[rank][clade]), sum(ORFs[rank][clade]), sum(length[rank][clade]) )) message = "{0} is created!".format(args.output_file) shared.give_user_feedback(message, args.log_file, args.quiet) return def summarise_bins(args): message = "# CAT v{0}.".format(about.__version__) shared.give_user_feedback( message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_input_file(args.input_file, args.log_file, args.quiet)) if not args.force: errors.append( check.check_output_file( args.output_file, args.log_file, args.quiet) ) errors.append( check.check_in_and_output_file( args.input_file, args.output_file, args.log_file, args.quiet) ) if True in errors: sys.exit(1) message = "Summarising..." shared.give_user_feedback(message, args.log_file, args.quiet) with open(args.input_file, "r") as f1: for line in f1: if line.startswith("#"): line = line.split("\t") if line[0] != "# bin": message = "{0} is not a BAT classification file.".format( args.input_file) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) if line[0] == "# contig": message = ( "{0} appears to be a CAT classification file. If " "you want to summarise contig classifications, " "supply a contigs fasta with argument " "[-c / --contigs_fasta]." "".format(args.input_file) ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) try: superkingdom_index = line.index("superkingdom") except: message = ( "official ranks not found in header of {0}. Make sure " "that the BAT classification file is named with " "official ranks with " "\'CAT add_names --only_official\'." "".format(args.input_file) ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) break else: message = "input file does not have a recognisable header." shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) n_bins = {} n_bins["no taxid assigned"] = 0 official_ranks = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"] for rank in official_ranks: n_bins[rank] = {} n = 0 bin_trace = set() doubles = set() with open(args.input_file, "r") as f1: for line in f1: line = line.rstrip() if line.startswith("#"): continue n += 1 line = line.split("\t") bin_ = line[0] if bin_ in bin_trace: doubles.add(bin_) bin_trace.add(bin_) if line[1] == "no taxid assigned": n_bins["no taxid assigned"] += 1 continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(": ", 1)[0].rstrip("*") rank = official_ranks[i] if classification not in n_bins[rank]: n_bins[rank][classification] = 0 n_bins[rank][classification] += 1 if len(doubles) != 0: message = ( "some bins have multiple classifications. CAT summarise currently " "does not allow for this. Bins with multiple classifications: {0}." "".format(", ".join(list(doubles))) ) shared.give_user_feedback( message, args.log_file, args.quiet, error=True) sys.exit(1) n_classified_bins = n - n_bins["no taxid assigned"] with open(args.output_file, "w") as outf1: outf1.write( "# total number of bins is {0:,d}, of which {1:,d} " "({2:.2f}%) have taxonomy assigned.\n" "".format(n, n_classified_bins, n_classified_bins / n * 100) ) outf1.write("#\n") outf1.write("# rank\tclade\tnumber of bins\n") for rank in official_ranks: for clade in sorted( n_bins[rank], key=lambda x: n_bins[rank][x], reverse=True): outf1.write("{0}\t{1}\t{2}\n".format( rank, clade, n_bins[rank][clade])) message = "{0} is created!".format(args.output_file) shared.give_user_feedback(message, args.log_file, args.quiet) return def run(): args = parse_arguments() if not args.contigs_fasta: summarise_bins(args) else: summarise_contigs(args) return if __name__ == "__main__": sys.exit("Run \'CAT summarise\' to summarise a named CAT contig " "classification file or named BAT bin classification file.") CAT-5.3/CAT_pack/tax.py000066400000000000000000000210031452142602700145420ustar00rootroot00000000000000#!/usr/bin/env python3 import sys import shared def import_nodes(nodes_dmp, log_file, quiet): message = "Loading file {0}.".format(nodes_dmp) shared.give_user_feedback(message, log_file, quiet) taxid2parent = {} taxid2rank = {} with open(nodes_dmp, "r") as f1: for line in f1: line = line.split("\t") taxid = line[0] parent = line[2] rank = line[4] taxid2parent[taxid] = parent taxid2rank[taxid] = rank return (taxid2parent, taxid2rank) def import_names(names_dmp, log_file, quiet): message = "Loading file {0}.".format(names_dmp) shared.give_user_feedback(message, log_file, quiet) taxid2name = {} with open(names_dmp, "r") as f1: for line in f1: line = line.split("\t") if line[6] == "scientific name": taxid = line[0] name = line[2] taxid2name[taxid] = name return taxid2name def import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet): message = "Loading file {0}.".format(fastaid2LCAtaxid_file) shared.give_user_feedback(message, log_file, quiet) fastaid2LCAtaxid = {} with open(fastaid2LCAtaxid_file, "r") as f1: for line in f1: line = line.rstrip().split("\t") if line[0] in all_hits: # Only include fastaids that are found in hits. fastaid2LCAtaxid[line[0]] = line[1] return fastaid2LCAtaxid def import_taxids_with_multiple_offspring( taxids_with_multiple_offspring_file, log_file, quiet): message = "Loading file {0}.".format(taxids_with_multiple_offspring_file) shared.give_user_feedback(message, log_file, quiet) taxids_with_multiple_offspring = set() with open(taxids_with_multiple_offspring_file, "r") as f1: for line in f1: line = line.rstrip() taxids_with_multiple_offspring.add(line) return taxids_with_multiple_offspring def find_lineage(taxid, taxid2parent, lineage=None): if lineage is None: lineage = [] lineage.append(taxid) if taxid2parent[taxid] == taxid: return lineage else: return find_lineage(taxid2parent[taxid], taxid2parent, lineage) def find_LCA(list_of_lineages): overlap = set.intersection(*map(set, list_of_lineages)) for taxid in list_of_lineages[0]: if taxid in overlap: return taxid def find_LCA_for_ORF(hits, fastaid2LCAtaxid, taxid2parent): list_of_lineages = [] top_bitscore = 0 for (hit, bitscore) in hits: if bitscore > top_bitscore: top_bitscore = bitscore try: taxid = fastaid2LCAtaxid[hit] lineage = find_lineage(taxid, taxid2parent) list_of_lineages.append(lineage) except: # The fastaid does not have an associated taxid for some reason. pass if len(list_of_lineages) == 0: return ( "no taxid found ({0})".format(";".join([i[0] for i in hits])), top_bitscore ) overlap = set.intersection(*map(set, list_of_lineages)) for taxid in list_of_lineages[0]: if taxid in overlap: return (taxid, top_bitscore) def find_questionable_taxids(lineage, taxids_with_multiple_offspring): questionable_taxids = [] if lineage == ["1"] or lineage == ["root"]: return questionable_taxids if len(lineage) == 2 and (lineage[1:] == ["1"] or lineage[1:] == ["root"]): return questionable_taxids for (i, taxid) in enumerate(lineage): taxid_parent = lineage[i + 1] if taxid_parent in taxids_with_multiple_offspring: return questionable_taxids questionable_taxids.append(taxid) def star_lineage(lineage, taxids_with_multiple_offspring): questionable_taxids = find_questionable_taxids( lineage, taxids_with_multiple_offspring) starred_lineage = [taxid if taxid not in questionable_taxids else "{0}*".format(taxid) for taxid in lineage] return starred_lineage def find_weighted_LCA(LCAs_ORFs, taxid2parent, f): list_of_lineages = [] list_of_bitscores = [] based_on_n_ORFs = 0 for (taxid, top_bitscore) in LCAs_ORFs: if taxid.startswith("no taxid found"): # Thus the ORFs that are not classified because they don"t have an # associated taxid are not taken into account for the # classification of the contig. continue lineage = find_lineage(taxid, taxid2parent) list_of_lineages.append(lineage) list_of_bitscores.append(top_bitscore) based_on_n_ORFs += 1 if len(list_of_lineages) == 0: return ( "no ORFs with taxids found.", "no ORFs with taxids found.", "no ORFs with taxids found." ) taxid2bitscore = {} for (i, lineage) in enumerate(list_of_lineages): for taxid in lineage: if taxid not in taxid2bitscore: taxid2bitscore[taxid] = 0 taxid2bitscore[taxid] += list_of_bitscores[i] whitelisted_lineages = [] for taxid in taxid2bitscore: if taxid2bitscore[taxid] / sum(list_of_bitscores) > f: lineage = find_lineage(taxid, taxid2parent) whitelisted_lineages.append(lineage) if len(whitelisted_lineages) == 0: return ( "no lineage whitelisted.", "no lineage whitelisted.", "no lineage whitelisted." ) whitelisted_lineages = sorted(whitelisted_lineages, key=lambda x: len(x), reverse=True) longest_lineages = [] longest_lineages_scores = [] taxid_trace = set() for whitelisted_lineage in whitelisted_lineages: if whitelisted_lineage[0] not in taxid_trace: longest_lineages.append(whitelisted_lineage) scores = [taxid2bitscore[taxid] / sum(list_of_bitscores) for taxid in whitelisted_lineage] longest_lineages_scores.append(scores) taxid_trace |= set(whitelisted_lineage) return (longest_lineages, longest_lineages_scores, based_on_n_ORFs) def convert_to_names(lineage, taxid2rank, taxid2name, scores=None): names = [] for (i, taxid) in enumerate(lineage): if "*" in taxid: taxid = taxid.rstrip("*") starred = True else: starred = False name = taxid2name[taxid] rank = taxid2rank[taxid] if scores is not None: if starred: names.append("{0}* ({1}): {2}".format(name, rank, scores[i])) else: names.append("{0} ({1}): {2}".format(name, rank, scores[i])) else: if starred: names.append("{0}* ({1})".format(name, rank)) else: names.append("{0} ({1})".format(name, rank)) return names def convert_to_official_names(lineage, taxid2rank, taxid2name, scores=None): official_ranks = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"] lineage_ranks = [taxid2rank[taxid.rstrip("*")] for taxid in lineage] official_names = ["no support"] * 7 for (i, rank) in enumerate(official_ranks): if rank in lineage_ranks: index = lineage_ranks.index(rank) taxid = lineage[index] if "*" in taxid: taxid = taxid.rstrip("*") starred = True else: starred = False name = taxid2name[taxid] if scores is not None: if starred: official_names[i] = "{0}*: {1}".format(name, scores[index]) else: official_names[i] = "{0}: {1}".format(name, scores[index]) else: if starred: official_names[i] = "{0}*".format(name) else: official_names[i] = name # Fill the official lineage with NAs if a lower classification is present. index_lowest_classification = 0 for (i, name) in enumerate(official_names): if name != "no support": index_lowest_classification = i for i in range(index_lowest_classification): if official_names[i] == "no support": official_names[i] = "NA" return official_names if __name__ == "__main__": sys.exit("Run \'CAT\' to run CAT or BAT.") CAT-5.3/CHANGELOG.md000066400000000000000000000065701452142602700136140ustar00rootroot00000000000000# Changelog ## 5.3 * GTDB support. * Sequence databases (NCBI nr or GTDB) can be downloaded with `CAT download`, and CAT databases constructed with `CAT prepare`. * Sensible defaults of DIAMOND parameters for high memory machines: `--top 11 --block_size 12 --index_chunks 1`. * You can now run CAT first, and then run BAT with the predicted proteins fasta and alignment file generated by CAT. * Preparations for Read Annotation Tool (RAT). ## 5.2.3 Minor bug fix for `CAT add_names`. ## 5.2.2 We have added the DIAMOND specific `--no_self_hits` flag. We have also added some extra checks and removed redundancy from the parser code. Databases constructed by `CAT prepare` now have a slightly different naming scheme. ## 5.2.1 Minor bug fix for `CAT prepare`. ## 5.2 `CAT prepare` now uses the latest taxonomy mapping files from NCBI, significantly expanding taxonomic coverage of proteins in nr. File integrity of downloads is assessed based on md5 checksums. The ORF2LCA output file contains a new column for the number of hits the classification is based on. We have made textual changes to the output files to better reflect the meaning of 'classified' and 'not classified' in different contexts. ## 5.1.2 Code streamlining. ## 5.1.1 CAT and BAT can now compress the DIAMOND alignment file, and import gzip compressed alignment files. ## 5.1 The code has been rewritten to prepare for future extensions. We have also added the `--verbose` flag. ## 5.0.5. Skip hidden files in bin folder. ## 5.0.4. We have added the `--no_stars` flag alongside a minor bug fix. ## 5.0.3 Bug fix for single bin mode. ## 5.0.2 Floating point numbers have been changed to decimals. ## 5.0.1 Updated license to MIT. ## 5.0 We have simplified the output table format: we have added a 'reason' column, which shows the number of ORFs a classification is based on and the total number of predicted ORFs on a contig/MAG. In case of an unclassified sequence, the reason for this is shown in this column as well. Moreover, `add_names` now has an option to exclude the bit-score support scores from the lineage! ## 4.6 We have added the DIAMOND `--top` parameter and the `--I_know_what_Im_doing` flag for experimental features. ## 4.5 BAT can now be run in single bin mode. The familiar `./CAT bins` is still the go-to option if you want to classify multiple MAGs, but if it's only one MAG you are interested in try out `./CAT bin`! An added benefit of single bin mode is that you can use the alignment and predicted protein files of the BAT run to classify individual contigs within the MAG with CAT, or the other way around. ## 4.4 We have added DIAMOND specific options. This allows you to use sensitive mode, and tune memory and temporary disk space usage during alignment! Moreover, you can now force CAT and BAT to overwrite existing files. ## 4.3.4 We extended some of the pre-flight checks. ## 4.3.3 Minor bug fix. ## 4.3.2 A fruity update: CAT and BAT are now macOS compatible! ## 4.3.1 We removed the psutil dependency. ## 4.3 Prepare now checks whether the RAM of your computer is large enough. If not, not to worry! We have put preconstructed databases online. ## 4.2 Code streamlining. ## 4.1 CAT and BAT leave much less footprints: the size of the alignment output files is greatly reduced, alignment is now up to 3 times faster than previous releases. ## 4.0 CAT and BAT have been completely rewritten, bumping the version up to 4.0. CAT-5.3/LICENSE.md000066400000000000000000000020501452142602700133740ustar00rootroot00000000000000Copyright (c) 2019 Universiteit Utrecht Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. CAT-5.3/README.md000066400000000000000000000504421452142602700132570ustar00rootroot00000000000000# CAT and BAT - [Introduction](#introduction) - [Dependencies and where to get them](#dependencies-and-where-to-get-them) - [Installation](#installation) - [Getting started](#getting-started) - [Usage](#usage) - [Interpreting the output files](#interpreting-the-output-files) - [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk) - [Optimising running time, RAM, and disk usage](#optimising-running-time-ram-and-disk-usage) - [Examples](#examples) ## Introduction Contig Annotation Tool (CAT) and Bin Annotation Tool (BAT) are pipelines for the taxonomic classification of long DNA sequences and metagenome assembled genomes (MAGs/bins) of both known and (highly) unknown microorganisms, as generated by contemporary metagenomics studies. The core algorithm of both programs involves gene calling, mapping of predicted ORFs against a protein database, and voting-based classification of the entire contig / MAG based on classification of the individual ORFs. CAT and BAT can be run from intermediate steps if files are formated appropriately (see [Usage](#usage)). A paper describing the algorithm together with extensive benchmarks can be found at https://doi.org/10.1186/s13059-019-1817-x. If you use CAT or BAT in your research, it would be great if you could cite us: * *von Meijenfeldt FAB, Arkhipova K, Cambuy DD, Coutinho FH, Dutilh BE. Robust taxonomic classification of uncharted microbial sequences and bins with CAT and BAT. Genome Biology. 2019;20:217.* ## Dependencies and where to get them Python 3, https://www.python.org/. DIAMOND, https://github.com/bbuchfink/diamond. Prodigal, https://github.com/hyattpd/Prodigal. CAT and BAT have been thoroughly tested on Linux systems, and should run on macOS as well. ## Installation No installation is required. You can run CAT and BAT by supplying the absolute path: ``` $ ./CAT_pack/CAT --help ``` Alternatively, if you add the files in the CAT\_pack directory to your `$PATH` variable, you can run CAT and BAT from anywhere: ``` $ CAT --version ``` *Special note for Mac users: since the macOS file system is case-insensitive by default, adding the CAT\_pack directory to your `$PATH` variable might replace calls to the standard unix `cat` utility. We advise Mac users to run CAT from its absolute path.* CAT and BAT can also be installed via Bioconda, thanks to Silas Kieser: ``` $ conda install -c bioconda cat ``` ## Getting started To get started with CAT and BAT, you will have to get the database files on your system. You can either download preconstructed database files, or generate them yourself. ### Downloading preconstructed database files To download the database files, find the most recent version on [tbb.bio.uu.nl/bastiaan/CAT\_prepare/](https://tbb.bio.uu.nl/bastiaan/CAT_prepare/), download and extract, and you are ready to go! ``` $ wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz $ tar -xvzf CAT_prepare_20210107.tar.gz ``` Your version of DIAMOND should be the same as with which the database is constructed. For this reason the DIAMOND executable is supplied within the CAT prepare folder. Alternatively, you can find the DIAMOND version used for database construction within the database log file: ``` $ grep version 2021-01-07.CAT_prepare.fresh.log ``` ### Preparing a CAT database You *must* have the following input ready before you launch a `CAT prepare` run. 1. A fasta file containing all protein sequences you want to include in your database. 2. A `names.dmp` file that contains mappings of taxids to their ranks and scientific names. The format must be the same as the NCBI standard `names.dmp` (uses `\t|\t` as field separator). An example would look like this: ``` 1 | root | | scientific name | 2 | Bacteria | | scientific name | 562 | Escherichia coli | scientific name | ``` 3. A `nodes.dmp` file that describes the child-parent relationship of the nodes in the taxonomy tree and their (official) rank. The format must be the same as the NCBI standard `nodes.dmp` (uses `\t|\t` as the field separator. An example would look like this: ``` 1 | 1 | root | 2 | 1 | superkingdom | 1224 | 2 | phylum | 1236 | 1224 | class | 91437 | 1236 | order | 543 | 91347 | family | 561 | 543 | genus | 562 | 561 | species | ``` For more information on the `nodes.dmp` and `names.dmp` files, see the [NCBI taxdump_readme.txt](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_readme.txt). 4. A 2-column, tab-separated file containing the mapping of each sequence in the fasta file to a taxid in the taxonomy. This *must* contain the header `accession.version taxid`. An example would look like this ``` accession.version taxid protein_1 562 protein_2 123456 ``` Once all of the above requirements are met you can run `CAT prepare`. All the input needs to be explicitly specified for `CAT prepare` to work. E.g. ``` CAT prepare \ --db_fasta path/to/fasta \ --names path/to/names.dmp \ --nodes path/to/nodes.dmp \ --acc2tax path/to/acc2taxid.txt.gz \ --db_dir path/to/output_dir ``` will create the `output_dir` that will look like this ``` output_dir ├── 2021-11-17_CAT.log ├── db │   ├── 2021-11-17_CAT.dmnd │   ├── 2021-11-17_CAT.fastaid2LCAtaxid │   └── 2021-11-17_CAT.taxids_with_multiple_offspring └── tax ├── names.dmp └── nodes.dmp ``` Notes: - Two subdirs are created `db` and `tax` that contain the necessary files. - The `nodes.dmp` and `names.dmp` in the `tax` directory are copied from their original location. This is to ensure that the `-t` flag of the rest of CAT modules works. - The default prefix is `_CAT`. You can customize it with the `--common_prefix` option. For all command line options available see ``` $ CAT prepare -h ``` ### Downloading raw data for nr and GTDB The `download` module can be used to download and process raw data, in preparation for building a new CAT database. This will ensure that all input dependencies are met and correctly formatted for `CAT prepare`. Currently, two databases are supported, NCBI's nr and GTDB proteins. * NCBI non-redundant protein database ( aka `nr`) Command: ``` $ CAT download -db nr -o path/to/nr_data_dir ``` Download the fasta file with the protein sequences, their mapping to a taxid and the taxonomy information from the NCBI's ftp site. * [Genome Taxonomy Database](https://gtdb.ecogenomic.org/) proteins Command: ``` $ CAT download -db gtdb -o path/to/gtdb_data_dir ``` The files required to build a CAT database are provided by the [GTDB downloads page](https://gtdb.ecogenomic.org/downloads). `CAT download` fetches the necessary files and does some additional processing to get them ready for `CAT prepare`: - The taxonomy information, provided for each genome from GTDB, is transformed into the NCBI style `nodes.dmp` and `names.dmp`. The species level annotation from GTDB is used as the unique taxid identifier. For example, all proteins coming from a representative genome for species `Escherichia coli` are assigned a taxid of `s__Escherichia coli`. All proteins from that genome get its taxid. - Fasta files containing protein sequences are extracted from the provided `gtdb_proteins_aa_reps.tar.gz` and are subjected to a round of deduplication. This is to reduce the redundancy in the DIAMOND database to be created, thus simplifying the alignment process. Exact duplicate sequences are identified based on a combination of the MD5 sum of the protein sequences and their length. Only one representative sequence is kept, with information on the rest of the accessions identified as duplicates encoded in the fasta header. This information is later used by `CAT prepare` to assign the LCA of the protein sequence appropriately in the `.fastaid2LCAtaxid` file. - The mapping of **all** protein sequences (duplicates or not) to their respective taxonomy is created. This is also used by `CAT prepare` for proper LCA identification. - In addition, the newick formatted trees for Bacteria and Archaea are downloaded and - artificially - concatenated under a single `root` node, to produce an `all.tree` file. This can come in handy for downstream analyses tools that require a phylogeny to be present to calculate diversity indices based on some metric that takes that information into account. This is **not** required for `CAT`. When the download and processing of the files is finished successfully you can build a CAT database with `CAT prepare`. For all command line options available see ``` $ CAT download -h ``` ### Running CAT and BAT. The taxonomy folder and database folder created by CAT prepare are needed in subsequent CAT and BAT runs. They only need to be generated/downloaded once or whenever you want to update the database. To run CAT on a contig set, each header in the contig fasta file (the part after `>` and before the first space) needs to be unique. To run BAT on set of MAGs, each header in a MAG needs to be unique within that MAG. If you are unsure if this is the case, you can just run CAT or BAT, as the appropriate error messages are generated if formatting is incorrect. ### Getting help. If you are unsure what options a program has, you can always add `--help` to a command. This is a great way to get you started with CAT and BAT. ``` $ CAT --help $ CAT contigs --help $ CAT summarise --help ``` ## Usage After you have got the database files on your system, you can run CAT to annotate your contig set: ``` $ CAT contigs -c {contigs fasta} -d {database folder} -t {taxonomy folder} ``` Multiple output files and a log file will be generated. The final classification files will be called `out.CAT.ORF2LCA.txt` and `out.CAT.contig2classification.txt`. Alternatively, if you already have a predicted proteins fasta file and/or an alignment table for example from previous runs, you can supply them to CAT, which will then skip the steps that have already been done and start from there: ``` $ CAT contigs -c {contigs fasta} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file} ``` The headers in the predicted proteins fasta file must look like this `>{contig}_{ORFnumber}`, so that CAT can couple contigs to ORFs. The alignment file must be tab-seperated, with queried ORF in the first column, protein accession number in the second, and bit-score in the 12th. To run BAT on a set of MAGs: ``` $ CAT bins -b {bin folder} -d {database folder} -t {taxonomy folder} ``` Alternatively, BAT can be run on a single MAG: ``` $ CAT bin -b {bin fasta} -d {database folder} -t {taxonomy folder} ``` Multiple output files and a log file will be generated. The final classification files will be called `out.BAT.ORF2LCA.txt` and `out.BAT.bin2classification.txt`. Similarly to CAT, BAT can be run from intermidate steps if gene prediction and alignment have already been carried out once: ``` $ CAT bins -b {bin folder} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file} ``` If BAT is run in single bin mode, you can use these predicted protein and alignment files to classify individual contigs within the MAG with CAT. ``` $ CAT bin -b {bin fasta} -d {database folder} -t {taxonomy folder} $ CAT contigs -c {bin fasta} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file} ``` You can also do this the other way around; start with contig classification and classify the entire MAG with BAT in single bin mode based on the files generated by CAT. ## Interpreting the output files The ORF2LCA output looks like this: ORF | lineage | bit-score --- | --- | --- contig\_1\_ORF1 | 1;131567;2;1783272 | 574.7 Where the lineage is the full taxonomic lineage of the classification of the ORF, and the bit-score the top-hit bit-score that is assigned to the ORF for voting. The BAT ORF2LCA output file has an extra column where ORFs are linked to the MAG in which they are found. The contig2classification and bin2classification output looks like this: contig or bin | classification | reason | lineage | lineage scores --- | --- | --- | --- | --- contig\_1 | taxid assigned | based on 14/15 ORFs | 1;131567;2;1783272 | 1.00; 1.00; 1.00; 0.78 contig\_2 | taxid assigned (1/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;1416614;1183438\* | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.23;0.23 contig\_2 | taxid assigned (2/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;33072 | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.77 contig\_3 | no taxid assigned | no ORFs found Where the lineage scores represent the fraction of bit-score support for each classification. **Contig\_2 has two classifications.** This can happen if the *f* parameter is chosen below 0.5. For an explanation of the **starred classification**, see [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk). To add names to the taxonomy id's in either output file, run: ``` $ CAT add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} ``` This will show you that for example contig\_1 is classified as Terrabacteria group. To only get official levels (*i.e.* superkingdom, phylum, ...): ``` $ CAT add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} --only_official ``` Or, alternatively: ``` $ CAT add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} --only_official --exclude_scores ``` If you have named a CAT or BAT classification file with official names, you can get a summary of the classification, where total length and number of ORFs supporting a taxon are calculated for contigs, and the number of MAGs per encountered taxon for MAG classification: ``` $ CAT summarise -c {contigs fasta} -i {named CAT classification file} -o {output file} $ CAT summarise -i {named BAT classification file} -o {output file} ``` CAT summarise currently does not support classification files wherein some contigs / MAGs have multiple classifications (as contig\_2 above). ## Marking suggestive taxonomic assignments with an asterisk When we want to confidently go down to the lowest taxonomic level possible for a classification, an important assumption is that on that level conflict between classifications could have arisen. Namely, if there were conflicting classifications, the algorithm would have made the classification more conservative by moving up a level. Since it did not, we can trust the low-level classification. However, it is not always possible for conflict to arise, because in some cases no other sequences from the clade are present in the database. This is true for example for the family Dehalococcoidaceae, which in our databases is the sole representative of the order Dehalococcoidales. Thus, here we cannot confidently state that an classification on the family level is more correct than an classification on the order level. For these cases, CAT and BAT mark the lineage with asterisks, starting from the lowest level classification up to the level where conflict could have arisen because the clade contains multiple taxa with database entries. The user is advised to examine starred taxa more carefully, for example by analysing sequence identity between predicted ORFs and hits, or move up the lineage to a confident classification (i.e. the first classification without an asterisk). If you do not want the asterisks in your output files, you can add the `--no_stars` flag to CAT or BAT. ## Optimising running time, RAM, and disk usage CAT and BAT may take a while to run, and may use quite a lot of RAM and disk space. Depending on what you value most, you can tune CAT and BAT to maximize one and minimize others. The classification algorithm itself is fast and is friendly on memory and disk space. The most expensive step is alignment with DIAMOND, hence tuning alignment parameters will have the highest impact: - The `-n / --nproc` argument allows you to choose the number of cores to deploy. - You can choose to run DIAMOND in sensitive mode with the `--sensitive` flag. This will increase sensitivity but will make alignment considerably slower. - Setting the `--block_size` parameter lower will decrease memory and temporary disk space usage. Setting it higher will increase performance. - For high memory machines, it is adviced to set `--index_chunks` to 1. This parameter has no effect on temprary disk space usage. - You can specify the location of temporary DIAMOND files with the `--tmpdir` argument. - You can set the DIAMOND --top parameter (see below). ### Setting the DIAMOND --top parameter You can speed up DIAMOND considerably, and at the same time greatly reduce disk usage, by setting the DIAMOND `--top` parameter to lower values. This will govern hits within range of the best hit that are written to the alignment file. You have to be very carefull to 1) not confuse this parameter with the `r / --range` parameter, which does a similar cut-off but *after* alignment and 2) be aware that if you want to run CAT or BAT again afterwards with different values of the `-r / --range` parameter, your options will be limited to the range you have chosen with `--top` earlier, because all hits that fall outside this range will not be included in the alignment file. **Importantly**, CAT and BAT currently do not warn you if you choose `-r / --range` in a second run higher than `--top` in a previous one, **so it's up to you to remember this!** If you have understood all this, or you do not plan to tune `-r / --range` at all afterwards, you can add the `--I_know_what_Im_doing` flag and enjoy a huge speedup with much smaller alignment files! For CAT you can for example set `--top 11` and for BAT `--top 6`. ## Examples Getting help for running the prepare utility: ``` $ CAT prepare --help ``` First, create a fresh database. Next, run CAT on a contig set with default parameter settings deploying 16 cores for DIAMOND alignment. Finally, name the contig classification output with official names, and create a summary: ``` $ CAT prepare --fresh -d CAT_database/ -t CAT_taxonomy/ $ CAT contigs -c contigs.fasta -d CAT_database/ -t CAT_taxonomy/ -n 16 --out_prefix first_CAT_run $ CAT add_names -i first_CAT_run.contig2classification.txt -o first_CAT_run.contig2classification.official_names.txt -t CAT_taxonomy/ --only_official $ CAT summarise -c contigs.fasta -i first_CAT_run.contig2classification.official_names.txt -o CAT_first_run.summary.txt ``` Run the classification algorithm again with custom parameter settings, and name the contig classification output with all names in the lineage, excluding the scores: ``` $ CAT contigs --range 5 --fraction 0.1 -c contigs.fasta -d CAT_database/ -t CAT_taxonomy/ -p first_CAT_run.predicted_proteins.fasta -a first_CAT_run.alignment.diamond -o second_CAT_run $ CAT add_names -i second_CAT_run.contig2classification.txt -o second_CAT_run.contig2classification.names.txt -t CAT_taxonomy/ --exclude_scores ``` First, run BAT on a set of MAGs with custom parameter settings, suppressing verbosity and not writing a log file. Next, add names to the ORF2LCA output file: ``` $ CAT bins -r 10 -f 0.1 -b ../bins/ -s .fa -d CAT_database/ -t CAT_taxonomy/ -o BAT_run --quiet --no_log $ CAT add_names -i BAT_run.ORF2LCA.txt -o BAT_run.ORF2LCA.names.txt -t CAT_taxonomy/ ``` ### Identifying contamination/mis-binned contigs within a MAG. We often use the combination of CAT/BAT to explore possible contamination within a MAG. Run BAT on a single MAG. Next, classify the contigs within the MAG individually without generating new protein files or DIAMOND alignments. ``` $ CAT bin -b ../bins/interesting_MAG.fasta -d CAT_database/ -t CAT_taxonomy/ -o BAT.interesting_MAG $ CAT contigs -c ../bins/interesting_MAG.fasta -d CAT_database/ -t CAT_taxonomy/ -p BAT.interesting_MAG.predicted_proteins.faa -a BAT.interesting_MAG.alignment.diamond -o CAT.interesting_MAG ``` Contigs that have a different taxonomic signal than the MAG classification are probably contamination. Alternatively, you can look at contamination from the MAG perspective, by setting the *f* parameter to a low value: ``` $ CAT bin -f 0.01 -b ../bins/interesting_MAG.fasta -d CAT_database/ -t CAT_taxonomy/ -o BAT.interesting_MAG $ CAT add_names -i BAT.interesting_MAG.bin2classification.txt -o BAT.interesting_MAG.bin2classification.names.txt -t CAT_taxonomy/ ``` BAT will output any taxonomic signal with at least 1% support. Low scoring diverging signals are clear signs of contamination! CAT-5.3/tests/000077500000000000000000000000001452142602700131355ustar00rootroot00000000000000CAT-5.3/tests/data/000077500000000000000000000000001452142602700140465ustar00rootroot00000000000000CAT-5.3/tests/data/contigs/000077500000000000000000000000001452142602700155145ustar00rootroot00000000000000CAT-5.3/tests/data/contigs/small_contigs.fa000066400000000000000000000075071452142602700206730ustar00rootroot00000000000000>species_2 EFG1759503.1 GTCGTCGTCGTCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG TCAAAGCCATTCGGTATGGAACACACCTTCTTTATCAATGCGCTTATAAGTATGTGCTCCGAAATAGTCA CGCTGTGCCTGGATCAGGTTTGCAGGCAGAACAGCGGCACGGTAGCTGTCATAATAGGCAACCGCAGCAG CGAAGGTAGGAACCGGGATACCGTTCTGTACTGCGTAAGCGACAACTTCGCGCAGCGCCTGCTGGTAGTT ATCGGCAATTTGCTTGAAGTACGGAGCCAGCAGCAGGTTAGCGATCTGTGGATTTTCGATATAAGCATCG GTGATTTTCTGCAGGAACTGCGCACGGATGATGCAGCCAGCACGGAAAATCTTCGCGATTTCGCCGTAGT TCAGATCCCAGTTGTACTCTTCAGACGCAGCACGCAGCTGAGAGAAGCCCTGAGCGTAAGAAACGATTTT GCCCAGATACAGAGCACGACGAACTTTTTCGATGAACTCACCCTTGTCGCCTGCTGGCTGTGCTTGCGGG CCAGAGAGAACTTTAGATGCGGCAACACGCTGCTCTTTCAGAGAAGAGATATAACGTGCAAACACAGACT CGGTAATCAGCGACAGTGGTTCACCGAGATCCAGCGCGCTCTGGCTGGTCCATTTACCGGTACCTTTGTT AGCCGCTTCATCCAGAATCACATCAACCAGGTAGTTACCGTCTTCATCTTTTTTGGTGAAGATATCTTTG GTGATGTCGATCAGGTAGCTGCTCAGTTCACCGTTATTCCACTCGGTAAAGGTCTGCGCCAGTTCTTCGT TGGTGAGGTTCAGGCCACCTTTCAGCAAAGAATAGGCTTCAGCAATCAGCTGCATATCACCGTATTCAAT ACCGTTGTGAACCATCTTCACATAGTGACCTGCGCCATCGGCACCTATATAGGTCACGCATGGCTCACCG TCTTCAGCCACTGCGGCGATTTTGGTCAGGATCGGTGCAACCAGTTCATAGGCTTCTTTCTGCCCACCAG GCATGATAGAAGGGCCTTTCAGCGCACCTTCTTCACCACCGGAGACACCGGTGCCGATGAAGTTAAAGCC TTCTGCAGAAAGCTCACGGTTACGGCGAATGGTGTCCTGGAAGAAGGTGTTACCACCATCAATGATGATG TCACCTTTGTCGAGGTATGGCTTAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG CCCCCCCCCCCCCCCCC >class_3 PYI97175.1;WP_137987990.1 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG ATGATTACTCCTGTATCAGAAGAAGGAAACGGGAAACGGTTCGTCTCACACGCGCCAGGTTTTTGGCCGC AGACGCCCACGGAACTCTGGAATGATTGGAAATGGCAGCTCAAAAACCGGGTTACGTCACTCGCTCACTT GGAACAACACCTGGATCTTAGCGATGAGGAGCGCAGCGGTGTTTTGCTCTCGGGCGACAAACTCGCGCTG GCCGTAACGCCGCATTTTTTTAATCTGGTGCCGCGAAACAACCCTGAGGACCCAATCCGCCGTCAGGTAA TTCCGCGGATAGAGGAAACGTGGACGTCCCCTTACGACATGGCCGATCCGTGCGGCGAAGATTCGCACAT GCCGGTGCCGGGATTAGTGCACAGGTATCCCGACCGTGTCCTGTTCTTGGTGACGGATCGTTGCGCGAGC TACTGCCGCTACTGCACCCGAAGCCGCGTGGTGAGCGGAGTTGGTGAACAAGAATTGCATACAAATTTCG AGGAAGCGTTCCGTTATCTTCAACAACACAACGAAGTGCGCGATGTGCTCTTGAGTGGTGGCGACGCTCT CATTTTCAGCGACGACAAAATTGACAAGCTGCTTTCGCGACTGCGGTCAATTAAGCACATCGAGTTTGTT CGCATCGGCACGCGCGTTCCGATTTTCCTACCGCAACGCATCACGCCTGACTTATGCGCGCTGCTCGCCA AACATCATCCGCTCTGGATGAGTGTGCATGTAAATCATCCGCGCGAGCTGACGATTGAAGTGAAGGAAGC GTTAGAGCGTCTTGTGAACGCTGGCATTCCATTGGGAAACCAAAGCGTGCTCCTTGCGGGTGTGAACGAC GATCTTGAAACGATGAAAACGCTCGTGCACAAGCTTCTTATGTGTCGGGTGCGCCCTTATTACATTTACC AATGCGACCTCATCAACGGATCATCGCACTTGCGGACCTCGGTTGCCAAAGGGATCGAGATCATCGAGGG CTTGCGCGGCCACACCACCGGTTACGCGGTGCCGCAATTCGTTATCGACGCTCCCGGCGGCGGCGGGAAG GTGCCGATTAATCCGGGCTACGTCCTGTATCACGACAACGAAAAAATCGTGATTCGCAATTACGAAGGCA AAATCTTCGAGTACCCGGAAACTGGAAACGAGAACGTCCAGTTTGCGCCGCAGCGCGAGTATCACGACGA GTATCTCTATTCTTGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG ATGACAGATCAAGTCAGATGTGCGGTGCTGGGATTAGGACGATTAGGGTATTTTCATGCGAAGCATTTAG TCAGCGAAGTGCGGGGAGCCGAGCTTGCGGCCGTCTGTGACCCGATGAAGGGAAAGGCGGAAACGTGTGC GAAAGAATTAGGAATCGCCAAATGGACGGAGAATCCGTATGACCTGTTGGAAGATCACACGATTGATGCC GTCATCATTGTCACACCGACGAGCACACATGCGGAAATGATTATGAAGGCTGCTGAAAACGGCAAGCACA TCTTTGTTGAAAAGCCGCTCACCCTGAGTCTTGAGGAATCTAAAGAAGTCATGAAAAAAATCGAAGAAAC GGGCGTCATCTGCCAGGTCGGTTTTATGAGGCGGTTTGATCCGGCATACGCCGACGCGAAAAGAAGAATC GACGCCGGGGAAATCGGCAGGCCCATTTATTATAAAGGATTTACAAGAGATCAAGGCGCACCGCCCGCGG AATTTATCAAACATAGCGGAGGGCTTTTCATTGACTGTTCGATTCATGATTACGACATCGCGAGATATCT CATGAATGCCGAAGTTACGTCCGTCTGCGGGCACGGGAGGATTTTAAAGCACCCGTTCATGGAAGAGTGC GGCGACGTTGATCAGGCATTAACGTATCTTGAATTTGATTCGGGCGCGGCCGGCGATGTGGAGGCAAGCC GAAATTCTCCGTACGGGCATGATATCCGAGCCGAAATTATCGGGACGGCGGGAAGCATCTTGGTCGGTAC GCTGCGGAAAAGCCATGTCACCATTTTAACGGAGTCAGGCAGCAGCTACGAAATTATTCCGGATTTTCAA GCCCGCTTTAAAGACGCTTACCGTCTGGAGCTTGAGCACTTTGCCGAATGTGTGAAAAAAGGGGAAATGC CGATTGTGACAGACGTCGATGCCACGATTAACTTAGAAATCGGGATCGCCGCGACGGAATCGTTTAAAAC CGGAAGGCCTGTAAAGCTTACCCCGGGCGCCTTCGGATATGCCGGATTATGAGGGGGGGGGGGGGGGGGG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG CAT-5.3/tests/data/prepare/000077500000000000000000000000001452142602700155045ustar00rootroot00000000000000CAT-5.3/tests/data/prepare/lineages.txt000066400000000000000000000005221452142602700200330ustar00rootroot00000000000000KJX92028.1 root;superkingdom_1;phylum_1;class_1;order_1;family_1;genus_1;species_1 EFG1759503.1 root;superkingdom_1;phylum_2;class_2;order_2;family_2;genus_2;species_2 PYI97175.1 root;superkingdom_2;phylum_3;class_3;order_3;family_3;genus_3;species_3 WP_137987990.1 root;superkingdom_2;phylum_3;class_3;order_4;family_4;genus_4;species_4 CAT-5.3/tests/data/prepare/names.dmp000066400000000000000000000016741452142602700173210ustar00rootroot000000000000001 | root | | scientific name | 2 | superkingdom_1 | | scientific name | 3 | superkingdom_2 | | scientific name | 4 | phylum_1 | | scientific name | 5 | phylum_2 | | scientific name | 6 | phylum_3 | | scientific name | 7 | class_1 | | scientific name | 8 | class_2 | | scientific name | 9 | class_3 | | scientific name | 10 | order_1 | | scientific name | 11 | order_2 | | scientific name | 12 | order_3 | | scientific name | 13 | order_4 | | scientific name | 14 | family_1 | | scientific name | 15 | family_2 | | scientific name | 16 | family_3 | a> | scientific name | 17 | family_4 | > | scientific name | 18 | genus_1 | ria> | scientific name | 19 | genus_2 | ria> | scientific name | 20 | genus_3 | ria> | scientific name | 21 | genus_4 | ria> | scientific name | 22 | species_1 | proka | scientific name | 23 | species_2 | proka | scientific name | 24 | species_3 | proka | scientific name | 25 | species_4 | proka | scientific name | CAT-5.3/tests/data/prepare/nodes.dmp000066400000000000000000000026701452142602700173230ustar00rootroot000000000000001 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | 2 | 1 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | 3 | 1 | superkingdom | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 4 | 2 | phylum | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 5 | 2 | phylum | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 6 | 3 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 7 | 4 | class | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | effective current name; | 8 | 5 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 9 | 6 | class | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 10 | 7 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 11 | 8 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 12 | 9 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 13 | 9 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 14 | 10 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 15 | 11 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 16 | 12 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 17 | 13 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 18 | 14 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 19 | 15 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 20 | 16 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 21 | 17 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 22 | 18 | species | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 23 | 19 | species | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 24 | 20 | species | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 25 | 21 | species | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | CAT-5.3/tests/data/prepare/prot2acc.txt000066400000000000000000000001261452142602700177610ustar00rootroot00000000000000accession.version taxid KJX92028.1 22 EFG1759503.1 23 PYI97175.1 24 WP_137987990.1 25 CAT-5.3/tests/data/prepare/small.fa.gz000066400000000000000000000023171452142602700175460ustar00rootroot00000000000000]ob5Wp(hlBWWtVHjbd#aY)e)H}?Vrx~{=ߞn{\^η\;nV?=>}ro/vooiME4,:tWR@p59EЌ IY T3du 6Rur֔ lޜU2F-M0CnDI;:˿Fmp, g9Dv76P߁Z9f$sB@IZE3UM GDjxufG,#RPMQ-w>;OK~|;NO.O}?}ߟnO&uvϦ|<|ˇǻOIb"9fv6unG*J$wF1TTa2 G1krdbke}~."6LՃad ߮~qt~q]7_.&aw9{7vw3|ǗoS4sjz5g\z(6qS$\lbmdhvs>Ne#*#v+7kBlD̀^sq^[UM JzhCJ aߜ*q ^Z5\d &[ !fM!ޝQ b*JYz [wNϱ[^Wy e'I*H+;;I{Ek$e*#F482vJxVўa&z|X=>VLYzowutǟW9ͶմP!*4ly V3+9Rk(dtp"Bɕو譇 p9+fZQ!uQsvX"#cDB0 f ֍bW@%g59uf>k*k5&gF7nƢԥVCڰ? `kWlZĺe|ָF$ӿ