pal2nal.v14.1/0000755002342000234200000000000013171315744012477 5ustar mikitamikitapal2nal.v14.1/for_paml/0000755002342000234200000000000011666056602014300 5ustar mikitamikitapal2nal.v14.1/for_paml/test.cnt0000644002342000234200000000323411666056602015767 0ustar mikitamikita seqfile = test.codon treefile = test.tree outfile = test.codeml noisy = 0 * 0,1,2,3,9: how much rubbish on the screen verbose = 0 * 1: detailed output, 0: concise output runmode = -2 * 0: user tree; 1: semi-automatic; 2: automatic * 3: StepwiseAddition; (4,5):PerturbationNNI; -2: pairwise cleandata = 1 * "I added on 07/07/2004" Mikita Suyama seqtype = 1 * 1:codons; 2:AAs; 3:codons-->AAs CodonFreq = 2 * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table model = 2 * models for codons: * 0:one, 1:b, 2:2 or more dN/dS ratios for branches NSsites = 0 * dN/dS among sites. 0:no variation, 1:neutral, 2:positive icode = 0 * 0:standard genetic code; 1:mammalian mt; 2-10:see below Mgene = 0 * 0:rates, 1:separate; 2:pi, 3:kappa, 4:all fix_kappa = 0 * 1: kappa fixed, 0: kappa to be estimated kappa = 2 * initial or fixed kappa fix_omega = 0 * 1: omega or omega_1 fixed, 0: estimate omega = 1 * initial or fixed omega, for codons or codon-transltd AAs fix_alpha = 1 * 0: estimate gamma shape parameter; 1: fix it at alpha alpha = .0 * initial or fixed alpha, 0:infinity (constant rate) Malpha = 0 * different alphas for genes ncatG = 4 * # of categories in the dG or AdG models of rates clock = 0 * 0: no clock, unrooted tree, 1: clock, rooted tree getSE = 0 * 0: don't want them, 1: want S.E.s of estimates RateAncestor = 0 * (1/0): rates (alpha>0) or ancestral states (alpha=0) method = 0 * 0: simultaneous; 1: one branch at a time pal2nal.v14.1/for_paml/test.codeml.ori0000644002342000234200000001313411666056602017236 0ustar mikitamikitaCODONML (in paml 3.14, March 2005) test.codon Model: several dN/dS ratios for branches Codon frequencies: F3x4 ns = 2 ls = 177 # site patterns = 105 2 1 1 1 1 1 1 1 1 1 4 3 2 1 2 1 1 4 1 1 2 1 1 1 7 2 3 1 2 1 2 1 3 1 1 2 2 8 1 1 1 1 1 1 7 1 1 1 3 1 4 1 1 4 1 1 3 2 1 6 1 1 1 2 1 1 1 6 2 1 3 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 BC070280 CTA AAC TGC ATC GTC GCT GTG TCC CAG AAC ATG GGC ATC GGC AAG AAC GGG GAC CTG CCC CCA CCG CTC AGG AAT GAA TTC CAG AGA ACC ACA ACC TCT TCA GTA GAA GGT AAA CAG CTG GTG ATT ATG GGT AAG ACC TGG TCC ATT CCT GAG CGA CCT TTA GGT AGA GTT CTC AGC GAA CTC CCT CCA CAA GGA GCT CAT TTT CTT TCC AGT CTA GAT GAT GCC CTT ACT CCA GCA GTA ATG CTC TGG ATA GTT GGT TAT GCC GGC CAT CTT GTG AGG CAA GAA ACG ATT TTG CTG CCA TAC GTC CAG GTA GAG pseudogene ... ... ... ..T ... AA. .AT ... ... ..G ... ... ... AT. .G. ..T ... ... ... ... ..T .A. ... .AA ... A.. ... ..A ... ... ... C.. ... ... .C. ..G ... ... G.A T.A ..A T.. T.A AT. ... .A. ... ..G ... A.. ... .A. ... ... TA. .T. ... G.. ..T ... TC. ..A ..G ... A.. C.. .C. ... ... GA. ... ..G .G. ... ... .G. .T. .T. ... CA. G.. T.T .TT .C. ..G ..A ... T.. .A. ... T.. ... T.. ..G C.. ... GG. ..A ..C ... ... ..G G.. ... ..A Codon usage in sequences -------------------------------------------------------------- Phe TTT 6 10 | Ser TCT 3 3 | Tyr TAT 3 4 | Cys TGT 0 0 TTC 3 3 | TCC 3 3 | TAC 2 2 | TGC 1 1 Leu TTA 4 7 | TCA 1 1 | *** TAA 0 0 | *** TGA 0 0 TTG 1 0 | TCG 0 1 | TAG 0 0 | Trp TGG 2 2 -------------------------------------------------------------- Leu CTT 4 2 | Pro CCT 3 5 | His CAT 2 1 | Arg CGT 0 1 CTC 5 2 | CCC 1 2 | CAC 0 0 | CGC 0 0 CTA 3 3 | CCA 6 3 | Gln CAA 3 6 | CGA 1 0 CTG 3 2 | CCG 1 1 | CAG 4 3 | CGG 0 0 -------------------------------------------------------------- Ile ATT 5 6 | Thr ACT 1 1 | Asn AAT 7 9 | Ser AGT 3 4 ATC 3 3 | ACC 3 1 | AAC 3 2 | AGC 1 0 ATA 1 1 | ACA 2 3 | Lys AAA 8 11 | Arg AGA 3 3 Met ATG 6 4 | ACG 1 1 | AAG 9 8 | AGG 2 2 -------------------------------------------------------------- Val GTT 4 3 | Ala GCT 2 0 | Asp GAT 5 5 | Gly GGT 5 4 GTC 2 3 | GCC 2 1 | GAC 4 6 | GGC 5 3 GTA 3 2 | GCA 1 2 | Glu GAA 11 8 | GGA 1 1 GTG 3 4 | GCG 0 0 | GAG 5 7 | GGG 1 1 -------------------------------------------------------------- Codon position x base (3x4) table for each sequence. #1: BC070280 position 1: T:0.16384 C:0.20339 A:0.32768 G:0.30508 position 2: T:0.31638 C:0.16949 A:0.37288 G:0.14124 position 3: T:0.29944 C:0.21469 A:0.27119 G:0.21469 #2: pseudogene position 1: T:0.20904 C:0.17514 A:0.33333 G:0.28249 position 2: T:0.31073 C:0.15819 A:0.40678 G:0.12429 position 3: T:0.32768 C:0.18079 A:0.28814 G:0.20339 Sums of codon usage counts ------------------------------------------------------------------------------ Phe F TTT 16 | Ser S TCT 6 | Tyr Y TAT 7 | Cys C TGT 0 TTC 6 | TCC 6 | TAC 4 | TGC 2 Leu L TTA 11 | TCA 2 | *** * TAA 0 | *** * TGA 0 TTG 1 | TCG 1 | TAG 0 | Trp W TGG 4 ------------------------------------------------------------------------------ Leu L CTT 6 | Pro P CCT 8 | His H CAT 3 | Arg R CGT 1 CTC 7 | CCC 3 | CAC 0 | CGC 0 CTA 6 | CCA 9 | Gln Q CAA 9 | CGA 1 CTG 5 | CCG 2 | CAG 7 | CGG 0 ------------------------------------------------------------------------------ Ile I ATT 11 | Thr T ACT 2 | Asn N AAT 16 | Ser S AGT 7 ATC 6 | ACC 4 | AAC 5 | AGC 1 ATA 2 | ACA 5 | Lys K AAA 19 | Arg R AGA 6 Met M ATG 10 | ACG 2 | AAG 17 | AGG 4 ------------------------------------------------------------------------------ Val V GTT 7 | Ala A GCT 2 | Asp D GAT 10 | Gly G GGT 9 GTC 5 | GCC 3 | GAC 10 | GGC 8 GTA 5 | GCA 3 | Glu E GAA 19 | GGA 2 GTG 7 | GCG 0 | GAG 12 | GGG 2 ------------------------------------------------------------------------------ Codon position x base (3x4) table, overall position 1: T:0.18644 C:0.18927 A:0.33051 G:0.29379 position 2: T:0.31356 C:0.16384 A:0.38983 G:0.13277 position 3: T:0.31356 C:0.19774 A:0.27966 G:0.20904 Nei & Gojobori 1986. dN/dS (dN, dS) (Note: This matrix is not used in later m.l. analysis. Use runmode = -2 for ML pairwise comparison.) BC070280 pseudogene 0.5224 (0.1421 0.2721) pairwise comparison, codon frequencies: F3x4. 2 (pseudogene) ... 1 (BC070280) lnL =-1014.258355 0.52723 1.94064 0.59797 t= 0.5272 S= 129.6 N= 401.4 dN/dS= 0.5980 dN= 0.1510 dS= 0.2525 pal2nal.v14.1/for_paml/test.tree0000644002342000234200000000003011666056602016131 0ustar mikitamikita(BC070280, pseudogene); pal2nal.v14.1/test.aln0000644002342000234200000000131711666056602014156 0ustar mikitamikitaCLUSTAL W (1.82) multiple sequence alignment BC070280 MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGKQNLVIMGKKTWFS pseudogene ----LNCIVNVSQKMGIIRNGDLP*PQLKNKF2-FQRMTTPSSAEGKENLVFLIRKNWFS ################# ########################## BC070280 IPEKNRPLKGRINLVLSRELKEPPQGAHFLSRSLDDALKLTEQPELANKVDMLWIVGGSS pseudogene ITEKNQPLKYIINLVVSRESKEPPQRPPFLD*SLGDALKRIEQLKLANKQDVFFTVGGSS ############### ##################### BC070280 VYKEAMNHPGHLKLFVTRIMQDFESDTFF-PEIDLEKYKLLPEYP-GVLSDVQEEKGIKY pseudogene VYKESMN*-DHFKLFVTWIMQDFQSDTFFS4EGDLEKYKLLPEYPQGVVSDVEEEKGIKY BC070280 KFEVYEKND pseudogene KFEVYEKND pal2nal.v14.1/test.nuc0000644002342000234200000000335011666056602014170 0ustar mikitamikita>BC0700280 dihydrofolate reductase (human) TGTAACGAGC GGGCTCGGAG GTCCTCCCGC TGCTGTCATG GTTGGTTCGC TAAACTGCAT CGTCGCTGTG TCCCAGAACA TGGGCATCGG CAAGAACGGG GACCTGCCCT GGCCACCGCT CAGGAATGAA TTCAGATATT TCCAGAGAAT GACCACAACC TCTTCAGTAG AAGGTAAACA GAATCTGGTG ATTATGGGTA AGAAGACCTG GTTCTCCATT CCTGAGAAGA ATCGACCTTT AAAGGGTAGA ATTAATTTAG TTCTCAGCAG AGAACTCAAG GAACCTCCAC AAGGAGCTCA TTTTCTTTCC AGAAGTCTAG ATGATGCCTT AAAACTTACT GAACAACCAG AATTAGCAAA TAAAGTAGAC ATGCTCTGGA TAGTTGGTGG CAGTTCTGTT TATAAGGAAG CCATGAATCA CCCAGGCCAT CTTAAACTAT TTGTGACAAG GATCATGCAA GACTTTGAAA GTGACACGTT TTTTCCAGAA ATTGATTTGG AGAAATATAA ACTTCTGCCA GAATACCCAG GTGTTCTCTC TGATGTCCAG GAGGAGAAAG GCATTAAGTA CAAATTTGAA GTATATGAGA AGAATGATTA ATATGAAGGT GTTTTCTAGT TTAAGTTGTT CCCCCTCCCT CTGAAAAAAG TATGTATTTT TACATTAGAA AAGGTTTTTT GTTGACTTTA GATCTATAAT TATTTCTAAG CAACTTGTTT TTATTCCCCA CTACTCTTGT CTCTATCAGA TACCATTTAT GAGACATTCT TGCTATAACT AAGTGCTTCT CCAAGACCCC AACTGAGTCC CCAGCACCTG CTACAGTGAG CTGCCATTCC ACACCCATCA CATGTGGCAC TCTTGCCAGT CCTTGACATT GTCGGGCTTT TCACATGTTG GTAATATTTA TTAAAGATGA AGATCCACAT ACCCTTCAAA AAAAAAAAAA AAAAAAAAAA AAAAAAA >pseudogene dihydrofolate reductase pseudogene (human) CTAAACTGCA TTGTCAATGA TTCCCAGAAG ATGGGCATCA TCAGGAATGG GGACCTGCCC TGACCTCAGC TCAAAAATAA ATTCGATTCC AAAGAATGAC CACACCCTCT TCAGCAGAGG GTAAAGAAAA TTTAGTATTT TTAATTAGGA AGAACTGGTT CTCGATTACT GAGAAGAATC AACCTTTAAA GTATATAATT AATTTAGTTG TCAGTAGAGA ATCCAAGGAA CCACCGCAAA GACCTCCTTT TCTTGACTAA AGTCTGGGTG ATGCCTTAAA ACGTATTGAG CAACTAAAAT TAGCAAATAA ACAAGACGTG TTTTTTACAG TGGGAGGCAG TTCTGTTTAT AAGGAATCCA TGAATTGAGA CCATTTTAAA CTATTTGTGA CATGGATCAT GCAGGACTTT CAAAGTGACA CGTTTTTTTC CCCTAGAAGG TGATTTAGAG AAATATAAAC TTCTCCCAGA ATACCCACAA GGTGTTGTCT CTGATGTGGA GGAGGAGAAA GGCATTAAGT ACAAATTTGA AGTATATGAA AAGAATGAT pal2nal.v14.1/README0000644002342000234200000001561013171315351013354 0ustar mikitamikita#=================================================================# # # # PAL2NAL: robust conversion of protein sequence alignments # # into the corresponding codon-based DNA alignments # # # # version 1.0 June 15, 2005 # # version 1.1 October 11, 2005 # # version 2.1 March 28, 2006 # # version 8.0 May 29, 2006 # # version 10.0 August 1, 2006 # # version 11.0 August 9, 2006 # # version 12.0 February 2, 2007 # # version 12.1 June 22, 2009 # # version 12.2 September 9, 2009 # # version 13.0 July 26, 2010 # # version 14.0 December 2, 2011 # # version 14.1 October 17, 2017 # # # # Mikita Suyama (mikita@bioreg.kyushu-u.ac.jp) # # # #=================================================================# #------------------# # What is pal2nal? #------------------# PAL2NAL is a program that converts a multiple sequence alignment of proteins and the corresponding DNA (or mRNA) sequences into a codon-based DNA alignment. The program automatically assigns the corresponding codon sequence even if the input DNA sequence has mismatches with the input protein sequence, or contains UTRs, polyA tails. It can also deal with frame shifts in the input alignment, which is suitable for the analysis of pseudogenes. The resulting codon-based DNA alignment can further be subjected to the calculation of synonymous (Ks) and non-synonymous (Ka) substitution rates. The script is licensed under GPL v2. #-----------# # Reference #-----------# If you use PAL2NAL, please cite the following paper: - Mikita Suyama, David Torrents, and Peer Bork (2006) PAL2NAL: robust conversion of protein sequence alignment into the corresponding codon alignments. Nucleic Acids Res. 34:W609-W612. #-------# # Files #-------# The distribution version should contain the following files: README - This document pal2nal.pl - The script (version 14) (written in Perl) test.aln - test data (protein alignment) test.nuc - test data (DNA sequences) for_paml (directory) test.cnt - control file for codeml test.tree - tree file used for codeml test.codeml.ori - an example of codeml output #-------# # Usage #-------# Usage: pal2nal.pl pep.aln nuc.fasta [nuc.fasta...] [options] pep.aln: protein alignment either in CLUSTAL or FASTA format - works not only a pairwise alignment but also the alignment with more than 2 sequences. - if there are frame shifts in your alignment, you have to specify those positions by numbers: for example '2' in the alignment means that there are only two bases (i.e. one base deletion) (see 'test.aln'). nuc.fasta: DNA sequences (single multiple fasta format file, or may be separated files) Options: -h Show help -output (clustal|paml|fasta|codon) Output format, default = clustal -blockonly Show only user specified blocks '#' under CLUSTAL alignment (see example) -nogap remove columns with gaps and inframe stop codons -nomismatch remove mismatched codons (mismatch between pep and cDNA) from the output -codontable (1(default)|2|3|4|5|6|9|10|11|12|13|14|15|16|21|22|23) NCBI GenBank codon table 1 Universal code 2 Vertebrate mitochondrial code 3 Yeast mitochondrial code 4 Mold, Protozoan, and Coelenterate Mitochondrial code and Mycoplasma/Spiroplasma code 5 Invertebrate mitochondrial 6 Ciliate, Dasycladacean and Hexamita nuclear code 9 Echinoderm and Flatworm mitochondrial code 10 Euplotid nuclear code 11 Bacterial, archaeal and plant plastid code 12 Alternative yeast nuclear code 13 Ascidian mitochondrial code 14 Alternative flatworm mitochondrial code 15 Blepharisma nuclear code 16 Chlorophycean mitochondrial code 21 Trematode mitochondrial code 22 Scenedesmus obliquus mitochondrial code 23 Thraustochytrium mitochondrial code -html HTML output (only for the web server) -nostderr No STDERR messages (only for the web server) - The correspondence of IDs between pep.aln and nuc.fasta is automatically checked: - If you use the same IDs in both pep.aln and nuc.fasta, the sequences don't have to be in the same order. - If not, the order of the sequences in pep.aln and nuc.fasta has to be the same. - IDs in pep.aln are used in the output. Example: pal2nal.pl test.aln test.nuc -output paml -nogap #---------------------------------# # How to calculate Ks, Ka values? #---------------------------------# To calclate Ks, Ka values, you need the codeml program, which is included in the PAML package. You can download PAML from http://abacus.gene.ucl.ac.uk/software/paml.html As an example, a control file (test.cnt) and a tree file (test.tree) are in the "for_paml" sub-directory. These control file and tree file are designed for the 'test' data used in PAL2NAL. Example: pal2nal.pl test.aln test.nuc -output paml -nogap > for_paml/test.codon cd for_paml codeml test.cnt You can find the output of codeml in "test.codeml". Ks, Ka values are very end of the output file. Just for comparison, there is a sample output file, "test.codeml.ori". #------------# # WWW server #------------# http://www.bork.embl.de/pal2nal or http://www.genome.med.kyoto-u.ac.jp/cgi-bin/suyama/pal2nal/index.cgi (not working) #---------# # Contact #---------# If you have any questions or comments, please email me: Mikita Suyama Medical Institute of Bioregulation Kyushu University, 812-8582 Fukuoka, JAPAN tel: +81 92 642 6384 fax: +81 92 642 6562 email: mikita@bioreg.kyushu-u.ac.jp pal2nal.v14.1/pal2nal.pl0000755002342000234200000020132313171315741014366 0ustar mikitamikita#!/usr/bin/perl # pal2nal.pl (c) 2017 Mikita Suyama # # #----------------------------------------# # This script is licensed under GPL v2.0 # #----------------------------------------# # # # # pal2nal.pl (v14.1) Mikita Suyama # # Usage: pal2nal.pl pep.aln nuc.fasta [nuc.fasta...] [options] > output # # pep.aln: protein alignment either in CLUSTAL or FASTA format # # nuc.fasta: DNA sequences (single multi-fasta or separated files) # # Options: -output (clustal(default)|paml|fasta|codon) # -nogap # -html # -nostderr # -blockonly # -nomismatch # -codontable N (default=1(universal)) # # - IDs in pep.aln are used in the output. # # - Sequence order is automatically checked (see the comment of v12). # # - In-frame stop -> "*" or "_" # # - Frameshift # # Gene HCDG # Pseudo HC1G 2 del in pseudo # # Gene HCDG # Pseudo HC2G 1 del in pseudo # # Gene ND-TY # Pseudo ND1TY 1 ins in pseudo # # Gene ND-TY # Pseudo ND2TY 2 ins in pseudo # # Gene EREQK # Pseudo EK4QK 1 ins in pseudo # # # example: # # pep.aln # AA1 ACDEFGARH1G-F* # AA2 A2DDW-A*H-G2F* # # nuc.fasta # >NUC1 # GCTTGTGATGAATTTGGTGCTCGTCATGGGGTTTTAA # >NUC2 # GCGGGGACGACTGGGCGTAGCACGGGGGTTTTGA # # output # NUC1 GCTTGTGATGAATTTGGTGCTCGTCATG--GGG---TTTTAA # NUC2 GCGGG-GACGACTGG---GCGTAGCAC---GGGGG-TTTTGA # # # v14.1 # - licensed under GPL v2.0 # 2017/10/17 # v14. # - NCBI GenBank codon table # 1 Universal code # 2 Vertebrate mitochondrial code # 3 Yeast mitochondrial code # 4 Mold, Protozoan, and Coelenterate Mitochondrial code # and Mycoplasma/Spiroplasma code # 5 Invertebrate mitochondrial # 6 Ciliate, Dasycladacean and Hexamita nuclear code # 9 Echinoderm and Flatworm mitochondrial code # 10 Euplotid nuclear code # 11 Bacterial, archaeal and plant plastid code # 12 Alternative yeast nuclear code # 13 Ascidian mitochondrial code # 14 Alternative flatworm mitochondrial code # 15 Blepharisma nuclear code # 16 Chlorophycean mitochondrial code # 21 Trematode mitochondrial code # 22 Scenedesmus obliquus mitochondrial code # 23 Thraustochytrium mitochondrial code # # - Initiation Met is treated separately. # 2011/12/02 # # v13. # - file type (Win, UNIX, Mac) # new-line: # Win CR+LF \x0D\x0A # UNIX LF \x0A # Mac CR \x0D # # undef $/; # s/\x0D\x0A|\x0D|\x0A/\n/g; # # OS: $^O # 2010/07/26 # # - run bl2seq for inconsistency check if it happens on linux. # 2008/04/23 # v12.1 # - If there is a mismatch, the first atg(lower-case) couldn't # recognized as Met. # 2009/06/22 # # v12. # - About the input files: # Now this script automatically check the following # two possibilities: # - If the same IDs are used in pep.aln and nuc.fasta, # then you don't have to care about the order of # the seqs. # - If not, # the same order of the seqs in pep.aln and nuc.fasta # is assumed. # # - In case of 'inconsistency between...', run bl2seq if available # # - Some '1 while' replacement was very slow if the string # very long (ca 10k). The routine is modified for speed-up. # # 02/02/2007 # v11. # - Mac/DOS file (^M = \r). # # - bug fixed: # If aa = '4' (frame shift), 'codon' output # had 1 aa shift. # 09/08/2006 # v10. # - input DNA can be upper/lower cases # (in older version, only upper case was allowed). # # - don't report warning if the first Met is GTG # # - default input aln type: clustal # 01/08/2006 # v9.3. # - "codon" was added to the output options. # - if ($html), print "
" and "
". # 20/06/2006 # # v9.2. # - pn2codon is modified (to v4) # from: 10 residue window (incl '-') # to: 10 residue window (excl '-') # 19/06/2006 # v9.1. # - to handle '/' followed by '*' # # $aaseq[$i] =~ s/\/(-*)[A-Z]/-${1}2/g; # | # v # $aaseq[$i] =~ s/\/(-*)[A-Z\*]/-${1}2/g; # 16/06/2006 # v9. # - Frameshift in tfastx/tfasty # # \ -> +1 # # / -> -1 (e.g. PG/A -> PG-2) # 06/06/2006 # v8. # - automatic detection of the input alignment format # (-a is no longer used.) # 14/02/2006 # - new option: # -nomismatch -> remove mismatched codons (mismatch between # pep and cDNA) from the output. # # -codontable (universal(default)|vmitochondria) # 26/03/2006 # v7. # - new option: # # -a input_alignment_type # # clustal (default) CLUSTALW format # # fasta # # gblocks Gblocks txt format *-gb.txt (-p=t) # # # # in the case of "-a gblocks", only the codons in conserved # # blocks are converted into codon alignment. # 27/01/2006 # # - if clustal alignment contains # # - new option: # -blockonly # 03/02/2006 # # - show warning message only if the position is marked '#'. # # - -i, -j options are deleted. # 08/02/2006 # v6. # # - new option: # # -fasta -> set this option if input pep file is # # not in *.aln format but in *.fasta format. # # 16/08/2005 # # v4. # - new option: # -html -> for the web server (STDERR messages -> STDOUT) # (mismatch in color) # -nostderr -> for Ks,Ka calc on the web server (NO STDERR messages) # 25/04/2005 # # v3. # - renamed back to "pal2nal.pl" # - subroutine (pn2codon) is modified: # - reverse translation table (%p2c) was replaced # - if there is no exact match, try fragment anchors # - return value is changed to hash # - new options: # -output clustal|paml|fasta # clustal format (default) # paml # fasta # -nogap -> remove all gaped codons and stop codons # # 22/04/2005 # v2. # - nuc.fasta can be either one multiple fasta file or several files. # The order of seqs should be the same as those in the pep.aln file. # 13/07/2004 # # old script name: pal2nal.pl # 06/09/2000 Mikita Suyama # # - modified for multiple seqs. 17/10/2000 # # - AA-ids and NUC-ids must be the same -> TURNED OFF 05/05/2002 # Now, only the order is the matter. # AA-ids are used throughout; NUC-ids are not used at all. # # - old condition: AA = NUC # new condition: AA <= NUC ## NUC(cDNA) can be longer than AA ## # # - renamed from "pal2nal.pl" to codon_aln.pl 12/07/2004 # $| = 1; undef $/; $myos = $^O; if ($#ARGV < 1) { & showhelp(); exit; } else { $getoutform = 0; $nogap = 0; $html = 0; $nostderr = 0; $fasta = 0; undef($alnfile); undef(@nucfiles); $outform = "clustal"; $blockonly = 0; $nomismatch = 0; $getcodontable = 0; $codontable = 1; foreach $i (0..$#ARGV) { if ($ARGV[$i] eq "-h") { & showhelp(); } elsif ($ARGV[$i] eq "-output") { $getoutform = 1; } elsif ($getoutform) { $outform = $ARGV[$i]; if ($outform ne "clustal" && $outform ne "paml" && $outform ne "fasta" && $outform ne "codon") { print STDERR "\nERROR: valid output format: clustal, paml, fasta, or codon\n\n"; exit; } $getoutform = 0; } elsif ($ARGV[$i] eq "-blockonly") { $blockonly = 1; } elsif ($ARGV[$i] eq "-nogap") { $nogap = 1; } elsif ($ARGV[$i] eq "-nomismatch") { $nomismatch = 1; } elsif ($ARGV[$i] eq "-codontable") { $getcodontable = 1; } elsif ($getcodontable) { $codontable = $ARGV[$i]; if ($codontable != 1 && $codontable != 2 && $codontable != 3 && $codontable != 4 && $codontable != 5 && $codontable != 6 && $codontable != 9 && $codontable != 10 && $codontable != 11 && $codontable != 12 && $codontable != 13 && $codontable != 14 && $codontable != 15 && $codontable != 16 && $codontable != 21 && $codontable != 22 && $codontable != 23) { print STDERR "\nERROR: invalid codontable number, $codontable!!\n\n"; exit; } $getcodontable = 0; } elsif ($ARGV[$i] eq "-html") { $html = 1; } elsif ($ARGV[$i] eq "-nostderr") { $nostderr = 1; } elsif (!$alnfile) { $alnfile = $ARGV[$i]; } else { push(@nucfiles, $ARGV[$i]); } } } if ($html) { print "
\n";
}

#---------------#
# check options  ("-outform codon" is not valid with -blockonly, -nogap, -nomismatch.)
#---------------#

if ($outform eq "codon" &&
    ($blockonly || $nogap || $nomismatch)) {
    if ($html) {
        print "\nERROR:  if \"codon(Output format)\" is selected, don't use \"Remove gaps, inframe stop codons\" or \"Remove mismatches\" or \"Use only selected positions\".\n\n";
    } else {
        print STDERR "\nERROR:  \"-outform codon\" is not valid with -blockonly, -nogap, -nomismatch\n\n";
    }
    exit;
}


#---------------------#
#  Get nuc sequences
#---------------------#

undef(@nucid);
undef(%id2nucseq);
$nseq = -1;
foreach $i (0..$#nucfiles) {
    open(NUCFILE, "< $nucfiles[$i]") || die "Can't open $nucfiles[$i]";
    $nucfiledata = ;
    close(NUCFILE);
    $nucfiledata =~ s/\x0D\x0A|\x0D|\x0A/\n/g;
    foreach (split(/\n/, $nucfiledata)) {
        if (!/^#/ && /\S+/) {
            if (/^>(\S+)/) {
                ++$nseq;
                $tmpnucid = $1;
                push(@nucid, $tmpnucid);
            } else {
                s/[^a-zA-Z]//g;
                $id2nucseq{$tmpnucid} .= $_;
            }
        }
    }
}


#-------------------#
#  Get aa alignemt
#-------------------#

undef(@aaid);
undef(%id2aaaln);
undef(%aaidcnt);
undef(@aaseq);
undef($gblockseq);

$gettype = 1;
open(ALNFILE, "< $alnfile") || die "Can't open $alnfile";
while () {
    chomp;
    if ($gettype && !/^#/ && /\S+/) {
        if (/^CLUSTAL/) {
            $inalntype = "clustal";
        } elsif (/^>/) {
            $inalntype = "fasta";
        } elsif (/^Gblocks/) {
            $inalntype = "gblocks";
        } else {
            $inalntype = "clustal";
        }

        $gettype = 0;
    }
}
close(ALNFILE);

open(ALNFILE, "< $alnfile") || die "Can't open $alnfile";
$getblock = 0;
$aafiledata = ;
close(ALNFILE);
$aafiledata =~ s/\x0D\x0A|\x0D|\x0A/\n/g;
foreach (split(/\n/, $aafiledata)) {
    if ($inalntype eq "clustal") {
        if (!/^CLUSTAL/ && /^\S+/ && !/^#/) {
            s/\s+$//;
            @dat = split(/\s+/, $_);
            ++$aaidcnt{$dat[0]};
            push(@aaid, $dat[0]) if ($aaidcnt{$dat[0]} == 1);
            $dat[1] =~ tr/a-z/A-Z/;
            $id2aaaln{$dat[0]} .= $dat[1];

            $tmplen = length($_);

            /^\S+\s+/;
            $idspc = length($&);
            $subalnlen = length($dat[1]);

            $getblock = 1;
        } elsif ($getblock) {
            $_ .= ' ' x ($tmplen - length($_));
            $gblockseq .= substr($_, $idspc, $subalnlen);

            $getblock = 0;
        }
    } elsif ($inalntype eq "fasta") {
        if (/^>(\S+)/) {
            $tmpid = $1;
            push(@aaid, $tmpid);
        } else {
            s/\s+//g;
            tr/a-z/A-Z/;
            $id2aaaln{$tmpid} .= $_;
        }
    } elsif ($inalntype eq "gblocks") {
        $getaln = 0;
        if (/^\s+\=/) {
            $getaln = 1;
        } elsif (/^Parameters/) {
            $getaln = 0;
        } elsif ($getaln) {
            @dat = split(/\s+/, $_);
            if (/^Gblocks/) {
                $gblockseq .= $dat[1];
            } elsif (/^\S+/) {
                ++$aaidcnt{$dat[0]};
                push(@aaid, $dat[0]) if ($aaidcnt{$dat[0]} == 1);
                $dat[1] =~ tr/a-z/A-Z/;
                $id2aaaln{$dat[0]} .= $dat[1];
            }
        }
    }
}

foreach $i (0..$#aaid) {
    push(@aaseq, $id2aaaln{$aaid[$i]});
}


#------------------------------------#
#  For frameshifts in tfastx/tfasty
#------------------------------------#

foreach $i (0..$#aaid) {
    $aaseq[$i] =~ s/\\/1/g;
    $aaseq[$i] =~ s/\/(-*)[A-Z\*]/-${1}2/g;
}


#-------------------------------------#
#  Check the input seqs (pep <=> nuc)
#-------------------------------------#

if ($#aaid != $#nucid) {
    $naa = $#aaid + 1;
    $nnuc = $#nucid + 1;
    if ($html) {
        print "\nERROR: number of input seqs differ (aa: $naa;  nuc: $nnuc)!!\n\n";
    } else {
        print STDERR "\nERROR: number of input seqs differ (aa: $naa;  nuc: $nnuc)!!\n\n";
        print STDERR "   aa  '@aaid'\n";
        print STDERR "   nuc '@nucid'\n";
    }
    exit;
}


#---------------------------------#
# corresponding IDs or same order
#---------------------------------#

@commonids = & common_elem(*aaid, *nucid);

sub common_elem {
    local(*aarr, *barr) = @_;

    local(%mark, @comarr);
    grep($mark{$_}++, @aarr);
    @comarr = grep($mark{$_}, @barr);
}
        
if ($#commonids == $#aaid) {
    $idcorrespondence = "sameID";
} else {
    $idcorrespondence = "ordered";
}


#-------------------#
#  Codon sequences
#-------------------#

undef(@codonseq);
undef(%aaidpos2mismatch);
undef(@outmessage);
foreach $i (0..$#aaid) {
    if ($idcorrespondence eq "sameID") {
        $tmpnucid = $aaid[$i];
    } elsif ($idcorrespondence eq "ordered") {
        $tmpnucid = $nucid[$i];
    } else {
        print STDERR "\nERROR in ID correspondence.\n\n";
        exit;
    }

    %codonout = & pn2codon($aaseq[$i], $id2nucseq{$tmpnucid}, $codontable);
    @message = @{$codonout{'message'}};

    foreach $j (0..$#message) {
        $outl = "WARNING: $aaid[$i] $message[$j]";
        push(@outmessage, $outl);
        @dat = split(/\s+/, $message[$j]);
        $dat[1] =~ s/:$//;
        $aaidpos2mismatch{"$aaid[$i] $dat[1]"} = 1;
    }

    if ($codonout{'result'} == 1 || $codonout{'result'} == 2) {
        push(@codonseq, $codonout{'codonseq'});
    } else {
        $erraa = $aaseq[$i];
        $erraa =~ s/-//g;
        # 1 while $erraa =~ s/(.{60})(.+)/$1\n$2/;
        @frags = & my1while($erraa, 60);
        $erraa = join("\n", @frags);

        $errnuc = $id2nucseq{$tmpnucid};
        $errnuc =~ s/-//g;
        # 1 while $errnuc =~ s/(.{60})(.+)/$1\n$2/;
        @frags = & my1while($errnuc, 60);
        $errnuc = join("\n", @frags);

        if ($html) {
            print "
\n"; print "

ERROR in your input files!

\n"; print "
\n";
            print "#---  ERROR: inconsistency between the following pep and nuc seqs  ---#\n";

            print ">$aaid[$i]\n";
            print "$erraa\n";

            print ">$tmpnucid\n";
            print "$errnuc\n";

            print "
\n"; $tmpwhich = "tmp/tmpwhich.$$"; } else { print STDERR "#--- ERROR: inconsistency between the following pep and nuc seqs ---#\n"; print STDERR ">$aaid[$i]\n"; print STDERR "$erraa\n"; print STDERR ">$tmpnucid\n"; print STDERR "$errnuc\n"; $tmpwhich = "tmpwhich.$$"; } if ($myos eq "linux") { system("which bl2seq > $tmpwhich"); $foundbl2seq = 0; open(TMPWHICH, "< $tmpwhich") || die "Can't open $tmpwhich"; while () { chomp; $foundbl2seq = 1 if (/bl2seq$/); } close(TMPWHICH); unlink($tmpwhich); if ($foundbl2seq) { #------------------------------------------# # run bl2seq (if the command is available) #------------------------------------------# if ($html) { $erraafile = "tmp/erraafile.$$"; $errnucfile = "tmp/errnucfile.$$"; $tblnout = "tmp/tbln.out.$$"; } else { $erraafile = "erraafile.$$"; $errnucfile = "errnucfile.$$"; $tblnout = "tbln.out.$$"; } open(ERRAAFILE, "> $erraafile") || die "Can't open $erraafile"; print ERRAAFILE ">$aaid[$i]\n"; print ERRAAFILE "$erraa\n"; close(ERRAAFILE); open(ERRNUCFILE, "> $errnucfile") || die "Can't open $errnucfile"; print ERRNUCFILE ">$tmpnucid\n"; print ERRNUCFILE "$errnuc\n"; close(ERRNUCFILE); system("bl2seq -p tblastn -F F -i $erraafile -j $errnucfile -o $tblnout"); if ($html) { print "
\n"; print "
\n"; print "

Check the following TBLASTN output.


\n"; print "
\n";
                    print "      your pep -> 'Query'\n";
                    print "      your nuc -> 'Sbjct'\n";
                    print "
\n"; } else { print STDERR "\n"; print STDERR "\n"; print STDERR " ###----- Check the following TBLASTN output: -----###\n"; print STDERR " ###----- your pep -> 'Query' -----###\n"; print STDERR " ###----- your nuc -> 'Sbjct' -----###\n"; print STDERR "\n"; } open(TBLNOUT, "< $tblnout") || die "Can't open $tblnout"; $tblnoutdata = ; close(TBLNOUT); $tblnoutdata =~ s/\x0D\x0A|\x0D|\x0A/\n/g; foreach (split(/\n/, $tblnoutdata)) { if ($html) { print "$_\n"; } else { print STDERR "$_\n"; } } if ($html) { print "
\n"; } unlink($erraafile); unlink($errnucfile); unlink($tblnout); } else { if ($html) { print "
\n"; print "
\n"; print "Run bl2seq (-p tblastn) or GeneWise to see the inconsistency.
\n"; print "
\n"; } else { print STDERR "\n"; print STDERR "\n"; print STDERR "Run bl2seq (-p tblastn) or GeneWise to see the inconsistency.\n"; print STDERR "\n"; } } } else { #### non-linux environment print STDERR "\n"; print STDERR "\n"; print STDERR "Run bl2seq (-p tblastn) or GeneWise to see the inconsistency.\n"; print STDERR "\n"; } exit; } } #-------------------# # Warning in '#'? #-------------------# if ($gblockseq =~ /#/ && $blockonly) { undef(@newoutmessage); foreach $i (0..$#outmessage) { $mpos = (split(/\s+/, $outmessage[$i]))[3]; $mpos =~ s/://; if (substr($gblockseq, $mpos - 1, 1) eq "#") { push(@newoutmessage, $outmessage[$i]); } } @outmessage = @newoutmessage; } #--------------------# # Warning messages #--------------------# if ($codontable != 1) { # push(@outmessage, "Codon table $codontable is used"); splice(@outmessage, 0, 0, "Codontable $codontable is used"); } if (!$nostderr) { foreach $j (0..$#outmessage) { if ($html) { if ($j == 0 && !$nomismatch) { print "#------------------------------------------------------------------------#\n"; } if ($nomismatch) { # print "# $outmessage[$j] (excluded from the output)\n"; } else { print "# $outmessage[$j]\n"; } if ($j == $#outmessage && !$nomismatch) { print "#------------------------------------------------------------------------#\n\n"; } } else { if ($j == 0 && !$nomismatch) { print STDERR "#------------------------------------------------------------------------#\n"; print STDERR "# Input files: $alnfile @nucfiles\n"; } if ($nomismatch) { # print STDERR "# $outmessage[$j] (exlucded from the output)\n"; } else { print STDERR "# $outmessage[$j]\n"; } if ($j == $#outmessage && !$nomismatch) { print STDERR "#------------------------------------------------------------------------#\n\n"; } } } } undef(%errorpos); foreach $j (0..$#outmessage) { @dat = split(/\s+/, $outmessage[$j]); $tmperrpos = $dat[3]; $tmperrpos =~ s/:$//; $tmperrpos -= 1; $errorpos{$tmperrpos} = 1; } #----------------------------------# # Make an AA-based NUC alignment #----------------------------------# undef(@tmppos); undef(@codonaln); undef(@coloraln); undef($maskseq); foreach $i (0..length($aaseq[0]) - 1) { $tmpmax = 0; $apos = $i + 1; #-----------# # gblocks ? #-----------# $putcodon = 1; if ($gblockseq =~ /#/ && substr($gblockseq, $i, 1) ne "#" && $blockonly) { $putcodon = 0; } if ($nomismatch && $errorpos{$i}) { $putcodon = 0; } foreach $k (0..$#aaid) { $tmpaa = substr($aaseq[$k], $i, 1); if ($tmpaa !~ /\d/) { $tmplen = 3; } else { $tmplen = (int(($tmpaa - 1) / 3) + 1) * 3; # 1, 2, 3 -> 3 # 4, 5, 6 -> 6 # 7, 8, 9 -> 9 } $tmpmax = $tmplen if ($tmpmax < $tmplen); } foreach $k (0..$#aaid) { $tmpaa = substr($aaseq[$k], $i, 1); if ($tmpaa !~ /\d/) { if ($tmpaa eq '-') { # if ($putcodon || (!$blockonly && !$nomismatch)) { if ($putcodon) { $codonaln[$k] .= '-' x $tmpmax; if ($aaidpos2mismatch{"$aaid[$k] $apos"}) { $coloraln[$k] .= 'R' x $tmpmax; } else { $coloraln[$k] .= '-' x $tmpmax; } } } elsif ($tmpaa =~ /[A-Z]/ || $tmpaa eq '*') { # if ($putcodon || (!$blockonly && !$nomismatch)) { if ($putcodon) { $codonaln[$k] .= substr($codonseq[$k], $tmppos[$k], 3); if ($aaidpos2mismatch{"$aaid[$k] $apos"}) { $coloraln[$k] .= 'RRR'; } else { $coloraln[$k] .= '---'; } } $tmppos[$k] += 3; # if ($putcodon || (!$blockonly && !$nomismatch)) { if ($putcodon) { $codonaln[$k] .= '-' x ($tmpmax - 3) if ($tmpmax - 3 > 0); if ($aaidpos2mismatch{"$aaid[$k] $apos"}) { $coloraln[$k] .= 'R' x ($tmpmax - 3) if ($tmpmax - 3 > 0); } else { $coloraln[$k] .= '-' x ($tmpmax - 3) if ($tmpmax - 3 > 0); } } } } elsif ($tmpaa =~ /\d/) { # if ($putcodon || (!$blockonly && !$nomismatch)) { if ($putcodon) { $codonaln[$k] .= substr($codonseq[$k], $tmppos[$k], $tmpaa); if ($aaidpos2mismatch{"$aaid[$k] $apos"}) { $coloraln[$k] .= 'R' x $tmpaa; } else { $coloraln[$k] .= '-' x $tmpaa; } } $tmppos[$k] += $tmpaa; # if ($putcodon || (!$blockonly &&!$nomismatch)) { if ($putcodon) { $codonaln[$k] .= '-' x ($tmpmax - $tmpaa); if ($aaidpos2mismatch{"$aaid[$k] $apos"}) { $coloraln[$k] .= 'R' x ($tmpmax - $tmpaa); } else { $coloraln[$k] .= '-' x ($tmpmax - $tmpaa); } } } } if (!$blockonly) { # if ($putcodon && substr($gblockseq, $i, 1) eq "#") { # $maskseq .= '#' x $tmpmax; # } else { # $maskseq .= ' ' x $tmpmax; # } if ($putcodon) { if (substr($gblockseq, $i, 1) eq "#") { $maskseq .= "#" x $tmpmax; } else { $maskseq .= " " x $tmpmax; } } } } #-----------# # -nogap? #-----------# $alilen = length($codonaln[0]); if ($nogap) { $tmppos = 0; undef(@nogapaln); undef(@nogapcoloraln); undef($nogapmaskseq); while ($tmppos < $alilen) { $outok = 1; foreach $i (0..$#codonaln) { $tmpcodon = substr($codonaln[$i], $tmppos, 3); $outok = 0 if ($tmpcodon =~ /-/); $outok = 0 if ($tmpcodon =~ /(((U|T)A(A|G|R))|((T|U)GA))/); } if ($outok) { foreach $i (0..$#codonaln) { $tmpcodon = substr($codonaln[$i], $tmppos, 3); $nogapaln[$i] .= $tmpcodon; $tmpcolorcodon = substr($coloraln[$i], $tmppos, 3); $nogapcoloraln[$i] .= $tmpcolorcodon; } $nogapmaskseq .= substr($maskseq, $tmppos, 3); } $tmppos += 3; } @codonaln = @nogapaln; @coloraln = @nogapcoloraln; $maskseq = $nogapmaskseq; } #----------# # Output #----------# $maxn = 0; foreach $i (0..$#aaid) { $maxn = length($aaid[$i]) if ($maxn < length($aaid[$i])); } $maxn = 10 if ($maxn < 10); $alilen = length($codonaln[0]); # foreach $i (0..$#aaid) { # 1 while $codonaln[$i] =~ s/(.{60})(.+)/$1\n$2/; # 1 while $coloraln[$i] =~ s/(.{60})(.+)/$1\n$2/; # } # 1 while $maskseq =~ s/(.{60})(.+)/$1\n$2/; undef(%aaid2codonarr); undef(%aaid2colorarr); foreach $i (0..$#aaid) { @{$aaid2codonarr{$aaid[$i]}} = & my1while($codonaln[$i], 60); @{$aaid2colorarr{$aaid[$i]}} = & my1while($coloraln[$i], 60); } @maskarr = & my1while($maskseq, 60); if ($outform eq "clustal") { #-----------# # clustal #-----------# print "CLUSTAL W multiple sequence alignment\n"; print "\n"; @output1 = @{$aaid2codonarr{$aaid[0]}}; if ($html) { foreach $i (0..$#output1) { foreach $k (0..$#aaid) { printf "%-${maxn}s ", $aaid[$k]; $outf = ${$aaid2codonarr{$aaid[$k]}}[$i]; $outr = ${$aaid2colorarr{$aaid[$k]}}[$i]; $rlen = length($outf); if ($outr =~ /R/) { foreach $l (0..$rlen - 1) { $tmpnuc = substr($outf, $l, 1); $tmpr = substr($outr, $l, 1); if ($tmpr eq "R") { print "$tmpnuc"; } else { print "$tmpnuc"; } } print "\n"; } else { print "$outf\n"; } } $outmask = $maskarr[$i]; printf "%-${maxn}s $outmask\n", ' ' if (!$blockonly && $gblockseq =~ /#/); print "\n"; } } else { foreach $i (0..$#output1) { foreach $k (0..$#aaid) { $outf = ${$aaid2codonarr{$aaid[$k]}}[$i]; printf "%-${maxn}s $outf\n", $aaid[$k]; } $outmask = $maskarr[$i]; printf "%-${maxn}s $outmask\n", ' ' if (!$blockonly && $gblockseq =~ /#/); print "\n"; } } } elsif ($outform eq "codon") { #-------# # codon #-------# # @outaa = @aaseq; undef(@outaa); $withn = 0; foreach $j (0..$#aaid) { $withn = 1 if ($aaseq[$j] =~ /\d/); } if ($withn) { $alnlen = length($aaseq[0]); foreach $i (0..$alnlen - 1) { $maxaan = 0; foreach $j (0..$#aaid) { $tmpaa = substr($aaseq[$j], $i, 1); if ($tmpaa =~ /\d/ && $tmpaa > $maxaan) { $maxaan = $tmpaa; } } if ($maxaan >= 4) { $tmplen = int(($tmpaa - 1) / 3) + 1; # 4, 5, 6 -> 2 # 7, 8, 9 -> 3 } else { $tmplen = 1; } foreach $j (0..$#aaid) { $pushaa = '-' x $tmplen; $tmpaa = substr($aaseq[$j], $i, 1); substr($pushaa, 0, 1) = $tmpaa; $outaa[$j] .= $pushaa; } } } else { @outaa = @aaseq; } undef(%aaid2aaarr); foreach $i (0..$#aaid) { # 1 while $outaa[$i] =~ s/(.{20})(.+)/$1\n$2/; @{$aaid2aaarr{$aaid[$i]}} = & my1while($outaa[$i], 20); } @output1 = @{$aaid2codonarr{$aaid[0]}}; if ($html) { foreach $i (0..$#output1) { foreach $k (0..$#aaid) { printf "%-${maxn}s ", ''; $outa = ${$aaid2aaarr{$aaid[$k]}}[$i]; # 1 while $outa =~ s/(\S)(\S+)/$1 $2/; @frags = & my1while($outa, 1); $outa = join(" ", @frags); $outf = ${$aaid2codonarr{$aaid[$k]}}[$i]; # 1 while $outf =~ s/(\S{3})(\S+)/$1 $2/; @frags = & my1while($outf, 3); $outf = join(" ", @frags); $outr = ${$aaid2colorarr{$aaid[$k]}}[$i]; # 1 while $outr =~ s/(\S{3})(\S+)/$1 $2/; @frags = & my1while($outr, 3); $outr = join(" ", @frags); if ($outr =~ /R/) { $rlen = length($outf); foreach $l (0..$rlen - 1) { $tmppep = substr($outa, $l, 1); $tmpr = substr($outr, $l, 1); if ($tmpr eq "R") { print "$tmppep"; } else { print "$tmppep"; } } print "\n"; printf "%-${maxn}s ", $aaid[$k]; foreach $l (0..$rlen - 1) { $tmpnuc = substr($outf, $l, 1); $tmpr = substr($outr, $l, 1); if ($tmpr eq "R") { print "$tmpnuc"; } else { print "$tmpnuc"; } } print "\n"; } else { print "$outa\n"; printf "%-${maxn}s ", $aaid[$k]; print "$outf\n"; } } $outmask = $maskarr[$i]; $outmask =~ s/\s/-/g; # 1 while $outmask =~ s/(\S{3})(\S+)/$1 $2/; @frags = & my1while($outmask, 3); $outmask = join(" ", @frags); $outmask =~ s/-/ /g; printf "%-${maxn}s $outmask\n", ' ' if (!$blockonly && $gblockseq =~ /#/); print "\n"; } } else { foreach $i (0..$#output1) { foreach $k (0..$#aaid) { $outa = ${$aaid2aaarr{$aaid[$k]}}[$i]; # 1 while $outa =~ s/(\S)(\S+)/$1 $2/; @frags = & my1while($outa, 1); $outa = join(" ", @frags); printf "%-${maxn}s $outa\n", ''; $outf = ${$aaid2codonarr{$aaid[$k]}}[$i]; # 1 while $outf =~ s/(\S{3})(\S+)/$1 $2/; @frags = & my1while($outf, 3); $outf = join(" ", @frags); printf "%-${maxn}s $outf\n", $aaid[$k]; } $outmask = $maskarr[$i]; $outmask =~ s/\s/-/g; # 1 while $outmask =~ s/(\S{3})(\S+)/$1 $2/; @frags = & my1while($outmask, 3); $outmask = join(" ", @frags); $outmask =~ s/-/ /g; printf "%-${maxn}s $outmask\n", ' ' if (!$blockonly && $gblockseq =~ /#/); print "\n"; } } } elsif ($outform eq "paml") { #--------# # paml #--------# $nseq = $#aaid + 1; printf " %3d %6d\n", $nseq, $alilen; foreach $i (0..$#aaid) { print "$aaid[$i]\n"; if ($html) { @outf = @{$aaid2codonarr{$aaid[$i]}}; @outr = @{$aaid2colorarr{$aaid[$i]}}; foreach $k (0..$#outf) { if ($outr[$k] =~ /R/) { $lenf = length($outf[$k]); foreach $l (0..$lenf - 1) { $tmpnuc = substr($outf[$k], $l, 1); $tmpr = substr($outr[$k], $l, 1); if ($tmpr eq "R") { print "$tmpnuc"; } else { print "$tmpnuc"; } } print "\n"; } else { print "$outf[$k]\n"; } } } else { $outcodon = join("\n", @{$aaid2codonarr{$aaid[$i]}}); print "$outcodon\n"; } } } elsif ($outform eq "fasta") { #---------# # fasta #---------# foreach $i (0..$#aaid) { print ">$aaid[$i]\n"; if ($html) { @outf = @{$aaid2codonarr{$aaid[$i]}}; @outr = @{$aaid2colorarr{$aaid[$i]}}; foreach $k (0..$#outf) { if ($outr[$k] =~ /R/) { $lenf = length($outf[$k]); foreach $l (0..$lenf - 1) { $tmpnuc = substr($outf[$k], $l, 1); $tmpr = substr($outr[$k], $l, 1); if ($tmpr eq "R") { print "$tmpnuc"; } else { print "$tmpnuc"; } } print "\n"; } else { print "$outf[$k]\n"; } } } else { $outcodon = join("\n", @{$aaid2codonarr{$aaid[$i]}}); print "$outcodon\n"; } } } if ($html) { print "\n"; } #----------------------------------------------------------------------- sub pn2codon { # pn2codon v6 # # input: $pep protein sequence # termination -> "_" or "*"; # frameshift -> digit # "-" or "." -> gap # $nuc DNA or RNA sequence (lower/upper case letters) # # $codontable (corresponds to codon tables used in GenBank # 1 Universal code # 2 Vertebrate mitochondrial code # 3 Yeast mitochondrial code # 4 Mold, Protozoan, and Coelenterate Mitochondrial code # and Mycoplasma/Spiroplasma code # 5 Invertebrate mitochondrial # 6 Ciliate, Dasycladacean and Hexamita nuclear code # 9 Echinoderm and Flatworm mitochondrial code # 10 Euplotid nuclear code # 11 Bacterial, archaeal and plant plastid code # 12 Alternative yeast nuclear code # 13 Ascidian mitochondrial code # 14 Alternative flatworm mitochondrial code # 15 Blepharisma nuclear code # 16 Chlorophycean mitochondrial code # 21 Trematode mitochondrial code # 22 Scenedesmus obliquus mitochondrial code # 23 Thraustochytrium mitochondrial code # # return: hash # $retval{'codonseq'}: codon seq (w/o gap) # $retval{'message'}: error/warning messame (array) # $retval{'result'}: 1: OK, 2: mismatch, -1: no match found # # # 05/05/2002 Mikita Suyama # 12/07/2004 # # v2 22/04/2005 # - reverse translation table (%p2c) was replaced # - if there is no exact match, try fragment anchors # - return value is changed to hash # # v3 26/03/2006 # - codon table for vertebrate mitochondria (vmitochondria) is added # # v4 # - from: 10 residue window (incl '-') # to: 10 residue window (excl '-') # 19/06/2006 # v5 # - bug fix: # just before 'does not correspond' # /$p2c{$tmpaa}/ -> /$p2c{$tmpaa}/i # ^ # - the first Met can be (A|G|C|R)TG # 01/08/2006 # v5.1 # - but fix: # if ($tmpcodon !~ /((A|C|G|R)TG)/i) { # ^ # 2009/06/22 # v6 # - start Met -> aa-code "B" in %p2c # - NCBI codon tables # 2011/11/13 # local($pep, $nuc, $codontable) = @_; local(%p2c); local($peplen, $qcodon, $codon); local($tmpaa, $message, $modpep, @qcodon, @fncodon, $wholecodon); local($i, $j, $anclen, @anchor, $peppos, $tmpcodon, $codonpos); local(%retval); if ($codontable == 1) { #-----------# # Universal Code (transl_table=1) #-----------# %p2c = ( "B" => "((U|T|C|Y|A)(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "(((U|T)A(A|G|R))|((T|U)GA))", "*" => "(((U|T)A(A|G|R))|((T|U)GA))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)GG)", "X" => "...", ); } elsif ($codontable == 2) { #--------------------------# # Vertebrate Mitochondrial Code (transl_table=2) #--------------------------# %p2c = ( "B" => "((A(U|T).)|G(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "(CG.)", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y))", "_" => "(((U|T)A(A|G|R))|(AG(A|G|R)))", "*" => "(((U|T)A(A|G|R))|(AG(A|G|R)))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)(A|G|R))", "W" => "((U|T)G(A|G|R))", "X" => "...", ); } elsif ($codontable == 3) { #--------------------------# # Yeast Mitochondrial Code (transl_table=3) #--------------------------# %p2c = ( "B" => "(A(U|T)(A|G|R))", "L" => "((U|T)(U|T)(A|G|R))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "((AC.)|(C(U|T).))", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y))", "_" => "((U|T)A(A|G|R))", "*" => "((U|T)A(A|G|R))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)(A|G|R))", "W" => "((U|T)G(A|G|R))", "X" => "...", ); } elsif ($codontable == 4) { #-----------# # Mold, Protozoan, and Coelenterate Mitochondrial Code # and Mycoplasma/Spiroplasma Code (transl_table=4) #-----------# %p2c = ( "B" => "((A(U|T).)|((U|T)(U|T)(A|G|R))|(C(U|T)G)|(G(U|T)G))", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "((U|T)A(A|G|R))", "*" => "((U|T)A(A|G|R))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)G(A|G|R))", "X" => "...", ); } elsif ($codontable == 5) { #-----------# # Invertebrate Mitochondrial Code (transl_table=5) #-----------# %p2c = ( "B" => "((A(U|T).)|((U|T|A|G|R)(U|T)G))", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "(CG.)", "S" => "(((U|T)C.)|(AG.))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y))", "_" => "((U|T)A(A|G|R))", "*" => "((U|T)A(A|G|R))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)(A|G|R))", "W" => "((U|T)G(A|G|R))", "X" => "...", ); } elsif ($codontable == 6) { #-----------# # Ciliate, Dasycladacean and Hexamita Nuclear Code (transl_table=6) #-----------# %p2c = ( "B" => "(A(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "((T|U)GA)", "*" => "((T|U)GA)", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "((CA(A|G|R))|((U|T)A(A|G|R)))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)GG)", "X" => "...", ); } elsif ($codontable == 9) { #-----------# # Echinoderm and Flatworm Mitochondrial Code (transl_table=9) #-----------# %p2c = ( "B" => "((A|G|R)(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "(CG.)", "S" => "(((U|T)C.)|(AG.))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "((U|T)A(A|G|R))", "*" => "((U|T)A(A|G|R))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AAG)", "N" => "(AA(U|T|C|Y|A))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)G(A|G|R))", "X" => "...", ); } elsif ($codontable == 10) { #-----------# # Euplotid Nuclear Code (transl_table=10) #-----------# %p2c = ( "B" => "(A(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "(((U|T)A(A|G|R))|((T|U)GA))", "*" => "(((U|T)A(A|G|R))|((T|U)GA))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)GG)", "X" => "...", ); } elsif ($codontable == 11) { #-----------# # Bacterial, Archaeal and Plant Plastid Code (transl_table=11) #-----------# %p2c = ( "B" => "((A(U|T)G)|(.(U|T)G))", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "(((U|T)A(A|G|R))|((T|U)GA))", "*" => "(((U|T)A(A|G|R))|((T|U)GA))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)GG)", "X" => "...", ); } elsif ($codontable == 12) { #-----------# # Alternative Yeast Nuclear Code (transl_table=12) #-----------# %p2c = ( "B" => "((A|C)(U|T)G)", "L" => "((C(U|T)(U|T|C|Y|A))|((U|T)(U|T)(A|G|R)))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y))|(C(U|T)G))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "(((U|T)A(A|G|R))|((T|U)GA))", "*" => "(((U|T)A(A|G|R))|((T|U)GA))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)GG)", "X" => "...", ); } elsif ($codontable == 13) { #-----------# # Ascidian Mitochondrial Code (transl_table=13) #-----------# %p2c = ( "B" => "((T|U|A|G|R)(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "(CG.)", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "((GG.)|(AG(A|G|R)))", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y))", "_" => "((U|T)A(A|G|R))", "*" => "((U|T)A(A|G|R))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)(A|G|R))", "W" => "((U|T)G(A|G|R))", "X" => "...", ); } elsif ($codontable == 14) { #-----------# # Alternative Flatworm Mitochondrial Code (transl_table=14) #-----------# %p2c = ( "B" => "(A(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "(CG.)", "S" => "(((U|T)C.)|(AG.))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "((U|T)AG)", "*" => "((U|T)AG)", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AAG)", "N" => "(AA(U|T|C|Y|A))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y|A))", "M" => "(A(U|T)G)", "W" => "((U|T)G(G|A|R))", "X" => "...", ); } elsif ($codontable == 15) { #-----------# # Blepharisma Nuclear Code (transl_table=15) #-----------# %p2c = ( "B" => "(A(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "(((U|T)AA)|((T|U)GA))", "*" => "(((U|T)AA)|((T|U)GA))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "((CA(A|G|R))|((U|T)AG))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)GG)", "X" => "...", ); } elsif ($codontable == 16) { #-----------# # Chlorophycean MitochondrialCode (transl_table=16) #-----------# %p2c = ( "B" => "(A(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R))|((U|T)AG))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "(((U|T)AA)|((T|U)GA))", "*" => "(((U|T)AA)|((T|U)GA))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)GG)", "X" => "...", ); } elsif ($codontable == 21) { #-----------# # Trematode Mitochondrial Code (transl_table=21) #-----------# %p2c = ( "B" => "((A|G|R)(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))", "R" => "(CG.)", "S" => "(((U|T)C.)|(AG.))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y))", "_" => "((U|T)A(A|G|R))", "*" => "((U|T)A(A|G|R))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AAG)", "N" => "(AA(U|T|C|Y|A))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)(A|G|R))", "W" => "((U|T)G(A|G|R))", "X" => "...", ); } elsif ($codontable == 22) { #-----------# # Scenedesmus obliquus mitochondrial Code (transl_table=22) #-----------# %p2c = ( "B" => "(A(U|T)G)", "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R))|((T|U)AG))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C(U|T|C|Y|G))|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "((U|T)(C|A|G|R)A)", "*" => "((U|T)(C|A|G|R)A)", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)GG)", "X" => "...", ); } elsif ($codontable == 23) { #-----------# # Thraustochytrium Mitochondrial Code (transl_table=23) #-----------# %p2c = ( "B" => "(((A|G|R)(U|T)G)|(A(U|T)(U|T)))", "L" => "((C(U|T).)|((U|T)(U|T)G))", "R" => "((CG.)|(AG(A|G|R)))", "S" => "(((U|T)C.)|(AG(U|T|C|Y)))", "A" => "(GC.)", "G" => "(GG.)", "P" => "(CC.)", "T" => "(AC.)", "V" => "(G(U|T).)", "I" => "(A(U|T)(U|T|C|Y|A))", "_" => "(((U|T)A(A|G|R))|((T|U)GA)|((T|U)(T|U)A))", "*" => "(((U|T)A(A|G|R))|((T|U)GA)|((T|U)(T|U)A))", "C" => "((U|T)G(U|T|C|Y))", "D" => "(GA(U|T|C|Y))", "E" => "(GA(A|G|R))", "F" => "((U|T)(U|T)(U|T|C|Y))", "H" => "(CA(U|T|C|Y))", "K" => "(AA(A|G|R))", "N" => "(AA(U|T|C|Y))", "Q" => "(CA(A|G|R))", "Y" => "((U|T)A(U|T|C|Y))", "M" => "(A(U|T)G)", "W" => "((U|T)GG)", "X" => "...", ); } #---------------------------------------------------------------# # make codon sequence, $qcodon, with all possible combinations #---------------------------------------------------------------# $peplen = length($pep); undef($qcodon); foreach $i (0..$peplen - 1) { $peppos = $i + 1; $tmpaa = substr($pep, $i, 1); if ($tmpaa =~ /[ACDEFGHIKLMNPQRSTVWY_\*XU]/) { if ($qcodon !~ /\w/ && substr($pep, $i, 1) eq "M") { #---------------# # the first Met #---------------# $qcodon .= $p2c{"B"}; } else { $qcodon .= $p2c{substr($pep, $i, 1)}; } } elsif ($tmpaa =~ /\d/) { $qcodon .= "." x $tmpaa; } elsif ($tmpaa =~ /[-\.]/) { # nothing to do } else { $message = "pepAlnPos $peppos: $tmpaa unknown AA type. Taken as 'X'"; push(@{$retval{'message'}}, $message); $qcodon .= $p2c{'X'}; } } # print "$qcodon\n"; #-----------------------------# # does $nuc contain $qcodon ? #-----------------------------# if ($nuc =~ /$qcodon/i) { $codon = $&; $retval{'codonseq'} = $codon; $retval{'result'} = 1; } else { #-------------------# # make 10 aa anchor #-------------------# # undef(@{$retval{'message'}}); # $modpep = $pep; # 1 while $modpep =~ s/(.{10})(.{10,})/$1\n$2/; # @anchor = split(/\n/, $modpep); undef(@preanchor); undef($tmpanc); $nanc = 0; foreach $i (0..$peplen - 1) { $tmpaa = substr($pep, $i, 1); $tmpanc .= $tmpaa; ++$nanc if ($tmpaa !~ /-/); if ($nanc == 10 || $i == $peplen - 1) { push(@preanchor, $tmpanc); undef($tmpanc); $nanc = 0; } } undef(@anchor); $lastanchorlen = length($preanchor[-1]); if ($lastanchorlen < 10) { foreach $i (0..$#preanchor - 1) { if ($i < $#preanchor - 1) { push(@anchor, $preanchor[$i]); } else { push(@anchor, "$preanchor[$i]$preanchor[$i + 1]"); } } } else { @anchor = @preanchor; } undef($wholecodon); foreach $i (0..$#anchor) { # print " $anchor[$i]\n"; $anclen = length($anchor[$i]); undef(@qcodon); undef(@fncodon); foreach $j (0..$anclen - 1) { $peppos = $i * 10 + $j + 1; $tmpaa = substr($anchor[$i], $j, 1); if ($tmpaa =~ /[ACDEFGHIKLMNPQRSTVWY_\*XU]/) { if ($i == 0 && $qcodon[$i] !~ /\w/ && $tmpaa eq "M") { #----------------------------------# # the first Met can be AGT|GTG|CTG #----------------------------------# $qcodon[$i] .= "((A|C|G|R)TG)"; } else { $qcodon[$i] .= $p2c{$tmpaa}; } $fncodon[$i] .= $p2c{'X'}; } elsif ($tmpaa =~ /\d/) { $qcodon[$i] .= "." x $tmpaa; $fncodon[$i] .= "." x $tmpaa; } elsif ($tmpaa =~ /[-\.]/) { # nothing to do } else { #del $message = "pepAlnPos $peppos: $tmpaa unknown AA type. Replaced by 'X'"; #del push(@{$retval{'message'}}, $message); $qcodon[$i] .= $p2c{'X'}; $fncodon[$i] .= $p2c{'X'}; } } if ($nuc =~ /$qcodon[$i]/i) { $wholecodon .= $qcodon[$i]; } else { $wholecodon .= $fncodon[$i]; } } if ($nuc =~ /$wholecodon/i) { $codon = $&; $codonpos = 0; $tmpnaa = 0; foreach $i (0..$peplen - 1) { $peppos = $i + 1; $tmpaa = substr($pep, $i, 1); undef($tmpcodon); if ($tmpaa !~ /\d/ && $tmpaa !~ /-/) { ++$tmpnaa; $tmpcodon = substr($codon, $codonpos, 3); $codonpos += 3; if ($tmpnaa == 1 && $tmpaa eq "M") { if ($tmpcodon !~ /((A|C|G|R)TG)/i) { $message = "pepAlnPos $peppos: $tmpaa does not correspond to $tmpcodon"; push(@{$retval{'message'}}, $message); } } elsif ($tmpcodon !~ /$p2c{$tmpaa}/i) { $message = "pepAlnPos $peppos: $tmpaa does not correspond to $tmpcodon"; push(@{$retval{'message'}}, $message); } } elsif ($tmpaa =~ /\d/i) { $tmpcodon = substr($codon, $codonpos, $tmpaa); $codonpos += $tmpaa; } # print "$tmpaa $tmpcodon\n"; } $codon; $retval{'codonseq'} = $codon; $retval{'result'} = 2; } else { $retval{'result'} = -1; } } return(%retval); } #----------------------------------------------------------------------- sub my1while { local($seq, $wlen) = @_; local($seqlen, $tmppos, @frags); $seqlen = length($seq); $tmppos = 1; while ($tmppos <= $seqlen) { $tmpfrag = substr($seq, $tmppos - 1, $wlen); push(@frags, $tmpfrag); $tmppos += $wlen; } return(@frags); } #--------------------------------------------------------- sub showhelp { print STDERR<